From 7c88f2bf7840c0ea67ac2de2e293a653f62ea134 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 20 Jul 2017 07:05:28 +0200 Subject: tile: defconfig: Cleanup from old Kconfig options Remove old, dead Kconfig options (in order appearing in this commit): - CRYPTO_ZLIB: commit 110492183c4b ("crypto: compress - remove unused pcomp interface"); - IP_NF_TARGET_ULOG: commit d4da843e6fad ("netfilter: kill remnants of ulog targets"); Signed-off-by: Krzysztof Kozlowski Signed-off-by: Chris Metcalf --- arch/tile/configs/tilegx_defconfig | 1 - arch/tile/configs/tilepro_defconfig | 2 -- 2 files changed, 3 deletions(-) diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig index 0d925fa0f0c1..9f94435cc44f 100644 --- a/arch/tile/configs/tilegx_defconfig +++ b/arch/tile/configs/tilegx_defconfig @@ -409,5 +409,4 @@ CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_ZLIB=m CONFIG_CRYPTO_LZO=m diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig index 149d8e8eacb8..1c5bd4f8ffca 100644 --- a/arch/tile/configs/tilepro_defconfig +++ b/arch/tile/configs/tilepro_defconfig @@ -189,7 +189,6 @@ CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_TTL=m CONFIG_IP_NF_FILTER=y CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_IP_NF_TARGET_ULOG=m CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m @@ -521,7 +520,6 @@ CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_ZLIB=m CONFIG_CRYPTO_LZO=m CONFIG_CRC_CCITT=m CONFIG_CRC7=m -- cgit From 637f23abca87d26e091e0d6647ec878d97d2c6cd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 22 Jul 2017 10:33:02 +0300 Subject: tile: array underflow in setup_maxnodemem() My static checker correctly complains that we should have a lower bound on "node" to prevent an array underflow. Fixes: 867e359b97c9 ("arch/tile: core support for Tilera 32-bit chips.") Signed-off-by: Dan Carpenter Signed-off-by: Chris Metcalf --- arch/tile/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index 443a70bccc1c..b1474e7d9afb 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -140,7 +140,7 @@ static int __init setup_maxnodemem(char *str) { char *endp; unsigned long long maxnodemem; - long node; + unsigned long node; node = str ? simple_strtoul(str, &endp, 0) : INT_MAX; if (node >= MAX_NUMNODES || *endp != ':') -- cgit From 472b46c352c9ff0b6fa57dbf85d77c51901a3368 Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Sun, 6 Aug 2017 18:44:27 +0200 Subject: uapi linux/kfd_ioctl.h: only use __u32 and __u64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include instead of which on Linux includes and on non-Linux platforms defines __u32 etc types. Fixes user space compilation errors like: linux/kfd_ioctl.h:33:2: error: unknown type name ‘uint32_t’ uint32_t major_version; /* from KFD */ ^~~~~~~~ Signed-off-by: Mikko Rapeli Acked-by: Arnd Bergmann Signed-off-by: Oded Gabbay --- include/uapi/linux/kfd_ioctl.h | 172 ++++++++++++++++++++--------------------- 1 file changed, 86 insertions(+), 86 deletions(-) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 7b4567bacfc2..26283fefdf5f 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -23,15 +23,15 @@ #ifndef KFD_IOCTL_H_INCLUDED #define KFD_IOCTL_H_INCLUDED -#include +#include #include #define KFD_IOCTL_MAJOR_VERSION 1 #define KFD_IOCTL_MINOR_VERSION 1 struct kfd_ioctl_get_version_args { - uint32_t major_version; /* from KFD */ - uint32_t minor_version; /* from KFD */ + __u32 major_version; /* from KFD */ + __u32 minor_version; /* from KFD */ }; /* For kfd_ioctl_create_queue_args.queue_type. */ @@ -43,36 +43,36 @@ struct kfd_ioctl_get_version_args { #define KFD_MAX_QUEUE_PRIORITY 15 struct kfd_ioctl_create_queue_args { - uint64_t ring_base_address; /* to KFD */ - uint64_t write_pointer_address; /* from KFD */ - uint64_t read_pointer_address; /* from KFD */ - uint64_t doorbell_offset; /* from KFD */ - - uint32_t ring_size; /* to KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t queue_type; /* to KFD */ - uint32_t queue_percentage; /* to KFD */ - uint32_t queue_priority; /* to KFD */ - uint32_t queue_id; /* from KFD */ - - uint64_t eop_buffer_address; /* to KFD */ - uint64_t eop_buffer_size; /* to KFD */ - uint64_t ctx_save_restore_address; /* to KFD */ - uint64_t ctx_save_restore_size; /* to KFD */ + __u64 ring_base_address; /* to KFD */ + __u64 write_pointer_address; /* from KFD */ + __u64 read_pointer_address; /* from KFD */ + __u64 doorbell_offset; /* from KFD */ + + __u32 ring_size; /* to KFD */ + __u32 gpu_id; /* to KFD */ + __u32 queue_type; /* to KFD */ + __u32 queue_percentage; /* to KFD */ + __u32 queue_priority; /* to KFD */ + __u32 queue_id; /* from KFD */ + + __u64 eop_buffer_address; /* to KFD */ + __u64 eop_buffer_size; /* to KFD */ + __u64 ctx_save_restore_address; /* to KFD */ + __u64 ctx_save_restore_size; /* to KFD */ }; struct kfd_ioctl_destroy_queue_args { - uint32_t queue_id; /* to KFD */ - uint32_t pad; + __u32 queue_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_update_queue_args { - uint64_t ring_base_address; /* to KFD */ + __u64 ring_base_address; /* to KFD */ - uint32_t queue_id; /* to KFD */ - uint32_t ring_size; /* to KFD */ - uint32_t queue_percentage; /* to KFD */ - uint32_t queue_priority; /* to KFD */ + __u32 queue_id; /* to KFD */ + __u32 ring_size; /* to KFD */ + __u32 queue_percentage; /* to KFD */ + __u32 queue_priority; /* to KFD */ }; /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ @@ -80,13 +80,13 @@ struct kfd_ioctl_update_queue_args { #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 struct kfd_ioctl_set_memory_policy_args { - uint64_t alternate_aperture_base; /* to KFD */ - uint64_t alternate_aperture_size; /* to KFD */ + __u64 alternate_aperture_base; /* to KFD */ + __u64 alternate_aperture_size; /* to KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t default_policy; /* to KFD */ - uint32_t alternate_policy; /* to KFD */ - uint32_t pad; + __u32 gpu_id; /* to KFD */ + __u32 default_policy; /* to KFD */ + __u32 alternate_policy; /* to KFD */ + __u32 pad; }; /* @@ -97,26 +97,26 @@ struct kfd_ioctl_set_memory_policy_args { */ struct kfd_ioctl_get_clock_counters_args { - uint64_t gpu_clock_counter; /* from KFD */ - uint64_t cpu_clock_counter; /* from KFD */ - uint64_t system_clock_counter; /* from KFD */ - uint64_t system_clock_freq; /* from KFD */ + __u64 gpu_clock_counter; /* from KFD */ + __u64 cpu_clock_counter; /* from KFD */ + __u64 system_clock_counter; /* from KFD */ + __u64 system_clock_freq; /* from KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t pad; + __u32 gpu_id; /* to KFD */ + __u32 pad; }; #define NUM_OF_SUPPORTED_GPUS 7 struct kfd_process_device_apertures { - uint64_t lds_base; /* from KFD */ - uint64_t lds_limit; /* from KFD */ - uint64_t scratch_base; /* from KFD */ - uint64_t scratch_limit; /* from KFD */ - uint64_t gpuvm_base; /* from KFD */ - uint64_t gpuvm_limit; /* from KFD */ - uint32_t gpu_id; /* from KFD */ - uint32_t pad; + __u64 lds_base; /* from KFD */ + __u64 lds_limit; /* from KFD */ + __u64 scratch_base; /* from KFD */ + __u64 scratch_limit; /* from KFD */ + __u64 gpuvm_base; /* from KFD */ + __u64 gpuvm_limit; /* from KFD */ + __u32 gpu_id; /* from KFD */ + __u32 pad; }; struct kfd_ioctl_get_process_apertures_args { @@ -124,8 +124,8 @@ struct kfd_ioctl_get_process_apertures_args { process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */ /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */ - uint32_t num_of_nodes; - uint32_t pad; + __u32 num_of_nodes; + __u32 pad; }; #define MAX_ALLOWED_NUM_POINTS 100 @@ -133,25 +133,25 @@ struct kfd_ioctl_get_process_apertures_args { #define MAX_ALLOWED_WAC_BUFF_SIZE 128 struct kfd_ioctl_dbg_register_args { - uint32_t gpu_id; /* to KFD */ - uint32_t pad; + __u32 gpu_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_dbg_unregister_args { - uint32_t gpu_id; /* to KFD */ - uint32_t pad; + __u32 gpu_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_dbg_address_watch_args { - uint64_t content_ptr; /* a pointer to the actual content */ - uint32_t gpu_id; /* to KFD */ - uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ + __u64 content_ptr; /* a pointer to the actual content */ + __u32 gpu_id; /* to KFD */ + __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ }; struct kfd_ioctl_dbg_wave_control_args { - uint64_t content_ptr; /* a pointer to the actual content */ - uint32_t gpu_id; /* to KFD */ - uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ + __u64 content_ptr; /* a pointer to the actual content */ + __u32 gpu_id; /* to KFD */ + __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ }; /* Matching HSA_EVENTTYPE */ @@ -172,44 +172,44 @@ struct kfd_ioctl_dbg_wave_control_args { #define KFD_SIGNAL_EVENT_LIMIT 256 struct kfd_ioctl_create_event_args { - uint64_t event_page_offset; /* from KFD */ - uint32_t event_trigger_data; /* from KFD - signal events only */ - uint32_t event_type; /* to KFD */ - uint32_t auto_reset; /* to KFD */ - uint32_t node_id; /* to KFD - only valid for certain + __u64 event_page_offset; /* from KFD */ + __u32 event_trigger_data; /* from KFD - signal events only */ + __u32 event_type; /* to KFD */ + __u32 auto_reset; /* to KFD */ + __u32 node_id; /* to KFD - only valid for certain event types */ - uint32_t event_id; /* from KFD */ - uint32_t event_slot_index; /* from KFD */ + __u32 event_id; /* from KFD */ + __u32 event_slot_index; /* from KFD */ }; struct kfd_ioctl_destroy_event_args { - uint32_t event_id; /* to KFD */ - uint32_t pad; + __u32 event_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_set_event_args { - uint32_t event_id; /* to KFD */ - uint32_t pad; + __u32 event_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_reset_event_args { - uint32_t event_id; /* to KFD */ - uint32_t pad; + __u32 event_id; /* to KFD */ + __u32 pad; }; struct kfd_memory_exception_failure { - uint32_t NotPresent; /* Page not present or supervisor privilege */ - uint32_t ReadOnly; /* Write access to a read-only page */ - uint32_t NoExecute; /* Execute access to a page marked NX */ - uint32_t pad; + __u32 NotPresent; /* Page not present or supervisor privilege */ + __u32 ReadOnly; /* Write access to a read-only page */ + __u32 NoExecute; /* Execute access to a page marked NX */ + __u32 pad; }; /* memory exception data*/ struct kfd_hsa_memory_exception_data { struct kfd_memory_exception_failure failure; - uint64_t va; - uint32_t gpu_id; - uint32_t pad; + __u64 va; + __u32 gpu_id; + __u32 pad; }; /* Event data*/ @@ -217,19 +217,19 @@ struct kfd_event_data { union { struct kfd_hsa_memory_exception_data memory_exception_data; }; /* From KFD */ - uint64_t kfd_event_data_ext; /* pointer to an extension structure + __u64 kfd_event_data_ext; /* pointer to an extension structure for future exception types */ - uint32_t event_id; /* to KFD */ - uint32_t pad; + __u32 event_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_wait_events_args { - uint64_t events_ptr; /* pointed to struct + __u64 events_ptr; /* pointed to struct kfd_event_data array, to KFD */ - uint32_t num_events; /* to KFD */ - uint32_t wait_for_all; /* to KFD */ - uint32_t timeout; /* to KFD */ - uint32_t wait_result; /* from KFD */ + __u32 num_events; /* to KFD */ + __u32 wait_for_all; /* to KFD */ + __u32 timeout; /* to KFD */ + __u32 wait_result; /* from KFD */ }; struct kfd_ioctl_set_scratch_backing_va_args { -- cgit From a33b2d0359a0aae25d5ac7a26b85a5682485ebbb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 8 Aug 2016 17:46:23 -0700 Subject: selftests/seccomp: Add tests for basic ptrace actions This adds tests for using only ptrace to perform syscall changes, just to validate matching behavior between seccomp events and ptrace events. Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 41 ++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 73f5ea6778ce..e61b963f011b 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1262,6 +1262,13 @@ TEST_F(TRACE_poke, getpid_runs_normally) # error "Do not know how to find your architecture's registers and syscalls" #endif +/* When the syscall return can't be changed, stub out the tests for it. */ +#ifdef SYSCALL_NUM_RET_SHARE_REG +# define EXPECT_SYSCALL_RETURN(val, action) EXPECT_EQ(-1, action) +#else +# define EXPECT_SYSCALL_RETURN(val, action) EXPECT_EQ(val, action) +#endif + /* Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux). */ @@ -1357,7 +1364,7 @@ void change_syscall(struct __test_metadata *_metadata, #ifdef SYSCALL_NUM_RET_SHARE_REG TH_LOG("Can't modify syscall return on this architecture"); #else - regs.SYSCALL_RET = 1; + regs.SYSCALL_RET = EPERM; #endif #ifdef HAVE_GETREGS @@ -1426,6 +1433,8 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee, if (nr == __NR_getpid) change_syscall(_metadata, tracee, __NR_getppid); + if (nr == __NR_open) + change_syscall(_metadata, tracee, -1); } FIXTURE_DATA(TRACE_syscall) { @@ -1480,6 +1489,28 @@ FIXTURE_TEARDOWN(TRACE_syscall) free(self->prog.filter); } +TEST_F(TRACE_syscall, ptrace_syscall_redirected) +{ + /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ + teardown_trace_fixture(_metadata, self->tracer); + self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, + true); + + /* Tracer will redirect getpid to getppid. */ + EXPECT_NE(self->mypid, syscall(__NR_getpid)); +} + +TEST_F(TRACE_syscall, ptrace_syscall_dropped) +{ + /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ + teardown_trace_fixture(_metadata, self->tracer); + self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, + true); + + /* Tracer should skip the open syscall, resulting in EPERM. */ + EXPECT_SYSCALL_RETURN(EPERM, syscall(__NR_open)); +} + TEST_F(TRACE_syscall, syscall_allowed) { long ret; @@ -1520,13 +1551,8 @@ TEST_F(TRACE_syscall, syscall_dropped) ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); ASSERT_EQ(0, ret); -#ifdef SYSCALL_NUM_RET_SHARE_REG - /* gettid has been skipped */ - EXPECT_EQ(-1, syscall(__NR_gettid)); -#else /* gettid has been skipped and an altered return value stored. */ - EXPECT_EQ(1, syscall(__NR_gettid)); -#endif + EXPECT_SYSCALL_RETURN(EPERM, syscall(__NR_gettid)); EXPECT_NE(self->mytid, syscall(__NR_gettid)); } @@ -1557,6 +1583,7 @@ TEST_F(TRACE_syscall, skip_after_RET_TRACE) ASSERT_EQ(0, ret); /* Tracer will redirect getpid to getppid, and we should see EPERM. */ + errno = 0; EXPECT_EQ(-1, syscall(__NR_getpid)); EXPECT_EQ(EPERM, errno); } -- cgit From 967d7ba8415139107033770a40c7ec7dd17fbcb1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 2 Dec 2015 11:39:36 -0800 Subject: selftests/seccomp: Add simple seccomp overhead benchmark This attempts to produce a comparison between native getpid() and a RET_ALLOW-filtered getpid(), to measure the overhead cost of using seccomp(). Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/Makefile | 18 ++-- .../testing/selftests/seccomp/seccomp_benchmark.c | 99 ++++++++++++++++++++++ 2 files changed, 112 insertions(+), 5 deletions(-) create mode 100644 tools/testing/selftests/seccomp/seccomp_benchmark.c diff --git a/tools/testing/selftests/seccomp/Makefile b/tools/testing/selftests/seccomp/Makefile index aeb0c805f3ca..553d870b4ca9 100644 --- a/tools/testing/selftests/seccomp/Makefile +++ b/tools/testing/selftests/seccomp/Makefile @@ -1,8 +1,16 @@ -TEST_GEN_PROGS := seccomp_bpf -CFLAGS += -Wl,-no-as-needed -Wall -LDFLAGS += -lpthread +all: include ../lib.mk -$(TEST_GEN_PROGS): seccomp_bpf.c ../kselftest_harness.h - $(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ +.PHONY: all clean + +BINARIES := seccomp_bpf seccomp_benchmark +CFLAGS += -Wl,-no-as-needed -Wall + +seccomp_bpf: seccomp_bpf.c ../kselftest_harness.h + $(CC) $(CFLAGS) $(LDFLAGS) -lpthread $< -o $@ + +TEST_PROGS += $(BINARIES) +EXTRA_CLEAN := $(BINARIES) + +all: $(BINARIES) diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c new file mode 100644 index 000000000000..5838c8697ec3 --- /dev/null +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -0,0 +1,99 @@ +/* + * Strictly speaking, this is not a test. But it can report during test + * runs so relative performace can be measured. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) + +unsigned long long timing(clockid_t clk_id, unsigned long long samples) +{ + pid_t pid, ret; + unsigned long long i; + struct timespec start, finish; + + pid = getpid(); + assert(clock_gettime(clk_id, &start) == 0); + for (i = 0; i < samples; i++) { + ret = syscall(__NR_getpid); + assert(pid == ret); + } + assert(clock_gettime(clk_id, &finish) == 0); + + i = finish.tv_sec - start.tv_sec; + i *= 1000000000; + i += finish.tv_nsec - start.tv_nsec; + + printf("%lu.%09lu - %lu.%09lu = %llu\n", + finish.tv_sec, finish.tv_nsec, + start.tv_sec, start.tv_nsec, + i); + + return i; +} + +unsigned long long calibrate(void) +{ + unsigned long long i; + + printf("Calibrating reasonable sample size...\n"); + + for (i = 5; ; i++) { + unsigned long long samples = 1 << i; + + /* Find something that takes more than 5 seconds to run. */ + if (timing(CLOCK_REALTIME, samples) / 1000000000ULL > 5) + return samples; + } +} + +int main(int argc, char *argv[]) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + unsigned long long samples; + unsigned long long native, filtered; + + if (argc > 1) + samples = strtoull(argv[1], NULL, 0); + else + samples = calibrate(); + + printf("Benchmarking %llu samples...\n", samples); + + native = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid native: %llu ns\n", native); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + assert(ret == 0); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + assert(ret == 0); + + filtered = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW: %llu ns\n", filtered); + + printf("Estimated seccomp overhead per syscall: %llu ns\n", + filtered - native); + + if (filtered == native) + printf("Trying running again with more samples.\n"); + + return 0; +} -- cgit From f3f6e30669c048f47d51ea59df9946a91f551c4c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 2 Aug 2017 14:08:39 -0700 Subject: selftests/seccomp: Refactor RET_ERRNO tests This refactors the errno tests (since they all use the same pattern for their filter) and adds a RET_DATA field ordering test. Signed-off-by: Kees Cook Reviewed-by: Tyler Hicks --- tools/testing/selftests/seccomp/seccomp_bpf.c | 95 ++++++++++++++++----------- 1 file changed, 58 insertions(+), 37 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index e61b963f011b..2fb49d99588d 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -136,7 +136,7 @@ TEST(no_new_privs_support) } } -/* Tests kernel support by checking for a copy_from_user() fault on * NULL. */ +/* Tests kernel support by checking for a copy_from_user() fault on NULL. */ TEST(mode_filter_support) { long ret; @@ -541,26 +541,30 @@ TEST(arg_out_of_range) EXPECT_EQ(EINVAL, errno); } +#define ERRNO_FILTER(name, errno) \ + struct sock_filter _read_filter_##name[] = { \ + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, \ + offsetof(struct seccomp_data, nr)), \ + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), \ + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno), \ + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), \ + }; \ + struct sock_fprog prog_##name = { \ + .len = (unsigned short)ARRAY_SIZE(_read_filter_##name), \ + .filter = _read_filter_##name, \ + } + +/* Make sure basic errno values are correctly passed through a filter. */ TEST(ERRNO_valid) { - struct sock_filter filter[] = { - BPF_STMT(BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, nr)), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | E2BIG), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), - }; - struct sock_fprog prog = { - .len = (unsigned short)ARRAY_SIZE(filter), - .filter = filter, - }; + ERRNO_FILTER(valid, E2BIG); long ret; pid_t parent = getppid(); ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); ASSERT_EQ(0, ret); - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid); ASSERT_EQ(0, ret); EXPECT_EQ(parent, syscall(__NR_getppid)); @@ -568,26 +572,17 @@ TEST(ERRNO_valid) EXPECT_EQ(E2BIG, errno); } +/* Make sure an errno of zero is correctly handled by the arch code. */ TEST(ERRNO_zero) { - struct sock_filter filter[] = { - BPF_STMT(BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, nr)), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 0), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), - }; - struct sock_fprog prog = { - .len = (unsigned short)ARRAY_SIZE(filter), - .filter = filter, - }; + ERRNO_FILTER(zero, 0); long ret; pid_t parent = getppid(); ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); ASSERT_EQ(0, ret); - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero); ASSERT_EQ(0, ret); EXPECT_EQ(parent, syscall(__NR_getppid)); @@ -595,26 +590,21 @@ TEST(ERRNO_zero) EXPECT_EQ(0, read(0, NULL, 0)); } +/* + * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller. + * This tests that the errno value gets capped correctly, fixed by + * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO"). + */ TEST(ERRNO_capped) { - struct sock_filter filter[] = { - BPF_STMT(BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, nr)), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 4096), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), - }; - struct sock_fprog prog = { - .len = (unsigned short)ARRAY_SIZE(filter), - .filter = filter, - }; + ERRNO_FILTER(capped, 4096); long ret; pid_t parent = getppid(); ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); ASSERT_EQ(0, ret); - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped); ASSERT_EQ(0, ret); EXPECT_EQ(parent, syscall(__NR_getppid)); @@ -622,6 +612,37 @@ TEST(ERRNO_capped) EXPECT_EQ(4095, errno); } +/* + * Filters are processed in reverse order: last applied is executed first. + * Since only the SECCOMP_RET_ACTION mask is tested for return values, the + * SECCOMP_RET_DATA mask results will follow the most recently applied + * matching filter return (and not the lowest or highest value). + */ +TEST(ERRNO_order) +{ + ERRNO_FILTER(first, 11); + ERRNO_FILTER(second, 13); + ERRNO_FILTER(third, 12); + long ret; + pid_t parent = getppid(); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third); + ASSERT_EQ(0, ret); + + EXPECT_EQ(parent, syscall(__NR_getppid)); + EXPECT_EQ(-1, read(0, NULL, 0)); + EXPECT_EQ(12, errno); +} + FIXTURE_DATA(TRAP) { struct sock_fprog prog; }; -- cgit From deb4de8b31bc5bf21efb6ac31150a01a631cd647 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 2 Aug 2017 15:00:40 -0700 Subject: seccomp: Provide matching filter for introspection Both the upcoming logging improvements and changes to RET_KILL will need to know which filter a given seccomp return value originated from. In order to delay logic processing of result until after the seccomp loop, this adds a single pointer assignment on matches. This will allow both log and RET_KILL logic to work off the filter rather than doing more expensive tests inside the time-critical run_filters loop. Running tight cycles of getpid() with filters attached shows no measurable difference in speed. Suggested-by: Tyler Hicks Signed-off-by: Kees Cook Reviewed-by: Tyler Hicks --- kernel/seccomp.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 98b59b5db90b..1f3347fc2605 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -171,10 +171,14 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) /** * seccomp_run_filters - evaluates all seccomp filters against @sd * @sd: optional seccomp data to be passed to filters + * @match: stores struct seccomp_filter that resulted in the return value, + * unless filter returned SECCOMP_RET_ALLOW, in which case it will + * be unchanged. * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(const struct seccomp_data *sd) +static u32 seccomp_run_filters(const struct seccomp_data *sd, + struct seccomp_filter **match) { struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; @@ -198,8 +202,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd) for (; f; f = f->prev) { u32 cur_ret = BPF_PROG_RUN(f->prog, sd); - if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) + if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) { ret = cur_ret; + *match = f; + } } return ret; } @@ -566,6 +572,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) { u32 filter_ret, action; + struct seccomp_filter *match = NULL; int data; /* @@ -574,7 +581,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ rmb(); - filter_ret = seccomp_run_filters(sd); + filter_ret = seccomp_run_filters(sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION; @@ -638,6 +645,11 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_ALLOW: + /* + * Note that the "match" filter will always be NULL for + * this action since SECCOMP_RET_ALLOW is the starting + * state in seccomp_run_filters(). + */ return 0; case SECCOMP_RET_KILL: -- cgit From 8e5f1ad116df6b0de65eac458d5e7c318d1c05af Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:52 +0000 Subject: seccomp: Sysctl to display available actions This patch creates a read-only sysctl containing an ordered list of seccomp actions that the kernel supports. The ordering, from left to right, is the lowest action value (kill) to the highest action value (allow). Currently, a read of the sysctl file would return "kill trap errno trace allow". The contents of this sysctl file can be useful for userspace code as well as the system administrator. The path to the sysctl is: /proc/sys/kernel/seccomp/actions_avail libseccomp and other userspace code can easily determine which actions the current kernel supports. The set of actions supported by the current kernel may be different than the set of action macros found in kernel headers that were installed where the userspace code was built. In addition, this sysctl will allow system administrators to know which actions are supported by the kernel and make it easier to configure exactly what seccomp logs through the audit subsystem. Support for this level of logging configuration will come in a future patch. Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- Documentation/sysctl/kernel.txt | 1 + Documentation/userspace-api/seccomp_filter.rst | 16 ++++++++ kernel/seccomp.c | 51 ++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index bac23c198360..995c42cf86ba 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -74,6 +74,7 @@ show up in /proc/sys/kernel: - reboot-cmd [ SPARC only ] - rtsig-max - rtsig-nr +- seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst - sem - sem_next_id [ sysv ipc ] - sg-big-buff [ generic SCSI device (sg) ] diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst index f71eb5ef1f2d..35fc7cbf1d95 100644 --- a/Documentation/userspace-api/seccomp_filter.rst +++ b/Documentation/userspace-api/seccomp_filter.rst @@ -169,7 +169,23 @@ The ``samples/seccomp/`` directory contains both an x86-specific example and a more generic example of a higher level macro interface for BPF program generation. +Sysctls +======= + +Seccomp's sysctl files can be found in the ``/proc/sys/kernel/seccomp/`` +directory. Here's a description of each file in that directory: + +``actions_avail``: + A read-only ordered list of seccomp return values (refer to the + ``SECCOMP_RET_*`` macros above) in string form. The ordering, from + left-to-right, is the least permissive return value to the most + permissive return value. + The list represents the set of seccomp return values supported + by the kernel. A userspace program may use this list to + determine if the actions found in the ``seccomp.h``, when the + program was built, differs from the set of actions actually + supported in the current running kernel. Adding architecture support =========================== diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1f3347fc2605..5f19f41e4e50 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -17,11 +17,13 @@ #include #include #include +#include #include #include #include #include #include +#include #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include @@ -934,3 +936,52 @@ out: return ret; } #endif + +#ifdef CONFIG_SYSCTL + +/* Human readable action names for friendly sysctl interaction */ +#define SECCOMP_RET_KILL_NAME "kill" +#define SECCOMP_RET_TRAP_NAME "trap" +#define SECCOMP_RET_ERRNO_NAME "errno" +#define SECCOMP_RET_TRACE_NAME "trace" +#define SECCOMP_RET_ALLOW_NAME "allow" + +static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME " " + SECCOMP_RET_TRAP_NAME " " + SECCOMP_RET_ERRNO_NAME " " + SECCOMP_RET_TRACE_NAME " " + SECCOMP_RET_ALLOW_NAME; + +static struct ctl_path seccomp_sysctl_path[] = { + { .procname = "kernel", }, + { .procname = "seccomp", }, + { } +}; + +static struct ctl_table seccomp_sysctl_table[] = { + { + .procname = "actions_avail", + .data = (void *) &seccomp_actions_avail, + .maxlen = sizeof(seccomp_actions_avail), + .mode = 0444, + .proc_handler = proc_dostring, + }, + { } +}; + +static int __init seccomp_sysctl_init(void) +{ + struct ctl_table_header *hdr; + + hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); + if (!hdr) + pr_warn("seccomp: sysctl registration failed\n"); + else + kmemleak_not_leak(hdr); + + return 0; +} + +device_initcall(seccomp_sysctl_init) + +#endif /* CONFIG_SYSCTL */ -- cgit From d612b1fd8010d0d67b5287fe146b8b55bcbb8655 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:53 +0000 Subject: seccomp: Operation for checking if an action is available Userspace code that needs to check if the kernel supports a given action may not be able to use the /proc/sys/kernel/seccomp/actions_avail sysctl. The process may be running in a sandbox and, therefore, sufficient filesystem access may not be available. This patch adds an operation to the seccomp(2) syscall that allows userspace code to ask the kernel if a given action is available. If the action is supported by the kernel, 0 is returned. If the action is not supported by the kernel, -1 is returned with errno set to -EOPNOTSUPP. If this check is attempted on a kernel that doesn't support this new operation, -1 is returned with errno set to -EINVAL meaning that userspace code will have the ability to differentiate between the two error cases. Signed-off-by: Tyler Hicks Suggested-by: Andy Lutomirski Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 5 ++-- kernel/seccomp.c | 26 +++++++++++++++++++ tools/testing/selftests/seccomp/seccomp_bpf.c | 36 +++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 0f238a43ff1e..aaad61cc46bc 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -11,8 +11,9 @@ #define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ /* Valid operations for seccomp syscall. */ -#define SECCOMP_SET_MODE_STRICT 0 -#define SECCOMP_SET_MODE_FILTER 1 +#define SECCOMP_SET_MODE_STRICT 0 +#define SECCOMP_SET_MODE_FILTER 1 +#define SECCOMP_GET_ACTION_AVAIL 2 /* Valid flags for SECCOMP_SET_MODE_FILTER */ #define SECCOMP_FILTER_FLAG_TSYNC 1 diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5f19f41e4e50..7a6089f66fed 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -808,6 +808,27 @@ static inline long seccomp_set_mode_filter(unsigned int flags, } #endif +static long seccomp_get_action_avail(const char __user *uaction) +{ + u32 action; + + if (copy_from_user(&action, uaction, sizeof(action))) + return -EFAULT; + + switch (action) { + case SECCOMP_RET_KILL: + case SECCOMP_RET_TRAP: + case SECCOMP_RET_ERRNO: + case SECCOMP_RET_TRACE: + case SECCOMP_RET_ALLOW: + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + /* Common entry point for both prctl and syscall. */ static long do_seccomp(unsigned int op, unsigned int flags, const char __user *uargs) @@ -819,6 +840,11 @@ static long do_seccomp(unsigned int op, unsigned int flags, return seccomp_set_mode_strict(); case SECCOMP_SET_MODE_FILTER: return seccomp_set_mode_filter(flags, uargs); + case SECCOMP_GET_ACTION_AVAIL: + if (flags != 0) + return -EINVAL; + + return seccomp_get_action_avail(uargs); default: return -EINVAL; } diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 2fb49d99588d..1f2888f6678b 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1731,6 +1731,10 @@ TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS) #define SECCOMP_SET_MODE_FILTER 1 #endif +#ifndef SECCOMP_GET_ACTION_AVAIL +#define SECCOMP_GET_ACTION_AVAIL 2 +#endif + #ifndef SECCOMP_FILTER_FLAG_TSYNC #define SECCOMP_FILTER_FLAG_TSYNC 1 #endif @@ -2469,6 +2473,38 @@ TEST(syscall_restart) _metadata->passed = 0; } +TEST(get_action_avail) +{ + __u32 actions[] = { SECCOMP_RET_KILL, SECCOMP_RET_TRAP, + SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE, + SECCOMP_RET_ALLOW }; + __u32 unknown_action = 0x10000000U; + int i; + long ret; + + ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]); + ASSERT_NE(ENOSYS, errno) { + TH_LOG("Kernel does not support seccomp syscall!"); + } + ASSERT_NE(EINVAL, errno) { + TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!"); + } + EXPECT_EQ(ret, 0); + + for (i = 0; i < ARRAY_SIZE(actions); i++) { + ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]); + EXPECT_EQ(ret, 0) { + TH_LOG("Expected action (0x%X) not available!", + actions[i]); + } + } + + /* Check that an unknown action is handled properly (EOPNOTSUPP) */ + ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action); + EXPECT_EQ(ret, -1); + EXPECT_EQ(errno, EOPNOTSUPP); +} + /* * TODO: * - add microbenchmarks -- cgit From 0ddec0fc8900201c0897b87b762b7c420436662f Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:54 +0000 Subject: seccomp: Sysctl to configure actions that are allowed to be logged Adminstrators can write to this sysctl to set the seccomp actions that are allowed to be logged. Any actions not found in this sysctl will not be logged. For example, all SECCOMP_RET_KILL, SECCOMP_RET_TRAP, and SECCOMP_RET_ERRNO actions would be loggable if "kill trap errno" were written to the sysctl. SECCOMP_RET_TRACE actions would not be logged since its string representation ("trace") wasn't present in the sysctl value. The path to the sysctl is: /proc/sys/kernel/seccomp/actions_logged The actions_avail sysctl can be read to discover the valid action names that can be written to the actions_logged sysctl with the exception of "allow". SECCOMP_RET_ALLOW actions cannot be configured for logging. The default setting for the sysctl is to allow all actions to be logged except SECCOMP_RET_ALLOW. While only SECCOMP_RET_KILL actions are currently logged, an upcoming patch will allow applications to request additional actions to be logged. There's one important exception to this sysctl. If a task is specifically being audited, meaning that an audit context has been allocated for the task, seccomp will log all actions other than SECCOMP_RET_ALLOW despite the value of actions_logged. This exception preserves the existing auditing behavior of tasks with an allocated audit context. With this patch, the logic for deciding if an action will be logged is: if action == RET_ALLOW: do not log else if action == RET_KILL && RET_KILL in actions_logged: log else if audit_enabled && task-is-being-audited: log else: do not log Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- Documentation/userspace-api/seccomp_filter.rst | 18 +++ include/linux/audit.h | 6 +- kernel/seccomp.c | 171 ++++++++++++++++++++++++- 3 files changed, 187 insertions(+), 8 deletions(-) diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst index 35fc7cbf1d95..2d1d8ab04ac5 100644 --- a/Documentation/userspace-api/seccomp_filter.rst +++ b/Documentation/userspace-api/seccomp_filter.rst @@ -187,6 +187,24 @@ directory. Here's a description of each file in that directory: program was built, differs from the set of actions actually supported in the current running kernel. +``actions_logged``: + A read-write ordered list of seccomp return values (refer to the + ``SECCOMP_RET_*`` macros above) that are allowed to be logged. Writes + to the file do not need to be in ordered form but reads from the file + will be ordered in the same way as the actions_avail sysctl. + + It is important to note that the value of ``actions_logged`` does not + prevent certain actions from being logged when the audit subsystem is + configured to audit a task. If the action is not found in + ``actions_logged`` list, the final decision on whether to audit the + action for that task is ultimately left up to the audit subsystem to + decide for all seccomp return values other than ``SECCOMP_RET_ALLOW``. + + The ``allow`` string is not accepted in the ``actions_logged`` sysctl + as it is not possible to log ``SECCOMP_RET_ALLOW`` actions. Attempting + to write ``allow`` to the sysctl will result in an EINVAL being + returned. + Adding architecture support =========================== diff --git a/include/linux/audit.h b/include/linux/audit.h index 2150bdccfbab..8c30f06d639d 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -314,11 +314,7 @@ void audit_core_dumps(long signr); static inline void audit_seccomp(unsigned long syscall, long signr, int code) { - if (!audit_enabled) - return; - - /* Force a record to be reported if a signal was delivered. */ - if (signr || unlikely(!audit_dummy_context())) + if (audit_enabled && unlikely(!audit_dummy_context())) __audit_seccomp(syscall, signr, code); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7a6089f66fed..54357e361aea 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -522,6 +522,45 @@ static void seccomp_send_sigsys(int syscall, int reason) } #endif /* CONFIG_SECCOMP_FILTER */ +/* For use with seccomp_actions_logged */ +#define SECCOMP_LOG_KILL (1 << 0) +#define SECCOMP_LOG_TRAP (1 << 2) +#define SECCOMP_LOG_ERRNO (1 << 3) +#define SECCOMP_LOG_TRACE (1 << 4) +#define SECCOMP_LOG_ALLOW (1 << 5) + +static u32 seccomp_actions_logged = SECCOMP_LOG_KILL | SECCOMP_LOG_TRAP | + SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE; + +static inline void seccomp_log(unsigned long syscall, long signr, u32 action) +{ + bool log = false; + + switch (action) { + case SECCOMP_RET_ALLOW: + case SECCOMP_RET_TRAP: + case SECCOMP_RET_ERRNO: + case SECCOMP_RET_TRACE: + break; + case SECCOMP_RET_KILL: + default: + log = seccomp_actions_logged & SECCOMP_LOG_KILL; + } + + /* + * Force an audit message to be emitted when the action is RET_KILL and + * the action is allowed to be logged by the admin. + */ + if (log) + return __audit_seccomp(syscall, signr, action); + + /* + * Let the audit subsystem decide if the action should be audited based + * on whether the current task itself is being audited. + */ + return audit_seccomp(syscall, signr, action); +} + /* * Secure computing mode 1 allows only read/write/exit/sigreturn. * To be fully secure this must be combined with rlimit @@ -547,7 +586,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif - audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); + seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL); do_exit(SIGKILL); } @@ -656,7 +695,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, case SECCOMP_RET_KILL: default: - audit_seccomp(this_syscall, SIGSYS, action); + seccomp_log(this_syscall, SIGSYS, action); /* Dump core only if this is the last remaining thread. */ if (get_nr_threads(current) == 1) { siginfo_t info; @@ -673,7 +712,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, unreachable(); skip: - audit_seccomp(this_syscall, 0, action); + seccomp_log(this_syscall, 0, action); return -1; } #else @@ -978,6 +1017,127 @@ static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME " " SECCOMP_RET_TRACE_NAME " " SECCOMP_RET_ALLOW_NAME; +struct seccomp_log_name { + u32 log; + const char *name; +}; + +static const struct seccomp_log_name seccomp_log_names[] = { + { SECCOMP_LOG_KILL, SECCOMP_RET_KILL_NAME }, + { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, + { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, + { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, + { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME }, + { } +}; + +static bool seccomp_names_from_actions_logged(char *names, size_t size, + u32 actions_logged) +{ + const struct seccomp_log_name *cur; + bool append_space = false; + + for (cur = seccomp_log_names; cur->name && size; cur++) { + ssize_t ret; + + if (!(actions_logged & cur->log)) + continue; + + if (append_space) { + ret = strscpy(names, " ", size); + if (ret < 0) + return false; + + names += ret; + size -= ret; + } else + append_space = true; + + ret = strscpy(names, cur->name, size); + if (ret < 0) + return false; + + names += ret; + size -= ret; + } + + return true; +} + +static bool seccomp_action_logged_from_name(u32 *action_logged, + const char *name) +{ + const struct seccomp_log_name *cur; + + for (cur = seccomp_log_names; cur->name; cur++) { + if (!strcmp(cur->name, name)) { + *action_logged = cur->log; + return true; + } + } + + return false; +} + +static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) +{ + char *name; + + *actions_logged = 0; + while ((name = strsep(&names, " ")) && *name) { + u32 action_logged = 0; + + if (!seccomp_action_logged_from_name(&action_logged, name)) + return false; + + *actions_logged |= action_logged; + } + + return true; +} + +static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + char names[sizeof(seccomp_actions_avail)]; + struct ctl_table table; + int ret; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + memset(names, 0, sizeof(names)); + + if (!write) { + if (!seccomp_names_from_actions_logged(names, sizeof(names), + seccomp_actions_logged)) + return -EINVAL; + } + + table = *ro_table; + table.data = names; + table.maxlen = sizeof(names); + ret = proc_dostring(&table, write, buffer, lenp, ppos); + if (ret) + return ret; + + if (write) { + u32 actions_logged; + + if (!seccomp_actions_logged_from_names(&actions_logged, + table.data)) + return -EINVAL; + + if (actions_logged & SECCOMP_LOG_ALLOW) + return -EINVAL; + + seccomp_actions_logged = actions_logged; + } + + return 0; +} + static struct ctl_path seccomp_sysctl_path[] = { { .procname = "kernel", }, { .procname = "seccomp", }, @@ -992,6 +1152,11 @@ static struct ctl_table seccomp_sysctl_table[] = { .mode = 0444, .proc_handler = proc_dostring, }, + { + .procname = "actions_logged", + .mode = 0644, + .proc_handler = seccomp_actions_logged_handler, + }, { } }; -- cgit From 2b7ea5b5b5799f2878ed454bb48032bed6d101d3 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:55 +0000 Subject: seccomp: Selftest for detection of filter flag support Userspace needs to be able to reliably detect the support of a filter flag. A good way of doing that is by attempting to enter filter mode, with the flag bit(s) in question set, and a NULL pointer for the args parameter of seccomp(2). EFAULT indicates that the flag is valid and EINVAL indicates that the flag is invalid. This patch adds a selftest that can be used to test this method of detection in userspace. Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 60 +++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 1f2888f6678b..abf708e09892 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1835,6 +1835,66 @@ TEST(seccomp_syscall_mode_lock) } } +/* + * Test detection of known and unknown filter flags. Userspace needs to be able + * to check if a filter flag is supported by the current kernel and a good way + * of doing that is by attempting to enter filter mode, with the flag bit in + * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates + * that the flag is valid and EINVAL indicates that the flag is invalid. + */ +TEST(detect_seccomp_filter_flags) +{ + unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC }; + unsigned int flag, all_flags; + int i; + long ret; + + /* Test detection of known-good filter flags */ + for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) { + flag = flags[i]; + ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL); + ASSERT_NE(ENOSYS, errno) { + TH_LOG("Kernel does not support seccomp syscall!"); + } + EXPECT_EQ(-1, ret); + EXPECT_EQ(EFAULT, errno) { + TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!", + flag); + } + + all_flags |= flag; + } + + /* Test detection of all known-good filter flags */ + ret = seccomp(SECCOMP_SET_MODE_FILTER, all_flags, NULL); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EFAULT, errno) { + TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!", + all_flags); + } + + /* Test detection of an unknown filter flag */ + flag = -1; + ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!", + flag); + } + + /* + * Test detection of an unknown filter flag that may simply need to be + * added to this test + */ + flag = flags[ARRAY_SIZE(flags) - 1] << 1; + ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?", + flag); + } +} + TEST(TSYNC_first) { struct sock_filter filter[] = { -- cgit From e66a39977985b1e69e17c4042cb290768eca9b02 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:56 +0000 Subject: seccomp: Filter flag to log all actions except SECCOMP_RET_ALLOW Add a new filter flag, SECCOMP_FILTER_FLAG_LOG, that enables logging for all actions except for SECCOMP_RET_ALLOW for the given filter. SECCOMP_RET_KILL actions are always logged, when "kill" is in the actions_logged sysctl, and SECCOMP_RET_ALLOW actions are never logged, regardless of this flag. This flag can be used to create noisy filters that result in all non-allowed actions to be logged. A process may have one noisy filter, which is loaded with this flag, as well as a quiet filter that's not loaded with this flag. This allows for the actions in a set of filters to be selectively conveyed to the admin. Since a system could have a large number of allocated seccomp_filter structs, struct packing was taken in consideration. On 64 bit x86, the new log member takes up one byte of an existing four byte hole in the struct. On 32 bit x86, the new log member creates a new four byte hole (unavoidable) and consumes one of those bytes. Unfortunately, the tests added for SECCOMP_FILTER_FLAG_LOG are not capable of inspecting the audit log to verify that the actions taken in the filter were logged. With this patch, the logic for deciding if an action will be logged is: if action == RET_ALLOW: do not log else if action == RET_KILL && RET_KILL in actions_logged: log else if filter-requests-logging && action in actions_logged: log else if audit_enabled && process-is-being-audited: log else: do not log Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- include/linux/seccomp.h | 3 +- include/uapi/linux/seccomp.h | 1 + kernel/seccomp.c | 26 +++++++--- tools/testing/selftests/seccomp/seccomp_bpf.c | 69 ++++++++++++++++++++++++++- 4 files changed, 91 insertions(+), 8 deletions(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index ecc296c137cd..c8bef436b61d 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -3,7 +3,8 @@ #include -#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC) +#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \ + SECCOMP_FILTER_FLAG_LOG) #ifdef CONFIG_SECCOMP diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index aaad61cc46bc..19a611d0712e 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -17,6 +17,7 @@ /* Valid flags for SECCOMP_SET_MODE_FILTER */ #define SECCOMP_FILTER_FLAG_TSYNC 1 +#define SECCOMP_FILTER_FLAG_LOG 2 /* * All BPF programs must return a 32-bit value. diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 54357e361aea..ed9fde418fc4 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -44,6 +44,7 @@ * get/put helpers should be used when accessing an instance * outside of a lifetime-guarded section. In general, this * is only needed for handling filters shared across tasks. + * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate * @@ -59,6 +60,7 @@ */ struct seccomp_filter { refcount_t usage; + bool log; struct seccomp_filter *prev; struct bpf_prog *prog; }; @@ -452,6 +454,10 @@ static long seccomp_attach_filter(unsigned int flags, return ret; } + /* Set log flag, if present. */ + if (flags & SECCOMP_FILTER_FLAG_LOG) + filter->log = true; + /* * If there is an existing filter, make it the prev and don't drop its * task reference. @@ -532,15 +538,22 @@ static void seccomp_send_sigsys(int syscall, int reason) static u32 seccomp_actions_logged = SECCOMP_LOG_KILL | SECCOMP_LOG_TRAP | SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE; -static inline void seccomp_log(unsigned long syscall, long signr, u32 action) +static inline void seccomp_log(unsigned long syscall, long signr, u32 action, + bool requested) { bool log = false; switch (action) { case SECCOMP_RET_ALLOW: + break; case SECCOMP_RET_TRAP: + log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP; + break; case SECCOMP_RET_ERRNO: + log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO; + break; case SECCOMP_RET_TRACE: + log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE; break; case SECCOMP_RET_KILL: default: @@ -548,8 +561,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action) } /* - * Force an audit message to be emitted when the action is RET_KILL and - * the action is allowed to be logged by the admin. + * Force an audit message to be emitted when the action is RET_KILL or + * the FILTER_FLAG_LOG bit was set and the action is allowed to be + * logged by the admin. */ if (log) return __audit_seccomp(syscall, signr, action); @@ -586,7 +600,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif - seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL); + seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL, true); do_exit(SIGKILL); } @@ -695,7 +709,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, case SECCOMP_RET_KILL: default: - seccomp_log(this_syscall, SIGSYS, action); + seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (get_nr_threads(current) == 1) { siginfo_t info; @@ -712,7 +726,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, unreachable(); skip: - seccomp_log(this_syscall, 0, action); + seccomp_log(this_syscall, 0, action, match ? match->log : false); return -1; } #else diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index abf708e09892..1c8c22ce7740 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1739,6 +1739,10 @@ TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS) #define SECCOMP_FILTER_FLAG_TSYNC 1 #endif +#ifndef SECCOMP_FILTER_FLAG_LOG +#define SECCOMP_FILTER_FLAG_LOG 2 +#endif + #ifndef seccomp int seccomp(unsigned int op, unsigned int flags, void *args) { @@ -1844,7 +1848,8 @@ TEST(seccomp_syscall_mode_lock) */ TEST(detect_seccomp_filter_flags) { - unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC }; + unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC, + SECCOMP_FILTER_FLAG_LOG }; unsigned int flag, all_flags; int i; long ret; @@ -2533,6 +2538,67 @@ TEST(syscall_restart) _metadata->passed = 0; } +TEST_SIGNAL(filter_flag_log, SIGSYS) +{ + struct sock_filter allow_filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_filter kill_filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog allow_prog = { + .len = (unsigned short)ARRAY_SIZE(allow_filter), + .filter = allow_filter, + }; + struct sock_fprog kill_prog = { + .len = (unsigned short)ARRAY_SIZE(kill_filter), + .filter = kill_filter, + }; + long ret; + pid_t parent = getppid(); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + /* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */ + ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG, + &allow_prog); + ASSERT_NE(ENOSYS, errno) { + TH_LOG("Kernel does not support seccomp syscall!"); + } + EXPECT_NE(0, ret) { + TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!"); + } + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!"); + } + + /* Verify that a simple, permissive filter can be added with no flags */ + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog); + EXPECT_EQ(0, ret); + + /* See if the same filter can be added with the FILTER_FLAG_LOG flag */ + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG, + &allow_prog); + ASSERT_NE(EINVAL, errno) { + TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!"); + } + EXPECT_EQ(0, ret); + + /* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */ + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG, + &kill_prog); + EXPECT_EQ(0, ret); + + EXPECT_EQ(parent, syscall(__NR_getppid)); + /* getpid() should never return. */ + EXPECT_EQ(0, syscall(__NR_getpid)); +} + TEST(get_action_avail) { __u32 actions[] = { SECCOMP_RET_KILL, SECCOMP_RET_TRAP, @@ -2573,6 +2639,7 @@ TEST(get_action_avail) * - endianness checking when appropriate * - 64-bit arg prodding * - arch value testing (x86 modes especially) + * - verify that FILTER_FLAG_LOG filters generate log messages * - ... */ -- cgit From 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:57 +0000 Subject: seccomp: Action to log before allowing Add a new action, SECCOMP_RET_LOG, that logs a syscall before allowing the syscall. At the implementation level, this action is identical to the existing SECCOMP_RET_ALLOW action. However, it can be very useful when initially developing a seccomp filter for an application. The developer can set the default action to be SECCOMP_RET_LOG, maybe mark any obviously needed syscalls with SECCOMP_RET_ALLOW, and then put the application through its paces. A list of syscalls that triggered the default action (SECCOMP_RET_LOG) can be easily gleaned from the logs and that list can be used to build the syscall whitelist. Finally, the developer can change the default action to the desired value. This provides a more friendly experience than seeing the application get killed, then updating the filter and rebuilding the app, seeing the application get killed due to a different syscall, then updating the filter and rebuilding the app, etc. The functionality is similar to what's supported by the various LSMs. SELinux has permissive mode, AppArmor has complain mode, SMACK has bring-up mode, etc. SECCOMP_RET_LOG is given a lower value than SECCOMP_RET_ALLOW as allow while logging is slightly more restrictive than quietly allowing. Unfortunately, the tests added for SECCOMP_RET_LOG are not capable of inspecting the audit log to verify that the syscall was logged. With this patch, the logic for deciding if an action will be logged is: if action == RET_ALLOW: do not log else if action == RET_KILL && RET_KILL in actions_logged: log else if action == RET_LOG && RET_LOG in actions_logged: log else if filter-requests-logging && action in actions_logged: log else if audit_enabled && process-is-being-audited: log else: do not log Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- Documentation/userspace-api/seccomp_filter.rst | 9 +++ include/uapi/linux/seccomp.h | 1 + kernel/seccomp.c | 23 ++++-- tools/testing/selftests/seccomp/seccomp_bpf.c | 98 +++++++++++++++++++++++++- 4 files changed, 125 insertions(+), 6 deletions(-) diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst index 2d1d8ab04ac5..f4977357daf2 100644 --- a/Documentation/userspace-api/seccomp_filter.rst +++ b/Documentation/userspace-api/seccomp_filter.rst @@ -141,6 +141,15 @@ In precedence order, they are: allow use of ptrace, even of other sandboxed processes, without extreme care; ptracers can use this mechanism to escape.) +``SECCOMP_RET_LOG``: + Results in the system call being executed after it is logged. This + should be used by application developers to learn which syscalls their + application needs without having to iterate through multiple test and + development cycles to build the list. + + This action will only be logged if "log" is present in the + actions_logged sysctl string. + ``SECCOMP_RET_ALLOW``: Results in the system call being executed. diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 19a611d0712e..f94433263e4b 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -31,6 +31,7 @@ #define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ #define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ #define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ +#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ /* Masks for the return value sections. */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ed9fde418fc4..59cde2ed3b92 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -533,10 +533,12 @@ static void seccomp_send_sigsys(int syscall, int reason) #define SECCOMP_LOG_TRAP (1 << 2) #define SECCOMP_LOG_ERRNO (1 << 3) #define SECCOMP_LOG_TRACE (1 << 4) -#define SECCOMP_LOG_ALLOW (1 << 5) +#define SECCOMP_LOG_LOG (1 << 5) +#define SECCOMP_LOG_ALLOW (1 << 6) static u32 seccomp_actions_logged = SECCOMP_LOG_KILL | SECCOMP_LOG_TRAP | - SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE; + SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE | + SECCOMP_LOG_LOG; static inline void seccomp_log(unsigned long syscall, long signr, u32 action, bool requested) @@ -555,15 +557,18 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, case SECCOMP_RET_TRACE: log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE; break; + case SECCOMP_RET_LOG: + log = seccomp_actions_logged & SECCOMP_LOG_LOG; + break; case SECCOMP_RET_KILL: default: log = seccomp_actions_logged & SECCOMP_LOG_KILL; } /* - * Force an audit message to be emitted when the action is RET_KILL or - * the FILTER_FLAG_LOG bit was set and the action is allowed to be - * logged by the admin. + * Force an audit message to be emitted when the action is RET_KILL, + * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is + * allowed to be logged by the admin. */ if (log) return __audit_seccomp(syscall, signr, action); @@ -699,6 +704,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; + case SECCOMP_RET_LOG: + seccomp_log(this_syscall, 0, action, true); + return 0; + case SECCOMP_RET_ALLOW: /* * Note that the "match" filter will always be NULL for @@ -873,6 +882,7 @@ static long seccomp_get_action_avail(const char __user *uaction) case SECCOMP_RET_TRAP: case SECCOMP_RET_ERRNO: case SECCOMP_RET_TRACE: + case SECCOMP_RET_LOG: case SECCOMP_RET_ALLOW: break; default: @@ -1023,12 +1033,14 @@ out: #define SECCOMP_RET_TRAP_NAME "trap" #define SECCOMP_RET_ERRNO_NAME "errno" #define SECCOMP_RET_TRACE_NAME "trace" +#define SECCOMP_RET_LOG_NAME "log" #define SECCOMP_RET_ALLOW_NAME "allow" static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME " " SECCOMP_RET_TRAP_NAME " " SECCOMP_RET_ERRNO_NAME " " SECCOMP_RET_TRACE_NAME " " + SECCOMP_RET_LOG_NAME " " SECCOMP_RET_ALLOW_NAME; struct seccomp_log_name { @@ -1041,6 +1053,7 @@ static const struct seccomp_log_name seccomp_log_names[] = { { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, + { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME }, { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME }, { } }; diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 1c8c22ce7740..7372958eccb5 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -74,7 +74,12 @@ #define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ #define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ +#endif +#ifndef SECCOMP_RET_LOG +#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ +#endif +#ifndef SECCOMP_RET_ACTION /* Masks for the return value sections. */ #define SECCOMP_RET_ACTION 0x7fff0000U #define SECCOMP_RET_DATA 0x0000ffffU @@ -342,6 +347,28 @@ TEST(empty_prog) EXPECT_EQ(EINVAL, errno); } +TEST(log_all) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + pid_t parent = getppid(); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ASSERT_EQ(0, ret); + + /* getppid() should succeed and be logged (no check for logging) */ + EXPECT_EQ(parent, syscall(__NR_getppid)); +} + TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS) { struct sock_filter filter[] = { @@ -756,6 +783,7 @@ TEST_F(TRAP, handler) FIXTURE_DATA(precedence) { struct sock_fprog allow; + struct sock_fprog log; struct sock_fprog trace; struct sock_fprog error; struct sock_fprog trap; @@ -767,6 +795,13 @@ FIXTURE_SETUP(precedence) struct sock_filter allow_insns[] = { BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), }; + struct sock_filter log_insns[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG), + }; struct sock_filter trace_insns[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), @@ -803,6 +838,7 @@ FIXTURE_SETUP(precedence) memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \ self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns) FILTER_ALLOC(allow); + FILTER_ALLOC(log); FILTER_ALLOC(trace); FILTER_ALLOC(error); FILTER_ALLOC(trap); @@ -813,6 +849,7 @@ FIXTURE_TEARDOWN(precedence) { #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter) FILTER_FREE(allow); + FILTER_FREE(log); FILTER_FREE(trace); FILTER_FREE(error); FILTER_FREE(trap); @@ -830,6 +867,8 @@ TEST_F(precedence, allow_ok) ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); @@ -854,6 +893,8 @@ TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS) ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); @@ -885,6 +926,8 @@ TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS) ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap); @@ -906,6 +949,8 @@ TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS) ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); @@ -931,6 +976,8 @@ TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS) ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); @@ -952,6 +999,8 @@ TEST_F(precedence, errno_is_third) ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); @@ -970,6 +1019,8 @@ TEST_F(precedence, errno_is_third_in_any_order) ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); @@ -992,6 +1043,8 @@ TEST_F(precedence, trace_is_fourth) ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); /* Should work just fine. */ @@ -1013,12 +1066,54 @@ TEST_F(precedence, trace_is_fourth_in_any_order) ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); /* Should work just fine. */ EXPECT_EQ(parent, syscall(__NR_getppid)); /* No ptracer */ EXPECT_EQ(-1, syscall(__NR_getpid)); } +TEST_F(precedence, log_is_fifth) +{ + pid_t mypid, parent; + long ret; + + mypid = getpid(); + parent = getppid(); + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); + /* Should work just fine. */ + EXPECT_EQ(parent, syscall(__NR_getppid)); + /* Should also work just fine */ + EXPECT_EQ(mypid, syscall(__NR_getpid)); +} + +TEST_F(precedence, log_is_fifth_in_any_order) +{ + pid_t mypid, parent; + long ret; + + mypid = getpid(); + parent = getppid(); + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); + ASSERT_EQ(0, ret); + /* Should work just fine. */ + EXPECT_EQ(parent, syscall(__NR_getppid)); + /* Should also work just fine */ + EXPECT_EQ(mypid, syscall(__NR_getpid)); +} + #ifndef PTRACE_O_TRACESECCOMP #define PTRACE_O_TRACESECCOMP 0x00000080 #endif @@ -2603,7 +2698,7 @@ TEST(get_action_avail) { __u32 actions[] = { SECCOMP_RET_KILL, SECCOMP_RET_TRAP, SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE, - SECCOMP_RET_ALLOW }; + SECCOMP_RET_LOG, SECCOMP_RET_ALLOW }; __u32 unknown_action = 0x10000000U; int i; long ret; @@ -2640,6 +2735,7 @@ TEST(get_action_avail) * - 64-bit arg prodding * - arch value testing (x86 modes especially) * - verify that FILTER_FLAG_LOG filters generate log messages + * - verify that RET_LOG generates log messages * - ... */ -- cgit From fd76875ca289a3d4722f266fd2d5532a27083903 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 12:53:18 -0700 Subject: seccomp: Rename SECCOMP_RET_KILL to SECCOMP_RET_KILL_THREAD In preparation for adding SECCOMP_RET_KILL_PROCESS, rename SECCOMP_RET_KILL to the more accurate SECCOMP_RET_KILL_THREAD. The existing selftest values are intentionally left as SECCOMP_RET_KILL just to be sure we're exercising the alias. Signed-off-by: Kees Cook --- Documentation/networking/filter.txt | 2 +- Documentation/userspace-api/seccomp_filter.rst | 4 +-- include/uapi/linux/seccomp.h | 3 +- kernel/seccomp.c | 39 ++++++++++++++------------ samples/seccomp/bpf-direct.c | 4 +-- samples/seccomp/bpf-helper.h | 2 +- tools/testing/selftests/seccomp/seccomp_bpf.c | 17 ++++++----- 7 files changed, 39 insertions(+), 32 deletions(-) diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index b69b205501de..73aa0f12156d 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -337,7 +337,7 @@ Examples for low-level BPF: jeq #14, good /* __NR_rt_sigprocmask */ jeq #13, good /* __NR_rt_sigaction */ jeq #35, good /* __NR_nanosleep */ - bad: ret #0 /* SECCOMP_RET_KILL */ + bad: ret #0 /* SECCOMP_RET_KILL_THREAD */ good: ret #0x7fff0000 /* SECCOMP_RET_ALLOW */ The above example code can be placed into a file (here called "foo"), and diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst index f4977357daf2..d76396f2d8ed 100644 --- a/Documentation/userspace-api/seccomp_filter.rst +++ b/Documentation/userspace-api/seccomp_filter.rst @@ -87,11 +87,11 @@ Return values A seccomp filter may return any of the following values. If multiple filters exist, the return value for the evaluation of a given system call will always use the highest precedent value. (For example, -``SECCOMP_RET_KILL`` will always take precedence.) +``SECCOMP_RET_KILL_THREAD`` will always take precedence.) In precedence order, they are: -``SECCOMP_RET_KILL``: +``SECCOMP_RET_KILL_THREAD``: Results in the task exiting immediately without executing the system call. The exit status of the task (``status & 0x7f``) will be ``SIGSYS``, not ``SIGKILL``. diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index f94433263e4b..5a03f699eb17 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -27,7 +27,8 @@ * The ordering ensures that a min_t() over composed return values always * selects the least permissive choice. */ -#define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */ +#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */ +#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD #define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ #define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ #define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 59cde2ed3b92..95ac54cff00f 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -192,7 +192,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) - return SECCOMP_RET_KILL; + return SECCOMP_RET_KILL_THREAD; if (!sd) { populate_seccomp_data(&sd_local); @@ -529,15 +529,17 @@ static void seccomp_send_sigsys(int syscall, int reason) #endif /* CONFIG_SECCOMP_FILTER */ /* For use with seccomp_actions_logged */ -#define SECCOMP_LOG_KILL (1 << 0) +#define SECCOMP_LOG_KILL_THREAD (1 << 0) #define SECCOMP_LOG_TRAP (1 << 2) #define SECCOMP_LOG_ERRNO (1 << 3) #define SECCOMP_LOG_TRACE (1 << 4) #define SECCOMP_LOG_LOG (1 << 5) #define SECCOMP_LOG_ALLOW (1 << 6) -static u32 seccomp_actions_logged = SECCOMP_LOG_KILL | SECCOMP_LOG_TRAP | - SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE | +static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_THREAD | + SECCOMP_LOG_TRAP | + SECCOMP_LOG_ERRNO | + SECCOMP_LOG_TRACE | SECCOMP_LOG_LOG; static inline void seccomp_log(unsigned long syscall, long signr, u32 action, @@ -560,13 +562,13 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, case SECCOMP_RET_LOG: log = seccomp_actions_logged & SECCOMP_LOG_LOG; break; - case SECCOMP_RET_KILL: + case SECCOMP_RET_KILL_THREAD: default: - log = seccomp_actions_logged & SECCOMP_LOG_KILL; + log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD; } /* - * Force an audit message to be emitted when the action is RET_KILL, + * Force an audit message to be emitted when the action is RET_KILL_*, * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is * allowed to be logged by the admin. */ @@ -605,7 +607,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif - seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL, true); + seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true); do_exit(SIGKILL); } @@ -716,7 +718,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ return 0; - case SECCOMP_RET_KILL: + case SECCOMP_RET_KILL_THREAD: default: seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ @@ -878,7 +880,7 @@ static long seccomp_get_action_avail(const char __user *uaction) return -EFAULT; switch (action) { - case SECCOMP_RET_KILL: + case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_TRAP: case SECCOMP_RET_ERRNO: case SECCOMP_RET_TRACE: @@ -1029,19 +1031,20 @@ out: #ifdef CONFIG_SYSCTL /* Human readable action names for friendly sysctl interaction */ -#define SECCOMP_RET_KILL_NAME "kill" +#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread" #define SECCOMP_RET_TRAP_NAME "trap" #define SECCOMP_RET_ERRNO_NAME "errno" #define SECCOMP_RET_TRACE_NAME "trace" #define SECCOMP_RET_LOG_NAME "log" #define SECCOMP_RET_ALLOW_NAME "allow" -static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME " " - SECCOMP_RET_TRAP_NAME " " - SECCOMP_RET_ERRNO_NAME " " - SECCOMP_RET_TRACE_NAME " " - SECCOMP_RET_LOG_NAME " " - SECCOMP_RET_ALLOW_NAME; +static const char seccomp_actions_avail[] = + SECCOMP_RET_KILL_THREAD_NAME " " + SECCOMP_RET_TRAP_NAME " " + SECCOMP_RET_ERRNO_NAME " " + SECCOMP_RET_TRACE_NAME " " + SECCOMP_RET_LOG_NAME " " + SECCOMP_RET_ALLOW_NAME; struct seccomp_log_name { u32 log; @@ -1049,7 +1052,7 @@ struct seccomp_log_name { }; static const struct seccomp_log_name seccomp_log_names[] = { - { SECCOMP_LOG_KILL, SECCOMP_RET_KILL_NAME }, + { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME }, { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c index 151ec3f52189..235ce3c49ee9 100644 --- a/samples/seccomp/bpf-direct.c +++ b/samples/seccomp/bpf-direct.c @@ -129,7 +129,7 @@ static int install_filter(void) /* Check that read is only using stdin. */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)), BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 4, 0), - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD), /* Check that write is only using stdout */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)), @@ -139,7 +139,7 @@ static int install_filter(void) BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP), - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD), }; struct sock_fprog prog = { .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h index 1d8de9edd858..83dbe79cbe2c 100644 --- a/samples/seccomp/bpf-helper.h +++ b/samples/seccomp/bpf-helper.h @@ -44,7 +44,7 @@ void seccomp_bpf_print(struct sock_filter *filter, size_t count); #define ALLOW \ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) #define DENY \ - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL) + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD) #define JUMP(labels, label) \ BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \ JUMP_JT, JUMP_JF) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 7372958eccb5..a3ba39a32449 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -68,15 +68,18 @@ #define SECCOMP_MODE_FILTER 2 #endif +#ifndef SECCOMP_RET_KILL_THREAD +#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */ +#endif #ifndef SECCOMP_RET_KILL -#define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */ -#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ -#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ -#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ -#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ +#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD +#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ +#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ +#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ +#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ #endif #ifndef SECCOMP_RET_LOG -#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ +#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ #endif #ifndef SECCOMP_RET_ACTION @@ -2696,7 +2699,7 @@ TEST_SIGNAL(filter_flag_log, SIGSYS) TEST(get_action_avail) { - __u32 actions[] = { SECCOMP_RET_KILL, SECCOMP_RET_TRAP, + __u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP, SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE, SECCOMP_RET_LOG, SECCOMP_RET_ALLOW }; __u32 unknown_action = 0x10000000U; -- cgit From 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 13:01:39 -0700 Subject: seccomp: Introduce SECCOMP_RET_KILL_PROCESS This introduces the BPF return value for SECCOMP_RET_KILL_PROCESS to kill an entire process. This cannot yet be reached by seccomp, but it changes the default-kill behavior (for unknown return values) from kill-thread to kill-process. Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 18 ++++++++++-------- kernel/seccomp.c | 22 ++++++++++++++++------ 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 5a03f699eb17..7e77c92df78a 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -22,18 +22,20 @@ /* * All BPF programs must return a 32-bit value. * The bottom 16-bits are for optional return data. - * The upper 16-bits are ordered from least permissive values to most. + * The upper 16-bits are ordered from least permissive values to most, + * as a signed value (so 0x8000000 is negative). * * The ordering ensures that a min_t() over composed return values always * selects the least permissive choice. */ -#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */ -#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD -#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ -#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ -#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ -#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ -#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ +#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */ +#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */ +#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD +#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ +#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ +#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ +#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ +#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ /* Masks for the return value sections. */ #define SECCOMP_RET_ACTION 0x7fff0000U diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 95ac54cff00f..5c7299b9d953 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -192,7 +192,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) - return SECCOMP_RET_KILL_THREAD; + return SECCOMP_RET_KILL_PROCESS; if (!sd) { populate_seccomp_data(&sd_local); @@ -529,14 +529,16 @@ static void seccomp_send_sigsys(int syscall, int reason) #endif /* CONFIG_SECCOMP_FILTER */ /* For use with seccomp_actions_logged */ -#define SECCOMP_LOG_KILL_THREAD (1 << 0) +#define SECCOMP_LOG_KILL_PROCESS (1 << 0) +#define SECCOMP_LOG_KILL_THREAD (1 << 1) #define SECCOMP_LOG_TRAP (1 << 2) #define SECCOMP_LOG_ERRNO (1 << 3) #define SECCOMP_LOG_TRACE (1 << 4) #define SECCOMP_LOG_LOG (1 << 5) #define SECCOMP_LOG_ALLOW (1 << 6) -static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_THREAD | +static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS | + SECCOMP_LOG_KILL_THREAD | SECCOMP_LOG_TRAP | SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE | @@ -563,8 +565,11 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, log = seccomp_actions_logged & SECCOMP_LOG_LOG; break; case SECCOMP_RET_KILL_THREAD: - default: log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD; + break; + case SECCOMP_RET_KILL_PROCESS: + default: + log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS; } /* @@ -719,10 +724,12 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_KILL_THREAD: + case SECCOMP_RET_KILL_PROCESS: default: seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ - if (get_nr_threads(current) == 1) { + if (action == SECCOMP_RET_KILL_PROCESS || + get_nr_threads(current) == 1) { siginfo_t info; /* Show the original registers in the dump. */ @@ -731,7 +738,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, seccomp_init_siginfo(&info, this_syscall, data); do_coredump(&info); } - do_exit(SIGSYS); + if (action == SECCOMP_RET_KILL_PROCESS) + do_group_exit(SIGSYS); + else + do_exit(SIGSYS); } unreachable(); -- cgit From 0466bdb99e8744bc9befa8d62a317f0fd7fd7421 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 13:12:11 -0700 Subject: seccomp: Implement SECCOMP_RET_KILL_PROCESS action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Right now, SECCOMP_RET_KILL_THREAD (neé SECCOMP_RET_KILL) kills the current thread. There have been a few requests for this to kill the entire process (the thread group). This cannot be just changed (discovered when adding coredump support since coredumping kills the entire process) because there are userspace programs depending on the thread-kill behavior. Instead, implement SECCOMP_RET_KILL_PROCESS, which is 0x80000000, and can be processed as "-1" by the kernel, below the existing RET_KILL that is ABI-set to "0". For userspace, SECCOMP_RET_ACTION_FULL is added to expand the mask to the signed bit. Old userspace using the SECCOMP_RET_ACTION mask will see SECCOMP_RET_KILL_PROCESS as 0 still, but this would only be visible when examining the siginfo in a core dump from a RET_KILL_*, where it will think it was thread-killed instead of process-killed. Attempts to introduce this behavior via other ways (filter flags, seccomp struct flags, masked RET_DATA bits) all come with weird side-effects and baggage. This change preserves the central behavioral expectations of the seccomp filter engine without putting too great a burden on changes needed in userspace to use the new action. The new action is discoverable by userspace through either the new actions_avail sysctl or through the SECCOMP_GET_ACTION_AVAIL seccomp operation. If used without checking for availability, old kernels will treat RET_KILL_PROCESS as RET_KILL_THREAD (since the old mask will produce RET_KILL_THREAD). Cc: Paul Moore Cc: Fabricio Voznika Signed-off-by: Kees Cook --- Documentation/userspace-api/seccomp_filter.rst | 7 ++++++- include/uapi/linux/seccomp.h | 1 + kernel/seccomp.c | 9 +++++++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst index d76396f2d8ed..099c412951d6 100644 --- a/Documentation/userspace-api/seccomp_filter.rst +++ b/Documentation/userspace-api/seccomp_filter.rst @@ -87,10 +87,15 @@ Return values A seccomp filter may return any of the following values. If multiple filters exist, the return value for the evaluation of a given system call will always use the highest precedent value. (For example, -``SECCOMP_RET_KILL_THREAD`` will always take precedence.) +``SECCOMP_RET_KILL_PROCESS`` will always take precedence.) In precedence order, they are: +``SECCOMP_RET_KILL_PROCESS``: + Results in the entire process exiting immediately without executing + the system call. The exit status of the task (``status & 0x7f``) + will be ``SIGSYS``, not ``SIGKILL``. + ``SECCOMP_RET_KILL_THREAD``: Results in the task exiting immediately without executing the system call. The exit status of the task (``status & 0x7f``) will diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 7e77c92df78a..f6bc1dea3247 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -38,6 +38,7 @@ #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ /* Masks for the return value sections. */ +#define SECCOMP_RET_ACTION_FULL 0xffff0000U #define SECCOMP_RET_ACTION 0x7fff0000U #define SECCOMP_RET_DATA 0x0000ffffU diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5c7299b9d953..c24579dfa7a1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -181,6 +181,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) * * Returns valid seccomp BPF response codes. */ +#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL))) static u32 seccomp_run_filters(const struct seccomp_data *sd, struct seccomp_filter **match) { @@ -206,7 +207,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, for (; f; f = f->prev) { u32 cur_ret = BPF_PROG_RUN(f->prog, sd); - if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) { + if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) { ret = cur_ret; *match = f; } @@ -650,7 +651,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, filter_ret = seccomp_run_filters(sd, &match); data = filter_ret & SECCOMP_RET_DATA; - action = filter_ret & SECCOMP_RET_ACTION; + action = filter_ret & SECCOMP_RET_ACTION_FULL; switch (action) { case SECCOMP_RET_ERRNO: @@ -890,6 +891,7 @@ static long seccomp_get_action_avail(const char __user *uaction) return -EFAULT; switch (action) { + case SECCOMP_RET_KILL_PROCESS: case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_TRAP: case SECCOMP_RET_ERRNO: @@ -1041,6 +1043,7 @@ out: #ifdef CONFIG_SYSCTL /* Human readable action names for friendly sysctl interaction */ +#define SECCOMP_RET_KILL_PROCESS_NAME "kill_process" #define SECCOMP_RET_KILL_THREAD_NAME "kill_thread" #define SECCOMP_RET_TRAP_NAME "trap" #define SECCOMP_RET_ERRNO_NAME "errno" @@ -1049,6 +1052,7 @@ out: #define SECCOMP_RET_ALLOW_NAME "allow" static const char seccomp_actions_avail[] = + SECCOMP_RET_KILL_PROCESS_NAME " " SECCOMP_RET_KILL_THREAD_NAME " " SECCOMP_RET_TRAP_NAME " " SECCOMP_RET_ERRNO_NAME " " @@ -1062,6 +1066,7 @@ struct seccomp_log_name { }; static const struct seccomp_log_name seccomp_log_names[] = { + { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME }, { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME }, { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, -- cgit From f3e1821d9e1cc3fb434d7763001791dcd6720c90 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 13:20:33 -0700 Subject: selftests/seccomp: Test thread vs process killing This verifies that SECCOMP_RET_KILL_PROCESS is higher priority than SECCOMP_RET_KILL_THREAD. (This also moves a bunch of defines up earlier in the file to use them earlier.) Signed-off-by: Kees Cook Reviewed-by: Tyler Hicks --- tools/testing/selftests/seccomp/seccomp_bpf.c | 228 +++++++++++++++++++------- 1 file changed, 168 insertions(+), 60 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index a3ba39a32449..0683cd543cd5 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -68,7 +68,17 @@ #define SECCOMP_MODE_FILTER 2 #endif -#ifndef SECCOMP_RET_KILL_THREAD +#ifndef SECCOMP_RET_ALLOW +struct seccomp_data { + int nr; + __u32 arch; + __u64 instruction_pointer; + __u64 args[6]; +}; +#endif + +#ifndef SECCOMP_RET_KILL_PROCESS +#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */ #define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */ #endif #ifndef SECCOMP_RET_KILL @@ -82,17 +92,53 @@ #define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ #endif -#ifndef SECCOMP_RET_ACTION -/* Masks for the return value sections. */ -#define SECCOMP_RET_ACTION 0x7fff0000U -#define SECCOMP_RET_DATA 0x0000ffffU +#ifndef __NR_seccomp +# if defined(__i386__) +# define __NR_seccomp 354 +# elif defined(__x86_64__) +# define __NR_seccomp 317 +# elif defined(__arm__) +# define __NR_seccomp 383 +# elif defined(__aarch64__) +# define __NR_seccomp 277 +# elif defined(__hppa__) +# define __NR_seccomp 338 +# elif defined(__powerpc__) +# define __NR_seccomp 358 +# elif defined(__s390__) +# define __NR_seccomp 348 +# else +# warning "seccomp syscall number unknown for this architecture" +# define __NR_seccomp 0xffff +# endif +#endif -struct seccomp_data { - int nr; - __u32 arch; - __u64 instruction_pointer; - __u64 args[6]; -}; +#ifndef SECCOMP_SET_MODE_STRICT +#define SECCOMP_SET_MODE_STRICT 0 +#endif + +#ifndef SECCOMP_SET_MODE_FILTER +#define SECCOMP_SET_MODE_FILTER 1 +#endif + +#ifndef SECCOMP_GET_ACTION_AVAIL +#define SECCOMP_GET_ACTION_AVAIL 2 +#endif + +#ifndef SECCOMP_FILTER_FLAG_TSYNC +#define SECCOMP_FILTER_FLAG_TSYNC 1 +#endif + +#ifndef SECCOMP_FILTER_FLAG_LOG +#define SECCOMP_FILTER_FLAG_LOG 2 +#endif + +#ifndef seccomp +int seccomp(unsigned int op, unsigned int flags, void *args) +{ + errno = 0; + return syscall(__NR_seccomp, op, flags, args); +} #endif #if __BYTE_ORDER == __LITTLE_ENDIAN @@ -550,6 +596,117 @@ TEST_SIGNAL(KILL_one_arg_six, SIGSYS) close(fd); } +/* This is a thread task to die via seccomp filter violation. */ +void *kill_thread(void *data) +{ + bool die = (bool)data; + + if (die) { + prctl(PR_GET_SECCOMP, 0, 0, 0, 0); + return (void *)SIBLING_EXIT_FAILURE; + } + + return (void *)SIBLING_EXIT_UNKILLED; +} + +/* Prepare a thread that will kill itself or both of us. */ +void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process) +{ + pthread_t thread; + void *status; + /* Kill only when calling __NR_prctl. */ + struct sock_filter filter_thread[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog_thread = { + .len = (unsigned short)ARRAY_SIZE(filter_thread), + .filter = filter_thread, + }; + struct sock_filter filter_process[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_PROCESS), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog_process = { + .len = (unsigned short)ARRAY_SIZE(filter_process), + .filter = filter_process, + }; + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, + kill_process ? &prog_process : &prog_thread)); + + /* + * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS + * flag cannot be downgraded by a new filter. + */ + ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread)); + + /* Start a thread that will exit immediately. */ + ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false)); + ASSERT_EQ(0, pthread_join(thread, &status)); + ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status); + + /* Start a thread that will die immediately. */ + ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true)); + ASSERT_EQ(0, pthread_join(thread, &status)); + ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status); + + /* + * If we get here, only the spawned thread died. Let the parent know + * the whole process didn't die (i.e. this thread, the spawner, + * stayed running). + */ + exit(42); +} + +TEST(KILL_thread) +{ + int status; + pid_t child_pid; + + child_pid = fork(); + ASSERT_LE(0, child_pid); + if (child_pid == 0) { + kill_thread_or_group(_metadata, false); + _exit(38); + } + + ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0)); + + /* If only the thread was killed, we'll see exit 42. */ + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(42, WEXITSTATUS(status)); +} + +TEST(KILL_process) +{ + int status; + pid_t child_pid; + + child_pid = fork(); + ASSERT_LE(0, child_pid); + if (child_pid == 0) { + kill_thread_or_group(_metadata, true); + _exit(38); + } + + ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0)); + + /* If the entire process was killed, we'll see SIGSYS. */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(SIGSYS, WTERMSIG(status)); +} + /* TODO(wad) add 64-bit versus 32-bit arg tests. */ TEST(arg_out_of_range) { @@ -1800,55 +1957,6 @@ TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS) EXPECT_NE(self->mypid, syscall(__NR_getpid)); } -#ifndef __NR_seccomp -# if defined(__i386__) -# define __NR_seccomp 354 -# elif defined(__x86_64__) -# define __NR_seccomp 317 -# elif defined(__arm__) -# define __NR_seccomp 383 -# elif defined(__aarch64__) -# define __NR_seccomp 277 -# elif defined(__hppa__) -# define __NR_seccomp 338 -# elif defined(__powerpc__) -# define __NR_seccomp 358 -# elif defined(__s390__) -# define __NR_seccomp 348 -# else -# warning "seccomp syscall number unknown for this architecture" -# define __NR_seccomp 0xffff -# endif -#endif - -#ifndef SECCOMP_SET_MODE_STRICT -#define SECCOMP_SET_MODE_STRICT 0 -#endif - -#ifndef SECCOMP_SET_MODE_FILTER -#define SECCOMP_SET_MODE_FILTER 1 -#endif - -#ifndef SECCOMP_GET_ACTION_AVAIL -#define SECCOMP_GET_ACTION_AVAIL 2 -#endif - -#ifndef SECCOMP_FILTER_FLAG_TSYNC -#define SECCOMP_FILTER_FLAG_TSYNC 1 -#endif - -#ifndef SECCOMP_FILTER_FLAG_LOG -#define SECCOMP_FILTER_FLAG_LOG 2 -#endif - -#ifndef seccomp -int seccomp(unsigned int op, unsigned int flags, void *args) -{ - errno = 0; - return syscall(__NR_seccomp, op, flags, args); -} -#endif - TEST(seccomp_syscall) { struct sock_filter filter[] = { -- cgit From 6849243bf4c6155151b294e9f0e0dc9540d6f083 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Aug 2017 20:26:57 -0700 Subject: samples: Unrename SECCOMP_RET_KILL Since samples can still be built before header installs, avoid the cosmetic renaming of SECCOMP_RET_KILL to avoid build failures in -next. Cc: Stephen Rothwell Signed-off-by: Kees Cook --- samples/seccomp/bpf-direct.c | 4 ++-- samples/seccomp/bpf-helper.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c index 235ce3c49ee9..151ec3f52189 100644 --- a/samples/seccomp/bpf-direct.c +++ b/samples/seccomp/bpf-direct.c @@ -129,7 +129,7 @@ static int install_filter(void) /* Check that read is only using stdin. */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)), BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 4, 0), - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), /* Check that write is only using stdout */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)), @@ -139,7 +139,7 @@ static int install_filter(void) BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP), - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), }; struct sock_fprog prog = { .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h index 83dbe79cbe2c..1d8de9edd858 100644 --- a/samples/seccomp/bpf-helper.h +++ b/samples/seccomp/bpf-helper.h @@ -44,7 +44,7 @@ void seccomp_bpf_print(struct sock_filter *filter, size_t count); #define ALLOW \ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) #define DENY \ - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD) + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL) #define JUMP(labels, label) \ BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \ JUMP_JT, JUMP_JF) -- cgit From a4e89ffb59235fd11d27107dea3efa4562ac0a12 Mon Sep 17 00:00:00 2001 From: Matt Weber Date: Wed, 28 Jun 2017 11:14:29 -0500 Subject: powerpc/e6500: Update machine check for L1D cache err This patch updates the machine check handler of Linux kernel to handle the e6500 architecture case. In e6500 core, L1 Data Cache Write Shadow Mode (DCWS) register is not implemented but L1 data cache always runs in write shadow mode. So, on L1 data cache parity errors, hardware will automatically invalidate the data cache but will still log a machine check interrupt. Signed-off-by: Ronak Desai Signed-off-by: Matthew Weber Signed-off-by: Scott Wood --- arch/powerpc/kernel/traps.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 675d5d2bfcde..410352acfa38 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -393,6 +393,7 @@ static inline int check_io_access(struct pt_regs *regs) int machine_check_e500mc(struct pt_regs *regs) { unsigned long mcsr = mfspr(SPRN_MCSR); + unsigned long pvr = mfspr(SPRN_PVR); unsigned long reason = mcsr; int recoverable = 1; @@ -434,8 +435,15 @@ int machine_check_e500mc(struct pt_regs *regs) * may still get logged and cause a machine check. We should * only treat the non-write shadow case as non-recoverable. */ - if (!(mfspr(SPRN_L1CSR2) & L1CSR2_DCWS)) - recoverable = 0; + /* On e6500 core, L1 DCWS (Data cache write shadow mode) bit + * is not implemented but L1 data cache always runs in write + * shadow mode. Hence on data cache parity errors HW will + * automatically invalidate the L1 Data Cache. + */ + if (PVR_VER(pvr) != PVR_VER_E6500) { + if (!(mfspr(SPRN_L1CSR2) & L1CSR2_DCWS)) + recoverable = 0; + } } if (reason & MCSR_L2MMU_MHIT) { -- cgit From 50dad5fb59d12dc9a5a0e96073ee1e5857ac45a2 Mon Sep 17 00:00:00 2001 From: Himanshu Jha Date: Wed, 30 Aug 2017 00:33:35 +0530 Subject: drm/amdkfd: remove memset before memcpy calling memcpy immediately after memset with the same region of memory makes memset redundant. Signed-off-by: Himanshu Jha Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 1cae95e2b13a..03bec765b03d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -143,7 +143,6 @@ int pqm_create_queue(struct process_queue_manager *pqm, int num_queues = 0; struct queue *cur; - memset(&q_properties, 0, sizeof(struct queue_properties)); memcpy(&q_properties, properties, sizeof(struct queue_properties)); q = NULL; kq = NULL; -- cgit From 1fabbf781167052f4480bdcc627378a27e78a559 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sat, 2 Sep 2017 15:00:25 +0300 Subject: drm/amdkfd: pass queue's mqd when destroying mqd In VI, the destroy mqd function needs to inquire fields present in the mqd structure. That's why we need to pass it to that function instead of NULL. Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 681b639f5133..0649dd43e780 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -183,7 +183,7 @@ static void uninitialize(struct kernel_queue *kq) { if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) kq->mqd->destroy_mqd(kq->mqd, - NULL, + kq->queue->mqd, false, QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, kq->queue->pipe, -- cgit From 3664847d95e60a9a943858b7800f8484669740fc Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 25 Aug 2017 10:40:02 -0700 Subject: md/raid5: fix a race condition in stripe batch We have a race condition in below scenario, say have 3 continuous stripes, sh1, sh2 and sh3, sh1 is the stripe_head of sh2 and sh3: CPU1 CPU2 CPU3 handle_stripe(sh3) stripe_add_to_batch_list(sh3) -> lock(sh2, sh3) -> lock batch_lock(sh1) -> add sh3 to batch_list of sh1 -> unlock batch_lock(sh1) clear_batch_ready(sh1) -> lock(sh1) and batch_lock(sh1) -> clear STRIPE_BATCH_READY for all stripes in batch_list -> unlock(sh1) and batch_lock(sh1) ->clear_batch_ready(sh3) -->test_and_clear_bit(STRIPE_BATCH_READY, sh3) --->return 0 as sh->batch == NULL -> sh3->batch_head = sh1 -> unlock (sh2, sh3) In CPU1, handle_stripe will continue handle sh3 even it's in batch stripe list of sh1. By moving sh3->batch_head assignment in to batch_lock, we make it impossible to clear STRIPE_BATCH_READY before batch_head is set. Thanks Stephane for helping debug this tricky issue. Reported-and-tested-by: Stephane Thiell Cc: stable@vger.kernel.org (v4.1+) Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 049a958d3c1e..90414a4e0d49 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -811,6 +811,14 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh spin_unlock(&head->batch_head->batch_lock); goto unlock_out; } + /* + * We must assign batch_head of this stripe within the + * batch_lock, otherwise clear_batch_ready of batch head + * stripe could clear BATCH_READY bit of this stripe and + * this stripe->batch_head doesn't get assigned, which + * could confuse clear_batch_ready for this stripe + */ + sh->batch_head = head->batch_head; /* * at this point, head's BATCH_READY could be cleared, but we @@ -818,8 +826,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh */ list_add(&sh->batch_list, &head->batch_list); spin_unlock(&head->batch_head->batch_lock); - - sh->batch_head = head->batch_head; } else { head->batch_head = head; sh->batch_head = head->batch_head; -- cgit From 184a09eb9a2fe425e49c9538f1604b05ed33cfef Mon Sep 17 00:00:00 2001 From: Dennis Yang Date: Wed, 6 Sep 2017 11:02:35 +0800 Subject: md/raid5: preserve STRIPE_ON_UNPLUG_LIST in break_stripe_batch_list In release_stripe_plug(), if a stripe_head has its STRIPE_ON_UNPLUG_LIST set, it indicates that this stripe_head is already in the raid5_plug_cb list and release_stripe() would be called instead to drop a reference count. Otherwise, the STRIPE_ON_UNPLUG_LIST bit would be set for this stripe_head and it will get queued into the raid5_plug_cb list. Since break_stripe_batch_list() did not preserve STRIPE_ON_UNPLUG_LIST, A stripe could be re-added to plug list while it is still on that list in the following situation. If stripe_head A is added to another stripe_head B's batch list, in this case A will have its batch_head != NULL and be added into the plug list. After that, stripe_head B gets handled and called break_stripe_batch_list() to reset all the batched stripe_head(including A which is still on the plug list)'s state and reset their batch_head to NULL. Before the plug list gets processed, if there is another write request comes in and get stripe_head A, A will have its batch_head == NULL (cleared by calling break_stripe_batch_list() on B) and be added to plug list once again. Signed-off-by: Dennis Yang Cc: stable@vger.kernel.org (v4.1+) Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 90414a4e0d49..34c01e83395a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4605,7 +4605,8 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DEGRADED)), + (1 << STRIPE_DEGRADED) | + (1 << STRIPE_ON_UNPLUG_LIST)), head_sh->state & (1 << STRIPE_INSYNC)); sh->check_state = head_sh->check_state; -- cgit From 01f5bbd17a8066b58dba9b5049fad504bce67322 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 7 Sep 2017 10:40:35 +0300 Subject: mmc: block: Fix incorrectly initialized requests mmc_init_request() depends on card->bouncesz so it must be calculated before blk_init_allocated_queue() starts allocating requests. Reported-by: Seraphime Kirkovski Fixes: 304419d8a7e9 ("mmc: core: Allocate per-request data using the..") Signed-off-by: Adrian Hunter Tested-by: Seraphime Kirkovski Cc: Signed-off-by: Ulf Hansson Tested-by: Pavel Machek --- drivers/mmc/core/queue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index affa7370ba82..74c663b1c0a7 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -242,6 +242,12 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask) limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT; + /* + * mmc_init_request() depends on card->bouncesz so it must be calculated + * before blk_init_allocated_queue() starts allocating requests. + */ + card->bouncesz = mmc_queue_calc_bouncesz(host); + mq->card = card; mq->queue = blk_alloc_queue(GFP_KERNEL); if (!mq->queue) @@ -265,7 +271,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, if (mmc_can_erase(card)) mmc_queue_setup_discard(mq->queue, card); - card->bouncesz = mmc_queue_calc_bouncesz(host); if (card->bouncesz) { blk_queue_max_hw_sectors(mq->queue, card->bouncesz / 512); blk_queue_max_segments(mq->queue, card->bouncesz / 512); -- cgit From b4f146f5faf85d21f5cd414531c3ced9c1b61e3c Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 5 Sep 2017 20:27:50 +0200 Subject: mmc: host: fix typo after MMC_DEBUG move MMC_DEBUG was moved and one letter got strangely capitalized. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 02179ed2a40d..8c15637178ff 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -5,7 +5,7 @@ comment "MMC/SD/SDIO Host Controller Drivers" config MMC_DEBUG - bool "MMC host drivers debugginG" + bool "MMC host drivers debugging" depends on MMC != n help This is an option for use by developers; most people should -- cgit From b917c6d18c031cfce11ec35139033845f205b1d0 Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Thu, 7 Sep 2017 13:24:17 +0200 Subject: mmc: cavium: Fix use-after-free in of_platform_device_destroy KASAN reported the following: [ 19.338655] ================================================================== [ 19.345946] BUG: KASAN: use-after-free in of_platform_device_destroy+0x88/0x100 [ 19.345966] Read of size 8 at addr fffffe01aa6f1468 by task systemd-udevd/264 [ 19.345983] CPU: 1 PID: 264 Comm: systemd-udevd Not tainted 4.13.0-jang+ #737 [ 19.345989] Hardware name: Cavium ThunderX CN81XX board (DT) [ 19.345995] Call trace: [ 19.346013] [] dump_backtrace+0x0/0x368 [ 19.346026] [] show_stack+0x24/0x30 [ 19.346040] [] dump_stack+0xa4/0xc8 [ 19.346057] [] print_address_description+0x68/0x258 [ 19.346070] [] kasan_report+0x238/0x2f8 [ 19.346082] [] __asan_load8+0x88/0xb8 [ 19.346098] [] of_platform_device_destroy+0x88/0x100 [ 19.346131] [] thunder_mmc_probe+0x314/0x550 [thunderx_mmc] [ 19.346147] [] pci_device_probe+0x158/0x1f8 [ 19.346162] [] driver_probe_device+0x394/0x5f8 [ 19.346174] [] __driver_attach+0x154/0x158 [ 19.346185] [] bus_for_each_dev+0xdc/0x140 [ 19.346196] [] driver_attach+0x38/0x48 [ 19.346207] [] bus_add_driver+0x290/0x3c8 [ 19.346219] [] driver_register+0xbc/0x1a0 [ 19.346232] [] __pci_register_driver+0xc4/0xd8 [ 19.346260] [] thunder_mmc_driver_init+0x24/0x10000 [thunderx_mmc] [ 19.346273] [] do_one_initcall+0x98/0x1c0 [ 19.346289] [] do_init_module+0xe0/0x2cc [ 19.346303] [] load_module+0x3238/0x35c0 [ 19.346318] [] SyS_finit_module+0x190/0x1a0 [ 19.346329] [] __sys_trace_return+0x0/0x4 This is caused by: platform_device_register() -> platform_device_unregister(to_platform_device(dev)) freeing struct device -> of_node_clear_flag(dev->of_node, ...) writing to the freed device The issue is solved by increasing the reference count before calling of_platform_device_destroy() so freeing the device is postponed after the call. Fixes: 8fb83b142823 ("mmc: cavium: Fix probing race with regulator") Signed-off-by: Jan Glauber Signed-off-by: Ulf Hansson --- drivers/mmc/host/cavium-thunderx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/cavium-thunderx.c b/drivers/mmc/host/cavium-thunderx.c index b9cc95998799..eee08d81b242 100644 --- a/drivers/mmc/host/cavium-thunderx.c +++ b/drivers/mmc/host/cavium-thunderx.c @@ -7,6 +7,7 @@ * * Copyright (C) 2016 Cavium Inc. */ +#include #include #include #include @@ -149,8 +150,11 @@ error: for (i = 0; i < CAVIUM_MAX_MMC; i++) { if (host->slot[i]) cvm_mmc_of_slot_remove(host->slot[i]); - if (host->slot_pdev[i]) + if (host->slot_pdev[i]) { + get_device(&host->slot_pdev[i]->dev); of_platform_device_destroy(&host->slot_pdev[i]->dev, NULL); + put_device(&host->slot_pdev[i]->dev); + } } clk_disable_unprepare(host->clk); return ret; -- cgit From bfaa1ce809bbcd12b1399409ab1dbf0cdaba6e27 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 8 Sep 2017 15:13:33 +0100 Subject: drm/amdkfd: check for null dev to avoid a null pointer dereference The call to kfd_device_by_id can potentially return null, so check that dev is null and return with -EINVAL to avoid a null pointer dereference. Detected by CoverityScan CID#1454629 ("Dereference null return value") Fixes: 5d71dbc3a588 ("drm/amdkfd: Implement image tiling mode support v2") Signed-off-by: Colin Ian King Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index e4a8c2e52cb2..660b3fbade41 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -892,6 +892,8 @@ static int kfd_ioctl_get_tile_config(struct file *filep, int err = 0; dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; dev->kfd2kgd->get_tile_config(dev->kgd, &config); -- cgit From b0e07da3f5c8d069d186a7983ff64eaebf2ea230 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Mon, 11 Sep 2017 11:39:50 +0200 Subject: qxl: fix primary surface handling The atomic conversion of the qxl driver didn't got the primary surface handling completely right. It works in the common simple cases, but fails for example when changing the display resolution using xrandr or in multihead setups. The rules are simple: There is one primary surface. Before defining a new one you have to destroy the old one. This patch makes qxl_primary_atomic_update() destroy the primary surface before defining a new one. It fixes is_primary flag updates. It adds is_primary checks so we don't try to update the primary surface in case it already has the state we want it being in. Fixes: 3538e80a869b ("drm: qxl: Atomic phase 1: Implement mode_set_nofb") Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102338 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=196777 Signed-off-by: Gerd Hoffmann Reviewed-by: Gabriel Krisman Bertazi Link: http://patchwork.freedesktop.org/patch/msgid/20170911093950.22401-1-kraxel@redhat.com --- drivers/gpu/drm/qxl/qxl_display.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_display.c b/drivers/gpu/drm/qxl/qxl_display.c index 03fe182203ce..7babdd8f20fd 100644 --- a/drivers/gpu/drm/qxl/qxl_display.c +++ b/drivers/gpu/drm/qxl/qxl_display.c @@ -512,23 +512,25 @@ static void qxl_primary_atomic_update(struct drm_plane *plane, .y2 = qfb->base.height }; - if (!old_state->fb) { - qxl_io_log(qdev, - "create primary fb: %dx%d,%d,%d\n", - bo->surf.width, bo->surf.height, - bo->surf.stride, bo->surf.format); + if (old_state->fb) { + qfb_old = to_qxl_framebuffer(old_state->fb); + bo_old = gem_to_qxl_bo(qfb_old->obj); + } else { + bo_old = NULL; + } - qxl_io_create_primary(qdev, 0, bo); - bo->is_primary = true; + if (bo == bo_old) return; - } else { - qfb_old = to_qxl_framebuffer(old_state->fb); - bo_old = gem_to_qxl_bo(qfb_old->obj); + if (bo_old && bo_old->is_primary) { + qxl_io_destroy_primary(qdev); bo_old->is_primary = false; } - bo->is_primary = true; + if (!bo->is_primary) { + qxl_io_create_primary(qdev, 0, bo); + bo->is_primary = true; + } qxl_draw_dirty_fb(qdev, qfb, bo, 0, 0, &norect, 1, 1); } @@ -537,13 +539,15 @@ static void qxl_primary_atomic_disable(struct drm_plane *plane, { struct qxl_device *qdev = plane->dev->dev_private; - if (old_state->fb) - { struct qxl_framebuffer *qfb = + if (old_state->fb) { + struct qxl_framebuffer *qfb = to_qxl_framebuffer(old_state->fb); struct qxl_bo *bo = gem_to_qxl_bo(qfb->obj); - qxl_io_destroy_primary(qdev); - bo->is_primary = false; + if (bo->is_primary) { + qxl_io_destroy_primary(qdev); + bo->is_primary = false; + } } } -- cgit From bf2afee14e07de16d3cafc67edbfc2a3cc65e4bc Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 8 Sep 2017 10:37:35 +1000 Subject: cifs: check rsp for NULL before dereferencing in SMB2_open In SMB2_open there are several paths where the SendReceive2 call will return an error before it sets rsp_iov.iov_base thus leaving iov_base uninitialized. Thus we need to check rsp before we dereference it in the call to get_rfc1002_length(). A report of this issue was previously reported in http://www.spinics.net/lists/linux-cifs/msg12846.html RH-bugzilla : 1476151 Version 2 : * Lets properly initialize rsp_iov before we use it. Signed-off-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky . Signed-off-by: Steve French Reported-by: Xiaoli Feng CC: Stable --- fs/cifs/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5531e7ee1210..69a751b038ab 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1634,7 +1634,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, struct cifs_tcon *tcon = oparms->tcon; struct cifs_ses *ses = tcon->ses; struct kvec iov[4]; - struct kvec rsp_iov; + struct kvec rsp_iov = {NULL, 0}; int resp_buftype; int uni_path_len; __le16 *copy_path = NULL; @@ -1763,7 +1763,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); - if (err_buf) + if (err_buf && rsp) *err_buf = kmemdup(rsp, get_rfc1002_length(rsp) + 4, GFP_KERNEL); goto creat_exit; -- cgit From 5a642e6bc49f59922e19ebd639e74f72753fc77b Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 8 Sep 2017 16:24:32 +0200 Subject: etnaviv: fix submit error path If the gpu submit fails, bail out to avoid accessing a potentially unititalized fence. CC: stable@vger.kernel.org #4.12+ Signed-off-by: Lucas Stach --- drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c index 6463fc2c736f..b95362186f9c 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c @@ -445,8 +445,10 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data, cmdbuf->user_size = ALIGN(args->stream_size, 8); ret = etnaviv_gpu_submit(gpu, submit, cmdbuf); - if (ret == 0) - cmdbuf = NULL; + if (ret) + goto out; + + cmdbuf = NULL; if (args->flags & ETNA_SUBMIT_FENCE_FD_OUT) { /* -- cgit From 518417525f3652c12fb5fad6da4ade66c0072fa3 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 11 Sep 2017 15:29:31 +0200 Subject: etnaviv: fix gem object list corruption All manipulations of the gem_object list need to be protected by the list mutex, as GEM objects can be created and freed in parallel. This fixes a kernel memory corruption. CC: stable@vger.kernel.org Signed-off-by: Lucas Stach --- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 9a3bea738330..87b95eeedd9e 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -551,12 +551,15 @@ static const struct etnaviv_gem_ops etnaviv_gem_shmem_ops = { void etnaviv_gem_free_object(struct drm_gem_object *obj) { struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj); + struct etnaviv_drm_private *priv = obj->dev->dev_private; struct etnaviv_vram_mapping *mapping, *tmp; /* object should not be active */ WARN_ON(is_active(etnaviv_obj)); + mutex_lock(&priv->gem_lock); list_del(&etnaviv_obj->gem_node); + mutex_unlock(&priv->gem_lock); list_for_each_entry_safe(mapping, tmp, &etnaviv_obj->vram_list, obj_node) { -- cgit From fc3100d64f0ae383ae8d845989103da06d62763b Mon Sep 17 00:00:00 2001 From: Pu Hou Date: Tue, 5 Sep 2017 05:17:24 +0200 Subject: s390/perf: fix bug when creating per-thread event A per-thread event could not be created correctly like below: perf record --per-thread -e rB0000 -- sleep 1 Error: The sys_perf_event_open() syscall returned with 19 (No such device) for event (rB0000). /bin/dmesg may provide additional information. No CONFIG_PERF_EVENTS=y kernel support configured? This bug was introduced by: commit c311c797998c1e70eade463dd60b843da4f1a203 Author: Alexey Dobriyan Date: Mon May 8 15:56:15 2017 -0700 cpumask: make "nr_cpumask_bits" unsigned If a per-thread event is not attached to any CPU, the cpu field in struct perf_event is -1. The above commit converts the CPU number to unsigned int, which result in an illegal CPU number. Fixes: c311c797998c ("cpumask: make "nr_cpumask_bits" unsigned") Cc: # v4.12+ Cc: Alexey Dobriyan Acked-by: Heiko Carstens Signed-off-by: Pu Hou Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/perf_cpum_sf.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index c1bf75ffb875..7e1e40323b78 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -823,9 +823,12 @@ static int cpumsf_pmu_event_init(struct perf_event *event) } /* Check online status of the CPU to which the event is pinned */ - if ((unsigned int)event->cpu >= nr_cpumask_bits || - (event->cpu >= 0 && !cpu_online(event->cpu))) - return -ENODEV; + if (event->cpu >= 0) { + if ((unsigned int)event->cpu >= nr_cpumask_bits) + return -ENODEV; + if (!cpu_online(event->cpu)) + return -ENODEV; + } /* Force reset of idle/hv excludes regardless of what the * user requested. -- cgit From 673514aff5d797b0cdb3f09097ae0e253b5667c3 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Tue, 12 Sep 2017 17:22:42 +0200 Subject: s390/dasd: fix race during dasd initialization Fix a panic in blk_mq_hctx_has_pending() that is caused by a racy call to blk_mq_run_hw_queues in a dasd function that might get called with the request queue not yet initialized during initialization. Signed-off-by: Stefan Haberland Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index ea19b4ff87a2..29f35e29d480 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1644,7 +1644,9 @@ void dasd_generic_handle_state_change(struct dasd_device *device) dasd_schedule_device_bh(device); if (device->block) { dasd_schedule_block_bh(device->block); - blk_mq_run_hw_queues(device->block->request_queue, true); + if (device->block->request_queue) + blk_mq_run_hw_queues(device->block->request_queue, + true); } } EXPORT_SYMBOL_GPL(dasd_generic_handle_state_change); @@ -3759,7 +3761,9 @@ int dasd_generic_path_operational(struct dasd_device *device) dasd_schedule_device_bh(device); if (device->block) { dasd_schedule_block_bh(device->block); - blk_mq_run_hw_queues(device->block->request_queue, true); + if (device->block->request_queue) + blk_mq_run_hw_queues(device->block->request_queue, + true); } if (!device->stopped) @@ -4025,7 +4029,9 @@ int dasd_generic_restore_device(struct ccw_device *cdev) if (device->block) { dasd_schedule_block_bh(device->block); - blk_mq_run_hw_queues(device->block->request_queue, true); + if (device->block->request_queue) + blk_mq_run_hw_queues(device->block->request_queue, + true); } clear_bit(DASD_FLAG_SUSPENDED, &device->flags); -- cgit From 4cf97582b46f123a4b7cd88d999f1806c2eb4093 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 11 Sep 2017 17:43:56 +0200 Subject: drm/amdgpu: revert tile table update for oland MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several users have complained that the tile table update broke Oland support. Despite several attempts to fix it, the root cause is still unknown at this point and no solution is available. As it is not acceptable to leave a known regression breaking a major functionality in the kernel for several releases, let's just reverse this optimization for now. It can be implemented again later if and only if the breakage is understood and fixed. As there were no complaints for Hainan so far, only the Oland part of the offending commit is reverted. Optimization is preserved on Hainan, so this commit isn't an actual revert of the original. This fixes bug #194761: https://bugzilla.kernel.org/show_bug.cgi?id=194761 Reviewed-by: Marek Olšák Signed-off-by: Jean Delvare Fixes: f8d9422ef80c ("drm/amdgpu: update tile table for oland/hainan") Cc: Flora Cui Cc: Junwei Zhang Cc: Alex Deucher Cc: Marek Olšák Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c | 189 +++++++++++++++++++++++++++++++++- 1 file changed, 188 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c index d228f5a99044..dbbe986f90f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c @@ -636,7 +636,194 @@ static void gfx_v6_0_tiling_mode_table_init(struct amdgpu_device *adev) NUM_BANKS(ADDR_SURF_2_BANK); for (reg_offset = 0; reg_offset < num_tile_mode_states; reg_offset++) WREG32(mmGB_TILE_MODE0 + reg_offset, tilemode[reg_offset]); - } else if (adev->asic_type == CHIP_OLAND || adev->asic_type == CHIP_HAINAN) { + } else if (adev->asic_type == CHIP_OLAND) { + tilemode[0] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[1] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_128B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[2] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[3] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_128B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[4] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_1D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[5] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(split_equal_to_row_size) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[6] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(split_equal_to_row_size) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[7] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(split_equal_to_row_size) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[8] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_LINEAR_ALIGNED) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[9] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_1D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[10] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[11] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[12] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_512B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[13] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_1D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[14] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[15] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[16] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_512B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[17] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(split_equal_to_row_size) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[21] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_2) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[22] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[23] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[24] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_512B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[25] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_1KB) | + NUM_BANKS(ADDR_SURF_8_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_1); + for (reg_offset = 0; reg_offset < num_tile_mode_states; reg_offset++) + WREG32(mmGB_TILE_MODE0 + reg_offset, tilemode[reg_offset]); + } else if (adev->asic_type == CHIP_HAINAN) { tilemode[0] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | ARRAY_MODE(ARRAY_2D_TILED_THIN1) | PIPE_CONFIG(ADDR_SURF_P2) | -- cgit From b468b6a4969f9bdddb31d484f151bfa03fbee767 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Sep 2017 13:54:36 +0200 Subject: scsi: scsi_transport_fc: fix NULL pointer dereference in fc_bsg_job_timeout bsg-lib now embeddeds the job structure into the request, and req->special can't be used anymore. Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org Reviewed-by: Ming Lei Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index 3c6bc0081fcb..ba9d70f8a6a1 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -3571,7 +3571,7 @@ fc_vport_sched_delete(struct work_struct *work) static enum blk_eh_timer_return fc_bsg_job_timeout(struct request *req) { - struct bsg_job *job = (void *) req->special; + struct bsg_job *job = blk_mq_rq_to_pdu(req); struct Scsi_Host *shost = fc_bsg_to_shost(job); struct fc_rport *rport = fc_bsg_to_rport(job); struct fc_internal *i = to_fc_internal(shost->transportt); -- cgit From fd536998314a9dc54f384fa16287321edb81602a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 11 Sep 2017 22:00:57 +0200 Subject: scsi: acornscsi: fix build error A cleanup patch introduced a fatal typo from inbalanced curly braces: drivers/scsi/arm/acornscsi.c: In function 'acornscsi_host_reset': drivers/scsi/arm/acornscsi.c:2773:1: error: ISO C90 forbids mixed declarations and code [-Werror=declaration-after-statement] drivers/scsi/arm/acornscsi.c:2795:12: error: invalid storage class for function 'acornscsi_show_info' static int acornscsi_show_info(struct seq_file *m, struct Scsi_Host *instance) The same patch incorrectly changed the argument type of the reset handler, as shown by this warning: drivers/scsi/arm/acornscsi.c:2888:27: error: initialization of 'int (*)(struct scsi_cmnd *)' from incompatible pointer type 'int (*)(struct Scsi_Host *)' [-Werror=incompatible-pointer-types] .eh_host_reset_handler = acornscsi_host_reset, This removes one the extraneous opening brace and reverts the argument type change. [mkp: fixed checkpatch complaint] Fixes: 74fa80ee3fae ("scsi: acornscsi: move bus reset to host reset") Signed-off-by: Arnd Bergmann Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/arm/acornscsi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/arm/acornscsi.c b/drivers/scsi/arm/acornscsi.c index 690816f3c6af..421fe869a11e 100644 --- a/drivers/scsi/arm/acornscsi.c +++ b/drivers/scsi/arm/acornscsi.c @@ -2725,9 +2725,9 @@ int acornscsi_abort(struct scsi_cmnd *SCpnt) * Params : SCpnt - command causing reset * Returns : one of SCSI_RESET_ macros */ -int acornscsi_host_reset(struct Scsi_Host *shpnt) +int acornscsi_host_reset(struct scsi_cmnd *SCpnt) { - AS_Host *host = (AS_Host *)shpnt->hostdata; + AS_Host *host = (AS_Host *)SCpnt->device->host->hostdata; struct scsi_cmnd *SCptr; host->stats.resets += 1; @@ -2741,7 +2741,7 @@ int acornscsi_host_reset(struct Scsi_Host *shpnt) printk(KERN_WARNING "acornscsi_reset: "); print_sbic_status(asr, ssr, host->scsi.phase); - for (devidx = 0; devidx < 9; devidx ++) { + for (devidx = 0; devidx < 9; devidx++) acornscsi_dumplog(host, devidx); } #endif -- cgit From e785fa0a164aa11001cba931367c7f94ffaff888 Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Wed, 13 Sep 2017 00:21:21 +0200 Subject: nl80211: check for the required netlink attributes presence nl80211_set_rekey_data() does not check if the required attributes NL80211_REKEY_DATA_{REPLAY_CTR,KEK,KCK} are present when processing NL80211_CMD_SET_REKEY_OFFLOAD request. This request can be issued by users with CAP_NET_ADMIN privilege and may result in NULL dereference and a system crash. Add a check for the required attributes presence. This patch is based on the patch by bo Zhang. This fixes CVE-2017-12153. References: https://bugzilla.redhat.com/show_bug.cgi?id=1491046 Fixes: e5497d766ad ("cfg80211/nl80211: support GTK rekey offload") Cc: # v3.1-rc1 Reported-by: bo Zhang Signed-off-by: Vladis Dronov Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0df8023f480b..fbd5593e88cb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10903,6 +10903,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!tb[NL80211_REKEY_DATA_REPLAY_CTR] || !tb[NL80211_REKEY_DATA_KEK] || + !tb[NL80211_REKEY_DATA_KCK]) + return -EINVAL; if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN) return -ERANGE; if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN) -- cgit From f7f3dc00f61261cdc9ccd8b886f21bc4dffd6fd9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 Sep 2017 19:08:21 +0200 Subject: x86/cpu/AMD: Fix erratum 1076 (CPB bit) CPUID Fn8000_0007_EDX[CPB] is wrongly 0 on models up to B1. But they do support CPB (AMD's Core Performance Boosting cpufreq CPU feature), so fix that. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sherry Hurwitz Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170907170821.16021-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9862e2cd6d93..d58184b7cd44 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -763,6 +763,16 @@ static void init_amd_bd(struct cpuinfo_x86 *c) } } +static void init_amd_zn(struct cpuinfo_x86 *c) +{ + /* + * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects + * all up to and including B1. + */ + if (c->x86_model <= 1 && c->x86_mask <= 1) + set_cpu_cap(c, X86_FEATURE_CPB); +} + static void init_amd(struct cpuinfo_x86 *c) { early_init_amd(c); @@ -791,6 +801,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x10: init_amd_gh(c); break; case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; + case 0x17: init_amd_zn(c); break; } /* Enable workaround for FXSAVE leak */ -- cgit From 0998b7a0befdf6e734032895ee639a5e6f88cc3f Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Thu, 14 Sep 2017 08:01:38 +0200 Subject: objtool: Fix memory leak in elf_create_rela_section() Let's free the allocated char array 'relaname' before returning, in order to avoid leaking memory. Signed-off-by: Martin Kepplinger Acked-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mingo.kernel.org@gmail.com Link: http://lkml.kernel.org/r/20170914060138.26472-1-martink@posteo.de Signed-off-by: Ingo Molnar --- tools/objtool/elf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 6e9f980a7d26..1e89a5f8bfc9 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -508,6 +508,7 @@ struct section *elf_create_rela_section(struct elf *elf, struct section *base) strcat(relaname, base->name); sec = elf_create_section(elf, relaname, sizeof(GElf_Rela), 0); + free(relaname); if (!sec) return NULL; -- cgit From df968c9329f6e5cf3596a0a54adb6f749747a746 Mon Sep 17 00:00:00 2001 From: Petr Vandrovec Date: Fri, 15 Sep 2017 02:15:05 -0500 Subject: objtool: Do not retrieve data from empty sections Binutils 2.29-9 in Debian return an error when elf_getdata is invoked on empty section (.note.GNU-stack in all kernel files), causing immediate failure of kernel build with: elf_getdata: can't manipulate null section As nothing is done with sections that have zero size, just do not retrieve their data at all. Signed-off-by: Petr Vandrovec Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2ce30a44349065b70d0f00e71e286dc0cbe745e6.1505459652.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- tools/objtool/elf.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 1e89a5f8bfc9..b4cd8bc62521 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -175,19 +175,20 @@ static int read_sections(struct elf *elf) return -1; } - sec->data = elf_getdata(s, NULL); - if (!sec->data) { - WARN_ELF("elf_getdata"); - return -1; - } - - if (sec->data->d_off != 0 || - sec->data->d_size != sec->sh.sh_size) { - WARN("unexpected data attributes for %s", sec->name); - return -1; + if (sec->sh.sh_size != 0) { + sec->data = elf_getdata(s, NULL); + if (!sec->data) { + WARN_ELF("elf_getdata"); + return -1; + } + if (sec->data->d_off != 0 || + sec->data->d_size != sec->sh.sh_size) { + WARN("unexpected data attributes for %s", + sec->name); + return -1; + } } - - sec->len = sec->data->d_size; + sec->len = sec->sh.sh_size; } /* sanity check, one more call to elf_nextscn() should return NULL */ -- cgit From 97dab2ae7e8473a821f72a039ead0f36b12ba22d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 15 Sep 2017 02:17:11 -0500 Subject: objtool: Fix object file corruption Arnd Bergmann reported that a randconfig build was failing with the following link error: built-in.o: member arch/x86/kernel/time.o in archive is not an object It turns out the link failed because the time.o file had been corrupted by objtool: nm: arch/x86/kernel/time.o: File format not recognized In certain rare cases, when a .o file's ORC table is very small, the .data section size doesn't change because it's page aligned. Because all the existing sections haven't changed size, libelf doesn't detect any section header changes, and so it doesn't update the section header table properly. Instead it writes junk in the section header entries for the new ORC sections. Make sure libelf properly updates the section header table by setting the ELF_F_DIRTY flag in the top level elf struct. Reported-by: Arnd Bergmann Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 627fce14809b ("objtool: Add ORC unwind table generation") Link: http://lkml.kernel.org/r/e650fd0f2d8a209d1409a9785deb101fdaed55fb.1505459813.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- tools/objtool/elf.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index b4cd8bc62521..24460155c82c 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -563,6 +563,7 @@ int elf_write(struct elf *elf) struct section *sec; Elf_Scn *s; + /* Update section headers for changed sections: */ list_for_each_entry(sec, &elf->sections, list) { if (sec->changed) { s = elf_getscn(elf->elf, sec->idx); @@ -570,13 +571,17 @@ int elf_write(struct elf *elf) WARN_ELF("elf_getscn"); return -1; } - if (!gelf_update_shdr (s, &sec->sh)) { + if (!gelf_update_shdr(s, &sec->sh)) { WARN_ELF("gelf_update_shdr"); return -1; } } } + /* Make sure the new section header entries get updated properly. */ + elf_flagelf(elf->elf, ELF_C_SET, ELF_F_DIRTY); + + /* Write all changes to the file. */ if (elf_update(elf->elf, ELF_C_WRITE) < 0) { WARN_ELF("elf_update"); return -1; -- cgit From 820608548737e315c6f93e3099b4e65bde062334 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 15 Sep 2017 11:55:27 -0400 Subject: drm/radeon: disable hard reset in hibernate for APUs Fixes a hibernation regression on APUs. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=191571 Fixes: 274ad65c9d02bdc (drm/radeon: hard reset r600 and newer GPU when hibernating.) Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/radeon_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 997131d58c7f..ffc10cadcf34 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -1663,7 +1663,7 @@ int radeon_suspend_kms(struct drm_device *dev, bool suspend, radeon_agp_suspend(rdev); pci_save_state(dev->pdev); - if (freeze && rdev->family >= CHIP_CEDAR) { + if (freeze && rdev->family >= CHIP_CEDAR && !(rdev->flags & RADEON_IS_IGP)) { rdev->asic->asic_reset(rdev, true); pci_restore_state(dev->pdev); } else if (suspend) { -- cgit From 9c95be0163a0a699ed7eda4727f485f8453580ef Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 13 Sep 2017 16:29:46 +0200 Subject: scsi: sd: Remove unnecessary condition in sd_read_block_limits() After series of changes around WRITE_SAME and UNMAP setup we ended up with leftover unnecessary condition. Remove it. Signed-off-by: Lukas Czerner Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 11c1738c2100..fb9f8b5f4673 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2915,8 +2915,6 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) sd_config_discard(sdkp, SD_LBP_WS16); else if (sdkp->lbpws10) sd_config_discard(sdkp, SD_LBP_WS10); - else if (sdkp->lbpu && sdkp->max_unmap_blocks) - sd_config_discard(sdkp, SD_LBP_UNMAP); else sd_config_discard(sdkp, SD_LBP_DISABLE); } -- cgit From 4759df905a474d245752c9dc94288e779b8734dd Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 15 Sep 2017 14:05:15 +0200 Subject: scsi: sg: factor out sg_fill_request_table() Factor out sg_fill_request_table() for better readability. [mkp: typos, applied by hand] Signed-off-by: Hannes Reinecke Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/sg.c | 61 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index cf0e71db9e51..1acdb6e03999 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -828,6 +828,40 @@ static int max_sectors_bytes(struct request_queue *q) return max_sectors << 9; } +static void +sg_fill_request_table(Sg_fd *sfp, sg_req_info_t *rinfo) +{ + Sg_request *srp; + int val; + unsigned int ms; + + val = 0; + list_for_each_entry(srp, &sfp->rq_list, entry) { + if (val > SG_MAX_QUEUE) + break; + memset(&rinfo[val], 0, SZ_SG_REQ_INFO); + rinfo[val].req_state = srp->done + 1; + rinfo[val].problem = + srp->header.masked_status & + srp->header.host_status & + srp->header.driver_status; + if (srp->done) + rinfo[val].duration = + srp->header.duration; + else { + ms = jiffies_to_msecs(jiffies); + rinfo[val].duration = + (ms > srp->header.duration) ? + (ms - srp->header.duration) : 0; + } + rinfo[val].orphan = srp->orphan; + rinfo[val].sg_io_owned = srp->sg_io_owned; + rinfo[val].pack_id = srp->header.pack_id; + rinfo[val].usr_ptr = srp->header.usr_ptr; + val++; + } +} + static long sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) { @@ -1012,38 +1046,13 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) return -EFAULT; else { sg_req_info_t *rinfo; - unsigned int ms; rinfo = kmalloc(SZ_SG_REQ_INFO * SG_MAX_QUEUE, GFP_KERNEL); if (!rinfo) return -ENOMEM; read_lock_irqsave(&sfp->rq_list_lock, iflags); - val = 0; - list_for_each_entry(srp, &sfp->rq_list, entry) { - if (val >= SG_MAX_QUEUE) - break; - memset(&rinfo[val], 0, SZ_SG_REQ_INFO); - rinfo[val].req_state = srp->done + 1; - rinfo[val].problem = - srp->header.masked_status & - srp->header.host_status & - srp->header.driver_status; - if (srp->done) - rinfo[val].duration = - srp->header.duration; - else { - ms = jiffies_to_msecs(jiffies); - rinfo[val].duration = - (ms > srp->header.duration) ? - (ms - srp->header.duration) : 0; - } - rinfo[val].orphan = srp->orphan; - rinfo[val].sg_io_owned = srp->sg_io_owned; - rinfo[val].pack_id = srp->header.pack_id; - rinfo[val].usr_ptr = srp->header.usr_ptr; - val++; - } + sg_fill_request_table(sfp, rinfo); read_unlock_irqrestore(&sfp->rq_list_lock, iflags); result = __copy_to_user(p, rinfo, SZ_SG_REQ_INFO * SG_MAX_QUEUE); -- cgit From 3e0097499839e0fe3af380410eababe5a47c4cf9 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 15 Sep 2017 14:05:16 +0200 Subject: scsi: sg: fixup infoleak when using SG_GET_REQUEST_TABLE When calling SG_GET_REQUEST_TABLE ioctl only a half-filled table is returned; the remaining part will then contain stale kernel memory information. This patch zeroes out the entire table to avoid this issue. Signed-off-by: Hannes Reinecke Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Eric Dumazet Signed-off-by: Martin K. Petersen --- drivers/scsi/sg.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 1acdb6e03999..0419c2298eab 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -839,7 +839,6 @@ sg_fill_request_table(Sg_fd *sfp, sg_req_info_t *rinfo) list_for_each_entry(srp, &sfp->rq_list, entry) { if (val > SG_MAX_QUEUE) break; - memset(&rinfo[val], 0, SZ_SG_REQ_INFO); rinfo[val].req_state = srp->done + 1; rinfo[val].problem = srp->header.masked_status & @@ -1047,8 +1046,8 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) else { sg_req_info_t *rinfo; - rinfo = kmalloc(SZ_SG_REQ_INFO * SG_MAX_QUEUE, - GFP_KERNEL); + rinfo = kzalloc(SZ_SG_REQ_INFO * SG_MAX_QUEUE, + GFP_KERNEL); if (!rinfo) return -ENOMEM; read_lock_irqsave(&sfp->rq_list_lock, iflags); -- cgit From 6c92f7dbf25c36f35320e4ae0b508676410bac04 Mon Sep 17 00:00:00 2001 From: Dave Carroll Date: Fri, 15 Sep 2017 11:04:28 -0600 Subject: scsi: aacraid: Fix 2T+ drives on SmartIOC-2000 The logic for supporting large drives was previously tied to 4Kn support for SmartIOC-2000. As SmartIOC-2000 does not support volumes using 4Kn drives, use the intended option flag AAC_OPT_NEW_COMM_64 to determine support for volumes greater than 2T. Cc: Signed-off-by: Dave Carroll Reviewed-by: Christoph Hellwig Reviewed-by: Raghava Aditya Renukunta Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/aachba.c | 12 ++++++------ drivers/scsi/aacraid/aacraid.h | 5 +++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c index a64285ab0728..af3e4d3f9735 100644 --- a/drivers/scsi/aacraid/aachba.c +++ b/drivers/scsi/aacraid/aachba.c @@ -699,13 +699,13 @@ static void _aac_probe_container1(void * context, struct fib * fibptr) int status; dresp = (struct aac_mount *) fib_data(fibptr); - if (!(fibptr->dev->supplement_adapter_info.supported_options2 & - AAC_OPTION_VARIABLE_BLOCK_SIZE)) + if (!aac_supports_2T(fibptr->dev)) { dresp->mnt[0].capacityhigh = 0; - if ((le32_to_cpu(dresp->status) != ST_OK) || - (le32_to_cpu(dresp->mnt[0].vol) != CT_NONE)) { - _aac_probe_container2(context, fibptr); - return; + if ((le32_to_cpu(dresp->status) == ST_OK) && + (le32_to_cpu(dresp->mnt[0].vol) != CT_NONE)) { + _aac_probe_container2(context, fibptr); + return; + } } scsicmd = (struct scsi_cmnd *) context; diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h index 92fabf2b0c24..403a639574e5 100644 --- a/drivers/scsi/aacraid/aacraid.h +++ b/drivers/scsi/aacraid/aacraid.h @@ -2701,6 +2701,11 @@ static inline int aac_is_src(struct aac_dev *dev) return 0; } +static inline int aac_supports_2T(struct aac_dev *dev) +{ + return (dev->adapter_info.options & AAC_OPT_NEW_COMM_64); +} + char * get_container_type(unsigned type); extern int numacb; extern char aac_driver_version[]; -- cgit From 51ae253870f55e14ba2854ce9577ac2920efef0c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Sep 2017 21:29:13 +0200 Subject: xen: x86: mark xen_find_pt_base as __init gcc-4.6 causes a harmless link-time warning: WARNING: vmlinux.o(.text.unlikely+0x48e): Section mismatch in reference from the function xen_find_pt_base() to the function .init.text:m2p() The function xen_find_pt_base() references the function __init m2p(). This is often because xen_find_pt_base lacks a __init annotation or the annotation of m2p is wrong. Newer compilers inline this function, so it never shows up, but marking it __init is the right way to avoid the warning. Fixes: 70e61199559a ("xen: move p2m list if conflicting with e820 map") Signed-off-by: Arnd Bergmann Signed-off-by: Boris Ostrovsky --- arch/x86/xen/mmu_pv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 0422ee7e70b3..ddfeebc2b125 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2221,7 +2221,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) * not the first page table in the page table pool. * Iterate through the initial page tables to find the real page table base. */ -static phys_addr_t xen_find_pt_base(pmd_t *pmd) +static phys_addr_t __init xen_find_pt_base(pmd_t *pmd) { phys_addr_t pt_base, paddr; unsigned pmdidx; -- cgit From fd7d56270b526ca3ed0c224362e3c64a0f86687a Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 14 Sep 2017 11:42:17 +0200 Subject: fs/proc: Report eip/esp in /prod/PID/stat for coredumping Commit 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in /proc/PID/stat") stopped reporting eip/esp because it is racy and dangerous for executing tasks. The comment adds: As far as I know, there are no use programs that make any material use of these fields, so just get rid of them. However, existing userspace core-dump-handler applications (for example, minicoredumper) are using these fields since they provide an excellent cross-platform interface to these valuable pointers. So that commit introduced a user space visible regression. Partially revert the change and make the readout possible for tasks with the proper permissions and only if the target task has the PF_DUMPCORE flag set. Fixes: 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in> /proc/PID/stat") Reported-by: Marco Felsch Signed-off-by: John Ogness Reviewed-by: Andy Lutomirski Cc: Tycho Andersen Cc: Kees Cook Cc: Peter Zijlstra Cc: Brian Gerst Cc: stable@vger.kernel.org Cc: Tetsuo Handa Cc: Borislav Petkov Cc: Al Viro Cc: Linux API Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/87poatfwg6.fsf@linutronix.de Signed-off-by: Thomas Gleixner --- fs/proc/array.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/proc/array.c b/fs/proc/array.c index 88c355574aa0..525157ca25cb 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -421,7 +422,15 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * esp and eip are intentionally zeroed out. There is no * non-racy way to read them without freezing the task. * Programs that need reliable values can use ptrace(2). + * + * The only exception is if the task is core dumping because + * a program is not able to use ptrace(2) in that case. It is + * safe because the task has stopped executing permanently. */ + if (permitted && (task->flags & PF_DUMPCORE)) { + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + } } get_task_comm(tcomm, task); -- cgit From 5c756065e47dc3e84b00577bd109f0a8e69903d7 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 6 Sep 2017 11:02:56 +0200 Subject: scsi: lpfc: Don't return internal MBXERR_ERROR code from probe function Internal error codes happen to be positive, thus the PCI driver core won't treat them as failure, but we do. This would cause a crash later on as lpfc_pci_remove_one() is called (e.g. as shutdown function). Fixes: 6d368e532168 ("[SCSI] lpfc 8.3.24: Add resource extent support") Signed-off-by: Stefano Brivio Reviewed-by: Johannes Thumshirn Acked-by: Dick Kennedy Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 7e7ae786121b..100bc4c8798d 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -6131,6 +6131,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) "Extents and RPI headers enabled.\n"); } mempool_free(mboxq, phba->mbox_mem_pool); + rc = -EIO; goto out_free_bsmbx; } -- cgit From 4cb433e856bce5974ea035181cc8eb406496dccc Mon Sep 17 00:00:00 2001 From: Nikola Pajkovsky Date: Wed, 13 Sep 2017 10:46:17 +0200 Subject: scsi: aacraid: error: testing array offset 'bus' after use Fix possible indexing array of bound for &aac->hba_map[bus][cid], where bus and cid boundary check happens later. Fixes: 0d643ff3c353 ("scsi: aacraid: use aac_tmf_callback for reset fib") Signed-off-by: Nikola Pajkovsky Reviewed-by: Dave Carroll Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/linit.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 87cc4a93e637..62beb2596466 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -906,12 +906,14 @@ static int aac_eh_dev_reset(struct scsi_cmnd *cmd) bus = aac_logical_to_phys(scmd_channel(cmd)); cid = scmd_id(cmd); - info = &aac->hba_map[bus][cid]; - if (bus >= AAC_MAX_BUSES || cid >= AAC_MAX_TARGETS || - info->devtype != AAC_DEVTYPE_NATIVE_RAW) + + if (bus >= AAC_MAX_BUSES || cid >= AAC_MAX_TARGETS) return FAILED; - if (info->reset_state > 0) + info = &aac->hba_map[bus][cid]; + + if (info->devtype != AAC_DEVTYPE_NATIVE_RAW && + info->reset_state > 0) return FAILED; pr_err("%s: Host adapter reset request. SCSI hang ?\n", @@ -962,12 +964,14 @@ static int aac_eh_target_reset(struct scsi_cmnd *cmd) bus = aac_logical_to_phys(scmd_channel(cmd)); cid = scmd_id(cmd); - info = &aac->hba_map[bus][cid]; - if (bus >= AAC_MAX_BUSES || cid >= AAC_MAX_TARGETS || - info->devtype != AAC_DEVTYPE_NATIVE_RAW) + + if (bus >= AAC_MAX_BUSES || cid >= AAC_MAX_TARGETS) return FAILED; - if (info->reset_state > 0) + info = &aac->hba_map[bus][cid]; + + if (info->devtype != AAC_DEVTYPE_NATIVE_RAW && + info->reset_state > 0) return FAILED; pr_err("%s: Host adapter reset request. SCSI hang ?\n", -- cgit From 9cb067ef8a10bb13112e4d1c0ea996ec96527422 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:03 +0200 Subject: genirq: Fix cpumask check in __irq_startup_managed() The result of cpumask_any_and() is invalid when result greater or equal nr_cpu_ids. The current check is checking for greater only. Fix it. Fixes: 761ea388e8c4 ("genirq: Handle managed irqs gracefully in irq_startup()") Signed-off-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Tony Luck Cc: Chen Yu Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: stable@vger.kernel.org Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: http://lkml.kernel.org/r/20170913213152.272283444@linutronix.de --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f51b7b6d2451..6fc89fd93824 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -202,7 +202,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) irqd_clr_managed_shutdown(d); - if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) { + if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) { /* * Catch code which fiddles with enable_irq() on a managed * and potentially shutdown IRQ. Chained interrupt -- cgit From 6d57339890c9a9dfcd4ba4f8931b911b962968e3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Sep 2017 17:08:16 +0200 Subject: dma-coherent: fix rmem_dma_device_init regression My recent bug fix introduced another bug, which caused rmem_dma_device_init to always fail, as rmem->priv is never set to anything. This restores the previous behavior, calling dma_init_coherent_memory() whenever ->priv is NULL. Fixes: d35b0996fef3 ("dma-coherent: fix dma_declare_coherent_memory() logic error") Reported-by: Roy Pledge Tested-by: Roy Pledge Signed-off-by: Arnd Bergmann Signed-off-by: Christoph Hellwig --- drivers/base/dma-coherent.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/base/dma-coherent.c b/drivers/base/dma-coherent.c index a39b2166b145..744f64f43454 100644 --- a/drivers/base/dma-coherent.c +++ b/drivers/base/dma-coherent.c @@ -348,16 +348,15 @@ static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) struct dma_coherent_mem *mem = rmem->priv; int ret; - if (!mem) - return -ENODEV; - - ret = dma_init_coherent_memory(rmem->base, rmem->base, rmem->size, - DMA_MEMORY_EXCLUSIVE, &mem); - - if (ret) { - pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", - &rmem->base, (unsigned long)rmem->size / SZ_1M); - return ret; + if (!mem) { + ret = dma_init_coherent_memory(rmem->base, rmem->base, + rmem->size, + DMA_MEMORY_EXCLUSIVE, &mem); + if (ret) { + pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + return ret; + } } mem->use_dev_dma_pfn_offset = true; rmem->priv = mem; -- cgit From ec11653b531099ddc08a8c7eb495ab83cae84e19 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 14 Sep 2017 14:51:20 -0500 Subject: CIFS/SMB3: Update documentation to reflect SMB3 and various changes Signed-off-by: Steve French Reviewed-by: Aurelien Aptel Reviewed-by: Pavel Shilovsky --- Documentation/filesystems/cifs/AUTHORS | 5 ++ Documentation/filesystems/cifs/README | 81 ++++++++++++++++----------------- Documentation/filesystems/cifs/TODO | 72 ++++++++++++++--------------- Documentation/filesystems/cifs/cifs.txt | 24 ++++++---- 4 files changed, 91 insertions(+), 91 deletions(-) diff --git a/Documentation/filesystems/cifs/AUTHORS b/Documentation/filesystems/cifs/AUTHORS index c98800df677f..9f4f87e16240 100644 --- a/Documentation/filesystems/cifs/AUTHORS +++ b/Documentation/filesystems/cifs/AUTHORS @@ -41,6 +41,11 @@ Igor Mammedov (DFS support) Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code) Scott Lovenberg Pavel Shilovsky (for great work adding SMB2 support, and various SMB3 features) +Aurelien Aptel (for DFS SMB3 work and some key bug fixes) +Ronnie Sahlberg (for SMB3 xattr work and bug fixes) +Shirish Pargaonkar (for many ACL patches over the years) +Sachin Prabhu (many bug fixes, including for reconnect, copy offload and security) + Test case and Bug Report contributors ------------------------------------- diff --git a/Documentation/filesystems/cifs/README b/Documentation/filesystems/cifs/README index a54788405429..a9da51553ba3 100644 --- a/Documentation/filesystems/cifs/README +++ b/Documentation/filesystems/cifs/README @@ -1,10 +1,14 @@ -The CIFS VFS support for Linux supports many advanced network filesystem -features such as hierarchical dfs like namespace, hardlinks, locking and more. +This module supports the SMB3 family of advanced network protocols (as well +as older dialects, originally called "CIFS" or SMB1). + +The CIFS VFS module for Linux supports many advanced network filesystem +features such as hierarchical DFS like namespace, hardlinks, locking and more. It was designed to comply with the SNIA CIFS Technical Reference (which supersedes the 1992 X/Open SMB Standard) as well as to perform best practice practical interoperability with Windows 2000, Windows XP, Samba and equivalent servers. This code was developed in participation with the Protocol Freedom -Information Foundation. +Information Foundation. CIFS and now SMB3 has now become a defacto +standard for interoperating between Macs and Windows and major NAS appliances. Please see http://protocolfreedom.org/ and @@ -15,30 +19,11 @@ for more details. For questions or bug reports please contact: sfrench@samba.org (sfrench@us.ibm.com) +See the project page at: https://wiki.samba.org/index.php/LinuxCIFS_utils + Build instructions: ================== -For Linux 2.4: -1) Get the kernel source (e.g.from http://www.kernel.org) -and download the cifs vfs source (see the project page -at http://us1.samba.org/samba/Linux_CIFS_client.html) -and change directory into the top of the kernel directory -then patch the kernel (e.g. "patch -p1 < cifs_24.patch") -to add the cifs vfs to your kernel configure options if -it has not already been added (e.g. current SuSE and UL -users do not need to apply the cifs_24.patch since the cifs vfs is -already in the kernel configure menu) and then -mkdir linux/fs/cifs and then copy the current cifs vfs files from -the cifs download to your kernel build directory e.g. - - cp /fs/cifs/* to /fs/cifs - -2) make menuconfig (or make xconfig) -3) select cifs from within the network filesystem choices -4) save and exit -5) make dep -6) make modules (or "make" if CIFS VFS not to be built as a module) - -For Linux 2.6: +For Linux: 1) Download the kernel (e.g. from http://www.kernel.org) and change directory into the top of the kernel directory tree (e.g. /usr/src/linux-2.5.73) @@ -61,16 +46,13 @@ would simply type "make install"). If you do not have the utility mount.cifs (in the Samba 3.0 source tree and on the CIFS VFS web site) copy it to the same directory in which mount.smbfs and similar files reside (usually /sbin). Although the helper software is not -required, mount.cifs is recommended. Eventually the Samba 3.0 utility program -"net" may also be helpful since it may someday provide easier mount syntax for -users who are used to Windows e.g. - net use +required, mount.cifs is recommended. Most distros include a "cifs-utils" +package that includes this utility so it is recommended to install this. + Note that running the Winbind pam/nss module (logon service) on all of your Linux clients is useful in mapping Uids and Gids consistently across the domain to the proper network user. The mount.cifs mount helper can be -trivially built from Samba 3.0 or later source e.g. by executing: - - gcc samba/source/client/mount.cifs.c -o mount.cifs +found at cifs-utils.git on git.samba.org If cifs is built as a module, then the size and number of network buffers and maximum number of simultaneous requests to one server can be configured. @@ -79,6 +61,18 @@ Changing these from their defaults is not recommended. By executing modinfo on kernel/fs/cifs/cifs.ko the list of configuration changes that can be made at module initialization time (by running insmod cifs.ko) can be seen. +Recommendations +=============== +To improve security the SMB2.1 dialect or later (usually will get SMB3) is now +the new default. To use old dialects (e.g. to mount Windows XP) use "vers=1.0" +on mount (or vers=2.0 for Windows Vista). Note that the CIFS (vers=1.0) is +much older and less secure than the default dialect SMB3 which includes +many advanced security features such as downgrade attack detection +and encrypted shares and stronger signing and authentication algorithms. +There are additional mount options that may be helpful for SMB3 to get +improved POSIX behavior (NB: can use vers=3.0 to force only SMB3, never 2.1): + "mfsymlinks" and "cifsacl" and "idsfromsid" + Allowing User Mounts ==================== To permit users to mount and unmount over directories they own is possible @@ -98,9 +92,7 @@ and execution of suid programs on the remote target would be enabled by default. This can be changed, as with nfs and other filesystems, by simply specifying "nosuid" among the mount options. For user mounts though to be able to pass the suid flag to mount requires rebuilding -mount.cifs with the following flag: - - gcc samba/source/client/mount.cifs.c -DCIFS_ALLOW_USR_SUID -o mount.cifs +mount.cifs with the following flag: CIFS_ALLOW_USR_SUID There is a corresponding manual page for cifs mounting in the Samba 3.0 and later source tree in docs/manpages/mount.cifs.8 @@ -189,18 +181,18 @@ applications running on the same server as Samba. Use instructions: ================ Once the CIFS VFS support is built into the kernel or installed as a module -(cifs.o), you can use mount syntax like the following to access Samba or Windows -servers: +(cifs.ko), you can use mount syntax like the following to access Samba or +Mac or Windows servers: - mount -t cifs //9.53.216.11/e$ /mnt -o user=myname,pass=mypassword + mount -t cifs //9.53.216.11/e$ /mnt -o username=myname,password=mypassword Before -o the option -v may be specified to make the mount.cifs mount helper display the mount steps more verbosely. After -o the following commonly used cifs vfs specific options are supported: - user= - pass= + username= + password= domain= Other cifs mount options are described below. Use of TCP names (in addition to @@ -246,13 +238,16 @@ the Server's registry. Samba starting with version 3.10 will allow such filenames (ie those which contain valid Linux characters, which normally would be forbidden for Windows/CIFS semantics) as long as the server is configured for Unix Extensions (and the client has not disabled -/proc/fs/cifs/LinuxExtensionsEnabled). - +/proc/fs/cifs/LinuxExtensionsEnabled). In addition the mount option +"mapposix" can be used on CIFS (vers=1.0) to force the mapping of +illegal Windows/NTFS/SMB characters to a remap range (this mount parm +is the default for SMB3). This remap ("mapposix") range is also +compatible with Mac (and "Services for Mac" on some older Windows). CIFS VFS Mount Options ====================== A partial list of the supported mount options follows: - user The user name to use when trying to establish + username The user name to use when trying to establish the CIFS session. password The user password. If the mount helper is installed, the user will be prompted for password diff --git a/Documentation/filesystems/cifs/TODO b/Documentation/filesystems/cifs/TODO index 066ffddc3964..396ecfd6ff4a 100644 --- a/Documentation/filesystems/cifs/TODO +++ b/Documentation/filesystems/cifs/TODO @@ -1,4 +1,4 @@ -Version 2.03 August 1, 2014 +Version 2.04 September 13, 2017 A Partial List of Missing Features ================================== @@ -8,73 +8,69 @@ for visible, important contributions to this module. Here is a partial list of the known problems and missing features: a) SMB3 (and SMB3.02) missing optional features: - - RDMA + - RDMA (started) - multichannel (started) - directory leases (improved metadata caching) - T10 copy offload (copy chunk is only mechanism supported) - - encrypted shares b) improved sparse file support c) Directory entry caching relies on a 1 second timer, rather than -using FindNotify or equivalent. - (started) +using Directory Leases d) quota support (needs minor kernel change since quota calls to make it to network filesystems or deviceless filesystems) -e) improve support for very old servers (OS/2 and Win9x for example) -Including support for changing the time remotely (utimes command). +e) Better optimize open to reduce redundant opens (using reference +counts more) and to improve use of compounding in SMB3 to reduce +number of roundtrips. -f) hook lower into the sockets api (as NFS/SunRPC does) to avoid the -extra copy in/out of the socket buffers in some cases. - -g) Better optimize open (and pathbased setfilesize) to reduce the -oplock breaks coming from windows srv. Piggyback identical file -opens on top of each other by incrementing reference count rather -than resending (helps reduce server resource utilization and avoid -spurious oplock breaks). - -h) Add support for storing symlink info to Windows servers -in the Extended Attribute format their SFU clients would recognize. - -i) Finish inotify support so kde and gnome file list windows +f) Finish inotify support so kde and gnome file list windows will autorefresh (partially complete by Asser). Needs minor kernel vfs change to support removing D_NOTIFY on a file. -j) Add GUI tool to configure /proc/fs/cifs settings and for display of +g) Add GUI tool to configure /proc/fs/cifs settings and for display of the CIFS statistics (started) -k) implement support for security and trusted categories of xattrs +h) implement support for security and trusted categories of xattrs (requires minor protocol extension) to enable better support for SELINUX -l) Implement O_DIRECT flag on open (already supported on mount) +i) Implement O_DIRECT flag on open (already supported on mount) -m) Create UID mapping facility so server UIDs can be mapped on a per +j) Create UID mapping facility so server UIDs can be mapped on a per mount or a per server basis to client UIDs or nobody if no mapping -exists. This is helpful when Unix extensions are negotiated to -allow better permission checking when UIDs differ on the server -and client. Add new protocol request to the CIFS protocol -standard for asking the server for the corresponding name of a -particular uid. +exists. Also better integration with winbind for resolving SID owners + +k) Add tools to take advantage of more smb3 specific ioctls and features + +l) encrypted file support + +m) improved stats gathering, tools (perhaps integration with nfsometer?) -n) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too) +n) allow setting more NTFS/SMB3 file attributes remotely (currently limited to compressed +file attribute via chflags) and improve user space tools for managing and +viewing them. -o) mount check for unmatched uids +o) mount helper GUI (to simplify the various configuration options on mount) -p) Add support for new vfs entry point for fallocate +p) autonegotiation of dialects (offering more than one dialect ie SMB3.02, +SMB3, SMB2.1 not just SMB3). -q) Add tools to take advantage of cifs/smb3 specific ioctls and features -such as "CopyChunk" (fast server side file copy) +q) Allow mount.cifs to be more verbose in reporting errors with dialect +or unsupported feature errors. -r) encrypted file support +r) updating cifs documentation, and user guid. -s) improved stats gathering, tools (perhaps integration with nfsometer?) +s) Addressing bugs found by running a broader set of xfstests in standard +file system xfstest suite. -t) allow setting more NTFS/SMB3 file attributes remotely (currently limited to compressed -file attribute via chflags) +t) split cifs and smb3 support into separate modules so legacy (and less +secure) CIFS dialect can be disabled in environments that don't need it +and simplify the code. -u) mount helper GUI (to simplify the various configuration options on mount) +u) Finish up SMB3.1.1 dialect support +v) POSIX Extensions for SMB3.1.1 KNOWN BUGS ==================================== diff --git a/Documentation/filesystems/cifs/cifs.txt b/Documentation/filesystems/cifs/cifs.txt index 2fac91ac96cf..67756607246e 100644 --- a/Documentation/filesystems/cifs/cifs.txt +++ b/Documentation/filesystems/cifs/cifs.txt @@ -1,24 +1,28 @@ - This is the client VFS module for the Common Internet File System - (CIFS) protocol which is the successor to the Server Message Block + This is the client VFS module for the SMB3 NAS protocol as well + older dialects such as the Common Internet File System (CIFS) + protocol which was the successor to the Server Message Block (SMB) protocol, the native file sharing mechanism for most early PC operating systems. New and improved versions of CIFS are now called SMB2 and SMB3. These dialects are also supported by the CIFS VFS module. CIFS is fully supported by network - file servers such as Windows 2000, 2003, 2008 and 2012 + file servers such as Windows 2000, 2003, 2008, 2012 and 2016 as well by Samba (which provides excellent CIFS - server support for Linux and many other operating systems), so + server support for Linux and many other operating systems), Apple + systems, as well as most Network Attached Storage vendors, so this network filesystem client can mount to a wide variety of servers. The intent of this module is to provide the most advanced network - file system function for CIFS compliant servers, including better - POSIX compliance, secure per-user session establishment, high - performance safe distributed caching (oplock), optional packet + file system function for SMB3 compliant servers, including advanced + security features, excellent parallelized high performance i/o, better + POSIX compliance, secure per-user session establishment, encryption, + high performance safe distributed caching (leases/oplocks), optional packet signing, large files, Unicode support and other internationalization improvements. Since both Samba server and this filesystem client support - the CIFS Unix extensions, the combination can provide a reasonable - alternative to NFSv4 for fileserving in some Linux to Linux environments, - not just in Linux to Windows environments. + the CIFS Unix extensions (and in the future SMB3 POSIX extensions), + the combination can provide a reasonable alternative to other network and + cluster file systems for fileserving in some Linux to Linux environments, + not just in Linux to Windows (or Linux to Mac) environments. This filesystem has an mount utility (mount.cifs) that can be obtained from -- cgit From 47061a24e2ee5bd8a40d473d47a5bd823fa0081f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 17 Sep 2017 09:03:48 -0700 Subject: x86/mm: Factor out CR3-building code Current, the code that assembles a value to load into CR3 is open-coded everywhere. Factor it out into helpers build_cr3() and build_cr3_noflush(). This makes one semantic change: __get_current_cr3_fast() was wrong on SME systems. No one noticed because the only caller is in the VMX code, and there are no CPUs with both SME and VMX. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Link: http://lkml.kernel.org/r/ce350cf11e93e2842d14d0b95b0199c7d881f527.1505663533.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 15 +++++++++++---- arch/x86/mm/tlb.c | 11 +++++------ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 7ae318c340d9..a999ba6b721f 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -286,6 +286,15 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } +static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) +{ + return __sme_pa(mm->pgd) | asid; +} + +static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) +{ + return __sme_pa(mm->pgd) | asid | CR3_NOFLUSH; +} /* * This can be used from process context to figure out what the value of @@ -296,10 +305,8 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, */ static inline unsigned long __get_current_cr3_fast(void) { - unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); - - if (static_cpu_has(X86_FEATURE_PCID)) - cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), + this_cpu_read(cpu_tlbstate.loaded_mm_asid)); /* For now, be very restrictive about when this can be called. */ VM_WARN_ON(in_nmi() || preemptible()); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 1ab3821f9e26..93fe97cce581 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -126,8 +126,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != - (__sme_pa(real_prev->pgd) | prev_asid))) { + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. @@ -172,7 +171,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, */ this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, next_tlb_gen); - write_cr3(__sme_pa(next->pgd) | prev_asid); + write_cr3(build_cr3(next, prev_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } @@ -216,12 +215,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - write_cr3(__sme_pa(next->pgd) | new_asid); + write_cr3(build_cr3(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); + write_cr3(build_cr3_noflush(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } @@ -265,7 +264,7 @@ void initialize_tlbstate_and_flush(void) !(cr4_read_shadow() & X86_CR4_PCIDE)); /* Force ASID 0 and force a TLB flush. */ - write_cr3(cr3 & ~CR3_PCID_MASK); + write_cr3(build_cr3(mm, 0)); /* Reinitialize tlbstate. */ this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); -- cgit From 52a2af400c1075219b3f0ce5c96fc961da44018a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 17 Sep 2017 09:03:49 -0700 Subject: x86/mm/64: Stop using CR3.PCID == 0 in ASID-aware code Putting the logical ASID into CR3's PCID bits directly means that we have two cases to consider separately: ASID == 0 and ASID != 0. This means that bugs that only hit in one of these cases trigger nondeterministically. There were some bugs like this in the past, and I think there's still one in current kernels. In particular, we have a number of ASID-unware code paths that save CR3, write some special value, and then restore CR3. This includes suspend/resume, hibernate, kexec, EFI, and maybe other things I've missed. This is currently dangerous: if ASID != 0, then this code sequence will leave garbage in the TLB tagged for ASID 0. We could potentially see corruption when switching back to ASID 0. In principle, an initialize_tlbstate_and_flush() call after these sequences would solve the problem, but EFI, at least, does not call this. (And it probably shouldn't -- initialize_tlbstate_and_flush() is rather expensive.) Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/cdc14bbe5d3c3ef2a562be09a6368ffe9bd947a6.1505663533.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index a999ba6b721f..c120b5db178a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -286,14 +286,31 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } +/* + * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID + * bits. This serves two purposes. It prevents a nasty situation in + * which PCID-unaware code saves CR3, loads some other value (with PCID + * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if + * the saved ASID was nonzero. It also means that any bugs involving + * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger + * deterministically. + */ + static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) { - return __sme_pa(mm->pgd) | asid; + if (static_cpu_has(X86_FEATURE_PCID)) { + VM_WARN_ON_ONCE(asid > 4094); + return __sme_pa(mm->pgd) | (asid + 1); + } else { + VM_WARN_ON_ONCE(asid != 0); + return __sme_pa(mm->pgd); + } } static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) { - return __sme_pa(mm->pgd) | asid | CR3_NOFLUSH; + VM_WARN_ON_ONCE(asid > 4094); + return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; } /* -- cgit From b8b7abaed7a49b350f8ba659ddc264b04931d581 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 17 Sep 2017 09:03:50 -0700 Subject: x86/mm/32: Move setup_clear_cpu_cap(X86_FEATURE_PCID) earlier Otherwise we might have the PCID feature bit set during cpu_init(). This is just for robustness. I haven't seen any actual bugs here. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: cba4671af755 ("x86/mm: Disable PCID on 32-bit kernels") Link: http://lkml.kernel.org/r/b16dae9d6b0db5d9801ddbebbfd83384097c61f3.1505663533.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/bugs.c | 8 -------- arch/x86/kernel/cpu/common.c | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index db684880d74a..0af86d9242da 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -21,14 +21,6 @@ void __init check_bugs(void) { -#ifdef CONFIG_X86_32 - /* - * Regardless of whether PCID is enumerated, the SDM says - * that it can't be enabled in 32-bit mode. - */ - setup_clear_cpu_cap(X86_FEATURE_PCID); -#endif - identify_boot_cpu(); if (!IS_ENABLED(CONFIG_SMP)) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 775f10100d7f..c9176bae7fd8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -904,6 +904,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); fpu__init_system(c); + +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif } void __init early_cpu_init(void) -- cgit From 4ba55e65f471d011d3ba2ac2022180ea0877d68e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 17 Sep 2017 09:03:51 -0700 Subject: x86/mm/32: Load a sane CR3 before cpu_init() on secondary CPUs For unknown historical reasons (i.e. Borislav doesn't recall), 32-bit kernels invoke cpu_init() on secondary CPUs with initial_page_table loaded into CR3. Then they set current->active_mm to &init_mm and call enter_lazy_tlb() before fixing CR3. This means that the x86 TLB code gets invoked while CR3 is inconsistent, and, with the improved PCID sanity checks I added, we warn. Fix it by loading swapper_pg_dir (i.e. init_mm.pgd) earlier. Reported-by: Paul Menzel Reported-by: Pavel Machek Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 72c0098d92ce ("x86/mm: Reinitialize TLB state on hotplug and resume") Link: http://lkml.kernel.org/r/30cdfea504682ba3b9012e77717800a91c22097f.1505663533.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0854ff169274..ad59edd84de7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -232,12 +232,6 @@ static void notrace start_secondary(void *unused) */ if (boot_cpu_has(X86_FEATURE_PCID)) __write_cr4(__read_cr4() | X86_CR4_PCIDE); - cpu_init(); - x86_cpuinit.early_percpu_clock_init(); - preempt_disable(); - smp_callin(); - - enable_start_cpu0 = 0; #ifdef CONFIG_X86_32 /* switch away from the initial page table */ @@ -245,6 +239,13 @@ static void notrace start_secondary(void *unused) __flush_tlb_all(); #endif + cpu_init(); + x86_cpuinit.early_percpu_clock_init(); + preempt_disable(); + smp_callin(); + + enable_start_cpu0 = 0; + /* otherwise gcc will move up smp_processor_id before the cpu_init */ barrier(); /* -- cgit From bf29ed1567b67854dc13504f685c45a2ea9b2081 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 7 Sep 2017 08:30:44 -0700 Subject: syscalls: Use CHECK_DATA_CORRUPTION for addr_limit_user_check Use CHECK_DATA_CORRUPTION instead of BUG_ON to provide more flexibility on address limit failures. By default, send a SIGKILL signal to kill the current process preventing exploitation of a bad address limit. Make the TIF_FSCHECK flag optional so ARM can use this function. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-api@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-2-git-send-email-keescook@chromium.org --- include/linux/syscalls.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 95606a2d556f..a78186d826d7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -221,21 +221,25 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) } \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) -#ifdef TIF_FSCHECK /* * Called before coming back to user-mode. Returning to user-mode with an * address limit different than USER_DS can allow to overwrite kernel memory. */ static inline void addr_limit_user_check(void) { - +#ifdef TIF_FSCHECK if (!test_thread_flag(TIF_FSCHECK)) return; +#endif - BUG_ON(!segment_eq(get_fs(), USER_DS)); + if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS), + "Invalid address limit on user-mode return")) + force_sig(SIGKILL, current); + +#ifdef TIF_FSCHECK clear_thread_flag(TIF_FSCHECK); -} #endif +} asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr); -- cgit From 2404269bc4e77a67875c8db6667be34c9913c96e Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 7 Sep 2017 08:30:45 -0700 Subject: Revert "arm/syscalls: Check address limit on user-mode return" This reverts commit 73ac5d6a2b6ac3ae8d1e1818f3e9946f97489bc9. The work pending loop can call set_fs after addr_limit_user_check removed the _TIF_FSCHECK flag. This may happen at anytime based on how ARM handles alignment exceptions. It leads to an infinite loop condition. After discussion, it has been agreed that the generic approach is not tailored to the ARM architecture and any fix might not be complete. This patch will be replaced by an architecture specific implementation. The work flag approach will be kept for other architectures. Reported-by: Leonard Crestez Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-api@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-3-git-send-email-keescook@chromium.org --- arch/arm/include/asm/thread_info.h | 15 ++++++--------- arch/arm/include/asm/uaccess.h | 2 -- arch/arm/kernel/entry-common.S | 9 ++------- arch/arm/kernel/signal.c | 5 ----- 4 files changed, 8 insertions(+), 23 deletions(-) diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 1d468b527b7b..776757d1604a 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -139,11 +139,10 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define TIF_NEED_RESCHED 1 /* rescheduling necessary */ #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_UPROBE 3 /* breakpointed or singlestepping */ -#define TIF_FSCHECK 4 /* Check FS is USER_DS on return */ -#define TIF_SYSCALL_TRACE 5 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ -#define TIF_SYSCALL_TRACEPOINT 7 /* syscall tracepoint instrumentation */ -#define TIF_SECCOMP 8 /* seccomp syscall filtering active */ +#define TIF_SYSCALL_TRACE 4 /* syscall trace active */ +#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ +#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ +#define TIF_SECCOMP 7 /* seccomp syscall filtering active */ #define TIF_NOHZ 12 /* in adaptive nohz mode */ #define TIF_USING_IWMMXT 17 @@ -154,7 +153,6 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_UPROBE (1 << TIF_UPROBE) -#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) @@ -168,9 +166,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, /* * Change these and you break ASM code in entry-common.S */ -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_FSCHECK) +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_UPROBE) #endif /* __KERNEL__ */ #endif /* __ASM_ARM_THREAD_INFO_H */ diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 87936dd5d151..0bf2347495f1 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -70,8 +70,6 @@ static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER); - /* On user-mode return, check fs is correct */ - set_thread_flag(TIF_FSCHECK); } #define segment_eq(a, b) ((a) == (b)) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index ca3614dc6938..0b60adf4a5d9 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -49,9 +49,7 @@ ret_fast_syscall: UNWIND(.cantunwind ) disable_irq_notrace @ disable interrupts ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK - bne fast_work_pending - tst r1, #_TIF_WORK_MASK + tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK bne fast_work_pending /* perform architecture specific actions before user return */ @@ -77,15 +75,12 @@ ret_fast_syscall: str r0, [sp, #S_R0 + S_OFF]! @ save returned r0 disable_irq_notrace @ disable interrupts ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK - bne fast_work_pending - tst r1, #_TIF_WORK_MASK + tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK beq no_work_pending UNWIND(.fnend ) ENDPROC(ret_fast_syscall) /* Slower path - fall through to work_pending */ -fast_work_pending: #endif tst r1, #_TIF_SYSCALL_WORK diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index e2de50bf8742..5814298ef0b7 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include @@ -614,10 +613,6 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) * Update the trace code with the current status. */ trace_hardirqs_off(); - - /* Check valid user FS if needed */ - addr_limit_user_check(); - do { if (likely(thread_flags & _TIF_NEED_RESCHED)) { schedule(); -- cgit From e33f8d32677fa4f4f8996ef46748f86aac81ccff Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 7 Sep 2017 08:30:46 -0700 Subject: arm/syscalls: Optimize address limit check Disable the generic address limit check in favor of an architecture specific optimized implementation. The generic implementation using pending work flags did not work well with ARM and alignment faults. The address limit is checked on each syscall return path to user-mode path as well as the irq user-mode return function. If the address limit was changed, a function is called to report data corruption (stopping the kernel or process based on configuration). The address limit check has to be done before any pending work because they can reset the address limit and the process is killed using a SIGKILL signal. For example the lkdtm address limit check does not work because the signal to kill the process will reset the user-mode address limit. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Tested-by: Kees Cook Tested-by: Leonard Crestez Reviewed-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-api@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-4-git-send-email-keescook@chromium.org --- arch/arm/kernel/entry-common.S | 11 +++++++++++ arch/arm/kernel/signal.c | 7 +++++++ 2 files changed, 18 insertions(+) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 0b60adf4a5d9..99c908226065 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_AEABI #include #endif @@ -48,10 +49,14 @@ ret_fast_syscall: UNWIND(.fnstart ) UNWIND(.cantunwind ) disable_irq_notrace @ disable interrupts + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK bne fast_work_pending + /* perform architecture specific actions before user return */ arch_ret_to_user r1, lr @@ -74,6 +79,9 @@ ret_fast_syscall: UNWIND(.cantunwind ) str r0, [sp, #S_R0 + S_OFF]! @ save returned r0 disable_irq_notrace @ disable interrupts + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK beq no_work_pending @@ -106,6 +114,9 @@ ENTRY(ret_to_user) ret_slow_syscall: disable_irq_notrace @ disable interrupts ENTRY(ret_to_user_from_irq) + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] tst r1, #_TIF_WORK_MASK bne slow_work_pending diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 5814298ef0b7..b67ae12503f3 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -673,3 +674,9 @@ struct page *get_signal_page(void) return page; } + +/* Defer to generic check */ +asmlinkage void addr_limit_check_failed(void) +{ + addr_limit_user_check(); +} -- cgit From a2048e34d4655c06d31400646ae495bbfeb16b27 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 7 Sep 2017 08:30:47 -0700 Subject: arm64/syscalls: Move address limit check in loop A bug was reported on ARM where set_fs might be called after it was checked on the work pending function. ARM64 is not affected by this bug but has a similar construct. In order to avoid any similar problems in the future, the addr_limit_user_check function is moved at the beginning of the loop. Fixes: cf7de27ab351 ("arm64/syscalls: Check address limit on user-mode return") Reported-by: Leonard Crestez Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-api@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-5-git-send-email-keescook@chromium.org --- arch/arm64/kernel/signal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index c45214f8fb54..0bdc96c61bc0 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -751,10 +751,10 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, */ trace_hardirqs_off(); - /* Check valid user FS if needed */ - addr_limit_user_check(); - do { + /* Check valid user FS if needed */ + addr_limit_user_check(); + if (thread_flags & _TIF_NEED_RESCHED) { schedule(); } else { -- cgit From 9764c02fcbad40001fd3f63558d918e4d519bb75 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 17 Sep 2017 10:41:35 -0500 Subject: SMB3: Add support for multidialect negotiate (SMB2.1 and later) With the need to discourage use of less secure dialect, SMB1 (CIFS), we temporarily upgraded the dialect to SMB3 in 4.13, but since there are various servers which only support SMB2.1 (2.1 is more secure than CIFS/SMB1) but not optimal for a default dialect - add support for multidialect negotiation. cifs.ko will now request SMB2.1 or later (ie SMB2.1 or SMB3.0, SMB3.02) and the server will pick the latest most secure one it can support. In addition since we are sending multidialect negotiate, add support for secure negotiate to validate that a man in the middle didn't downgrade us. Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky CC: Stable # 4.13+ --- fs/cifs/cifsglob.h | 6 ++++ fs/cifs/connect.c | 24 ++++++++++----- fs/cifs/smb2ops.c | 40 +++++++++++++++++++++++++ fs/cifs/smb2pdu.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++------- fs/cifs/smb2pdu.h | 2 +- 5 files changed, 139 insertions(+), 18 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 808486c29f0d..de5b2e1fcce5 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -188,6 +188,8 @@ enum smb_version { #ifdef CONFIG_CIFS_SMB311 Smb_311, #endif /* SMB311 */ + Smb_3any, + Smb_default, Smb_version_err }; @@ -1701,6 +1703,10 @@ extern struct smb_version_values smb20_values; #define SMB21_VERSION_STRING "2.1" extern struct smb_version_operations smb21_operations; extern struct smb_version_values smb21_values; +#define SMBDEFAULT_VERSION_STRING "default" +extern struct smb_version_values smbdefault_values; +#define SMB3ANY_VERSION_STRING "3" +extern struct smb_version_values smb3any_values; #define SMB30_VERSION_STRING "3.0" extern struct smb_version_operations smb30_operations; extern struct smb_version_values smb30_values; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5aa2d278ca84..8d38b22afb2b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -301,6 +301,8 @@ static const match_table_t cifs_smb_version_tokens = { { Smb_311, SMB311_VERSION_STRING }, { Smb_311, ALT_SMB311_VERSION_STRING }, #endif /* SMB311 */ + { Smb_3any, SMB3ANY_VERSION_STRING }, + { Smb_default, SMBDEFAULT_VERSION_STRING }, { Smb_version_err, NULL } }; @@ -1148,6 +1150,14 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) vol->vals = &smb311_values; break; #endif /* SMB311 */ + case Smb_3any: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smb3any_values; + break; + case Smb_default: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smbdefault_values; + break; default: cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); return 1; @@ -1274,9 +1284,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->actimeo = CIFS_DEF_ACTIMEO; - /* FIXME: add autonegotiation for SMB3 or later rather than just SMB3 */ - vol->ops = &smb30_operations; /* both secure and accepted widely */ - vol->vals = &smb30_values; + /* offer SMB2.1 and later (SMB3 etc). Secure and widely accepted */ + vol->ops = &smb30_operations; + vol->vals = &smbdefault_values; vol->echo_interval = SMB_ECHO_INTERVAL_DEFAULT; @@ -1988,11 +1998,10 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (got_version == false) pr_warn("No dialect specified on mount. Default has changed to " - "a more secure dialect, SMB3 (vers=3.0), from CIFS " + "a more secure dialect, SMB2.1 or later (e.g. SMB3), from CIFS " "(SMB1). To use the less secure SMB1 dialect to access " - "old servers which do not support SMB3 specify vers=1.0" - " on mount. For somewhat newer servers such as Windows " - "7 try vers=2.1.\n"); + "old servers which do not support SMB3 (or SMB2.1) specify vers=1.0" + " on mount.\n"); kfree(mountdata_copy); return 0; @@ -2133,6 +2142,7 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) if (vol->nosharesock) return 0; + /* BB update this for smb3any and default case */ if ((server->vals != vol->vals) || (server->ops != vol->ops)) return 0; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index fb2934b9b97c..405a9bf13aac 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3110,6 +3110,46 @@ struct smb_version_values smb21_values = { .create_lease_size = sizeof(struct create_lease), }; +struct smb_version_values smb3any_values = { + .version_string = SMB3ANY_VERSION_STRING, + .protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */ + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), +}; + +struct smb_version_values smbdefault_values = { + .version_string = SMBDEFAULT_VERSION_STRING, + .protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */ + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), +}; + struct smb_version_values smb30_values = { .version_string = SMB30_VERSION_STRING, .protocol_id = SMB30_PROT_ID, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 69a751b038ab..5c16591a128e 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -491,10 +491,25 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) req->hdr.sync_hdr.SessionId = 0; - req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); - - req->DialectCount = cpu_to_le16(1); /* One vers= at a time for now */ - inc_rfc1001_len(req, 2); + if (strcmp(ses->server->vals->version_string, + SMB3ANY_VERSION_STRING) == 0) { + req->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); + req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); + req->DialectCount = cpu_to_le16(2); + inc_rfc1001_len(req, 4); + } else if (strcmp(ses->server->vals->version_string, + SMBDEFAULT_VERSION_STRING) == 0) { + req->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); + req->Dialects[1] = cpu_to_le16(SMB30_PROT_ID); + req->Dialects[2] = cpu_to_le16(SMB302_PROT_ID); + req->DialectCount = cpu_to_le16(3); + inc_rfc1001_len(req, 6); + } else { + /* otherwise send specific dialect */ + req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); + req->DialectCount = cpu_to_le16(1); + inc_rfc1001_len(req, 2); + } /* only one of SMB2 signing flags may be set in SMB2 request */ if (ses->sign) @@ -528,16 +543,42 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) */ if (rc == -EOPNOTSUPP) { cifs_dbg(VFS, "Dialect not supported by server. Consider " - "specifying vers=1.0 or vers=2.1 on mount for accessing" + "specifying vers=1.0 or vers=2.0 on mount for accessing" " older servers\n"); goto neg_exit; } else if (rc != 0) goto neg_exit; + if (strcmp(ses->server->vals->version_string, + SMB3ANY_VERSION_STRING) == 0) { + if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) { + cifs_dbg(VFS, + "SMB2 dialect returned but not requested\n"); + return -EIO; + } else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) { + cifs_dbg(VFS, + "SMB2.1 dialect returned but not requested\n"); + return -EIO; + } + } else if (strcmp(ses->server->vals->version_string, + SMBDEFAULT_VERSION_STRING) == 0) { + if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) { + cifs_dbg(VFS, + "SMB2 dialect returned but not requested\n"); + return -EIO; + } else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) { + /* ops set to 3.0 by default for default so update */ + ses->server->ops = &smb21_operations; + } + } else if (rsp->DialectRevision != ses->server->vals->protocol_id) { + /* if requested single dialect ensure returned dialect matched */ + cifs_dbg(VFS, "Illegal 0x%x dialect returned: not requested\n", + cpu_to_le16(rsp->DialectRevision)); + return -EIO; + } + cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode); - /* BB we may eventually want to match the negotiated vs. requested - dialect, even though we are only requesting one at a time */ if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) cifs_dbg(FYI, "negotiated smb2.0 dialect\n"); else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) @@ -558,6 +599,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) } server->dialect = le16_to_cpu(rsp->DialectRevision); + /* BB: add check that dialect was valid given dialect(s) we asked for */ + /* SMB2 only has an extended negflavor */ server->negflavor = CIFS_NEGFLAVOR_EXTENDED; /* set it to the maximum buffer size value we can send with 1 credit */ @@ -606,6 +649,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) struct validate_negotiate_info_req vneg_inbuf; struct validate_negotiate_info_rsp *pneg_rsp; u32 rsplen; + u32 inbuflen; /* max of 4 dialects */ cifs_dbg(FYI, "validate negotiate\n"); @@ -634,9 +678,30 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) else vneg_inbuf.SecurityMode = 0; - vneg_inbuf.DialectCount = cpu_to_le16(1); - vneg_inbuf.Dialects[0] = - cpu_to_le16(tcon->ses->server->vals->protocol_id); + + if (strcmp(tcon->ses->server->vals->version_string, + SMB3ANY_VERSION_STRING) == 0) { + vneg_inbuf.Dialects[0] = cpu_to_le16(SMB30_PROT_ID); + vneg_inbuf.Dialects[1] = cpu_to_le16(SMB302_PROT_ID); + vneg_inbuf.DialectCount = cpu_to_le16(2); + /* structure is big enough for 3 dialects, sending only 2 */ + inbuflen = sizeof(struct validate_negotiate_info_req) - 2; + } else if (strcmp(tcon->ses->server->vals->version_string, + SMBDEFAULT_VERSION_STRING) == 0) { + vneg_inbuf.Dialects[0] = cpu_to_le16(SMB21_PROT_ID); + vneg_inbuf.Dialects[1] = cpu_to_le16(SMB30_PROT_ID); + vneg_inbuf.Dialects[2] = cpu_to_le16(SMB302_PROT_ID); + vneg_inbuf.DialectCount = cpu_to_le16(3); + /* structure is big enough for 3 dialects */ + inbuflen = sizeof(struct validate_negotiate_info_req); + } else { + /* otherwise specific dialect was requested */ + vneg_inbuf.Dialects[0] = + cpu_to_le16(tcon->ses->server->vals->protocol_id); + vneg_inbuf.DialectCount = cpu_to_le16(1); + /* structure is big enough for 3 dialects, sending only 1 */ + inbuflen = sizeof(struct validate_negotiate_info_req) - 4; + } rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 393ed5f4e1b6..6c9653a130c8 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -716,7 +716,7 @@ struct validate_negotiate_info_req { __u8 Guid[SMB2_CLIENT_GUID_SIZE]; __le16 SecurityMode; __le16 DialectCount; - __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */ + __le16 Dialects[3]; /* BB expand this if autonegotiate > 3 dialects */ } __packed; struct validate_negotiate_info_rsp { -- cgit From 1368f155a972eab037f81b5dea82afd544227552 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Sep 2017 11:24:15 +0200 Subject: cifs: hide unused functions The newly added SMB2+ attribute support causes unused function warnings when CONFIG_CIFS_XATTR is disabled: fs/cifs/smb2ops.c:563:1: error: 'smb2_set_ea' defined but not used [-Werror=unused-function] smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, fs/cifs/smb2ops.c:513:1: error: 'smb2_query_eas' defined but not used [-Werror=unused-function] smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, This adds another #ifdef around the affected functions. Fixes: 5517554e4313 ("cifs: Add support for writing attributes on SMB2+") Fixes: 95907fea4fd8 ("cifs: Add support for reading attributes on SMB2+") Signed-off-by: Arnd Bergmann Acked-by: Geert Uytterhoeven Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 405a9bf13aac..0dafdbae1f8c 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -426,6 +426,7 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +#ifdef CONFIG_CIFS_XATTR static ssize_t move_smb2_ea_to_cifs(char *dst, size_t dst_size, struct smb2_file_full_ea_info *src, size_t src_size, @@ -613,6 +614,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +#endif static bool smb2_can_echo(struct TCP_Server_Info *server) -- cgit From 94a9daeaece415ce37ccb413b49a580e68e3477b Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 12 Sep 2017 18:13:11 -0500 Subject: Update version of cifs module Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 30bf89b1fd9a..5a10e566f0e6 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -149,5 +149,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.09" +#define CIFS_VERSION "2.10" #endif /* _CIFSFS_H */ -- cgit From 8fce3dc5c5d6f6301f67311fa79f333902b58cea Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Sep 2017 21:42:59 +0200 Subject: clocksource/integrator: Fix section mismatch warning gcc-4.6 and older fail to inline integrator_clocksource_init, so they end up showing a harmless warning: WARNING: vmlinux.o(.text+0x4aa94c): Section mismatch in reference from the function integrator_clocksource_init() to the function .init.text:clocksource_mmio_init() The function integrator_clocksource_init() references the function __init clocksource_mmio_init(). This is often because integrator_clocksource_init lacks a __init annotation or the annotation of clocksource_mmio_init is wrong. Add the missing __init annotation that makes it build cleanly with all compilers. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Thierry Reding Cc: Linus Walleij Cc: Daniel Lezcano Link: http://lkml.kernel.org/r/20170915194310.1170514-1-arnd@arndb.de --- drivers/clocksource/timer-integrator-ap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-integrator-ap.c b/drivers/clocksource/timer-integrator-ap.c index 2ff64d9d4fb3..62d24690ba02 100644 --- a/drivers/clocksource/timer-integrator-ap.c +++ b/drivers/clocksource/timer-integrator-ap.c @@ -36,8 +36,8 @@ static u64 notrace integrator_read_sched_clock(void) return -readl(sched_clk_base + TIMER_VALUE); } -static int integrator_clocksource_init(unsigned long inrate, - void __iomem *base) +static int __init integrator_clocksource_init(unsigned long inrate, + void __iomem *base) { u32 ctrl = TIMER_CTRL_ENABLE | TIMER_CTRL_PERIODIC; unsigned long rate = inrate; -- cgit From b8f3911610529ba531b99d1e109c7a40fb071f0a Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 12 Sep 2017 15:10:35 +0200 Subject: mtd: spi-nor: Check consistency of the memory size extracted from the SFDP One field of the flash parameter table contains information about the flash device size. Most of the time the data extracted from this field is valid, but sometimes the BFPT section of the SFDP table is corrupted or invalid and this field is set to 0xffffffff, thus resulting in an integer overflow when setting params->size. Since NOR devices are anayway always smaller than 2^64 bytes, we can easily stop the BFPT parsing if the size reported in this table is invalid. Fixes: f384b352cbf0 ("mtd: spi-nor: parse Serial Flash Discoverable Parameters (SFDP) tables") Reported-by: Geert Uytterhoeven Signed-off-by: Boris Brezillon Tested-by: Geert Uytterhoeven Acked-by: Cyrille Pitchen --- drivers/mtd/spi-nor/spi-nor.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index cf1d4a15e10a..4425b0283725 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -2127,6 +2127,15 @@ static int spi_nor_parse_bfpt(struct spi_nor *nor, params->size = bfpt.dwords[BFPT_DWORD(2)]; if (params->size & BIT(31)) { params->size &= ~BIT(31); + + /* + * Prevent overflows on params->size. Anyway, a NOR of 2^64 + * bits is unlikely to exist so this error probably means + * the BFPT we are reading is corrupted/wrong. + */ + if (params->size > 63) + return -EINVAL; + params->size = 1ULL << params->size; } else { params->size++; -- cgit From bfa4133795e5a0badd402dd3f58b13b3cec64a4b Mon Sep 17 00:00:00 2001 From: Cyrille Pitchen Date: Wed, 6 Sep 2017 23:45:02 +0200 Subject: mtd: spi-nor: fix DMA unsafe buffer issue in spi_nor_read_sfdp() spi_nor_read_sfdp() calls nor->read() to read the SFDP data. When the m25p80 driver is used (pretty common case), nor->read() is then implemented by the m25p80_read() function, which is likely to initialize a 'struct spi_transfer' from its buf argument before appending this structure inside the 'struct spi_message' argument of spi_sync(). Besides the SPI sub-system states that both .tx_buf and .rx_buf members of 'struct spi_transfer' must point into dma-safe memory. However, two of the three calls of spi_nor_read_sfdp() were given pointers to stack allocated memory as buf argument, hence not in a dma-safe area. Hopefully, the third and last call of spi_nor_read_sfdp() was already given a kmalloc'ed buffer argument, hence dma-safe. So this patch fixes this issue by introducing a spi_nor_read_sfdp_dma_unsafe() function which simply wraps the existing spi_nor_read_sfdp() function and uses some kmalloc'ed memory as a bounce buffer. Fixes: f384b352cbf0 ("mtd: spi-nor: parse Serial Flash Discoverable Parameters (SFDP) tables") Reported-by: Geert Uytterhoeven Signed-off-by: Cyrille Pitchen Signed-off-by: Boris Brezillon --- drivers/mtd/spi-nor/spi-nor.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index 4425b0283725..19c000722cbc 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -1784,7 +1784,7 @@ spi_nor_set_pp_settings(struct spi_nor_pp_command *pp, * @nor: pointer to a 'struct spi_nor' * @addr: offset in the SFDP area to start reading data from * @len: number of bytes to read - * @buf: buffer where the SFDP data are copied into + * @buf: buffer where the SFDP data are copied into (dma-safe memory) * * Whatever the actual numbers of bytes for address and dummy cycles are * for (Fast) Read commands, the Read SFDP (5Ah) instruction is always @@ -1829,6 +1829,36 @@ read_err: return ret; } +/** + * spi_nor_read_sfdp_dma_unsafe() - read Serial Flash Discoverable Parameters. + * @nor: pointer to a 'struct spi_nor' + * @addr: offset in the SFDP area to start reading data from + * @len: number of bytes to read + * @buf: buffer where the SFDP data are copied into + * + * Wrap spi_nor_read_sfdp() using a kmalloc'ed bounce buffer as @buf is now not + * guaranteed to be dma-safe. + * + * Return: -ENOMEM if kmalloc() fails, the return code of spi_nor_read_sfdp() + * otherwise. + */ +static int spi_nor_read_sfdp_dma_unsafe(struct spi_nor *nor, u32 addr, + size_t len, void *buf) +{ + void *dma_safe_buf; + int ret; + + dma_safe_buf = kmalloc(len, GFP_KERNEL); + if (!dma_safe_buf) + return -ENOMEM; + + ret = spi_nor_read_sfdp(nor, addr, len, dma_safe_buf); + memcpy(buf, dma_safe_buf, len); + kfree(dma_safe_buf); + + return ret; +} + struct sfdp_parameter_header { u8 id_lsb; u8 minor; @@ -2101,7 +2131,7 @@ static int spi_nor_parse_bfpt(struct spi_nor *nor, bfpt_header->length * sizeof(u32)); addr = SFDP_PARAM_HEADER_PTP(bfpt_header); memset(&bfpt, 0, sizeof(bfpt)); - err = spi_nor_read_sfdp(nor, addr, len, &bfpt); + err = spi_nor_read_sfdp_dma_unsafe(nor, addr, len, &bfpt); if (err < 0) return err; @@ -2252,7 +2282,7 @@ static int spi_nor_parse_sfdp(struct spi_nor *nor, int i, err; /* Get the SFDP header. */ - err = spi_nor_read_sfdp(nor, 0, sizeof(header), &header); + err = spi_nor_read_sfdp_dma_unsafe(nor, 0, sizeof(header), &header); if (err < 0) return err; -- cgit From 17b694a5476de09e119c517dcc3b71eff6835bdf Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 3 Sep 2017 13:58:31 +0200 Subject: mtd: nand: lpc32xx_mlc: Fix an error handling path in lpc32xx_nand_probe() If 'clk_prepare_enable()' fails, we must 'put' the corresponding clock. Fixes: 4d26f012ab59 ("mtd: nand: lpc32xx_mlc: Handle return value of clk_prepare_enable.") Signed-off-by: Christophe JAILLET Signed-off-by: Boris Brezillon --- drivers/mtd/nand/lpc32xx_mlc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/lpc32xx_mlc.c b/drivers/mtd/nand/lpc32xx_mlc.c index c3bb358ef01e..5796468db653 100644 --- a/drivers/mtd/nand/lpc32xx_mlc.c +++ b/drivers/mtd/nand/lpc32xx_mlc.c @@ -707,7 +707,7 @@ static int lpc32xx_nand_probe(struct platform_device *pdev) } res = clk_prepare_enable(host->clk); if (res) - goto err_exit1; + goto err_put_clk; nand_chip->cmd_ctrl = lpc32xx_nand_cmd_ctrl; nand_chip->dev_ready = lpc32xx_nand_device_ready; @@ -814,6 +814,7 @@ err_exit3: dma_release_channel(host->dma_chan); err_exit2: clk_disable_unprepare(host->clk); +err_put_clk: clk_put(host->clk); err_exit1: lpc32xx_wp_enable(host); -- cgit From e580b8bc4316cbb8bbffb5ed7bf1e477064755ed Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 18 Sep 2017 09:40:12 +0100 Subject: arm64: efi: Don't include EFI fpsimd save/restore code in non-EFI kernels __efi_fpsimd_begin()/__efi_fpsimd_end() are for use when making EFI calls only, so using them in non-EFI kernels is not allowed. This patch compiles them out if CONFIG_EFI is not set. Acked-by: Ard Biesheuvel Signed-off-by: Dave Martin Signed-off-by: Catalin Marinas --- arch/arm64/kernel/fpsimd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 3a68cf38a6b3..f444f374bd7b 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -321,6 +321,8 @@ void kernel_neon_end(void) } EXPORT_SYMBOL(kernel_neon_end); +#ifdef CONFIG_EFI + static DEFINE_PER_CPU(struct fpsimd_state, efi_fpsimd_state); static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); @@ -370,6 +372,8 @@ void __efi_fpsimd_end(void) kernel_neon_end(); } +#endif /* CONFIG_EFI */ + #endif /* CONFIG_KERNEL_MODE_NEON */ #ifdef CONFIG_CPU_PM -- cgit From c73cc120a33e12e4e254b4b42bc613204ccb923b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 18 Sep 2017 11:20:19 +0100 Subject: arm64: relax assembly code alignment from 16 byte to 4 byte Aarch64 instructions must be word aligned. The current 16 byte alignment is more than enough. Relax it into 4 byte alignment. Signed-off-by: Masahiro Yamada Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/linkage.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h index 636c1bced7d4..1b266292f0be 100644 --- a/arch/arm64/include/asm/linkage.h +++ b/arch/arm64/include/asm/linkage.h @@ -1,7 +1,7 @@ #ifndef __ASM_LINKAGE_H #define __ASM_LINKAGE_H -#define __ALIGN .align 4 -#define __ALIGN_STR ".align 4" +#define __ALIGN .align 2 +#define __ALIGN_STR ".align 2" #endif -- cgit From 3d6a7b99e3fa29b92d6288487e057e0a596bd2b0 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Mon, 18 Sep 2017 11:20:20 +0100 Subject: arm64: ensure the kernel is compiled for LP64 The kernel needs to be compiled as a LP64 binary for ARM64, even when using a compiler that defaults to code-generation for the ILP32 ABI. Consequently, we need to explicitly pass '-mabi=lp64' (supported on gcc-4.9 and newer). Signed-off-by: Andrew Pinski Signed-off-by: Philipp Tomsich Signed-off-by: Christoph Muellner Signed-off-by: Yury Norov Reviewed-by: David Daney Signed-off-by: Catalin Marinas --- arch/arm64/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 9b41f1e3b1a0..939b310913cf 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -50,17 +50,22 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += $(call cc-option, -mpc-relative-literal-loads) KBUILD_AFLAGS += $(lseinstr) $(brokengasinst) +KBUILD_CFLAGS += $(call cc-option,-mabi=lp64) +KBUILD_AFLAGS += $(call cc-option,-mabi=lp64) + ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) KBUILD_CPPFLAGS += -mbig-endian CHECKFLAGS += -D__AARCH64EB__ AS += -EB LD += -EB +LDFLAGS += -maarch64linuxb UTS_MACHINE := aarch64_be else KBUILD_CPPFLAGS += -mlittle-endian CHECKFLAGS += -D__AARCH64EL__ AS += -EL LD += -EL +LDFLAGS += -maarch64linux UTS_MACHINE := aarch64 endif -- cgit From 0a51fb7174f2b7866b4d7a4a5c23b685b674beb6 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 9 Sep 2017 12:19:22 +0300 Subject: quota: add missing lock into __dquot_transfer() Lock dq_dqb_lock around dquot_decr_inodes() Signed-off-by: Konstantin Khlebnikov Fixes: 7b9ca4c61bc2 ("quota: Reduce contention on dq_data_lock") Signed-off-by: Jan Kara --- fs/quota/dquot.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 8381db9db6d9..50b0556a124f 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1980,7 +1980,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, 0, &warn_to[cnt]); if (ret) { + spin_lock(&transfer_to[cnt]->dq_dqb_lock); dquot_decr_inodes(transfer_to[cnt], inode_usage); + spin_unlock(&transfer_to[cnt]->dq_dqb_lock); goto over_quota; } } -- cgit From 0ab0b271bf75073cb254b5ea0593aceae5a42bd3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 16 Sep 2017 22:32:12 +0200 Subject: isofs: fix build regression The new isofs_show_options() function fails to build when CONFIG_NLS is disabled: fs/isofs/inode.c: In function 'isofs_show_options': fs/isofs/inode.c:518:44: error: 'CONFIG_NLS_DEFAULT' undeclared (first use in this function) fs/isofs/inode.c:518:44: note: each undeclared identifier is reported only once for each function it appears in This adds a check for CONFIG_JOLIET (which selects NLS), matching the other uses of the iocharset handling in this file. Fixes: 6fecb86a44f5 ("isofs: Implement show_options") Signed-off-by: Arnd Bergmann Signed-off-by: Jan Kara --- fs/isofs/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index db692f554158..447a24d77b89 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -514,9 +514,11 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root) if (sbi->s_fmode != ISOFS_INVALID_MODE) seq_printf(m, ",fmode=%o", sbi->s_fmode); +#ifdef CONFIG_JOLIET if (sbi->s_nls_iocharset && strcmp(sbi->s_nls_iocharset->charset, CONFIG_NLS_DEFAULT) != 0) seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset); +#endif return 0; } -- cgit From 74378c5c8cdaf0ce9f65e67cbd0613286f2c3bad Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 5 Sep 2017 20:16:27 +0200 Subject: driver core: Fix link to device power management documentation Correct location as of commit 2728b2d2e5be4b82 (PM / core / docs: Convert sleep states API document to reST). Fixes: 2728b2d2e5be4b82 (PM / core / docs: Convert sleep states API document to reST) Signed-off-by: Geert Uytterhoeven Signed-off-by: Rafael J. Wysocki --- include/linux/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/device.h b/include/linux/device.h index c6f27207dbe8..1d2607923a24 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -838,7 +838,7 @@ struct dev_links_info { * @driver_data: Private pointer for driver specific info. * @links: Links to suppliers and consumers of this device. * @power: For device power management. - * See Documentation/power/admin-guide/devices.rst for details. + * See Documentation/driver-api/pm/devices.rst for details. * @pm_domain: Provide callbacks that are executed during system suspend, * hibernation, system resume and during runtime PM transitions * along with subsystem-level and driver-level callbacks. -- cgit From 096a2c610df279d001a8f61faa3eb7f98ca6196b Mon Sep 17 00:00:00 2001 From: Rafael Wysocki Date: Fri, 8 Sep 2017 01:20:54 +0200 Subject: ACPI / PMIC: Add code reviewers to MAINTAINERS Andy and Mika review code changes under drivers/acpi/pmic/ on a regular basis and I rely on their help with that, so add them as code reviwewers for that part of the kernel. Signed-off-by: Rafael J. Wysocki Acked-by: Lee Jones Reviewed-by: Mika Westerberg Reviewed-by: Andy Shevchenko --- MAINTAINERS | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..fa4e916b72e9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -352,6 +352,18 @@ L: linux-acpi@vger.kernel.org S: Maintained F: drivers/acpi/arm64 +ACPI PMIC DRIVERS +M: "Rafael J. Wysocki" +M: Len Brown +R: Andy Shevchenko +R: Mika Westerberg +L: linux-acpi@vger.kernel.org +Q: https://patchwork.kernel.org/project/linux-acpi/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm +B: https://bugzilla.kernel.org +S: Supported +F: drivers/acpi/pmic/ + ACPI THERMAL DRIVER M: Zhang Rui L: linux-acpi@vger.kernel.org -- cgit From 41ba8bd0829f5c210715ece8b0f699bed0da8eb5 Mon Sep 17 00:00:00 2001 From: "Jan H. Schönherr" Date: Tue, 5 Sep 2017 23:14:29 +0200 Subject: PM / QoS: Use the correct variable to check the QoS request type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the actual function argument for the validation of the request type, instead of the type field in a fresh (supposedly zero-initialized) request structure. Signed-off-by: Jan H. Schönherr Signed-off-by: Rafael J. Wysocki --- drivers/base/power/qos.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c index f850daeffba4..277d43a83f53 100644 --- a/drivers/base/power/qos.c +++ b/drivers/base/power/qos.c @@ -277,11 +277,11 @@ void dev_pm_qos_constraints_destroy(struct device *dev) mutex_unlock(&dev_pm_qos_sysfs_mtx); } -static bool dev_pm_qos_invalid_request(struct device *dev, - struct dev_pm_qos_request *req) +static bool dev_pm_qos_invalid_req_type(struct device *dev, + enum dev_pm_qos_req_type type) { - return !req || (req->type == DEV_PM_QOS_LATENCY_TOLERANCE - && !dev->power.set_latency_tolerance); + return type == DEV_PM_QOS_LATENCY_TOLERANCE && + !dev->power.set_latency_tolerance; } static int __dev_pm_qos_add_request(struct device *dev, @@ -290,7 +290,7 @@ static int __dev_pm_qos_add_request(struct device *dev, { int ret = 0; - if (!dev || dev_pm_qos_invalid_request(dev, req)) + if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type)) return -EINVAL; if (WARN(dev_pm_qos_request_active(req), -- cgit From 73600b619bf8b9803a0610624a0e11f5ed8f761e Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Sat, 2 Sep 2017 10:49:38 +0200 Subject: mtd: nand: remove unused blockmask variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fix the following build warning: drivers/mtd/nand/nand_base.c:2671:30: attention : variable ‘blockmask’ set but not used [-Wunused-but-set-variable] Fixes: 0b4773fd1649 ("mtd: nand: Drop unused cached programming support") Signed-off-by: Corentin Labbe Signed-off-by: Boris Brezillon --- drivers/mtd/nand/nand_base.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index bcc8cef1c615..12edaae17d81 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -2668,7 +2668,7 @@ static uint8_t *nand_fill_oob(struct mtd_info *mtd, uint8_t *oob, size_t len, static int nand_do_write_ops(struct mtd_info *mtd, loff_t to, struct mtd_oob_ops *ops) { - int chipnr, realpage, page, blockmask, column; + int chipnr, realpage, page, column; struct nand_chip *chip = mtd_to_nand(mtd); uint32_t writelen = ops->len; @@ -2704,7 +2704,6 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to, realpage = (int)(to >> chip->page_shift); page = realpage & chip->pagemask; - blockmask = (1 << (chip->phys_erase_shift - chip->page_shift)) - 1; /* Invalidate the page cache, when we write to the cached page */ if (to <= ((loff_t)chip->pagebuf << chip->page_shift) && -- cgit From fe9c1c9555a529bf678148f719c1b9f1730f68a3 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 14 Sep 2017 14:38:58 +0200 Subject: xen: don't compile pv-specific parts if XEN_PV isn't configured xenbus_client.c contains some functions specific for pv guests. Enclose them with #ifdef CONFIG_XEN_PV to avoid compiling them when they are not needed (e.g. on ARM). Signed-off-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/xenbus/xenbus_client.c | 130 +++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 63 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 82a8866758ee..a1c17000129b 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -519,64 +519,6 @@ static int __xenbus_map_ring(struct xenbus_device *dev, return err; } -static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, - grant_ref_t *gnt_refs, - unsigned int nr_grefs, - void **vaddr) -{ - struct xenbus_map_node *node; - struct vm_struct *area; - pte_t *ptes[XENBUS_MAX_RING_GRANTS]; - phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; - int err = GNTST_okay; - int i; - bool leaked; - - *vaddr = NULL; - - if (nr_grefs > XENBUS_MAX_RING_GRANTS) - return -EINVAL; - - node = kzalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; - - area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, ptes); - if (!area) { - kfree(node); - return -ENOMEM; - } - - for (i = 0; i < nr_grefs; i++) - phys_addrs[i] = arbitrary_virt_to_machine(ptes[i]).maddr; - - err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles, - phys_addrs, - GNTMAP_host_map | GNTMAP_contains_pte, - &leaked); - if (err) - goto failed; - - node->nr_handles = nr_grefs; - node->pv.area = area; - - spin_lock(&xenbus_valloc_lock); - list_add(&node->next, &xenbus_valloc_pages); - spin_unlock(&xenbus_valloc_lock); - - *vaddr = area->addr; - return 0; - -failed: - if (!leaked) - free_vm_area(area); - else - pr_alert("leaking VM area %p size %u page(s)", area, nr_grefs); - - kfree(node); - return err; -} - struct map_ring_valloc_hvm { unsigned int idx; @@ -725,6 +667,65 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) } EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); +#ifdef CONFIG_XEN_PV +static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, + grant_ref_t *gnt_refs, + unsigned int nr_grefs, + void **vaddr) +{ + struct xenbus_map_node *node; + struct vm_struct *area; + pte_t *ptes[XENBUS_MAX_RING_GRANTS]; + phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; + int err = GNTST_okay; + int i; + bool leaked; + + *vaddr = NULL; + + if (nr_grefs > XENBUS_MAX_RING_GRANTS) + return -EINVAL; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, ptes); + if (!area) { + kfree(node); + return -ENOMEM; + } + + for (i = 0; i < nr_grefs; i++) + phys_addrs[i] = arbitrary_virt_to_machine(ptes[i]).maddr; + + err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles, + phys_addrs, + GNTMAP_host_map | GNTMAP_contains_pte, + &leaked); + if (err) + goto failed; + + node->nr_handles = nr_grefs; + node->pv.area = area; + + spin_lock(&xenbus_valloc_lock); + list_add(&node->next, &xenbus_valloc_pages); + spin_unlock(&xenbus_valloc_lock); + + *vaddr = area->addr; + return 0; + +failed: + if (!leaked) + free_vm_area(area); + else + pr_alert("leaking VM area %p size %u page(s)", area, nr_grefs); + + kfree(node); + return err; +} + static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) { struct xenbus_map_node *node; @@ -788,6 +789,12 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) return err; } +static const struct xenbus_ring_ops ring_ops_pv = { + .map = xenbus_map_ring_valloc_pv, + .unmap = xenbus_unmap_ring_vfree_pv, +}; +#endif + struct unmap_ring_vfree_hvm { unsigned int idx; @@ -916,11 +923,6 @@ enum xenbus_state xenbus_read_driver_state(const char *path) } EXPORT_SYMBOL_GPL(xenbus_read_driver_state); -static const struct xenbus_ring_ops ring_ops_pv = { - .map = xenbus_map_ring_valloc_pv, - .unmap = xenbus_unmap_ring_vfree_pv, -}; - static const struct xenbus_ring_ops ring_ops_hvm = { .map = xenbus_map_ring_valloc_hvm, .unmap = xenbus_unmap_ring_vfree_hvm, @@ -928,8 +930,10 @@ static const struct xenbus_ring_ops ring_ops_hvm = { void __init xenbus_ring_ops_init(void) { +#ifdef CONFIG_XEN_PV if (!xen_feature(XENFEAT_auto_translated_physmap)) ring_ops = &ring_ops_pv; else +#endif ring_ops = &ring_ops_hvm; } -- cgit From b0ade85165b3caeb0cd908cffe5921a39f25c243 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 10 Sep 2017 13:41:41 +0200 Subject: netfilter: nat: Do not use ARRAY_SIZE() on spinlocks to fix zero div MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If no spinlock debugging options (CONFIG_GENERIC_LOCKBREAK, CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_LOCK_ALLOC) are enabled on a UP platform (e.g. m68k defconfig), arch_spinlock_t is an empty struct, hence using ARRAY_SIZE(nf_nat_locks) causes a division by zero: net/netfilter/nf_nat_core.c: In function ‘nf_nat_setup_info’: net/netfilter/nf_nat_core.c:432: warning: division by zero net/netfilter/nf_nat_core.c: In function ‘__nf_nat_cleanup_conntrack’: net/netfilter/nf_nat_core.c:535: warning: division by zero net/netfilter/nf_nat_core.c:537: warning: division by zero net/netfilter/nf_nat_core.c: In function ‘nf_nat_init’: net/netfilter/nf_nat_core.c:810: warning: division by zero net/netfilter/nf_nat_core.c:811: warning: division by zero net/netfilter/nf_nat_core.c:824: warning: division by zero Fix this by using the CONNTRACK_LOCKS definition instead. Suggested-by: Florian Westphal Fixes: 8073e960a03bf7b5 ("netfilter: nat: use keyed locks") Signed-off-by: Geert Uytterhoeven Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index f393a7086025..af8345fc4fbd 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -429,7 +429,7 @@ nf_nat_setup_info(struct nf_conn *ct, srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)]; + lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); @@ -532,9 +532,9 @@ static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) unsigned int h; h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) @@ -807,8 +807,8 @@ static int __init nf_nat_init(void) /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; - if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks)) - nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks); + if (nf_nat_htable_size < CONNTRACK_LOCKS) + nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) @@ -821,7 +821,7 @@ static int __init nf_nat_init(void) return ret; } - for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++) + for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); nf_ct_helper_expectfn_register(&follow_master_nat); -- cgit From 7f4f7dd4417d9efd038b14d39c70170db2e0baa0 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Mon, 11 Sep 2017 21:52:40 +0200 Subject: netfilter: ipset: ipset list may return wrong member count for set with timeout Simple testcase: $ ipset create test hash:ip timeout 5 $ ipset add test 1.2.3.4 $ ipset add test 1.2.2.2 $ sleep 5 $ ipset l Name: test Type: hash:ip Revision: 5 Header: family inet hashsize 1024 maxelem 65536 timeout 5 Size in memory: 296 References: 0 Number of entries: 2 Members: We return "Number of entries: 2" but no members are listed. That is because mtype_list runs "ip_set_timeout_expired" and does not list the expired entries, but set->elements is never upated (until mtype_gc cleans it up later). Reviewed-by: Joshua Hunt Signed-off-by: Vishwanath Pai Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index f236c0bc7b3f..51063d9ed0f7 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1041,12 +1041,24 @@ out: static int mtype_head(struct ip_set *set, struct sk_buff *skb) { - const struct htype *h = set->data; + struct htype *h = set->data; const struct htable *t; struct nlattr *nested; size_t memsize; u8 htable_bits; + /* If any members have expired, set->elements will be wrong + * mytype_expire function will update it with the right count. + * we do not hold set->lock here, so grab it first. + * set->elements can still be incorrect in the case of a huge set, + * because elements might time out during the listing. + */ + if (SET_WITH_TIMEOUT(set)) { + spin_lock_bh(&set->lock); + mtype_expire(set, h); + spin_unlock_bh(&set->lock); + } + rcu_read_lock_bh(); t = rcu_dereference_bh_nfnl(h->table); memsize = mtype_ahash_memsize(h, t) + set->ext_size; -- cgit From 64cfcaed7b25f69d8b7a091a23961f50c6788a66 Mon Sep 17 00:00:00 2001 From: Daniel Díaz Date: Fri, 7 Jul 2017 10:27:06 -0500 Subject: selftests: net: More graceful finding of `ip'. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ip tool might be provided by another package (such as Busybox), not necessarily implementing the -Version switch. Trying an actual usage (`ip link show') might be a better test that would work with all implementations of `ip'. Signed-off-by: Daniel Díaz Signed-off-by: Shuah Khan --- tools/testing/selftests/net/netdevice.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/netdevice.sh b/tools/testing/selftests/net/netdevice.sh index 4e00568d70c2..90cb903c3381 100755 --- a/tools/testing/selftests/net/netdevice.sh +++ b/tools/testing/selftests/net/netdevice.sh @@ -178,7 +178,7 @@ if [ "$(id -u)" -ne 0 ];then exit 0 fi -ip -Version 2>/dev/null >/dev/null +ip link show 2>/dev/null >/dev/null if [ $? -ne 0 ];then echo "SKIP: Could not run test without the ip tool" exit 0 -- cgit From 96e5335859e30fa2bb2da8befc86292c3c1a73dd Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 11 Sep 2017 12:49:01 +0200 Subject: tools: fix testing/selftests/sigaltstack for s390x MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On s390x the compilation of the file sas.c in directory tools/testing/selftests/sigaltstack fails with this error message: root@s35lp76 testing]# make selftests/sigaltstack/sas cc selftests/sigaltstack/sas.c -o selftests/sigaltstack/sas selftests/sigaltstack/sas.c: In function ‘my_usr1’: selftests/sigaltstack/sas.c:42:25: error: invalid register name for ‘sp’ register unsigned long sp asm("sp"); ^~ : recipe for target 'selftests/sigaltstack/sas' failed make: *** [selftests/sigaltstack/sas] Error 1 [root@s35lp76 testing]# On s390x the stack pointer is register r15, the register name "sp" is unknown. Make this line platform dependend and use register r15. With this patch the compilation and test succeeds: [root@s35lp76 testing]# ./selftests/sigaltstack/sas TAP version 13 ok 1 Initial sigaltstack state was SS_DISABLE # [RUN] signal USR1 ok 2 sigaltstack is disabled in sighandler # [RUN] switched to user ctx # [RUN] signal USR2 # [OK] Stack preserved ok 3 sigaltstack is still SS_AUTODISARM after signal Pass 3 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 1..3 [root@s35lp76 testing]# Signed-off-by: Thomas Richter Signed-off-by: Shuah Khan --- tools/testing/selftests/sigaltstack/sas.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/sigaltstack/sas.c b/tools/testing/selftests/sigaltstack/sas.c index 7d406c3973ba..97bb150837df 100644 --- a/tools/testing/selftests/sigaltstack/sas.c +++ b/tools/testing/selftests/sigaltstack/sas.c @@ -39,7 +39,11 @@ void my_usr1(int sig, siginfo_t *si, void *u) stack_t stk; struct stk_data *p; +#if __s390x__ + register unsigned long sp asm("%15"); +#else register unsigned long sp asm("sp"); +#endif if (sp < (unsigned long)sstack || sp >= (unsigned long)sstack + SIGSTKSZ) { -- cgit From 172a8ca880f1c8b01bc4d1e0239bcb293ef65e0b Mon Sep 17 00:00:00 2001 From: Fathi Boudra Date: Thu, 29 Jun 2017 12:39:53 +0300 Subject: selftests: breakpoints: re-order TEST_GEN_PROGS targets breakpoint_test can fail on arm64 with older/unpatched glibc: breakpoint_test_arm64.c: In function 'run_test': breakpoint_test_arm64.c:170:25: error: 'TRAP_HWBKPT' undeclared (first use in this function) due to glibc missing several of the TRAP_* constants in the userspace definitions. Specifically TRAP_BRANCH and TRAP_HWBKPT. See https://sourceware.org/bugzilla/show_bug.cgi?id=21286 It prevents to build step_after_suspend_test afterward, since make won't continue. We still want to be able to build and run the test, independently of breakpoint_test_arm64 build failure. Re-order TEST_GEN_PROGS to be able to build step_after_suspend_test first. Signed-off-by: Fathi Boudra Signed-off-by: Shuah Khan --- tools/testing/selftests/breakpoints/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/breakpoints/Makefile b/tools/testing/selftests/breakpoints/Makefile index 6b214b7b10fb..247b0a1899d7 100644 --- a/tools/testing/selftests/breakpoints/Makefile +++ b/tools/testing/selftests/breakpoints/Makefile @@ -2,14 +2,14 @@ uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) +TEST_GEN_PROGS := step_after_suspend_test + ifeq ($(ARCH),x86) -TEST_GEN_PROGS := breakpoint_test +TEST_GEN_PROGS += breakpoint_test endif ifneq (,$(filter $(ARCH),aarch64 arm64)) -TEST_GEN_PROGS := breakpoint_test_arm64 +TEST_GEN_PROGS += breakpoint_test_arm64 endif -TEST_GEN_PROGS += step_after_suspend_test - include ../lib.mk -- cgit From 67b2e30eb7b346db60afe91b0927738fb604d0a4 Mon Sep 17 00:00:00 2001 From: Daniel Díaz Date: Thu, 17 Aug 2017 10:55:30 -0500 Subject: selftests: intel_pstate: build only on x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These tests are only for x86, so don't try to build or run them on other architectures. Signed-off-by: Daniel Díaz Signed-off-by: Shuah Khan --- tools/testing/selftests/intel_pstate/Makefile | 2 ++ tools/testing/selftests/intel_pstate/run.sh | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/tools/testing/selftests/intel_pstate/Makefile b/tools/testing/selftests/intel_pstate/Makefile index 849a90ffe8dd..a97e24edde39 100644 --- a/tools/testing/selftests/intel_pstate/Makefile +++ b/tools/testing/selftests/intel_pstate/Makefile @@ -1,7 +1,9 @@ CFLAGS := $(CFLAGS) -Wall -D_GNU_SOURCE LDLIBS := $(LDLIBS) -lm +ifeq (,$(filter $(ARCH),x86)) TEST_GEN_FILES := msr aperf +endif TEST_PROGS := run.sh diff --git a/tools/testing/selftests/intel_pstate/run.sh b/tools/testing/selftests/intel_pstate/run.sh index 7868c106b8b1..1b4b8302dfc2 100755 --- a/tools/testing/selftests/intel_pstate/run.sh +++ b/tools/testing/selftests/intel_pstate/run.sh @@ -29,6 +29,11 @@ EVALUATE_ONLY=0 +if ! uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ | grep -q x86; then + echo "$0 # Skipped: Test can only run on x86 architectures." + exit 0 +fi + max_cpus=$(($(nproc)-1)) # compile programs -- cgit From 6f0003363a13be699ba945cfa4074193e04fbea5 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Fri, 8 Sep 2017 14:01:18 +0200 Subject: selftests/intel_pstate: No need to compile test progs in the run script Both test programs are being compiled by make, so no need to compile both programs in the runner script. This resolves an error when installing all selftests via make install and run them in a different environemnt. Running tests in intel_pstate ======================================== ./run.sh: line 35: gcc: command not found Problem compiling aperf.c. Signed-off-by: Thomas Meyer Signed-off-by: Shuah Khan --- tools/testing/selftests/intel_pstate/run.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tools/testing/selftests/intel_pstate/run.sh b/tools/testing/selftests/intel_pstate/run.sh index 1b4b8302dfc2..d3ab48f91cd6 100755 --- a/tools/testing/selftests/intel_pstate/run.sh +++ b/tools/testing/selftests/intel_pstate/run.sh @@ -36,12 +36,6 @@ fi max_cpus=$(($(nproc)-1)) -# compile programs -gcc aperf.c -Wall -D_GNU_SOURCE -o aperf -lm -[ $? -ne 0 ] && echo "Problem compiling aperf.c." && exit 1 -gcc -o msr msr.c -lm -[ $? -ne 0 ] && echo "Problem compiling msr.c." && exit 1 - function run_test () { file_ext=$1 -- cgit From 56a268cd4a410081d477913d51e044039d3a8834 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Fri, 8 Sep 2017 13:19:23 +0200 Subject: selftests/bpf: Make bpf_util work on uniprocessor systems The current implementation fails to work on uniprocessor systems. Fix the parser to also handle the uniprocessor case. Signed-off-by: Thomas Meyer Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Shuah Khan --- tools/testing/selftests/bpf/bpf_util.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h index 20ecbaa0d85d..6c53a8906eff 100644 --- a/tools/testing/selftests/bpf/bpf_util.h +++ b/tools/testing/selftests/bpf/bpf_util.h @@ -12,6 +12,7 @@ static inline unsigned int bpf_num_possible_cpus(void) unsigned int start, end, possible_cpus = 0; char buff[128]; FILE *fp; + int n; fp = fopen(fcpu, "r"); if (!fp) { @@ -20,17 +21,17 @@ static inline unsigned int bpf_num_possible_cpus(void) } while (fgets(buff, sizeof(buff), fp)) { - if (sscanf(buff, "%u-%u", &start, &end) == 2) { - possible_cpus = start == 0 ? end + 1 : 0; - break; + n = sscanf(buff, "%u-%u", &start, &end); + if (n == 0) { + printf("Failed to retrieve # possible CPUs!\n"); + exit(1); + } else if (n == 1) { + end = start; } + possible_cpus = start == 0 ? end + 1 : 0; + break; } - fclose(fp); - if (!possible_cpus) { - printf("Failed to retrieve # possible CPUs!\n"); - exit(1); - } return possible_cpus; } -- cgit From 84c06566cfb84e5f6aed9582d4570df8406700c9 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Fri, 8 Sep 2017 14:01:17 +0200 Subject: selftests/ftrace: multiple_kprobes: Also check for support The multiple_kprobes test case fails to check for KPROBE_EVENT support. Add the check to prevent a false test result. Signed-off-by: Thomas Meyer Acked-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc index 2a1cb9908746..a4fd4c851a5b 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc @@ -1,6 +1,8 @@ #!/bin/sh # description: Register/unregister many kprobe events +[ -f kprobe_events ] || exit_unsupported # this is configurable + # ftrace fentry skip size depends on the machine architecture. # Currently HAVE_KPROBES_ON_FTRACE defined on x86 and powerpc64le case `uname -m` in -- cgit From 63ecc3d9436f8012e49dc846d6cb0a85a3433517 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 13 Sep 2017 19:30:51 -0600 Subject: udpv6: Fix the checksum computation when HW checksum does not apply While trying an ESP transport mode encryption for UDPv6 packets of datagram size 1436 with MTU 1500, checksum error was observed in the secondary fragment. This error occurs due to the UDP payload checksum being missed out when computing the full checksum for these packets in udp6_hwcsum_outgoing(). Fixes: d39d938c8228 ("ipv6: Introduce udpv6_send_skb()") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- net/ipv6/udp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e2ecfb137297..40d7234c27b9 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1015,6 +1015,7 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, */ offset = skb_transport_offset(skb); skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + csum = skb->csum; skb->ip_summed = CHECKSUM_NONE; -- cgit From 265698d7e6132a2d41471135534f4f36ad15b09c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 18 Sep 2017 22:46:36 +0200 Subject: nl80211: fix null-ptr dereference on invalid mesh configuration If TX rates are specified during mesh join, the channel must also be specified. Check the channel pointer to avoid a null pointer dereference if it isn't. Reported-by: Jouni Malinen Fixes: 8564e38206de ("cfg80211: add checks for beacon rate, extend to mesh") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fbd5593e88cb..690874293cfc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9987,6 +9987,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!setup.chandef.chan) + return -EINVAL; + err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band, &setup.beacon_rate); if (err) -- cgit From 76cc0d3282d4b933fa144fa41fbc5318e0fdca24 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Sep 2017 12:00:07 +0800 Subject: ip6_gre: skb_push ipv6hdr before packing the header in ip6gre_header Now in ip6gre_header before packing the ipv6 header, it skb_push t->hlen which only includes encap_hlen + tun_hlen. It means greh and inner header would be over written by ipv6 stuff and ipv6h might have no chance to set up. Jianlin found this issue when using remote any on ip6_gre, the packets he captured on gre dev are truncated: 22:50:26.210866 Out ethertype IPv6 (0x86dd), length 120: truncated-ip6 -\ 8128 bytes missing!(flowlabel 0x92f40, hlim 0, next-header Options (0) \ payload length: 8192) ::1:2000:0 > ::1:0:86dd: HBH [trunc] ip-proto-128 \ 8184 It should also skb_push ipv6hdr so that ipv6h points to the right position to set ipv6 stuff up. This patch is to skb_push hlen + sizeof(*ipv6h) and also fix some indents in ip6gre_header. Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b7a72d409334..20f66f4c9460 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -940,24 +940,25 @@ done: } static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, - const void *daddr, const void *saddr, unsigned int len) + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) { struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h = skb_push(skb, t->hlen); - __be16 *p = (__be16 *)(ipv6h+1); + struct ipv6hdr *ipv6h; + __be16 *p; - ip6_flow_hdr(ipv6h, 0, - ip6_make_flowlabel(dev_net(dev), skb, - t->fl.u.ip6.flowlabel, true, - &t->fl.u.ip6)); + ipv6h = skb_push(skb, t->hlen + sizeof(*ipv6h)); + ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, + t->fl.u.ip6.flowlabel, + true, &t->fl.u.ip6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; ipv6h->daddr = t->parms.raddr; - p[0] = t->parms.o_flags; - p[1] = htons(type); + p = (__be16 *)(ipv6h + 1); + p[0] = t->parms.o_flags; + p[1] = htons(type); /* * Set the source hardware address. -- cgit From 7b4dc3c0da0d66e7b20a826c537d41bb73e4df54 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 18 Aug 2017 17:49:58 +0800 Subject: drm/i915/gvt: Fix incorrect PCI BARs reporting Looking at our virtual PCI device, we can see surprising Region 4 and Region 5. 00:10.0 VGA compatible controller: Intel Corporation Sky Lake Integrated Graphics (rev 06) (prog-if 00 [VGA controller]) .... Region 0: Memory at 140000000 (64-bit, non-prefetchable) [size=16M] Region 2: Memory at 180000000 (64-bit, prefetchable) [size=1G] Region 4: Memory at (32-bit, non-prefetchable) Region 5: Memory at (32-bit, non-prefetchable) Expansion ROM at febd6000 [disabled] [size=2K] The fact is that we only implemented BAR0 and BAR2. Surprising Region 4 and Region 5 are shown because we report their size as 0xffffffff. They should report size 0 instead. BTW, the physical GPU has a PIO BAR. GVTg hasn't implemented PIO access, so we ignored this BAR for vGPU device. v2: fix BAR size value calculation. Link: https://bugzilla.redhat.com/show_bug.cgi?id=1458032 Signed-off-by: Changbin Du Cc: stable@vger.kernel.org Signed-off-by: Zhenyu Wang (cherry picked from commit f1751362d6357a90bc6e53176cec715ff2dbed74) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gvt/cfg_space.c | 113 +++++++++++++++-------------------- 1 file changed, 48 insertions(+), 65 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/cfg_space.c b/drivers/gpu/drm/i915/gvt/cfg_space.c index 40af17ec6312..ff3154fe6588 100644 --- a/drivers/gpu/drm/i915/gvt/cfg_space.c +++ b/drivers/gpu/drm/i915/gvt/cfg_space.c @@ -197,78 +197,65 @@ static int emulate_pci_command_write(struct intel_vgpu *vgpu, static int emulate_pci_bar_write(struct intel_vgpu *vgpu, unsigned int offset, void *p_data, unsigned int bytes) { - unsigned int bar_index = - (rounddown(offset, 8) % PCI_BASE_ADDRESS_0) / 8; u32 new = *(u32 *)(p_data); bool lo = IS_ALIGNED(offset, 8); u64 size; int ret = 0; bool mmio_enabled = vgpu_cfg_space(vgpu)[PCI_COMMAND] & PCI_COMMAND_MEMORY; + struct intel_vgpu_pci_bar *bars = vgpu->cfg_space.bar; - if (WARN_ON(bar_index >= INTEL_GVT_PCI_BAR_MAX)) - return -EINVAL; - + /* + * Power-up software can determine how much address + * space the device requires by writing a value of + * all 1's to the register and then reading the value + * back. The device will return 0's in all don't-care + * address bits. + */ if (new == 0xffffffff) { - /* - * Power-up software can determine how much address - * space the device requires by writing a value of - * all 1's to the register and then reading the value - * back. The device will return 0's in all don't-care - * address bits. - */ - size = vgpu->cfg_space.bar[bar_index].size; - if (lo) { - new = rounddown(new, size); - } else { - u32 val = vgpu_cfg_space(vgpu)[rounddown(offset, 8)]; - /* for 32bit mode bar it returns all-0 in upper 32 - * bit, for 64bit mode bar it will calculate the - * size with lower 32bit and return the corresponding - * value + switch (offset) { + case PCI_BASE_ADDRESS_0: + case PCI_BASE_ADDRESS_1: + size = ~(bars[INTEL_GVT_PCI_BAR_GTTMMIO].size -1); + intel_vgpu_write_pci_bar(vgpu, offset, + size >> (lo ? 0 : 32), lo); + /* + * Untrap the BAR, since guest hasn't configured a + * valid GPA */ - if (val & PCI_BASE_ADDRESS_MEM_TYPE_64) - new &= (~(size-1)) >> 32; - else - new = 0; - } - /* - * Unmapp & untrap the BAR, since guest hasn't configured a - * valid GPA - */ - switch (bar_index) { - case INTEL_GVT_PCI_BAR_GTTMMIO: ret = trap_gttmmio(vgpu, false); break; - case INTEL_GVT_PCI_BAR_APERTURE: + case PCI_BASE_ADDRESS_2: + case PCI_BASE_ADDRESS_3: + size = ~(bars[INTEL_GVT_PCI_BAR_APERTURE].size -1); + intel_vgpu_write_pci_bar(vgpu, offset, + size >> (lo ? 0 : 32), lo); ret = map_aperture(vgpu, false); break; + default: + /* Unimplemented BARs */ + intel_vgpu_write_pci_bar(vgpu, offset, 0x0, false); } - intel_vgpu_write_pci_bar(vgpu, offset, new, lo); } else { - /* - * Unmapp & untrap the old BAR first, since guest has - * re-configured the BAR - */ - switch (bar_index) { - case INTEL_GVT_PCI_BAR_GTTMMIO: - ret = trap_gttmmio(vgpu, false); + switch (offset) { + case PCI_BASE_ADDRESS_0: + case PCI_BASE_ADDRESS_1: + /* + * Untrap the old BAR first, since guest has + * re-configured the BAR + */ + trap_gttmmio(vgpu, false); + intel_vgpu_write_pci_bar(vgpu, offset, new, lo); + ret = trap_gttmmio(vgpu, mmio_enabled); break; - case INTEL_GVT_PCI_BAR_APERTURE: - ret = map_aperture(vgpu, false); + case PCI_BASE_ADDRESS_2: + case PCI_BASE_ADDRESS_3: + map_aperture(vgpu, false); + intel_vgpu_write_pci_bar(vgpu, offset, new, lo); + ret = map_aperture(vgpu, mmio_enabled); break; - } - intel_vgpu_write_pci_bar(vgpu, offset, new, lo); - /* Track the new BAR */ - if (mmio_enabled) { - switch (bar_index) { - case INTEL_GVT_PCI_BAR_GTTMMIO: - ret = trap_gttmmio(vgpu, true); - break; - case INTEL_GVT_PCI_BAR_APERTURE: - ret = map_aperture(vgpu, true); - break; - } + default: + intel_vgpu_write_pci_bar(vgpu, offset, new, lo); } } return ret; @@ -299,10 +286,7 @@ int intel_vgpu_emulate_cfg_write(struct intel_vgpu *vgpu, unsigned int offset, } switch (rounddown(offset, 4)) { - case PCI_BASE_ADDRESS_0: - case PCI_BASE_ADDRESS_1: - case PCI_BASE_ADDRESS_2: - case PCI_BASE_ADDRESS_3: + case PCI_BASE_ADDRESS_0 ... PCI_BASE_ADDRESS_5: if (WARN_ON(!IS_ALIGNED(offset, 4))) return -EINVAL; return emulate_pci_bar_write(vgpu, offset, p_data, bytes); @@ -344,7 +328,6 @@ void intel_vgpu_init_cfg_space(struct intel_vgpu *vgpu, struct intel_gvt *gvt = vgpu->gvt; const struct intel_gvt_device_info *info = &gvt->device_info; u16 *gmch_ctl; - int i; memcpy(vgpu_cfg_space(vgpu), gvt->firmware.cfg_space, info->cfg_space_size); @@ -371,13 +354,13 @@ void intel_vgpu_init_cfg_space(struct intel_vgpu *vgpu, */ memset(vgpu_cfg_space(vgpu) + PCI_BASE_ADDRESS_1, 0, 4); memset(vgpu_cfg_space(vgpu) + PCI_BASE_ADDRESS_3, 0, 4); + memset(vgpu_cfg_space(vgpu) + PCI_BASE_ADDRESS_4, 0, 8); memset(vgpu_cfg_space(vgpu) + INTEL_GVT_PCI_OPREGION, 0, 4); - for (i = 0; i < INTEL_GVT_MAX_BAR_NUM; i++) { - vgpu->cfg_space.bar[i].size = pci_resource_len( - gvt->dev_priv->drm.pdev, i * 2); - vgpu->cfg_space.bar[i].tracked = false; - } + vgpu->cfg_space.bar[INTEL_GVT_PCI_BAR_GTTMMIO].size = + pci_resource_len(gvt->dev_priv->drm.pdev, 0); + vgpu->cfg_space.bar[INTEL_GVT_PCI_BAR_APERTURE].size = + pci_resource_len(gvt->dev_priv->drm.pdev, 2); } /** -- cgit From 814feed316e9d15b0ae869ca729370e7630b8d13 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 10 Sep 2017 10:56:42 +0200 Subject: drm/i915: Fix an error handling in 'intel_framebuffer_init()' We should go through the error handling path to decrease the 'framebuffer_references' as done everywhere else in this function. Fixes: 2e2adb05736c ("drm/i915: Add render decompression support") Signed-off-by: Christophe JAILLET Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20170910085642.13673-1-christophe.jaillet@wanadoo.fr (cherry picked from commit 37875d6b3af702425ce292de81b77faf94766616) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_display.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index f17275519484..00cd17c76fdc 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -14030,7 +14030,7 @@ static int intel_framebuffer_init(struct intel_framebuffer *intel_fb, if (mode_cmd->handles[i] != mode_cmd->handles[0]) { DRM_DEBUG_KMS("bad plane %d handle\n", i); - return -EINVAL; + goto err; } stride_alignment = intel_fb_stride_alignment(fb, i); -- cgit From ac73661c6222faef1a97ec44f424234f165e4710 Mon Sep 17 00:00:00 2001 From: "Lee, Shawn C" Date: Tue, 12 Sep 2017 11:36:30 +0800 Subject: drm/i915/bxt: set min brightness from VBT Min brightness value from vbt was missing for BXT platform. This setting have to refer backlight ic spec to restrict min backlight output. Without this restriction, driver would allow to configure lower brightness value and violate backlight ic requirement. Fixes: 0fb890c01349 ("drm/i915/bxt: BLC implementation") Cc: Jani Nikula Cc: Cooper Chiou Cc: Gary C Wang Signed-off-by: Shawn Lee Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/1505187390-7039-1-git-send-email-shawn.c.lee@intel.com (cherry picked from commit c3881128cb672cf484a52fbb36b5daa9044d9168) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_panel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c index a17b1de7d7e0..d4dd248ac9a8 100644 --- a/drivers/gpu/drm/i915/intel_panel.c +++ b/drivers/gpu/drm/i915/intel_panel.c @@ -1699,6 +1699,8 @@ bxt_setup_backlight(struct intel_connector *connector, enum pipe unused) if (!panel->backlight.max) return -ENODEV; + panel->backlight.min = get_backlight_min_vbt(connector); + val = bxt_get_backlight(connector); val = intel_panel_compute_brightness(connector, val); panel->backlight.level = clamp(val, panel->backlight.min, -- cgit From abeae421b03d800d33894df7fbca6d00c70c358e Mon Sep 17 00:00:00 2001 From: Uma Shankar Date: Tue, 5 Sep 2017 15:14:31 +0530 Subject: Revert "drm/i915/bxt: Disable device ready before shutdown command" This reverts commit bbdf0b2ff32a ("drm/i915/bxt: Disable device ready before shutdown command"). Disable device ready before shutdown command was added previously to avoid a split screen issue seen on dual link DSI panels. As of now, dual link is not supported and will need some rework in the upstream code. For single link DSI panels, the change is not required. This will cause failure in sending SHUTDOWN packet during disable. Hence reverting the change. Will handle the change as part of dual link enabling in upstream. Fixes: bbdf0b2ff32a ("drm/i915/bxt: Disable device ready before shutdown command") Cc: # v4.12+ Signed-off-by: Uma Shankar Signed-off-by: Vidya Srinivas Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/1504604671-17237-1-git-send-email-vidya.srinivas@intel.com (cherry picked from commit 33c8d8870c67faf3161898a56af98ac3c1c71450) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_dsi.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_dsi.c b/drivers/gpu/drm/i915/intel_dsi.c index f0c11aec5ea5..7442891762be 100644 --- a/drivers/gpu/drm/i915/intel_dsi.c +++ b/drivers/gpu/drm/i915/intel_dsi.c @@ -892,8 +892,6 @@ static void intel_dsi_disable(struct intel_encoder *encoder, struct intel_crtc_state *old_crtc_state, struct drm_connector_state *old_conn_state) { - struct drm_device *dev = encoder->base.dev; - struct drm_i915_private *dev_priv = dev->dev_private; struct intel_dsi *intel_dsi = enc_to_intel_dsi(&encoder->base); enum port port; @@ -902,15 +900,6 @@ static void intel_dsi_disable(struct intel_encoder *encoder, intel_dsi_vbt_exec_sequence(intel_dsi, MIPI_SEQ_BACKLIGHT_OFF); intel_panel_disable_backlight(old_conn_state); - /* - * Disable Device ready before the port shutdown in order - * to avoid split screen - */ - if (IS_BROXTON(dev_priv)) { - for_each_dsi_port(port, intel_dsi->ports) - I915_WRITE(MIPI_DEVICE_READY(port), 0); - } - /* * According to the spec we should send SHUTDOWN before * MIPI_SEQ_DISPLAY_OFF only for v3+ VBTs, but field testing -- cgit From 8c7a758873577f0d1bb3f879584338f73e6c6bc3 Mon Sep 17 00:00:00 2001 From: "Lee, Shawn C" Date: Wed, 13 Sep 2017 13:19:20 +0800 Subject: drm/i915/cnp: set min brightness from VBT Min brightness value from vbt was missing for CNP platform. This setting have to refer backlight ic spec to restrict min backlight output. Without this restriction, driver would allow to configure lower brightness value and violate backlight ic requirement. Fixes: 4c9f7086ac6d ("drm/i915/cnp: Backlight support for CNP.") Cc: Jani Nikula Signed-off-by: Shawn Lee Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/1505279961-16140-1-git-send-email-shawn.c.lee@intel.com (cherry picked from commit f44e354f857f207cd361269c5e38e1f96e0b616c) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_panel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c index d4dd248ac9a8..3b1c5d783ee7 100644 --- a/drivers/gpu/drm/i915/intel_panel.c +++ b/drivers/gpu/drm/i915/intel_panel.c @@ -1737,6 +1737,8 @@ cnp_setup_backlight(struct intel_connector *connector, enum pipe unused) if (!panel->backlight.max) return -ENODEV; + panel->backlight.min = get_backlight_min_vbt(connector); + val = bxt_get_backlight(connector); val = intel_panel_compute_brightness(connector, val); panel->backlight.level = clamp(val, panel->backlight.min, -- cgit From 99df13b6ea811a63eeacb278d05a5b914ce28073 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 14 Sep 2017 17:42:13 +0100 Subject: drm/i915: Remove unused 'in_vbl' from i915_get_crtc_scanoutpos() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1bf6ad622b9b ("drm/vblank: drop the mode argument from drm_calc_vbltimestamp_from_scanoutpos") removed the use of in_vbl, but did not remove the local variable. Do so now. Fixes: 1bf6ad622b9b ("drm/vblank: drop the mode argument from drm_calc_vbltimestamp_from_scanoutpos") Signed-off-by: Chris Wilson Cc: Ville Syrjälä Cc: Daniel Vetter Cc: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20170914164213.18461-1-chris@chris-wilson.co.uk Reviewed-by: Ville Syrjälä (cherry picked from commit e01e71fc49d4c95090a04f898a3fe788c652a04b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_irq.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index e21ce9c18b6e..b63893eeca73 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -839,7 +839,6 @@ static bool i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, pipe); int position; int vbl_start, vbl_end, hsync_start, htotal, vtotal; - bool in_vbl = true; unsigned long irqflags; if (WARN_ON(!mode->crtc_clock)) { @@ -922,8 +921,6 @@ static bool i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); - in_vbl = position >= vbl_start && position < vbl_end; - /* * While in vblank, position will be negative * counting up towards 0 at vbl_end. And outside -- cgit From f2654a4781318dc7ab8d6cde66f1fa39eab980a9 Mon Sep 17 00:00:00 2001 From: Fahad Kunnathadi Date: Fri, 15 Sep 2017 12:01:58 +0530 Subject: net: phy: Fix mask value write on gmii2rgmii converter speed register To clear Speed Selection in MDIO control register(0x10), ie, clear bits 6 and 13 to zero while keeping other bits same. Before AND operation,The Mask value has to be perform with bitwise NOT operation (ie, ~ operator) This patch clears current speed selection before writing the new speed settings to gmii2rgmii converter Fixes: f411a6160bd4 ("net: phy: Add gmiitorgmii converter support") Signed-off-by: Fahad Kunnathadi Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/xilinx_gmii2rgmii.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/xilinx_gmii2rgmii.c b/drivers/net/phy/xilinx_gmii2rgmii.c index d15dd3938ba8..2e5150b0b8d5 100644 --- a/drivers/net/phy/xilinx_gmii2rgmii.c +++ b/drivers/net/phy/xilinx_gmii2rgmii.c @@ -44,7 +44,7 @@ static int xgmiitorgmii_read_status(struct phy_device *phydev) priv->phy_drv->read_status(phydev); val = mdiobus_read(phydev->mdio.bus, priv->addr, XILINX_GMII2RGMII_REG); - val &= XILINX_GMII2RGMII_SPEED_MASK; + val &= ~XILINX_GMII2RGMII_SPEED_MASK; if (phydev->speed == SPEED_1000) val |= BMCR_SPEED1000; -- cgit From 8c22dab03ad072e45060c299c70d02a4f6fc4aab Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Sep 2017 15:58:33 +0800 Subject: ip6_tunnel: do not allow loading ip6_tunnel if ipv6 is disabled in cmdline If ipv6 has been disabled from cmdline since kernel started, it makes no sense to allow users to create any ip6 tunnel. Otherwise, it could some potential problem. Jianlin found a kernel crash caused by this in ip6_gre when he set ipv6.disable=1 in grub: [ 209.588865] Unable to handle kernel paging request for data at address 0x00000080 [ 209.588872] Faulting instruction address: 0xc000000000a3aa6c [ 209.588879] Oops: Kernel access of bad area, sig: 11 [#1] [ 209.589062] NIP [c000000000a3aa6c] fib_rules_lookup+0x4c/0x260 [ 209.589071] LR [c000000000b9ad90] fib6_rule_lookup+0x50/0xb0 [ 209.589076] Call Trace: [ 209.589097] fib6_rule_lookup+0x50/0xb0 [ 209.589106] rt6_lookup+0xc4/0x110 [ 209.589116] ip6gre_tnl_link_config+0x214/0x2f0 [ip6_gre] [ 209.589125] ip6gre_newlink+0x138/0x3a0 [ip6_gre] [ 209.589134] rtnl_newlink+0x798/0xb80 [ 209.589142] rtnetlink_rcv_msg+0xec/0x390 [ 209.589151] netlink_rcv_skb+0x138/0x150 [ 209.589159] rtnetlink_rcv+0x48/0x70 [ 209.589169] netlink_unicast+0x538/0x640 [ 209.589175] netlink_sendmsg+0x40c/0x480 [ 209.589184] ___sys_sendmsg+0x384/0x4e0 [ 209.589194] SyS_sendmsg+0xd4/0x140 [ 209.589201] SyS_socketcall+0x3e0/0x4f0 [ 209.589209] system_call+0x38/0xe0 This patch is to return -EOPNOTSUPP in ip6_tunnel_init if ipv6 has been disabled from cmdline. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index ae73164559d5..f2f21c24915f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2259,6 +2259,9 @@ static int __init ip6_tunnel_init(void) { int err; + if (!ipv6_mod_enabled()) + return -EOPNOTSUPP; + err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; -- cgit From 3ff4cbec87da48b0ec1f7b6196607b034de0c680 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Sat, 16 Sep 2017 14:02:21 +0200 Subject: net/sched: cls_matchall: fix crash when used with classful qdisc this script, edited from Linux Advanced Routing and Traffic Control guide tc q a dev en0 root handle 1: htb default a tc c a dev en0 parent 1: classid 1:1 htb rate 6mbit burst 15k tc c a dev en0 parent 1:1 classid 1:a htb rate 5mbit ceil 6mbit burst 15k tc c a dev en0 parent 1:1 classid 1:b htb rate 1mbit ceil 6mbit burst 15k tc f a dev en0 parent 1:0 prio 1 $clsname $clsargs classid 1:b ping $address -c1 tc -s c s dev en0 classifies traffic to 1:b or 1:a, depending on whether the packet matches or not the pattern $clsargs of filter $clsname. However, when $clsname is 'matchall', a systematic crash can be observed in htb_classify(). HTB and classful qdiscs don't assign initial value to struct tcf_result, but then they expect it to contain valid values after filters have been run. Thus, current 'matchall' ignores the TCA_MATCHALL_CLASSID attribute, configured by user, and makes HTB (and classful qdiscs) dereference random pointers. By assigning head->res to *res in mall_classify(), before the actions are invoked, we fix this crash and enable TCA_MATCHALL_CLASSID functionality, that had no effect on 'matchall' classifier since its first introduction. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1460213 Reported-by: Jiri Benc Fixes: b87f7936a932 ("net/sched: introduce Match-all classifier") Signed-off-by: Davide Caratti Acked-by: Yotam Gigi Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 21cc45caf842..eeac606c95ab 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -32,6 +32,7 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (tc_skip_sw(head->flags)) return -1; + *res = head->res; return tcf_exts_exec(skb, &head->exts, res); } -- cgit From 51513748ddfe7a5d2812158a1e0570499b0c511c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 16 Sep 2017 13:10:06 -0700 Subject: Documentation: networking: fix ASCII art in switchdev.txt Fix ASCII art in Documentation/networking/switchdev.txt: Change non-ASCII "spaces" to ASCII spaces. Change 2 erroneous '+' characters in ASCII art to '-' (at the '*' characters below): line 32: +--+----+----+----+-*--+----+---+ +-----+-----+ line 41: +--------------+---*------------+ Signed-off-by: Randy Dunlap Acked-by: Pavel Machek Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/switchdev.txt | 68 +++++++++++++++++----------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index 5e40e1f68873..82236a17b5e6 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -13,42 +13,42 @@ an example setup using a data-center-class switch ASIC chip. Other setups with SR-IOV or soft switches, such as OVS, are possible. -                             User-space tools - -       user space                   | -      +-------------------------------------------------------------------+ -       kernel                       | Netlink -                                    | -                     +--------------+-------------------------------+ -                     |         Network stack                        | -                     |           (Linux)                            | -                     |                                              | -                     +----------------------------------------------+ + User-space tools + + user space | + +-------------------------------------------------------------------+ + kernel | Netlink + | + +--------------+-------------------------------+ + | Network stack | + | (Linux) | + | | + +----------------------------------------------+ sw1p2 sw1p4 sw1p6 -                      sw1p1  + sw1p3 +  sw1p5 +         eth1 -                        +    |    +    |    +    |            + -                        |    |    |    |    |    |            | -                     +--+----+----+----+-+--+----+---+  +-----+-----+ -                     |         Switch driver         |  |    mgmt   | -                     |        (this document)        |  |   driver  | -                     |                               |  |           | -                     +--------------+----------------+  +-----------+ -                                    | -       kernel                       | HW bus (eg PCI) -      +-------------------------------------------------------------------+ -       hardware                     | -                     +--------------+---+------------+ -                     |         Switch device (sw1)   | -                     |  +----+                       +--------+ -                     |  |    v offloaded data path   | mgmt port -                     |  |    |                       | -                     +--|----|----+----+----+----+---+ -                        |    |    |    |    |    | -                        +    +    +    +    +    + -                       p1   p2   p3   p4   p5   p6 - -                             front-panel ports + sw1p1 + sw1p3 + sw1p5 + eth1 + + | + | + | + + | | | | | | | + +--+----+----+----+----+----+---+ +-----+-----+ + | Switch driver | | mgmt | + | (this document) | | driver | + | | | | + +--------------+----------------+ +-----------+ + | + kernel | HW bus (eg PCI) + +-------------------------------------------------------------------+ + hardware | + +--------------+----------------+ + | Switch device (sw1) | + | +----+ +--------+ + | | v offloaded data path | mgmt port + | | | | + +--|----|----+----+----+----+---+ + | | | | | | + + + + + + + + p1 p2 p3 p4 p5 p6 + + front-panel ports Fig 1. -- cgit From 6ce14f6416c84bd9c81777edf899b57ac5000c87 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 19 Sep 2017 01:49:02 +0200 Subject: ACPI / watchdog: properly initialize resources We copy a local resource structure into a list, but only initialize some of its members, as pointed out by gcc-4.4: drivers/acpi/acpi_watchdog.c: In function 'acpi_watchdog_init': drivers/acpi/acpi_watchdog.c:105: error: 'res.child' may be used uninitialized in this function drivers/acpi/acpi_watchdog.c:105: error: 'res.sibling' may be used uninitialized in this function drivers/acpi/acpi_watchdog.c:105: error: 'res.parent' may be used uninitialized in this function drivers/acpi/acpi_watchdog.c:105: error: 'res.desc' may be used uninitialized in this function drivers/acpi/acpi_watchdog.c:105: error: 'res.name' may be used uninitialized in this function Newer compilers can presumably optimize the uninitialized access away entirely and don't warn at all, but rely on the kzalloc() to zero the structure first. This adds an explicit initialization to force consistent behavior. Fixes: 058dfc767008 (ACPI / watchdog: Add support for WDAT hardware watchdog) Signed-off-by: Arnd Bergmann Acked-by: Guenter Roeck Acked-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/acpi_watchdog.c b/drivers/acpi/acpi_watchdog.c index bf22c29d2517..11b113f8e367 100644 --- a/drivers/acpi/acpi_watchdog.c +++ b/drivers/acpi/acpi_watchdog.c @@ -66,7 +66,7 @@ void __init acpi_watchdog_init(void) for (i = 0; i < wdat->entries; i++) { const struct acpi_generic_address *gas; struct resource_entry *rentry; - struct resource res; + struct resource res = {}; bool found; gas = &entries[i].register_region; -- cgit From 1e3c5ec66119783440ed211ae527674651affa9b Mon Sep 17 00:00:00 2001 From: Sathya Perla Date: Mon, 18 Sep 2017 17:05:37 +0530 Subject: bnxt_en: check for ingress qdisc in flower offload Check for ingress-only qdisc for flower offload, as other qdiscs are not supported for flower offload. Suggested-by: Jiri Pirko Signed-off-by: Sathya Perla Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index ccd699fb2d70..7dd3d131043a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -750,6 +750,10 @@ int bnxt_tc_setup_flower(struct bnxt *bp, u16 src_fid, { int rc = 0; + if (!is_classid_clsact_ingress(cls_flower->common.classid) || + cls_flower->common.chain_index) + return -EOPNOTSUPP; + switch (cls_flower->command) { case TC_CLSFLOWER_REPLACE: rc = bnxt_tc_add_flow(bp, src_fid, cls_flower); -- cgit From 582db7e0c4c2fc5bb4f932f268035883385e3692 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 18 Sep 2017 15:03:46 +0200 Subject: bpf: devmap: pass on return value of bpf_map_precharge_memlock If bpf_map_precharge_memlock in dev_map_alloc, -ENOMEM is returned regardless of the actual error produced by bpf_map_precharge_memlock. Fix it by passing on the error returned by bpf_map_precharge_memlock. Also return -EINVAL instead of -ENOMEM if the page count overflow check fails. This makes dev_map_alloc match the behavior of other bpf maps' alloc functions wrt. return values. Signed-off-by: Tobias Klauser Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 959c9a07f318..e093d9a2c4dd 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -75,8 +75,8 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr) static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; + int err = -EINVAL; u64 cost; - int err; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -108,6 +108,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (err) goto free_dtab; + err = -ENOMEM; + /* A per cpu bitfield with a bit per possible net device */ dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), __alignof__(unsigned long)); @@ -128,7 +130,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) free_dtab: free_percpu(dtab->flush_needed); kfree(dtab); - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); } static void dev_map_free(struct bpf_map *map) -- cgit From 5e75fe3927559505682b0053b5745e3f42d66e8c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 18 Sep 2017 17:19:10 -0700 Subject: tools/testing/nvdimm: disable labels for nfit_test.1 Improve coverage of NVDIMM-N test scenarios by providing a test bus incapable of label operations. Signed-off-by: Dan Williams --- tools/testing/nvdimm/test/nfit.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index d20791c3f499..bef419d4266d 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -1527,9 +1527,6 @@ static void nfit_test1_setup(struct nfit_test *t) set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en); set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en); set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en); - set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_cmd_force_en); - set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_cmd_force_en); - set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_cmd_force_en); } static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa, -- cgit From 4c7124413aa759b8ea0b90cd39177e525396e662 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 18 Sep 2017 11:05:16 -0700 Subject: tcp: remove two unused functions remove tcp_may_send_now and tcp_snd_test that are no longer used Fixes: 840a3cbe8969 ("tcp: remove forward retransmit feature") Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - net/ipv4/tcp_output.c | 34 ---------------------------------- 2 files changed, 35 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index b510f284427a..3bc910a9bfc6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -544,7 +544,6 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, int min_tso_segs); void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); -bool tcp_may_send_now(struct sock *sk); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); void tcp_retransmit_timer(struct sock *sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1c839c99114c..517d737059d1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1806,40 +1806,6 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, return !after(end_seq, tcp_wnd_end(tp)); } -/* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) - * should be put on the wire right now. If so, it returns the number of - * packets allowed by the congestion window. - */ -static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) -{ - const struct tcp_sock *tp = tcp_sk(sk); - unsigned int cwnd_quota; - - tcp_init_tso_segs(skb, cur_mss); - - if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) - return 0; - - cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) - cwnd_quota = 0; - - return cwnd_quota; -} - -/* Test if sending is allowed right now. */ -bool tcp_may_send_now(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = tcp_send_head(sk); - - return skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk), - (tcp_skb_is_last(sk, skb) ? - tp->nonagle : TCP_NAGLE_PUSH)); -} - /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like * tcp_fragment() except that it may make several kinds of assumptions -- cgit From 33a56086712561b8b9cdc881e0317f4c36861f72 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 18 Sep 2017 14:48:58 -0700 Subject: libnvdimm, namespace: fix btt claim class crash Maurice reports: BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: holder_class_store+0x253/0x2b0 [libnvdimm] ...while trying to reconfigure an NVDIMM-N namespace into 'sector' / 'btt' mode. The crash points to this line: (gdb) li *(holder_class_store+0x253) 0x7773 is in holder_class_store (drivers/nvdimm/namespace_devs.c:1420). 1415 for (i = 0; i < nd_region->ndr_mappings; i++) { 1416 struct nd_mapping *nd_mapping = &nd_region->mapping[i]; 1417 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); 1418 struct nd_namespace_index *nsindex; 1419 1420 nsindex = to_namespace_index(ndd, ndd->ns_current); ...where we are failing because ndd is NULL due to NVDIMM-N dimms not supporting labels. Long story short, default to the BTTv1 format in the label-less / NVDIMM-N case. Fixes: 14e494542636 ("libnvdimm, btt: BTT updates for UEFI 2.7 format") Cc: Cc: Vishal Verma Reported-by: Maurice A. Saldivar Tested-by: Maurice A. Saldivar Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 1427a386a033..3e4d1e7998da 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1417,6 +1417,15 @@ static int btt_claim_class(struct device *dev) struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nd_namespace_index *nsindex; + /* + * If any of the DIMMs do not support labels the only + * possible BTT format is v1. + */ + if (!ndd) { + loop_bitmask = 0; + break; + } + nsindex = to_namespace_index(ndd, ndd->ns_current); if (nsindex == NULL) loop_bitmask |= 1; -- cgit From 54640d238760a1a54dfebe039b49682522100186 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 18 Sep 2017 22:51:14 -0500 Subject: fcntl: Don't set si_code to SI_SIGIO when sig == SIGPOLL When fixing things to avoid ambiguous cases I had a thinko and included SIGPOLL/SIGIO in with all of the other signals that have signal specific si_codes. Which is completely wrong. Fix that. Reported-by: Vince Weaver Signed-off-by: "Eric W. Biederman" --- fs/fcntl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index 0491da3b28c3..448a1119f0be 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -749,7 +749,7 @@ static void send_sigio_to_task(struct task_struct *p, * specific si_codes. In that case use SI_SIGIO instead * to remove the ambiguity. */ - if (sig_specific_sicodes(signum)) + if ((signum != SIGPOLL) && sig_specific_sicodes(signum)) si.si_code = SI_SIGIO; /* Make sure we are called with one of the POLL_* -- cgit From 129c6cda2de2a8ac44fab096152469999b727faf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 Sep 2017 13:03:43 -0700 Subject: 8139too: revisit napi_complete_done() usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems we have to be more careful in napi_complete_done() use. This patch is not a revert, as it seems we can avoid bug that Ville reported by moving the napi_complete_done() test in the spinlock section. Many thanks to Ville for detective work and all tests. Fixes: 617f01211baf ("8139too: use napi_complete_done()") Reported-by: Ville Syrjälä Tested-by: Ville Syrjälä Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/8139too.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c index ca22f2898664..d24b47b8e0b2 100644 --- a/drivers/net/ethernet/realtek/8139too.c +++ b/drivers/net/ethernet/realtek/8139too.c @@ -2135,11 +2135,12 @@ static int rtl8139_poll(struct napi_struct *napi, int budget) if (likely(RTL_R16(IntrStatus) & RxAckBits)) work_done += rtl8139_rx(dev, tp, budget); - if (work_done < budget && napi_complete_done(napi, work_done)) { + if (work_done < budget) { unsigned long flags; spin_lock_irqsave(&tp->lock, flags); - RTL_W16_F(IntrMask, rtl8139_intr_mask); + if (napi_complete_done(napi, work_done)) + RTL_W16_F(IntrMask, rtl8139_intr_mask); spin_unlock_irqrestore(&tp->lock, flags); } spin_unlock(&tp->rx_lock); -- cgit From 8ecb1a29e11e24c458ee4ee59447d0ddf8274589 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 18 Sep 2017 16:31:30 -0700 Subject: net: systemport: Fix 64-bit statistics dependency There are several problems with commit 10377ba7673d ("net: systemport: Support 64bit statistics", first one got fixed in 7095c973453e ("net: systemport: Fix 64-bit stats deadlock"). The second problem is that this specific code updates the stats64.tx_{packets,bytes} from ndo_get_stats64() and that is what we are returning to ethtool -S. If we are not running a tool that involves calling ndo_get_stats64(), then we won't get updated ethtool stats. The solution to this is to update the stats from both call sites, factoring that into a specific function, While at it, don't just check the sizeof() but also the type of the statistics in order to use the 64-bit stats seqlock. Fixes: 10377ba7673d ("net: systemport: Support 64bit statistics") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 52 ++++++++++++++++++------------ 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index c3c53f6cd9e6..83eec9a8c275 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -432,6 +432,27 @@ static void bcm_sysport_update_mib_counters(struct bcm_sysport_priv *priv) netif_dbg(priv, hw, priv->netdev, "updated MIB counters\n"); } +static void bcm_sysport_update_tx_stats(struct bcm_sysport_priv *priv, + u64 *tx_bytes, u64 *tx_packets) +{ + struct bcm_sysport_tx_ring *ring; + u64 bytes = 0, packets = 0; + unsigned int start; + unsigned int q; + + for (q = 0; q < priv->netdev->num_tx_queues; q++) { + ring = &priv->tx_rings[q]; + do { + start = u64_stats_fetch_begin_irq(&priv->syncp); + bytes = ring->bytes; + packets = ring->packets; + } while (u64_stats_fetch_retry_irq(&priv->syncp, start)); + + *tx_bytes += bytes; + *tx_packets += packets; + } +} + static void bcm_sysport_get_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { @@ -439,11 +460,16 @@ static void bcm_sysport_get_stats(struct net_device *dev, struct bcm_sysport_stats64 *stats64 = &priv->stats64; struct u64_stats_sync *syncp = &priv->syncp; struct bcm_sysport_tx_ring *ring; + u64 tx_bytes = 0, tx_packets = 0; unsigned int start; int i, j; - if (netif_running(dev)) + if (netif_running(dev)) { bcm_sysport_update_mib_counters(priv); + bcm_sysport_update_tx_stats(priv, &tx_bytes, &tx_packets); + stats64->tx_bytes = tx_bytes; + stats64->tx_packets = tx_packets; + } for (i = 0, j = 0; i < BCM_SYSPORT_STATS_LEN; i++) { const struct bcm_sysport_stats *s; @@ -461,12 +487,13 @@ static void bcm_sysport_get_stats(struct net_device *dev, continue; p += s->stat_offset; - if (s->stat_sizeof == sizeof(u64)) + if (s->stat_sizeof == sizeof(u64) && + s->type == BCM_SYSPORT_STAT_NETDEV64) { do { start = u64_stats_fetch_begin_irq(syncp); data[i] = *(u64 *)p; } while (u64_stats_fetch_retry_irq(syncp, start)); - else + } else data[i] = *(u32 *)p; j++; } @@ -1716,27 +1743,12 @@ static void bcm_sysport_get_stats64(struct net_device *dev, { struct bcm_sysport_priv *priv = netdev_priv(dev); struct bcm_sysport_stats64 *stats64 = &priv->stats64; - struct bcm_sysport_tx_ring *ring; - u64 tx_packets = 0, tx_bytes = 0; unsigned int start; - unsigned int q; netdev_stats_to_stats64(stats, &dev->stats); - for (q = 0; q < dev->num_tx_queues; q++) { - ring = &priv->tx_rings[q]; - do { - start = u64_stats_fetch_begin_irq(&priv->syncp); - tx_bytes = ring->bytes; - tx_packets = ring->packets; - } while (u64_stats_fetch_retry_irq(&priv->syncp, start)); - - stats->tx_bytes += tx_bytes; - stats->tx_packets += tx_packets; - } - - stats64->tx_bytes = stats->tx_bytes; - stats64->tx_packets = stats->tx_packets; + bcm_sysport_update_tx_stats(priv, &stats->tx_bytes, + &stats->tx_packets); do { start = u64_stats_fetch_begin_irq(&priv->syncp); -- cgit From c8b850241559d6bad7e640075e282a83a4ad7ead Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 18 Sep 2017 12:25:22 +0200 Subject: s390/scm_blk: consistently use blk_status_t as error type Fix these warnings found by sparse: drivers/s390/block/scm_blk.c:257:24: warning: incorrect type in assignment (different base types) drivers/s390/block/scm_blk.c:257:24: expected int [signed] drivers/s390/block/scm_blk.c:257:24: got restricted blk_status_t [usertype] error drivers/s390/block/scm_blk.c:420:33: warning: incorrect type in argument 2 (different base types) drivers/s390/block/scm_blk.c:420:33: expected restricted blk_status_t [usertype] error drivers/s390/block/scm_blk.c:420:33: got int [signed] Signed-off-by: Sebastian Ott Reported-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/block/scm_blk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 2e7fd966c515..eb51893c74a4 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -249,7 +249,7 @@ static void scm_request_requeue(struct scm_request *scmrq) static void scm_request_finish(struct scm_request *scmrq) { struct scm_blk_dev *bdev = scmrq->bdev; - int *error; + blk_status_t *error; int i; for (i = 0; i < nr_requests_per_io && scmrq->request[i]; i++) { @@ -415,7 +415,7 @@ void scm_blk_irq(struct scm_device *scmdev, void *data, blk_status_t error) static void scm_blk_request_done(struct request *req) { - int *error = blk_mq_rq_to_pdu(req); + blk_status_t *error = blk_mq_rq_to_pdu(req); blk_mq_end_request(req, *error); } @@ -450,7 +450,7 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) atomic_set(&bdev->queued_reqs, 0); bdev->tag_set.ops = &scm_mq_ops; - bdev->tag_set.cmd_size = sizeof(int); + bdev->tag_set.cmd_size = sizeof(blk_status_t); bdev->tag_set.nr_hw_queues = nr_requests; bdev->tag_set.queue_depth = nr_requests_per_io * nr_requests; bdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; -- cgit From 55fb7347579017bdb09550b4e8019447340b7004 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 14 Sep 2017 13:55:22 +0200 Subject: s390/cio: recover from bad paths In some situations we don't receive notification from firmware that a previously unusable channelpath is usable again. Schedule recovery for devices that return from path verification without using all potentially usable paths. The recovery thread will periodically trigger a path verification on the affected devices. Signed-off-by: Sebastian Ott Suggested-by: Peter Oberparleiter Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/device.c | 12 +++++++++--- drivers/s390/cio/device.h | 1 + drivers/s390/cio/device_fsm.c | 12 ++++++++++++ drivers/s390/cio/io_sch.h | 2 ++ 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index 489b583f263d..e5c32f4b5287 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -1225,10 +1225,16 @@ static int device_is_disconnected(struct ccw_device *cdev) static int recovery_check(struct device *dev, void *data) { struct ccw_device *cdev = to_ccwdev(dev); + struct subchannel *sch; int *redo = data; spin_lock_irq(cdev->ccwlock); switch (cdev->private->state) { + case DEV_STATE_ONLINE: + sch = to_subchannel(cdev->dev.parent); + if ((sch->schib.pmcw.pam & sch->opm) == sch->vpm) + break; + /* fall through */ case DEV_STATE_DISCONNECTED: CIO_MSG_EVENT(3, "recovery: trigger 0.%x.%04x\n", cdev->private->dev_id.ssid, @@ -1260,7 +1266,7 @@ static void recovery_work_func(struct work_struct *unused) } spin_unlock_irq(&recovery_lock); } else - CIO_MSG_EVENT(4, "recovery: end\n"); + CIO_MSG_EVENT(3, "recovery: end\n"); } static DECLARE_WORK(recovery_work, recovery_work_func); @@ -1274,11 +1280,11 @@ static void recovery_func(unsigned long data) schedule_work(&recovery_work); } -static void ccw_device_schedule_recovery(void) +void ccw_device_schedule_recovery(void) { unsigned long flags; - CIO_MSG_EVENT(4, "recovery: schedule\n"); + CIO_MSG_EVENT(3, "recovery: schedule\n"); spin_lock_irqsave(&recovery_lock, flags); if (!timer_pending(&recovery_timer) || (recovery_phase != 0)) { recovery_phase = 0; diff --git a/drivers/s390/cio/device.h b/drivers/s390/cio/device.h index ec497af99dd8..69cb70f080a5 100644 --- a/drivers/s390/cio/device.h +++ b/drivers/s390/cio/device.h @@ -134,6 +134,7 @@ void ccw_device_set_disconnected(struct ccw_device *cdev); void ccw_device_set_notoper(struct ccw_device *cdev); void ccw_device_set_timeout(struct ccw_device *, int); +void ccw_device_schedule_recovery(void); /* Channel measurement facility related */ void retry_set_schib(struct ccw_device *cdev); diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index 12016e32e519..f98ea674c3d8 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -476,6 +476,17 @@ static void create_fake_irb(struct irb *irb, int type) } } +static void ccw_device_handle_broken_paths(struct ccw_device *cdev) +{ + struct subchannel *sch = to_subchannel(cdev->dev.parent); + u8 broken_paths = (sch->schib.pmcw.pam & sch->opm) ^ sch->vpm; + + if (broken_paths && (cdev->private->path_broken_mask != broken_paths)) + ccw_device_schedule_recovery(); + + cdev->private->path_broken_mask = broken_paths; +} + void ccw_device_verify_done(struct ccw_device *cdev, int err) { struct subchannel *sch; @@ -508,6 +519,7 @@ callback: memset(&cdev->private->irb, 0, sizeof(struct irb)); } ccw_device_report_path_events(cdev); + ccw_device_handle_broken_paths(cdev); break; case -ETIME: case -EUSERS: diff --git a/drivers/s390/cio/io_sch.h b/drivers/s390/cio/io_sch.h index 220f49145b2f..9a1b56b2df3e 100644 --- a/drivers/s390/cio/io_sch.h +++ b/drivers/s390/cio/io_sch.h @@ -131,6 +131,8 @@ struct ccw_device_private { not operable */ u8 path_gone_mask; /* mask of paths, that became unavailable */ u8 path_new_mask; /* mask of paths, that became available */ + u8 path_broken_mask; /* mask of paths, which were found to be + unusable */ struct { unsigned int fast:1; /* post with "channel end" */ unsigned int repall:1; /* report every interrupt status */ -- cgit From 91c575b335766effa6103eba42a82aea560c365f Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 18 Sep 2017 16:10:35 +0200 Subject: s390/mm: make pmdp_invalidate() do invalidation only Commit 227be799c39a ("s390/mm: uninline pmdp_xxx functions from pgtable.h") inadvertently changed the behavior of pmdp_invalidate(), so that it now clears the pmd instead of just marking it as invalid. Fix this by restoring the original behavior. A possible impact of the misbehaving pmdp_invalidate() would be the MADV_DONTNEED races (see commits ced10803 and 58ceeb6b), although we should not have any negative impact on the related dirty/young flags, since those flags are not set by the hardware on s390. Fixes: 227be799c39a ("s390/mm: uninline pmdp_xxx functions from pgtable.h") Cc: # v4.6+ Signed-off-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index dce708e061ea..20e75a2ca93a 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1507,7 +1507,9 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, static inline void pmdp_invalidate(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmdp_xchg_direct(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); + + pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); } #define __HAVE_ARCH_PMDP_SET_WRPROTECT -- cgit From ba385c0594e723d41790ecfb12c610e6f90c7785 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 18 Sep 2017 16:51:51 +0200 Subject: s390/mm: fix write access check in gup_huge_pmd() The check for the _SEGMENT_ENTRY_PROTECT bit in gup_huge_pmd() is the wrong way around. It must not be set for write==1, and not be checked for write==0. Fix this similar to how it was fixed for ptes long time ago in commit 25591b070336 ("[S390] fix get_user_pages_fast"). One impact of this bug would be unnecessarily using the gup slow path for write==0 on r/w mappings. A potentially more severe impact would be that gup_huge_pmd() will succeed for write==1 on r/o mappings. Cc: Signed-off-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/mm/gup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 8ecc25e760fa..98ffe3ee9411 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -56,13 +56,12 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - unsigned long mask, result; struct page *head, *page; + unsigned long mask; int refs; - result = write ? 0 : _SEGMENT_ENTRY_PROTECT; - mask = result | _SEGMENT_ENTRY_INVALID; - if ((pmd_val(pmd) & mask) != result) + mask = (write ? _SEGMENT_ENTRY_PROTECT : 0) | _SEGMENT_ENTRY_INVALID; + if ((pmd_val(pmd) & mask) != 0) return 0; VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT)); -- cgit From 95e2a3b3ef177730019e3799917193595133b275 Mon Sep 17 00:00:00 2001 From: "Jan H. Schönherr" Date: Sat, 16 Sep 2017 22:12:24 +0200 Subject: Revert "KVM: Don't accept obviously wrong gsi values via KVM_IRQFD" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 36ae3c0a36b7456432fedce38ae2f7bd3e01a563. The commit broke compilation on !CONFIG_HAVE_KVM_IRQ_ROUTING. Also, there may be cases with CONFIG_HAVE_KVM_IRQ_ROUTING, where larger gsi values make sense. As the commit was meant as an early indicator to user space that something is wrong, reverting just restores the previous behavior where overly large values are ignored when encountered (without any direct feedback). Reported-by: Abdul Haleem Signed-off-by: Jan H. Schönherr Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- virt/kvm/eventfd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index c608ab495282..f2ac53ab8243 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -565,8 +565,6 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) { if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) return -EINVAL; - if (args->gsi >= KVM_MAX_IRQ_ROUTES) - return -EINVAL; if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) return kvm_irqfd_deassign(kvm, args); -- cgit From a4aaeccc7a91e24cc17f72a7eb2f586f5c3d811d Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 10 Sep 2017 13:43:37 -0700 Subject: iommu: Add missing dependencies parisc:allmodconfig, xtensa:allmodconfig, and possibly others generate the following Kconfig warning. warning: (IPMMU_VMSA && ARM_SMMU && ARM_SMMU_V3 && QCOM_IOMMU) selects IOMMU_IO_PGTABLE_LPAE which has unmet direct dependencies (IOMMU_SUPPORT && HAS_DMA && (ARM || ARM64 || COMPILE_TEST && !GENERIC_ATOMIC64)) IOMMU_IO_PGTABLE_LPAE depends on (COMPILE_TEST && !GENERIC_ATOMIC64), so any configuration option selecting it needs to have the same dependencies. Signed-off-by: Guenter Roeck Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 49bd2ab8c507..8530ef78925d 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -278,7 +278,7 @@ config EXYNOS_IOMMU_DEBUG config IPMMU_VMSA bool "Renesas VMSA-compatible IPMMU" depends on ARM || IOMMU_DMA - depends on ARCH_RENESAS || COMPILE_TEST + depends on ARCH_RENESAS || (COMPILE_TEST && !GENERIC_ATOMIC64) select IOMMU_API select IOMMU_IO_PGTABLE_LPAE select ARM_DMA_USE_IOMMU @@ -373,7 +373,7 @@ config MTK_IOMMU_V1 config QCOM_IOMMU # Note: iommu drivers cannot (yet?) be built as modules bool "Qualcomm IOMMU Support" - depends on ARCH_QCOM || COMPILE_TEST + depends on ARCH_QCOM || (COMPILE_TEST && !GENERIC_ATOMIC64) select IOMMU_API select IOMMU_IO_PGTABLE_LPAE select ARM_DMA_USE_IOMMU -- cgit From 3bd71e18c5803c23014df962bdd74af1b34697fe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 12 Sep 2017 22:10:21 +0200 Subject: iommu/vt-d: Fix harmless section mismatch warning Building with gcc-4.6 results in this warning due to dmar_table_print_dmar_entry being inlined as in newer compiler versions: WARNING: vmlinux.o(.text+0x5c8bee): Section mismatch in reference from the function dmar_walk_remapping_entries() to the function .init.text:dmar_table_print_dmar_entry() The function dmar_walk_remapping_entries() references the function __init dmar_table_print_dmar_entry(). This is often because dmar_walk_remapping_entries lacks a __init annotation or the annotation of dmar_table_print_dmar_entry is wrong. This removes the __init annotation to avoid the warning. On compilers that don't show the warning today, this should have no impact since the function gets inlined anyway. Signed-off-by: Arnd Bergmann Signed-off-by: Joerg Roedel --- drivers/iommu/dmar.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index ca5ebaeafd6a..57c920c1372d 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -497,7 +497,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg) #define dmar_parse_one_rhsa dmar_res_noop #endif -static void __init +static void dmar_table_print_dmar_entry(struct acpi_dmar_header *header) { struct acpi_dmar_hardware_unit *drhd; -- cgit From 5baf6bb0fd2388742a0846cc7bcacee6dec78235 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 14 Sep 2017 14:01:00 +0200 Subject: drm/exynos: Fix locking in the suspend/resume paths Commit 48a92916729b ("drm/exynos: use drm_for_each_connector_iter()") replaced unsafe drm_for_each_connector() with drm_for_each_connector_iter() and removed surrounding drm_modeset_lock calls. However, that lock was there not only to protect unsafe drm_for_each_connector(), but it was also required to be held by the dpms code which was called from the loop body. This patch restores those drm_modeset_lock calls to fix broken suspend and resume of Exynos DRM subsystem in v4.13 kernel. Fixes: 48a92916729b ("drm/exynos: use drm_for_each_connector_iter()") CC: stable@vger.kernel.org # v4.13 Signed-off-by: Marek Szyprowski Acked-by: Krzysztof Kozlowski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_drv.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c index b1f7299600f0..7f3cfc5dd320 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_drv.c +++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c @@ -174,6 +174,7 @@ static int exynos_drm_suspend(struct device *dev) if (pm_runtime_suspended(dev) || !drm_dev) return 0; + drm_modeset_lock_all(drm_dev); drm_connector_list_iter_begin(drm_dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { int old_dpms = connector->dpms; @@ -185,6 +186,7 @@ static int exynos_drm_suspend(struct device *dev) connector->dpms = old_dpms; } drm_connector_list_iter_end(&conn_iter); + drm_modeset_unlock_all(drm_dev); return 0; } @@ -198,6 +200,7 @@ static int exynos_drm_resume(struct device *dev) if (pm_runtime_suspended(dev) || !drm_dev) return 0; + drm_modeset_lock_all(drm_dev); drm_connector_list_iter_begin(drm_dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { if (connector->funcs->dpms) { @@ -208,6 +211,7 @@ static int exynos_drm_resume(struct device *dev) } } drm_connector_list_iter_end(&conn_iter); + drm_modeset_unlock_all(drm_dev); return 0; } -- cgit From 6e8edf8a7d8d98e1734ff41e5c69439419319402 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 14 Sep 2017 14:01:01 +0200 Subject: drm/exynos: Fix suspend/resume support Commit 7d902c05b480 ("drm: Nuke drm_atomic_helper_connector_dpms") removed drm_atomic_helper_connector_dpms() helper saying that it was a dead code. It was however indirectly used by Exynos DRM driver for implementing suspend/resume support. To fix this regression (after that patch Exynos DRM suspend/resume functions became no-ops and hardware fails to suspend), this patch rewrites them with drm_atomic_helper_suspend/resume() helpers. Fixes: 7d902c05b480 ("drm: Nuke drm_atomic_helper_connector_dpms") Signed-off-by: Marek Szyprowski Acked-by: Krzysztof Kozlowski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_drv.c | 40 ++++++++++--------------------- drivers/gpu/drm/exynos/exynos_drm_drv.h | 1 + drivers/gpu/drm/exynos/exynos_drm_fbdev.c | 20 ++++++++++++++++ drivers/gpu/drm/exynos/exynos_drm_fbdev.h | 10 ++++++++ 4 files changed, 43 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c index 7f3cfc5dd320..e651a58c18cf 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_drv.c +++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c @@ -168,25 +168,19 @@ static struct drm_driver exynos_drm_driver = { static int exynos_drm_suspend(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); - struct drm_connector *connector; - struct drm_connector_list_iter conn_iter; + struct exynos_drm_private *private = drm_dev->dev_private; if (pm_runtime_suspended(dev) || !drm_dev) return 0; - drm_modeset_lock_all(drm_dev); - drm_connector_list_iter_begin(drm_dev, &conn_iter); - drm_for_each_connector_iter(connector, &conn_iter) { - int old_dpms = connector->dpms; - - if (connector->funcs->dpms) - connector->funcs->dpms(connector, DRM_MODE_DPMS_OFF); - - /* Set the old mode back to the connector for resume */ - connector->dpms = old_dpms; + drm_kms_helper_poll_disable(drm_dev); + exynos_drm_fbdev_suspend(drm_dev); + private->suspend_state = drm_atomic_helper_suspend(drm_dev); + if (IS_ERR(private->suspend_state)) { + exynos_drm_fbdev_resume(drm_dev); + drm_kms_helper_poll_enable(drm_dev); + return PTR_ERR(private->suspend_state); } - drm_connector_list_iter_end(&conn_iter); - drm_modeset_unlock_all(drm_dev); return 0; } @@ -194,24 +188,14 @@ static int exynos_drm_suspend(struct device *dev) static int exynos_drm_resume(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); - struct drm_connector *connector; - struct drm_connector_list_iter conn_iter; + struct exynos_drm_private *private = drm_dev->dev_private; if (pm_runtime_suspended(dev) || !drm_dev) return 0; - drm_modeset_lock_all(drm_dev); - drm_connector_list_iter_begin(drm_dev, &conn_iter); - drm_for_each_connector_iter(connector, &conn_iter) { - if (connector->funcs->dpms) { - int dpms = connector->dpms; - - connector->dpms = DRM_MODE_DPMS_OFF; - connector->funcs->dpms(connector, dpms); - } - } - drm_connector_list_iter_end(&conn_iter); - drm_modeset_unlock_all(drm_dev); + drm_atomic_helper_resume(drm_dev, private->suspend_state); + exynos_drm_fbdev_resume(drm_dev); + drm_kms_helper_poll_enable(drm_dev); return 0; } diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.h b/drivers/gpu/drm/exynos/exynos_drm_drv.h index cf131c2aa23e..f8bae4cb4823 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_drv.h +++ b/drivers/gpu/drm/exynos/exynos_drm_drv.h @@ -202,6 +202,7 @@ struct drm_exynos_file_private { */ struct exynos_drm_private { struct drm_fb_helper *fb_helper; + struct drm_atomic_state *suspend_state; struct device *dma_dev; void *mapping; diff --git a/drivers/gpu/drm/exynos/exynos_drm_fbdev.c b/drivers/gpu/drm/exynos/exynos_drm_fbdev.c index c3a068409b48..dfb66ecf417b 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fbdev.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fbdev.c @@ -18,6 +18,8 @@ #include #include +#include + #include "exynos_drm_drv.h" #include "exynos_drm_fb.h" #include "exynos_drm_fbdev.h" @@ -285,3 +287,21 @@ void exynos_drm_output_poll_changed(struct drm_device *dev) drm_fb_helper_hotplug_event(fb_helper); } + +void exynos_drm_fbdev_suspend(struct drm_device *dev) +{ + struct exynos_drm_private *private = dev->dev_private; + + console_lock(); + drm_fb_helper_set_suspend(private->fb_helper, 1); + console_unlock(); +} + +void exynos_drm_fbdev_resume(struct drm_device *dev) +{ + struct exynos_drm_private *private = dev->dev_private; + + console_lock(); + drm_fb_helper_set_suspend(private->fb_helper, 0); + console_unlock(); +} diff --git a/drivers/gpu/drm/exynos/exynos_drm_fbdev.h b/drivers/gpu/drm/exynos/exynos_drm_fbdev.h index 330eef87f718..645d1bb7f665 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fbdev.h +++ b/drivers/gpu/drm/exynos/exynos_drm_fbdev.h @@ -21,6 +21,8 @@ int exynos_drm_fbdev_init(struct drm_device *dev); void exynos_drm_fbdev_fini(struct drm_device *dev); void exynos_drm_fbdev_restore_mode(struct drm_device *dev); void exynos_drm_output_poll_changed(struct drm_device *dev); +void exynos_drm_fbdev_suspend(struct drm_device *drm); +void exynos_drm_fbdev_resume(struct drm_device *drm); #else @@ -39,6 +41,14 @@ static inline void exynos_drm_fbdev_restore_mode(struct drm_device *dev) #define exynos_drm_output_poll_changed (NULL) +static inline void exynos_drm_fbdev_suspend(struct drm_device *drm) +{ +} + +static inline void exynos_drm_fbdev_resume(struct drm_device *drm) +{ +} + #endif #endif -- cgit From 9ac30ef6d8ec3533d0feec53e268c4fa32ea700c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Sep 2017 10:19:55 +0200 Subject: drm: exynos: include linux/irq.h I ran into a build error on x86: drivers/gpu/drm/exynos/exynos5433_drm_decon.c: In function 'decon_conf_irq': drivers/gpu/drm/exynos/exynos5433_drm_decon.c:706:2: error: implicit declaration of function 'irq_set_status_flags'; did you mean 'dquot_state_flag'? [-Werror=implicit-function-declaration] irq_set_status_flags(irq, IRQ_NOAUTOEN); Adding the missing include fixes the error. Fixes: b37d53a0382c ("drm/exynos/decon5433: move TE handling to DECON") Signed-off-by: Arnd Bergmann Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos5433_drm_decon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/exynos/exynos5433_drm_decon.c b/drivers/gpu/drm/exynos/exynos5433_drm_decon.c index 730b8d9db187..6be5b53c3b27 100644 --- a/drivers/gpu/drm/exynos/exynos5433_drm_decon.c +++ b/drivers/gpu/drm/exynos/exynos5433_drm_decon.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include -- cgit From d6500149bc4fddc5a91cd1a0c31b38fa36bff3ee Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Mon, 18 Sep 2017 18:45:01 +0800 Subject: KVM: x86: Fix the NULL pointer parameter in check_cr_write() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Routine check_cr_write() will trigger emulator_get_cpuid()-> kvm_cpuid() to get maxphyaddr, and NULL is passed as values for ebx/ecx/edx. This is problematic because kvm_cpuid() will dereference these pointers. Fixes: d1cd3ce90044 ("KVM: MMU: check guest CR3 reserved bits based on its physical address width.") Reported-by: Jim Mattson Signed-off-by: Yu Zhang Reviewed-by: David Hildenbrand Reviewed-by: Jim Mattson Signed-off-by: Radim Krčmář --- arch/x86/kvm/emulate.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 16bf6655aa85..15f527b44aa7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4102,10 +4102,12 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (efer & EFER_LMA) { u64 maxphyaddr; - u32 eax = 0x80000008; + u32 eax, ebx, ecx, edx; - if (ctxt->ops->get_cpuid(ctxt, &eax, NULL, NULL, - NULL, false)) + eax = 0x80000008; + ecx = 0; + if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, + &edx, false)) maxphyaddr = eax & 0xff; else maxphyaddr = 36; -- cgit From dc91f2eb1a4021eb6705c15e474942f84ab9b211 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Mon, 18 Sep 2017 09:56:49 +0800 Subject: KVM: VMX: do not change SN bit in vmx_update_pi_irte() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In kvm_vcpu_trigger_posted_interrupt() and pi_pre_block(), KVM assumes that PI notification events should not be suppressed when the target vCPU is not blocked. vmx_update_pi_irte() sets the SN field before changing an interrupt from posting to remapping, but it does not check the vCPU mode. Therefore, the change of SN field may break above the assumption. Besides, I don't see reasons to suppress notification events here, so remove the changes of SN field to avoid race condition. Signed-off-by: Haozhong Zhang Reported-by: "Ramamurthy, Venkatesh" Reported-by: Dan Williams Reviewed-by: Paolo Bonzini Fixes: 28b835d60fcc ("KVM: Update Posted-Interrupts Descriptor when vCPU is preempted") Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06c0c6d0541e..7328c8c0ea3b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11911,12 +11911,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, if (set) ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else { - /* suppress notification event before unposting */ - pi_set_sn(vcpu_to_pi_desc(vcpu)); + else ret = irq_set_vcpu_affinity(host_irq, NULL); - pi_clear_sn(vcpu_to_pi_desc(vcpu)); - } if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", -- cgit From 5753743fa5108b8f98bd61e40dc63f641b26c768 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Mon, 18 Sep 2017 09:56:50 +0800 Subject: KVM: VMX: remove WARN_ON_ONCE in kvm_vcpu_trigger_posted_interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)) in kvm_vcpu_trigger_posted_interrupt() intends to detect the violation of invariant that VT-d PI notification event is not suppressed when vcpu is in the guest mode. Because the two checks for the target vcpu mode and the target suppress field cannot be performed atomically, the target vcpu mode may change in between. If that does happen, WARN_ON_ONCE() here may raise false alarms. As the previous patch fixed the real invariant breaker, remove this WARN_ON_ONCE() to avoid false alarms, and document the allowed cases instead. Signed-off-by: Haozhong Zhang Reported-by: "Ramamurthy, Venkatesh" Reported-by: Dan Williams Reviewed-by: Paolo Bonzini Fixes: 28b835d60fcc ("KVM: Update Posted-Interrupts Descriptor when vCPU is preempted") Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7328c8c0ea3b..0726ca7a1b02 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5077,21 +5077,30 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; if (vcpu->mode == IN_GUEST_MODE) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * Currently, we don't support urgent interrupt, - * all interrupts are recognized as non-urgent - * interrupt, so we cannot post interrupts when - * 'SN' is set. + * The vector of interrupt to be delivered to vcpu had + * been set in PIR before this function. + * + * Following cases will be reached in this block, and + * we always send a notification event in all cases as + * explained below. + * + * Case 1: vcpu keeps in non-root mode. Sending a + * notification event posts the interrupt to vcpu. + * + * Case 2: vcpu exits to root mode and is still + * runnable. PIR will be synced to vIRR before the + * next vcpu entry. Sending a notification event in + * this case has no effect, as vcpu is not in root + * mode. * - * If the vcpu is in guest mode, it means it is - * running instead of being scheduled out and - * waiting in the run queue, and that's the only - * case when 'SN' is set currently, warning if - * 'SN' is set. + * Case 3: vcpu exits to root mode and is blocked. + * vcpu_block() has already synced PIR to vIRR and + * never blocks vcpu if vIRR is not cleared. Therefore, + * a blocked vcpu here does not wait for any requested + * interrupts in PIR, and sending a notification event + * which has no effect is safe here. */ - WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); return true; -- cgit From 0555ac4333d7da7cff83d5b06bd3679a3e1ef584 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Mon, 18 Sep 2017 16:35:32 -0600 Subject: xen, arm64: drop dummy lookup_address() This is unused, and conflicts with the definition that we'll add for XPFO. Signed-off-by: Tycho Andersen Reviewed-by: Julien Grall CC: Boris Ostrovsky CC: Juergen Gross CC: Stefano Stabellini Signed-off-by: Boris Ostrovsky --- include/xen/arm/page.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/xen/arm/page.h b/include/xen/arm/page.h index 415dbc6e43fd..6adc2a955340 100644 --- a/include/xen/arm/page.h +++ b/include/xen/arm/page.h @@ -84,16 +84,6 @@ static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr) BUG(); } -/* TODO: this shouldn't be here but it is because the frontend drivers - * are using it (its rolled in headers) even though we won't hit the code path. - * So for right now just punt with this. - */ -static inline pte_t *lookup_address(unsigned long address, unsigned int *level) -{ - BUG(); - return NULL; -} - extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); -- cgit From 986a5f70173626eea9863fee6d6e029f0f2bc361 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Sep 2017 14:34:34 +0200 Subject: iommu/qcom: Depend on HAS_DMA to fix compile error If NO_DMA=y: warning: (IPMMU_VMSA && ARM_SMMU && ARM_SMMU_V3 && QCOM_IOMMU) selects IOMMU_IO_PGTABLE_LPAE which has unmet direct dependencies (IOMMU_SUPPORT && HAS_DMA && (ARM || ARM64 || COMPILE_TEST && !GENERIC_ATOMIC64)) and drivers/iommu/io-pgtable-arm.o: In function `__arm_lpae_sync_pte': io-pgtable-arm.c:(.text+0x206): undefined reference to `bad_dma_ops' drivers/iommu/io-pgtable-arm.o: In function `__arm_lpae_free_pages': io-pgtable-arm.c:(.text+0x6a6): undefined reference to `bad_dma_ops' drivers/iommu/io-pgtable-arm.o: In function `__arm_lpae_alloc_pages': io-pgtable-arm.c:(.text+0x812): undefined reference to `bad_dma_ops' io-pgtable-arm.c:(.text+0x81c): undefined reference to `bad_dma_ops' io-pgtable-arm.c:(.text+0x862): undefined reference to `bad_dma_ops' drivers/iommu/io-pgtable-arm.o: In function `arm_lpae_run_tests': io-pgtable-arm.c:(.init.text+0x86): undefined reference to `alloc_io_pgtable_ops' io-pgtable-arm.c:(.init.text+0x47c): undefined reference to `free_io_pgtable_ops' drivers/iommu/qcom_iommu.o: In function `qcom_iommu_init_domain': qcom_iommu.c:(.text+0x1ce): undefined reference to `alloc_io_pgtable_ops' drivers/iommu/qcom_iommu.o: In function `qcom_iommu_domain_free': qcom_iommu.c:(.text+0x754): undefined reference to `free_io_pgtable_ops' QCOM_IOMMU selects IOMMU_IO_PGTABLE_LPAE, which bypasses its dependency on HAS_DMA. Make QCOM_IOMMU depend on HAS_DMA to fix this. Fixes: 0ae349a0f33fb040 ("iommu/qcom: Add qcom_iommu") Signed-off-by: Geert Uytterhoeven Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 8530ef78925d..f3a21343e636 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -374,6 +374,7 @@ config QCOM_IOMMU # Note: iommu drivers cannot (yet?) be built as modules bool "Qualcomm IOMMU Support" depends on ARCH_QCOM || (COMPILE_TEST && !GENERIC_ATOMIC64) + depends on HAS_DMA select IOMMU_API select IOMMU_IO_PGTABLE_LPAE select ARM_DMA_USE_IOMMU -- cgit From 8dd33bcb7050dd6f8c1432732f930932c9d3a33e Mon Sep 17 00:00:00 2001 From: Bo Yan Date: Mon, 18 Sep 2017 10:03:35 -0700 Subject: tracing: Erase irqsoff trace with empty write One convenient way to erase trace is "echo > trace". However, this is currently broken if the current tracer is irqsoff tracer. This is because irqsoff tracer use max_buffer as the default trace buffer. Set the max_buffer as the one to be cleared when it's the trace buffer currently in use. Link: http://lkml.kernel.org/r/1505754215-29411-1-git-send-email-byan@nvidia.com Cc: Cc: stable@vger.kernel.org Fixes: 4acd4d00f ("tracing: give easy way to clear trace buffer") Signed-off-by: Bo Yan Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5360b7aec57a..a7fb136da891 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4020,11 +4020,17 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { int cpu = tracing_get_cpu(inode); + struct trace_buffer *trace_buf = &tr->trace_buffer; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->current_trace->print_max) + trace_buf = &tr->max_buffer; +#endif if (cpu == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(trace_buf); else - tracing_reset(&tr->trace_buffer, cpu); + tracing_reset(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { -- cgit From c7b3ae0bd2ca658c7a71c49901d08c590294fac9 Mon Sep 17 00:00:00 2001 From: "Ziqian SUN (Zamir)" Date: Mon, 11 Sep 2017 14:26:35 +0800 Subject: tracing: Ignore mmiotrace from kernel commandline The mmiotrace tracer cannot be enabled with ftrace=mmiotrace in kernel commandline. With this patch, noboot is added to the tracer struct, and when system boot with a tracer that has noboot=true, it will print out a warning message and continue booting. Link: http://lkml.kernel.org/r/1505111195-31942-1-git-send-email-zsun@redhat.com Signed-off-by: Ziqian SUN (Zamir) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 7 +++++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_mmiotrace.c | 1 + 3 files changed, 10 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a7fb136da891..d3ca35f38803 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5364,6 +5364,13 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t == tr->current_trace) goto out; + /* Some tracers won't work on kernel command line */ + if (system_state < SYSTEM_RUNNING && t->noboot) { + pr_warn("Tracer '%s' is not allowed on command line, ignored\n", + t->name); + goto out; + } + /* Some tracers are only allowed for the top level buffer */ if (!trace_ok_for_array(t, tr)) { ret = -EINVAL; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fb5d54d0d1b3..652c682707cd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -444,6 +444,8 @@ struct tracer { #ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; #endif + /* True if tracer cannot be enabled in kernel param */ + bool noboot; }; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index cd7480d0a201..dca78fc48439 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -282,6 +282,7 @@ static struct tracer mmio_tracer __read_mostly = .close = mmio_close, .read = mmio_read, .print_line = mmio_print_line, + .noboot = true, }; __init static int init_mmio_trace(void) -- cgit From aa767cfb75341a6b93742f4d7d1589ac2532c29e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 11 Sep 2017 22:07:50 +0200 Subject: of: provide inline helper for of_find_device_by_node The ipmmu-vmsa driver fails in compile-testing on non-OF platforms: drivers/iommu/ipmmu-vmsa.o: In function `ipmmu_of_xlate': ipmmu-vmsa.c:(.text+0x740): undefined reference to `of_find_device_by_node' It would be reasonable to assume that this interface works but returns failure on non-OF builds, like it does on machines that have been booted in another way, so this adds another inline function helper. Fixes: 7b2d59611fef ("iommu/ipmmu-vmsa: Replace local utlb code with fwspec ids") Signed-off-by: Arnd Bergmann Signed-off-by: Rob Herring --- include/linux/of_platform.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h index e0d1946270f3..fb908e598348 100644 --- a/include/linux/of_platform.h +++ b/include/linux/of_platform.h @@ -57,7 +57,14 @@ extern const struct of_device_id of_default_bus_match_table[]; extern struct platform_device *of_device_alloc(struct device_node *np, const char *bus_id, struct device *parent); +#ifdef CONFIG_OF extern struct platform_device *of_find_device_by_node(struct device_node *np); +#else +static inline struct platform_device *of_find_device_by_node(struct device_node *np) +{ + return NULL; +} +#endif /* Platform devices and busses creation */ extern struct platform_device *of_platform_device_create(struct device_node *np, -- cgit From a6899e900509bb2442239d6198dc8bc64f8437ec Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Thu, 14 Sep 2017 23:39:38 +0200 Subject: dt-bindings: fix vendor prefix for Abracon Commit 446810f2dd41 ("of: add vendor prefix for Abracon Corporation") claimed that "abcn" was used as the vendor prefix while in fact "abracon" was used in the subsequent commits. It is also the only prefix used in the tree. Signed-off-by: Alexandre Belloni [robh: fix alphabetical order] Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/vendor-prefixes.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt index 1ea1fd4232ab..1afd298eddd7 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.txt +++ b/Documentation/devicetree/bindings/vendor-prefixes.txt @@ -3,8 +3,8 @@ Device tree binding vendor prefix registry. Keep list in alphabetical order. This isn't an exhaustive list, but you should add new prefixes to it before using them to avoid name-space collisions. -abcn Abracon Corporation abilis Abilis Systems +abracon Abracon Corporation actions Actions Semiconductor Co., Ltd. active-semi Active-Semi International Inc ad Avionic Design GmbH -- cgit From 3993491bf27117782bee05debc6a6afa51d61760 Mon Sep 17 00:00:00 2001 From: Ariel Elior Date: Tue, 19 Sep 2017 12:54:34 +0300 Subject: MAINTAINERS: Remove Yuval Mintz from maintainers list Remove Yuval from maintaining the bnx2x & qed* modules as he is no longer working for the company. Thanks Yuval for your huge contributions and tireless efforts over the many years and various companies. Ariel Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..955f034fd523 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2853,7 +2853,6 @@ S: Supported F: drivers/scsi/bnx2i/ BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER -M: Yuval Mintz M: Ariel Elior M: everest-linux-l2@cavium.com L: netdev@vger.kernel.org @@ -11047,7 +11046,6 @@ S: Supported F: drivers/scsi/qedi/ QLOGIC QL4xxx ETHERNET DRIVER -M: Yuval Mintz M: Ariel Elior M: everest-linux-l2@cavium.com L: netdev@vger.kernel.org -- cgit From 29a0cfbf91ba997591535a4f7246835ce8328141 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 18 Sep 2017 12:21:37 +0200 Subject: libceph: don't allow bidirectional swap of pg-upmap-items This reverts most of commit f53b7665c8ce ("libceph: upmap semantic changes"). We need to prevent duplicates in the final result. For example, we can currently take [1,2,3] and apply [(1,2)] and get [2,2,3] or [1,2,3] and apply [(3,2)] and get [1,2,2] The rest of the system is not prepared to handle duplicates in the result set like this. The reverted piece was intended to allow [1,2,3] and [(1,2),(2,1)] to get [2,1,3] to reorder primaries. First, this bidirectional swap is hard to implement in a way that also prevents dups. For example, [1,2,3] and [(1,4),(2,3),(3,4)] would give [4,3,4] but would we just drop the last step we'd have [4,3,3] which is also invalid, etc. Simpler to just not handle bidirectional swaps. In practice, they are not needed: if you just want to choose a different primary then use primary_affinity, or pg_upmap (not pg_upmap_items). Cc: stable@vger.kernel.org # 4.13 Link: http://tracker.ceph.com/issues/21410 Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osdmap.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index f358d0bfa76b..79d14d70b7ea 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2445,19 +2445,34 @@ static void apply_upmap(struct ceph_osdmap *osdmap, pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid); if (pg) { - for (i = 0; i < raw->size; i++) { - for (j = 0; j < pg->pg_upmap_items.len; j++) { - int from = pg->pg_upmap_items.from_to[j][0]; - int to = pg->pg_upmap_items.from_to[j][1]; - - if (from == raw->osds[i]) { - if (!(to != CRUSH_ITEM_NONE && - to < osdmap->max_osd && - osdmap->osd_weight[to] == 0)) - raw->osds[i] = to; + /* + * Note: this approach does not allow a bidirectional swap, + * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. + */ + for (i = 0; i < pg->pg_upmap_items.len; i++) { + int from = pg->pg_upmap_items.from_to[i][0]; + int to = pg->pg_upmap_items.from_to[i][1]; + int pos = -1; + bool exists = false; + + /* make sure replacement doesn't already appear */ + for (j = 0; j < raw->size; j++) { + int osd = raw->osds[j]; + + if (osd == to) { + exists = true; break; } + /* ignore mapping if target is marked out */ + if (osd == from && pos < 0 && + !(to != CRUSH_ITEM_NONE && + to < osdmap->max_osd && + osdmap->osd_weight[to] == 0)) { + pos = j; + } } + if (!exists && pos >= 0) + raw->osds[pos] = to; } } } -- cgit From 3fad4cdac235c5b13227d0c09854c689ae62c70b Mon Sep 17 00:00:00 2001 From: zijun_hu Date: Sat, 16 Sep 2017 01:59:41 +0800 Subject: irqchip/gic-v3: Iterate over possible CPUs by for_each_possible_cpu() get_cpu_number() doesn't use existing helper to iterate over possible CPUs, It will cause an error in case of discontinuous @cpu_possible_mask such as 0b11110001, which can result from a core having failed to come up on a SMP machine. Fixed by using existing helper for_each_possible_cpu(). Signed-off-by: zijun_hu Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 519149ec9053..b5df99c6f680 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -1042,7 +1042,7 @@ static int get_cpu_number(struct device_node *dn) { const __be32 *cell; u64 hwid; - int i; + int cpu; cell = of_get_property(dn, "reg", NULL); if (!cell) @@ -1056,9 +1056,9 @@ static int get_cpu_number(struct device_node *dn) if (hwid & ~MPIDR_HWID_BITMASK) return -1; - for (i = 0; i < num_possible_cpus(); i++) - if (cpu_logical_map(i) == hwid) - return i; + for_each_possible_cpu(cpu) + if (cpu_logical_map(cpu) == hwid) + return cpu; return -1; } -- cgit From 6c09ffd027255d97c536aa4ac51a90973adfedc6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 12 Sep 2017 22:08:23 +0200 Subject: irqchip/gic-v4: Fix building with ancient gcc gcc-4.5 and earlier don't like named initializers for anonymous union members: drivers/irqchip/irq-gic-v4.c: In function 'its_map_vlpi': drivers/irqchip/irq-gic-v4.c:176:3: error: unknown field 'map' specified in initializer drivers/irqchip/irq-gic-v4.c:176:3: error: missing braces around initializer drivers/irqchip/irq-gic-v4.c:176:3: error: (near initialization for 'info.') drivers/irqchip/irq-gic-v4.c: In function 'its_get_vlpi': drivers/irqchip/irq-gic-v4.c:192:3: error: unknown field 'map' specified in initializer drivers/irqchip/irq-gic-v4.c:192:3: error: missing braces around initializer drivers/irqchip/irq-gic-v4.c:192:3: error: (near initialization for 'info.') drivers/irqchip/irq-gic-v4.c: In function 'its_prop_update_vlpi': drivers/irqchip/irq-gic-v4.c:208:3: error: unknown field 'config' specified in initializer drivers/irqchip/irq-gic-v4.c:208:3: error: missing braces around initializer drivers/irqchip/irq-gic-v4.c:208:3: error: (near initialization for 'info.') drivers/irqchip/irq-gic-v4.c:208:3: error: initialization makes pointer from integer without a cast This is fairly easy to work around, by using extra curly braces. Signed-off-by: Arnd Bergmann Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v4.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c index 2370e6d9e603..cd0bcc3b7e33 100644 --- a/drivers/irqchip/irq-gic-v4.c +++ b/drivers/irqchip/irq-gic-v4.c @@ -173,7 +173,9 @@ int its_map_vlpi(int irq, struct its_vlpi_map *map) { struct its_cmd_info info = { .cmd_type = MAP_VLPI, - .map = map, + { + .map = map, + }, }; /* @@ -189,7 +191,9 @@ int its_get_vlpi(int irq, struct its_vlpi_map *map) { struct its_cmd_info info = { .cmd_type = GET_VLPI, - .map = map, + { + .map = map, + }, }; return irq_set_vcpu_affinity(irq, &info); @@ -205,7 +209,9 @@ int its_prop_update_vlpi(int irq, u8 config, bool inv) { struct its_cmd_info info = { .cmd_type = inv ? PROP_UPDATE_AND_INV_VLPI : PROP_UPDATE_VLPI, - .config = config, + { + .config = config, + }, }; return irq_set_vcpu_affinity(irq, &info); -- cgit From 90019f8fcd71bf71653690329e32f41489e96122 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 5 Sep 2017 11:28:46 -0700 Subject: irqchip.mips-gic: Fix shared interrupt mask writes The write_gic_smask() & write_gic_rmask() functions take a shared interrupt number as a parameter, but we're incorrectly providing them a bitmask with the shared interrupt's bit set. This effectively means that we mask or unmask the shared interrupt 1< Fixes: 68898c8765f4 ("irqchip: mips-gic: Drop gic_(re)set_mask() functions") Cc: Jason Cooper Cc: Marc Zyngier Cc: Ralf Baechle Cc: Thomas Gleixner Cc: linux-mips@linux-mips.org Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mips-gic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 6e52a88bbd9e..40159ac12ac8 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -169,7 +169,7 @@ static void gic_mask_irq(struct irq_data *d) { unsigned int intr = GIC_HWIRQ_TO_SHARED(d->hwirq); - write_gic_rmask(BIT(intr)); + write_gic_rmask(intr); gic_clear_pcpu_masks(intr); } @@ -179,7 +179,7 @@ static void gic_unmask_irq(struct irq_data *d) unsigned int intr = GIC_HWIRQ_TO_SHARED(d->hwirq); unsigned int cpu; - write_gic_smask(BIT(intr)); + write_gic_smask(intr); gic_clear_pcpu_masks(intr); cpu = cpumask_first_and(affinity, cpu_online_mask); @@ -767,7 +767,7 @@ static int __init gic_of_init(struct device_node *node, for (i = 0; i < gic_shared_intrs; i++) { change_gic_pol(i, GIC_POL_ACTIVE_HIGH); change_gic_trig(i, GIC_TRIG_LEVEL); - write_gic_rmask(BIT(i)); + write_gic_rmask(i); } for (i = 0; i < gic_vpes; i++) { -- cgit From 717e6f2893eb35ce6728c3cacdc297b78d371b31 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 11 Sep 2017 12:10:08 +0800 Subject: ceph: avoid panic in create_session_open_msg() if utsname() returns NULL utsname() can return NULL while process is exiting. Kernel releases file locks during process exits. We send request to mds when releasing file lock. So it's possible that we open mds session while process is exiting. utsname() is called in create_session_open_msg(). Link: http://tracker.ceph.com/issues/21275 Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton [idryomov@gmail.com: drop utsname.h include from mds_client.c] Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 7 ++++--- fs/ceph/mds_client.h | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9dd6b836ac9e..84edfc60d87a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include "super.h" @@ -884,8 +883,8 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 void *p; const char* metadata[][2] = { - {"hostname", utsname()->nodename}, - {"kernel_version", utsname()->release}, + {"hostname", mdsc->nodename}, + {"kernel_version", init_utsname()->release}, {"entity_id", opt->name ? : ""}, {"root", fsopt->server_path ? : "/"}, {NULL, NULL} @@ -3539,6 +3538,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_rwsem(&mdsc->pool_perm_rwsem); mdsc->pool_perm_tree = RB_ROOT; + strncpy(mdsc->nodename, utsname()->nodename, + sizeof(mdsc->nodename) - 1); return 0; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index db57ae98ed34..636d6b2ec49c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -368,6 +369,8 @@ struct ceph_mds_client { struct rw_semaphore pool_perm_rwsem; struct rb_root pool_perm_tree; + + char nodename[__NEW_UTS_LEN + 1]; }; extern const char *ceph_mds_op_name(int op); -- cgit From 19a8d6b7604df85402deecae01d7861cb1d40c89 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Tue, 19 Sep 2017 15:50:42 +0100 Subject: MIPS: PCI: Move map_irq() hooks out of initdata 04c81c7293df ("MIPS: PCI: Replace pci_fixup_irqs() call with host bridge IRQ mapping hooks") moved the PCI IRQ fixup to the new host bridge map/swizzle_irq() hooks mechanism. Those hooks can also be called after boot, when all the __init/__initdata/__initconst sections have been freed. Therefore, functions called by them (and the data they refer to) must not be marked as __init/__initdata/__initconst lest compilation trigger section mismatch warnings. Fix all the board files map_irq() hooks by simply removing the respective __init/__initdata/__initconst section markers and by adding another persistent hook IRQ map for the txx9 board files. Fixes: 04c81c7293df ("MIPS: PCI: Replace pci_fixup_irqs() call with host bridge IRQ mapping hooks") Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Arnd Bergmann Cc: Ralf Baechle Cc: Steve French --- arch/mips/ath79/pci.c | 12 ++++++------ arch/mips/pci/fixup-capcella.c | 4 ++-- arch/mips/pci/fixup-cobalt.c | 8 ++++---- arch/mips/pci/fixup-emma2rh.c | 4 ++-- arch/mips/pci/fixup-fuloong2e.c | 2 +- arch/mips/pci/fixup-ip32.c | 4 ++-- arch/mips/pci/fixup-jmr3927.c | 2 +- arch/mips/pci/fixup-lantiq.c | 2 +- arch/mips/pci/fixup-lemote2f.c | 4 ++-- arch/mips/pci/fixup-loongson3.c | 2 +- arch/mips/pci/fixup-malta.c | 4 ++-- arch/mips/pci/fixup-mpc30x.c | 6 +++--- arch/mips/pci/fixup-pmcmsp.c | 8 ++++---- arch/mips/pci/fixup-rbtx4927.c | 2 +- arch/mips/pci/fixup-rbtx4938.c | 2 +- arch/mips/pci/fixup-sni.c | 12 ++++++------ arch/mips/pci/fixup-tb0219.c | 2 +- arch/mips/pci/fixup-tb0226.c | 2 +- arch/mips/pci/fixup-tb0287.c | 2 +- arch/mips/pci/pci-alchemy.c | 2 +- arch/mips/pci/pci-bcm47xx.c | 2 +- arch/mips/pci/pci-lasat.c | 2 +- arch/mips/pci/pci-mt7620.c | 2 +- arch/mips/pci/pci-octeon.c | 5 ++--- arch/mips/pci/pci-rt2880.c | 2 +- arch/mips/pci/pci-rt3883.c | 2 +- arch/mips/pci/pci-tx4938.c | 2 +- arch/mips/pci/pci-tx4939.c | 4 ++-- arch/mips/pci/pci-xlp.c | 2 +- arch/mips/pci/pci-xlr.c | 2 +- arch/mips/pci/pcie-octeon.c | 3 +-- arch/mips/txx9/generic/pci.c | 8 ++++++-- 32 files changed, 62 insertions(+), 60 deletions(-) diff --git a/arch/mips/ath79/pci.c b/arch/mips/ath79/pci.c index 730c0b03060d..b816cb4a25ff 100644 --- a/arch/mips/ath79/pci.c +++ b/arch/mips/ath79/pci.c @@ -22,10 +22,10 @@ #include "pci.h" static int (*ath79_pci_plat_dev_init)(struct pci_dev *dev); -static const struct ath79_pci_irq *ath79_pci_irq_map __initdata; -static unsigned ath79_pci_nr_irqs __initdata; +static const struct ath79_pci_irq *ath79_pci_irq_map; +static unsigned ath79_pci_nr_irqs; -static const struct ath79_pci_irq ar71xx_pci_irq_map[] __initconst = { +static const struct ath79_pci_irq ar71xx_pci_irq_map[] = { { .slot = 17, .pin = 1, @@ -41,7 +41,7 @@ static const struct ath79_pci_irq ar71xx_pci_irq_map[] __initconst = { } }; -static const struct ath79_pci_irq ar724x_pci_irq_map[] __initconst = { +static const struct ath79_pci_irq ar724x_pci_irq_map[] = { { .slot = 0, .pin = 1, @@ -49,7 +49,7 @@ static const struct ath79_pci_irq ar724x_pci_irq_map[] __initconst = { } }; -static const struct ath79_pci_irq qca955x_pci_irq_map[] __initconst = { +static const struct ath79_pci_irq qca955x_pci_irq_map[] = { { .bus = 0, .slot = 0, @@ -64,7 +64,7 @@ static const struct ath79_pci_irq qca955x_pci_irq_map[] __initconst = { }, }; -int __init pcibios_map_irq(const struct pci_dev *dev, uint8_t slot, uint8_t pin) +int pcibios_map_irq(const struct pci_dev *dev, uint8_t slot, uint8_t pin) { int irq = -1; int i; diff --git a/arch/mips/pci/fixup-capcella.c b/arch/mips/pci/fixup-capcella.c index 1c02f5737367..b4c263f16b15 100644 --- a/arch/mips/pci/fixup-capcella.c +++ b/arch/mips/pci/fixup-capcella.c @@ -32,13 +32,13 @@ #define INTC PC104PLUS_INTC_IRQ #define INTD PC104PLUS_INTD_IRQ -static char irq_tab_capcella[][5] __initdata = { +static char irq_tab_capcella[][5] = { [11] = { -1, INT1, INT1, INT1, INT1 }, [12] = { -1, INT2, INT2, INT2, INT2 }, [14] = { -1, INTA, INTB, INTC, INTD } }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return irq_tab_capcella[slot][pin]; } diff --git a/arch/mips/pci/fixup-cobalt.c b/arch/mips/pci/fixup-cobalt.c index b3ab59318d91..44be65c3e6bb 100644 --- a/arch/mips/pci/fixup-cobalt.c +++ b/arch/mips/pci/fixup-cobalt.c @@ -147,7 +147,7 @@ static void qube_raq_via_board_id_fixup(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C586_0, qube_raq_via_board_id_fixup); -static char irq_tab_qube1[] __initdata = { +static char irq_tab_qube1[] = { [COBALT_PCICONF_CPU] = 0, [COBALT_PCICONF_ETH0] = QUBE1_ETH0_IRQ, [COBALT_PCICONF_RAQSCSI] = SCSI_IRQ, @@ -156,7 +156,7 @@ static char irq_tab_qube1[] __initdata = { [COBALT_PCICONF_ETH1] = 0 }; -static char irq_tab_cobalt[] __initdata = { +static char irq_tab_cobalt[] = { [COBALT_PCICONF_CPU] = 0, [COBALT_PCICONF_ETH0] = ETH0_IRQ, [COBALT_PCICONF_RAQSCSI] = SCSI_IRQ, @@ -165,7 +165,7 @@ static char irq_tab_cobalt[] __initdata = { [COBALT_PCICONF_ETH1] = ETH1_IRQ }; -static char irq_tab_raq2[] __initdata = { +static char irq_tab_raq2[] = { [COBALT_PCICONF_CPU] = 0, [COBALT_PCICONF_ETH0] = ETH0_IRQ, [COBALT_PCICONF_RAQSCSI] = RAQ2_SCSI_IRQ, @@ -174,7 +174,7 @@ static char irq_tab_raq2[] __initdata = { [COBALT_PCICONF_ETH1] = ETH1_IRQ }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (cobalt_board_id <= COBALT_BRD_ID_QUBE1) return irq_tab_qube1[slot]; diff --git a/arch/mips/pci/fixup-emma2rh.c b/arch/mips/pci/fixup-emma2rh.c index 19caf775c206..c31cb6af1cd0 100644 --- a/arch/mips/pci/fixup-emma2rh.c +++ b/arch/mips/pci/fixup-emma2rh.c @@ -43,7 +43,7 @@ */ #define MAX_SLOT_NUM 10 -static unsigned char irq_map[][5] __initdata = { +static unsigned char irq_map[][5] = { [3] = {0, MARKEINS_PCI_IRQ_INTB, MARKEINS_PCI_IRQ_INTC, MARKEINS_PCI_IRQ_INTD, 0,}, [4] = {0, MARKEINS_PCI_IRQ_INTA, 0, 0, 0,}, @@ -85,7 +85,7 @@ static void emma2rh_pci_host_fixup(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NEC, PCI_DEVICE_ID_NEC_EMMA2RH, emma2rh_pci_host_fixup); -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return irq_map[slot][pin]; } diff --git a/arch/mips/pci/fixup-fuloong2e.c b/arch/mips/pci/fixup-fuloong2e.c index 50da773faede..b47c2771dc99 100644 --- a/arch/mips/pci/fixup-fuloong2e.c +++ b/arch/mips/pci/fixup-fuloong2e.c @@ -19,7 +19,7 @@ /* South bridge slot number is set by the pci probe process */ static u8 sb_slot = 5; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = 0; diff --git a/arch/mips/pci/fixup-ip32.c b/arch/mips/pci/fixup-ip32.c index 133685e215ee..c6ec18a07e63 100644 --- a/arch/mips/pci/fixup-ip32.c +++ b/arch/mips/pci/fixup-ip32.c @@ -21,7 +21,7 @@ #define INTB MACEPCI_SHARED0_IRQ #define INTC MACEPCI_SHARED1_IRQ #define INTD MACEPCI_SHARED2_IRQ -static char irq_tab_mace[][5] __initdata = { +static char irq_tab_mace[][5] = { /* Dummy INT#A INT#B INT#C INT#D */ {0, 0, 0, 0, 0}, /* This is placeholder row - never used */ {0, SCSI0, SCSI0, SCSI0, SCSI0}, @@ -39,7 +39,7 @@ static char irq_tab_mace[][5] __initdata = { * irqs. I suppose a device without a pin A will thank us for doing it * right if there exists such a broken piece of crap. */ -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return irq_tab_mace[slot][pin]; } diff --git a/arch/mips/pci/fixup-jmr3927.c b/arch/mips/pci/fixup-jmr3927.c index 0f1069527cba..d3102eeea898 100644 --- a/arch/mips/pci/fixup-jmr3927.c +++ b/arch/mips/pci/fixup-jmr3927.c @@ -31,7 +31,7 @@ #include #include -int __init jmr3927_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int jmr3927_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { unsigned char irq = pin; diff --git a/arch/mips/pci/fixup-lantiq.c b/arch/mips/pci/fixup-lantiq.c index 2b5427d3f35c..81530a13b349 100644 --- a/arch/mips/pci/fixup-lantiq.c +++ b/arch/mips/pci/fixup-lantiq.c @@ -23,7 +23,7 @@ int pcibios_plat_dev_init(struct pci_dev *dev) return 0; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return of_irq_parse_and_map_pci(dev, slot, pin); } diff --git a/arch/mips/pci/fixup-lemote2f.c b/arch/mips/pci/fixup-lemote2f.c index 95ab9a1bd010..20cdfdc08938 100644 --- a/arch/mips/pci/fixup-lemote2f.c +++ b/arch/mips/pci/fixup-lemote2f.c @@ -30,7 +30,7 @@ #define PCID 7 /* all the pci device has the PCIA pin, check the datasheet. */ -static char irq_tab[][5] __initdata = { +static char irq_tab[][5] = { /* INTA INTB INTC INTD */ {0, 0, 0, 0, 0}, /* 11: Unused */ {0, 0, 0, 0, 0}, /* 12: Unused */ @@ -51,7 +51,7 @@ static char irq_tab[][5] __initdata = { {0, 0, 0, 0, 0}, /* 27: Unused */ }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int virq; diff --git a/arch/mips/pci/fixup-loongson3.c b/arch/mips/pci/fixup-loongson3.c index 2b6d5e196f99..8a741c2c6685 100644 --- a/arch/mips/pci/fixup-loongson3.c +++ b/arch/mips/pci/fixup-loongson3.c @@ -32,7 +32,7 @@ static void print_fixup_info(const struct pci_dev *pdev) pdev->vendor, pdev->device, pdev->irq); } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { print_fixup_info(dev); return dev->irq; diff --git a/arch/mips/pci/fixup-malta.c b/arch/mips/pci/fixup-malta.c index 40e920c653cc..3ec85331795e 100644 --- a/arch/mips/pci/fixup-malta.c +++ b/arch/mips/pci/fixup-malta.c @@ -12,7 +12,7 @@ static char pci_irq[5] = { }; -static char irq_tab[][5] __initdata = { +static char irq_tab[][5] = { /* INTA INTB INTC INTD */ {0, 0, 0, 0, 0 }, /* 0: GT64120 PCI bridge */ {0, 0, 0, 0, 0 }, /* 1: Unused */ @@ -38,7 +38,7 @@ static char irq_tab[][5] __initdata = { {0, PCID, PCIA, PCIB, PCIC } /* 21: PCI Slot 4 */ }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int virq; virq = irq_tab[slot][pin]; diff --git a/arch/mips/pci/fixup-mpc30x.c b/arch/mips/pci/fixup-mpc30x.c index 8e4f8288eca2..66eaf456bc89 100644 --- a/arch/mips/pci/fixup-mpc30x.c +++ b/arch/mips/pci/fixup-mpc30x.c @@ -22,19 +22,19 @@ #include -static const int internal_func_irqs[] __initconst = { +static const int internal_func_irqs[] = { VRC4173_CASCADE_IRQ, VRC4173_AC97_IRQ, VRC4173_USB_IRQ, }; -static const int irq_tab_mpc30x[] __initconst = { +static const int irq_tab_mpc30x[] = { [12] = VRC4173_PCMCIA1_IRQ, [13] = VRC4173_PCMCIA2_IRQ, [29] = MQ200_IRQ, }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (slot == 30) return internal_func_irqs[PCI_FUNC(dev->devfn)]; diff --git a/arch/mips/pci/fixup-pmcmsp.c b/arch/mips/pci/fixup-pmcmsp.c index fab405c21c2f..4ad2ef02087b 100644 --- a/arch/mips/pci/fixup-pmcmsp.c +++ b/arch/mips/pci/fixup-pmcmsp.c @@ -47,7 +47,7 @@ #if defined(CONFIG_PMC_MSP7120_GW) /* Garibaldi Board IRQ wiring to PCI slots */ -static char irq_tab[][5] __initdata = { +static char irq_tab[][5] = { /* INTA INTB INTC INTD */ {0, 0, 0, 0, 0 }, /* (AD[0]): Unused */ {0, 0, 0, 0, 0 }, /* (AD[1]): Unused */ @@ -86,7 +86,7 @@ static char irq_tab[][5] __initdata = { #elif defined(CONFIG_PMC_MSP7120_EVAL) /* MSP7120 Eval Board IRQ wiring to PCI slots */ -static char irq_tab[][5] __initdata = { +static char irq_tab[][5] = { /* INTA INTB INTC INTD */ {0, 0, 0, 0, 0 }, /* (AD[0]): Unused */ {0, 0, 0, 0, 0 }, /* (AD[1]): Unused */ @@ -125,7 +125,7 @@ static char irq_tab[][5] __initdata = { #else /* Unknown board -- don't assign any IRQs */ -static char irq_tab[][5] __initdata = { +static char irq_tab[][5] = { /* INTA INTB INTC INTD */ {0, 0, 0, 0, 0 }, /* (AD[0]): Unused */ {0, 0, 0, 0, 0 }, /* (AD[1]): Unused */ @@ -202,7 +202,7 @@ int pcibios_plat_dev_init(struct pci_dev *dev) * RETURNS: IRQ number * ****************************************************************************/ -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { #if !defined(CONFIG_PMC_MSP7120_GW) && !defined(CONFIG_PMC_MSP7120_EVAL) printk(KERN_WARNING "PCI: unknown board, no PCI IRQs assigned.\n"); diff --git a/arch/mips/pci/fixup-rbtx4927.c b/arch/mips/pci/fixup-rbtx4927.c index 321db265829c..d6aaed1d6be9 100644 --- a/arch/mips/pci/fixup-rbtx4927.c +++ b/arch/mips/pci/fixup-rbtx4927.c @@ -36,7 +36,7 @@ #include #include -int __init rbtx4927_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int rbtx4927_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { unsigned char irq = pin; diff --git a/arch/mips/pci/fixup-rbtx4938.c b/arch/mips/pci/fixup-rbtx4938.c index a80579af609b..ff22a22db73e 100644 --- a/arch/mips/pci/fixup-rbtx4938.c +++ b/arch/mips/pci/fixup-rbtx4938.c @@ -13,7 +13,7 @@ #include #include -int __init rbtx4938_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int rbtx4938_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = tx4938_pcic1_map_irq(dev, slot); diff --git a/arch/mips/pci/fixup-sni.c b/arch/mips/pci/fixup-sni.c index f67ebeeb4200..adb9a58641e8 100644 --- a/arch/mips/pci/fixup-sni.c +++ b/arch/mips/pci/fixup-sni.c @@ -40,7 +40,7 @@ * seem to be a documentation error. At least on my RM200C the Cirrus * Logic CL-GD5434 VGA is device 3. */ -static char irq_tab_rm200[8][5] __initdata = { +static char irq_tab_rm200[8][5] = { /* INTA INTB INTC INTD */ { 0, 0, 0, 0, 0 }, /* EISA bridge */ { SCSI, SCSI, SCSI, SCSI, SCSI }, /* SCSI */ @@ -57,7 +57,7 @@ static char irq_tab_rm200[8][5] __initdata = { * * The VGA card is optional for RM300 systems. */ -static char irq_tab_rm300d[8][5] __initdata = { +static char irq_tab_rm300d[8][5] = { /* INTA INTB INTC INTD */ { 0, 0, 0, 0, 0 }, /* EISA bridge */ { SCSI, SCSI, SCSI, SCSI, SCSI }, /* SCSI */ @@ -69,7 +69,7 @@ static char irq_tab_rm300d[8][5] __initdata = { { 0, INTD, INTA, INTB, INTC }, /* Slot 4 */ }; -static char irq_tab_rm300e[5][5] __initdata = { +static char irq_tab_rm300e[5][5] = { /* INTA INTB INTC INTD */ { 0, 0, 0, 0, 0 }, /* HOST bridge */ { SCSI, SCSI, SCSI, SCSI, SCSI }, /* SCSI */ @@ -96,7 +96,7 @@ static char irq_tab_rm300e[5][5] __initdata = { #define INTC PCIT_IRQ_INTC #define INTD PCIT_IRQ_INTD -static char irq_tab_pcit[13][5] __initdata = { +static char irq_tab_pcit[13][5] = { /* INTA INTB INTC INTD */ { 0, 0, 0, 0, 0 }, /* HOST bridge */ { SCSI0, SCSI0, SCSI0, SCSI0, SCSI0 }, /* SCSI */ @@ -113,7 +113,7 @@ static char irq_tab_pcit[13][5] __initdata = { { 0, INTA, INTB, INTC, INTD }, /* Slot 5 */ }; -static char irq_tab_pcit_cplus[13][5] __initdata = { +static char irq_tab_pcit_cplus[13][5] = { /* INTA INTB INTC INTD */ { 0, 0, 0, 0, 0 }, /* HOST bridge */ { 0, INTB, INTC, INTD, INTA }, /* PCI Slot 9 */ @@ -130,7 +130,7 @@ static inline int is_rm300_revd(void) return (csmsr & 0xa0) == 0x20; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { switch (sni_brd_type) { case SNI_BRD_PCI_TOWER_CPLUS: diff --git a/arch/mips/pci/fixup-tb0219.c b/arch/mips/pci/fixup-tb0219.c index d0b0083fbd27..cc581535f257 100644 --- a/arch/mips/pci/fixup-tb0219.c +++ b/arch/mips/pci/fixup-tb0219.c @@ -23,7 +23,7 @@ #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = -1; diff --git a/arch/mips/pci/fixup-tb0226.c b/arch/mips/pci/fixup-tb0226.c index 4196ccf3ea3d..b827b5cad5fd 100644 --- a/arch/mips/pci/fixup-tb0226.c +++ b/arch/mips/pci/fixup-tb0226.c @@ -23,7 +23,7 @@ #include #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = -1; diff --git a/arch/mips/pci/fixup-tb0287.c b/arch/mips/pci/fixup-tb0287.c index 8c5039ed75d7..98f26285f2e3 100644 --- a/arch/mips/pci/fixup-tb0287.c +++ b/arch/mips/pci/fixup-tb0287.c @@ -22,7 +22,7 @@ #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { unsigned char bus; int irq = -1; diff --git a/arch/mips/pci/pci-alchemy.c b/arch/mips/pci/pci-alchemy.c index e99ca7702d8a..f15ec98de2de 100644 --- a/arch/mips/pci/pci-alchemy.c +++ b/arch/mips/pci/pci-alchemy.c @@ -522,7 +522,7 @@ static int __init alchemy_pci_init(void) arch_initcall(alchemy_pci_init); -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { struct alchemy_pci_context *ctx = dev->sysdata; if (ctx && ctx->board_map_irq) diff --git a/arch/mips/pci/pci-bcm47xx.c b/arch/mips/pci/pci-bcm47xx.c index 76f16eaed0ad..230d7dd273e2 100644 --- a/arch/mips/pci/pci-bcm47xx.c +++ b/arch/mips/pci/pci-bcm47xx.c @@ -28,7 +28,7 @@ #include #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return 0; } diff --git a/arch/mips/pci/pci-lasat.c b/arch/mips/pci/pci-lasat.c index 40d2797d2bc4..47f4ee6bbb3b 100644 --- a/arch/mips/pci/pci-lasat.c +++ b/arch/mips/pci/pci-lasat.c @@ -61,7 +61,7 @@ arch_initcall(lasat_pci_setup); #define LASAT_IRQ_PCIC (LASAT_IRQ_BASE + 7) #define LASAT_IRQ_PCID (LASAT_IRQ_BASE + 8) -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { switch (slot) { case 1: diff --git a/arch/mips/pci/pci-mt7620.c b/arch/mips/pci/pci-mt7620.c index 4e633c1e7ff3..90fba9bf98da 100644 --- a/arch/mips/pci/pci-mt7620.c +++ b/arch/mips/pci/pci-mt7620.c @@ -361,7 +361,7 @@ static int mt7620_pci_probe(struct platform_device *pdev) return 0; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { u16 cmd; u32 val; diff --git a/arch/mips/pci/pci-octeon.c b/arch/mips/pci/pci-octeon.c index 9ee01936862e..3e92a06fa772 100644 --- a/arch/mips/pci/pci-octeon.c +++ b/arch/mips/pci/pci-octeon.c @@ -59,8 +59,7 @@ union octeon_pci_address { } s; }; -int __initconst (*octeon_pcibios_map_irq)(const struct pci_dev *dev, - u8 slot, u8 pin); +int (*octeon_pcibios_map_irq)(const struct pci_dev *dev, u8 slot, u8 pin); enum octeon_dma_bar_type octeon_dma_bar_type = OCTEON_DMA_BAR_TYPE_INVALID; /** @@ -74,7 +73,7 @@ enum octeon_dma_bar_type octeon_dma_bar_type = OCTEON_DMA_BAR_TYPE_INVALID; * as it goes through each bridge. * Returns Interrupt number for the device */ -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (octeon_pcibios_map_irq) return octeon_pcibios_map_irq(dev, slot, pin); diff --git a/arch/mips/pci/pci-rt2880.c b/arch/mips/pci/pci-rt2880.c index d6360fe73d05..711cdccdf65b 100644 --- a/arch/mips/pci/pci-rt2880.c +++ b/arch/mips/pci/pci-rt2880.c @@ -181,7 +181,7 @@ static inline void rt2880_pci_write_u32(unsigned long reg, u32 val) spin_unlock_irqrestore(&rt2880_pci_lock, flags); } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { u16 cmd; int irq = -1; diff --git a/arch/mips/pci/pci-rt3883.c b/arch/mips/pci/pci-rt3883.c index 04f8ea953297..958899ffe99c 100644 --- a/arch/mips/pci/pci-rt3883.c +++ b/arch/mips/pci/pci-rt3883.c @@ -564,7 +564,7 @@ err_put_intc_node: return err; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return of_irq_parse_and_map_pci(dev, slot, pin); } diff --git a/arch/mips/pci/pci-tx4938.c b/arch/mips/pci/pci-tx4938.c index 000c0e1f9ef8..a6418460e3c4 100644 --- a/arch/mips/pci/pci-tx4938.c +++ b/arch/mips/pci/pci-tx4938.c @@ -112,7 +112,7 @@ int __init tx4938_pciclk66_setup(void) return pciclk; } -int __init tx4938_pcic1_map_irq(const struct pci_dev *dev, u8 slot) +int tx4938_pcic1_map_irq(const struct pci_dev *dev, u8 slot) { if (get_tx4927_pcicptr(dev->bus->sysdata) == tx4938_pcic1ptr) { switch (slot) { diff --git a/arch/mips/pci/pci-tx4939.c b/arch/mips/pci/pci-tx4939.c index 9d6acc00f348..09a65f7dbe7c 100644 --- a/arch/mips/pci/pci-tx4939.c +++ b/arch/mips/pci/pci-tx4939.c @@ -48,7 +48,7 @@ void __init tx4939_report_pci1clk(void) ((pciclk + 50000) / 100000) % 10); } -int __init tx4939_pcic1_map_irq(const struct pci_dev *dev, u8 slot) +int tx4939_pcic1_map_irq(const struct pci_dev *dev, u8 slot) { if (get_tx4927_pcicptr(dev->bus->sysdata) == tx4939_pcic1ptr) { switch (slot) { @@ -68,7 +68,7 @@ int __init tx4939_pcic1_map_irq(const struct pci_dev *dev, u8 slot) return -1; } -int __init tx4939_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int tx4939_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = tx4939_pcic1_map_irq(dev, slot); diff --git a/arch/mips/pci/pci-xlp.c b/arch/mips/pci/pci-xlp.c index 7babf01600cb..9eff9137f78e 100644 --- a/arch/mips/pci/pci-xlp.c +++ b/arch/mips/pci/pci-xlp.c @@ -205,7 +205,7 @@ int xlp_socdev_to_node(const struct pci_dev *lnkdev) return PCI_SLOT(lnkdev->devfn) / 8; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { struct pci_dev *lnkdev; int lnkfunc, node; diff --git a/arch/mips/pci/pci-xlr.c b/arch/mips/pci/pci-xlr.c index 26d2dabef281..2a1c81a129ba 100644 --- a/arch/mips/pci/pci-xlr.c +++ b/arch/mips/pci/pci-xlr.c @@ -315,7 +315,7 @@ static void xls_pcie_ack_b(struct irq_data *d) } } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return get_irq_vector(dev); } diff --git a/arch/mips/pci/pcie-octeon.c b/arch/mips/pci/pcie-octeon.c index ad3584dbc9d7..fd2887415bc8 100644 --- a/arch/mips/pci/pcie-octeon.c +++ b/arch/mips/pci/pcie-octeon.c @@ -1464,8 +1464,7 @@ static int cvmx_pcie_rc_initialize(int pcie_port) * as it goes through each bridge. * Returns Interrupt number for the device */ -int __init octeon_pcie_pcibios_map_irq(const struct pci_dev *dev, - u8 slot, u8 pin) +int octeon_pcie_pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { /* * The EBH5600 board with the PCI to PCIe bridge mistakenly diff --git a/arch/mips/txx9/generic/pci.c b/arch/mips/txx9/generic/pci.c index 0bd2a1e1ff9a..fb998726bd5d 100644 --- a/arch/mips/txx9/generic/pci.c +++ b/arch/mips/txx9/generic/pci.c @@ -386,9 +386,10 @@ int pcibios_plat_dev_init(struct pci_dev *dev) return 0; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +static int (*txx9_pci_map_irq)(const struct pci_dev *dev, u8 slot, u8 pin); +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { - return txx9_board_vec->pci_map_irq(dev, slot, pin); + return txx9_pci_map_irq(dev, slot, pin); } char * (*txx9_board_pcibios_setup)(char *str) __initdata; @@ -424,5 +425,8 @@ char *__init txx9_pcibios_setup(char *str) txx9_pci_err_action = TXX9_PCI_ERR_IGNORE; return NULL; } + + txx9_pci_map_irq = txx9_board_vec->pci_map_irq; + return str; } -- cgit From fbcab13d2e2511a858590846ac2e2d7cbd830591 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Sep 2017 09:51:28 -0400 Subject: selftests: silence test output by default Some of the networking tests are very noisy and make it impossible to see if we actually passed the tests as they run. Default to suppressing the output from any tests run in order to make it easier to track what failed. Signed-off-by: Josef Bacik Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 693616651da5..4665463779f5 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -24,7 +24,7 @@ define RUN_TESTS echo "selftests: Warning: file $$BASENAME_TEST is not executable, correct this.";\ echo "not ok 1..$$test_num selftests: $$BASENAME_TEST [FAIL]"; \ else \ - cd `dirname $$TEST` > /dev/null; (./$$BASENAME_TEST && echo "ok 1..$$test_num selftests: $$BASENAME_TEST [PASS]") || echo "not ok 1..$$test_num selftests: $$BASENAME_TEST [FAIL]"; cd - > /dev/null;\ + cd `dirname $$TEST` > /dev/null; (./$$BASENAME_TEST > /tmp/$$BASENAME_TEST 2>&1 && echo "ok 1..$$test_num selftests: $$BASENAME_TEST [PASS]") || echo "not ok 1..$$test_num selftests: $$BASENAME_TEST [FAIL]"; cd - > /dev/null;\ fi; \ done; endef @@ -55,7 +55,7 @@ endif define EMIT_TESTS @for TEST in $(TEST_GEN_PROGS) $(TEST_PROGS); do \ BASENAME_TEST=`basename $$TEST`; \ - echo "(./$$BASENAME_TEST && echo \"selftests: $$BASENAME_TEST [PASS]\") || echo \"selftests: $$BASENAME_TEST [FAIL]\""; \ + echo "(./$$BASENAME_TEST > /tmp/$$BASENAME_TEST 2>&1 && echo \"selftests: $$BASENAME_TEST [PASS]\") || echo \"selftests: $$BASENAME_TEST [FAIL]\""; \ done; endef -- cgit From 422d8dc6fd3afecd66bd6acfcd73a2d53e338ff3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Sep 2017 09:51:26 -0400 Subject: selftest: add a reuseaddr test This is to test for a regression introduced by b9470c27607b ("inet: kill smallest_size and smallest_port") which introduced a problem with reuseaddr and bind conflicts. Signed-off-by: Josef Bacik Signed-off-by: Shuah Khan --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 2 +- tools/testing/selftests/net/reuseaddr_conflict.c | 114 +++++++++++++++++++++++ 3 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/reuseaddr_conflict.c diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 9801253e4802..c612d6e38c62 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -6,3 +6,4 @@ reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa reuseport_dualstack +reuseaddr_conflict diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index de1f5772b878..3df542c84610 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -7,7 +7,7 @@ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetl TEST_GEN_FILES = socket TEST_GEN_FILES += psock_fanout psock_tpacket TEST_GEN_FILES += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa -TEST_GEN_FILES += reuseport_dualstack msg_zerocopy +TEST_GEN_FILES += reuseport_dualstack msg_zerocopy reuseaddr_conflict include ../lib.mk diff --git a/tools/testing/selftests/net/reuseaddr_conflict.c b/tools/testing/selftests/net/reuseaddr_conflict.c new file mode 100644 index 000000000000..7c5b12664b03 --- /dev/null +++ b/tools/testing/selftests/net/reuseaddr_conflict.c @@ -0,0 +1,114 @@ +/* + * Test for the regression introduced by + * + * b9470c27607b ("inet: kill smallest_size and smallest_port") + * + * If we open an ipv4 socket on a port with reuseaddr we shouldn't reset the tb + * when we open the ipv6 conterpart, which is what was happening previously. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PORT 9999 + +int open_port(int ipv6, int any) +{ + int fd = -1; + int reuseaddr = 1; + int v6only = 1; + int addrlen; + int ret = -1; + struct sockaddr *addr; + int family = ipv6 ? AF_INET6 : AF_INET; + + struct sockaddr_in6 addr6 = { + .sin6_family = AF_INET6, + .sin6_port = htons(PORT), + .sin6_addr = in6addr_any + }; + struct sockaddr_in addr4 = { + .sin_family = AF_INET, + .sin_port = htons(PORT), + .sin_addr.s_addr = any ? htonl(INADDR_ANY) : inet_addr("127.0.0.1"), + }; + + + if (ipv6) { + addr = (struct sockaddr*)&addr6; + addrlen = sizeof(addr6); + } else { + addr = (struct sockaddr*)&addr4; + addrlen = sizeof(addr4); + } + + if ((fd = socket(family, SOCK_STREAM, IPPROTO_TCP)) < 0) { + perror("socket"); + goto out; + } + + if (ipv6 && setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, (void*)&v6only, + sizeof(v6only)) < 0) { + perror("setsockopt IPV6_V6ONLY"); + goto out; + } + + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, + sizeof(reuseaddr)) < 0) { + perror("setsockopt SO_REUSEADDR"); + goto out; + } + + if (bind(fd, addr, addrlen) < 0) { + perror("bind"); + goto out; + } + + if (any) + return fd; + + if (listen(fd, 1) < 0) { + perror("listen"); + goto out; + } + return fd; +out: + close(fd); + return ret; +} + +int main(void) +{ + int listenfd; + int fd1, fd2; + + fprintf(stderr, "Opening 127.0.0.1:%d\n", PORT); + listenfd = open_port(0, 0); + if (listenfd < 0) + error(1, errno, "Couldn't open listen socket"); + fprintf(stderr, "Opening INADDR_ANY:%d\n", PORT); + fd1 = open_port(0, 1); + if (fd1 >= 0) + error(1, 0, "Was allowed to create an ipv4 reuseport on a already bound non-reuseport socket"); + fprintf(stderr, "Opening in6addr_any:%d\n", PORT); + fd1 = open_port(1, 1); + if (fd1 < 0) + error(1, errno, "Couldn't open ipv6 reuseport"); + fprintf(stderr, "Opening INADDR_ANY:%d\n", PORT); + fd2 = open_port(0, 1); + if (fd2 >= 0) + error(1, 0, "Was allowed to create an ipv4 reuseport on a already bound non-reuseport socket"); + close(fd1); + fprintf(stderr, "Opening INADDR_ANY:%d after closing ipv6 socket\n", PORT); + fd1 = open_port(0, 1); + if (fd1 >= 0) + error(1, 0, "Was allowed to create an ipv4 reuseport on an already bound non-reuseport socket with no ipv6"); + fprintf(stderr, "Success"); + return 0; +} -- cgit From e06d79fbc338935ef27befc84f8b8b2e5f878a10 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Sep 2017 09:51:27 -0400 Subject: selftests: actually run the various net selftests These self tests are just self contained binaries, they are not run by any of the scripts in the directory. This means they need to be marked with TEST_GEN_PROGS to actually be run, not TEST_GEN_FILES. Signed-off-by: Josef Bacik Signed-off-by: Shuah Khan --- tools/testing/selftests/net/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 3df542c84610..d86bca991f45 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -5,9 +5,9 @@ CFLAGS += -I../../../../usr/include/ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh TEST_GEN_FILES = socket -TEST_GEN_FILES += psock_fanout psock_tpacket -TEST_GEN_FILES += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa -TEST_GEN_FILES += reuseport_dualstack msg_zerocopy reuseaddr_conflict +TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy +TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa +TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict include ../lib.mk -- cgit From 06e8852ceecccdeff6841a6c5cd78a947a75d5bc Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Fri, 8 Sep 2017 14:01:19 +0200 Subject: selftests/net: msg_zerocopy enable build with older kernel headers Explicitly define SO_EE_ORIGIN_ZEROCOPY. This makes the test program build with older kernel headers, e.g. from Debian 9. Signed-off-by: Thomas Meyer Signed-off-by: Shuah Khan --- tools/testing/selftests/net/msg_zerocopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c index 40232af5b023..3ab6ec403905 100644 --- a/tools/testing/selftests/net/msg_zerocopy.c +++ b/tools/testing/selftests/net/msg_zerocopy.c @@ -55,7 +55,7 @@ #include #ifndef SO_EE_ORIGIN_ZEROCOPY -#define SO_EE_ORIGIN_ZEROCOPY SO_EE_ORIGIN_UPAGE +#define SO_EE_ORIGIN_ZEROCOPY 5 #endif #ifndef SO_ZEROCOPY -- cgit From 9e987b70ada27554c5d176421de1d167218c49b5 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 15 Sep 2017 17:35:27 -0700 Subject: ACPI / bus: Make ACPI_HANDLE() work for non-GPL code again Due to commit db3e50f3234b (device property: Get rid of struct fwnode_handle type field), ACPI_HANDLE() inadvertently became a GPL-only call. The call path that led to that was: ACPI_HANDLE() ACPI_COMPANION() to_acpi_device_node() is_acpi_device_node() acpi_device_fwnode_ops DECLARE_ACPI_FWNODE_OPS(acpi_device_fwnode_ops); ...and the new DECLARE_ACPI_FWNODE_OPS() includes EXPORT_SYMBOL_GPL, whereas previously it was a static struct. In order to avoid changing any of that, let's instead provide ever so slightly better encapsulation of those struct fwnode_operations instances. Those do not really need to be directly used in inline function calls in header files. Simply moving two small functions (is_acpi_device_node and is_acpi_data_node) out of acpi_bus.h, and into a .c file, does that. That leaves the internals of struct fwnode_operations as GPL-only (which I think was the intent all along), but un-breaks any driver code out there that relies on the ACPI subsystem's being (historically) an EXPORT_SYMBOL-usable system. By that, I mean, ACPI_HANDLE() and other basic ACPI calls were non-GPL-protected. Also, while I'm there, remove a tiny bit of redundancy that was missed in the earlier commit, by having is_acpi_node() use the other two routines, instead of checking fwnode directly. Fixes: db3e50f3234b (device property: Get rid of struct fwnode_handle type field) Signed-off-by: John Hubbard Acked-by: Sakari Ailus Acked-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 13 +++++++++++++ include/acpi/acpi_bus.h | 18 ++++-------------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index c1c216163de3..1e3c2517a1ac 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -1293,3 +1293,16 @@ static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, DECLARE_ACPI_FWNODE_OPS(acpi_device_fwnode_ops); DECLARE_ACPI_FWNODE_OPS(acpi_data_fwnode_ops); const struct fwnode_operations acpi_static_fwnode_ops; + +bool is_acpi_device_node(const struct fwnode_handle *fwnode) +{ + return !IS_ERR_OR_NULL(fwnode) && + fwnode->ops == &acpi_device_fwnode_ops; +} +EXPORT_SYMBOL(is_acpi_device_node); + +bool is_acpi_data_node(const struct fwnode_handle *fwnode) +{ + return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_data_fwnode_ops; +} +EXPORT_SYMBOL(is_acpi_data_node); diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index dedf9d789166..fa1505292f6c 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -399,17 +399,12 @@ extern const struct fwnode_operations acpi_device_fwnode_ops; extern const struct fwnode_operations acpi_data_fwnode_ops; extern const struct fwnode_operations acpi_static_fwnode_ops; +bool is_acpi_device_node(const struct fwnode_handle *fwnode); +bool is_acpi_data_node(const struct fwnode_handle *fwnode); + static inline bool is_acpi_node(const struct fwnode_handle *fwnode) { - return !IS_ERR_OR_NULL(fwnode) && - (fwnode->ops == &acpi_device_fwnode_ops - || fwnode->ops == &acpi_data_fwnode_ops); -} - -static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode) -{ - return !IS_ERR_OR_NULL(fwnode) && - fwnode->ops == &acpi_device_fwnode_ops; + return (is_acpi_device_node(fwnode) || is_acpi_data_node(fwnode)); } #define to_acpi_device_node(__fwnode) \ @@ -422,11 +417,6 @@ static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode) NULL; \ }) -static inline bool is_acpi_data_node(const struct fwnode_handle *fwnode) -{ - return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_data_fwnode_ops; -} - #define to_acpi_data_node(__fwnode) \ ({ \ typeof(__fwnode) __to_acpi_data_node_fwnode = __fwnode; \ -- cgit From 6073512cc8e2c48bed5c6625c02c5e4ae50cec34 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 18 Sep 2017 14:59:20 +0200 Subject: net: phy: Kconfig: Fix PHY infrastructure menu in menuconfig Since the integration of PHYLINK, the configuration option which used to be under the PHY infrastructure menu in menuconfig ended up one level up (the network device driver section) By placing PHYLINK option right after PHYLIB entry, it broke the way Kconfig used to build the menu. See kconfig-language.txt, section "Menu structure", 2nd method. This is fixed by placing the PHYLINK option just before PHYLIB. Fixes: 9525ae83959b ("phylink: add phylink infrastructure") Signed-off-by: Jerome Brunet Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index a9d16a3af514..cd931cf9dcc2 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -160,15 +160,6 @@ config MDIO_XGENE endif -menuconfig PHYLIB - tristate "PHY Device support and infrastructure" - depends on NETDEVICES - select MDIO_DEVICE - help - Ethernet controllers are usually attached to PHY - devices. This option provides infrastructure for - managing PHY devices. - config PHYLINK tristate depends on NETDEVICES @@ -179,6 +170,15 @@ config PHYLINK configuration links, PHYs, and Serdes links with MAC level autonegotiation modes. +menuconfig PHYLIB + tristate "PHY Device support and infrastructure" + depends on NETDEVICES + select MDIO_DEVICE + help + Ethernet controllers are usually attached to PHY + devices. This option provides infrastructure for + managing PHY devices. + if PHYLIB config SWPHY -- cgit From b247c211ee367cfe975dd54e381094083adfd8d3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 19 Sep 2017 02:43:13 +0200 Subject: PM: docs: Drop an excess character from devices.rst Drop an excess "`" from Documentation/driver-api/pm/devices.rst. Fixes: 2728b2d2e5be (PM / core / docs: Convert sleep states API document to reST) Signed-off-by: Rafael J. Wysocki --- Documentation/driver-api/pm/devices.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst index bedd32388dac..a0dc2879a152 100644 --- a/Documentation/driver-api/pm/devices.rst +++ b/Documentation/driver-api/pm/devices.rst @@ -675,7 +675,7 @@ sub-domain of the parent domain. Support for power domains is provided through the :c:member:`pm_domain` field of |struct device|. This field is a pointer to an object of type -|struct dev_pm_domain|, defined in :file:`include/linux/pm.h``, providing a set +|struct dev_pm_domain|, defined in :file:`include/linux/pm.h`, providing a set of power management callbacks analogous to the subsystem-level and device driver callbacks that are executed for the given device during all power transitions, instead of the respective subsystem-level callbacks. Specifically, if a -- cgit From 157c460e10cb6eca29ccbd0f023db159d0c55ec7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 19 Sep 2017 02:22:39 +0200 Subject: PM: core: Fix device_pm_check_callbacks() The device_pm_check_callbacks() function doesn't check legacy ->suspend and ->resume callback pointers under the device's bus type, class and driver, so in some cases it may set the no_pm_callbacks flag for the device incorrectly and then the callbacks may be skipped during system suspend/resume, which shouldn't happen. Fixes: aa8e54b55947 (PM / sleep: Go direct_complete if driver has no callbacks) Signed-off-by: Rafael J. Wysocki Cc: 4.5+ # 4.5+ --- drivers/base/power/main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index ea1732ed7a9d..770b1539a083 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1860,10 +1860,13 @@ void device_pm_check_callbacks(struct device *dev) { spin_lock_irq(&dev->power.lock); dev->power.no_pm_callbacks = - (!dev->bus || pm_ops_is_empty(dev->bus->pm)) && - (!dev->class || pm_ops_is_empty(dev->class->pm)) && + (!dev->bus || (pm_ops_is_empty(dev->bus->pm) && + !dev->bus->suspend && !dev->bus->resume)) && + (!dev->class || (pm_ops_is_empty(dev->class->pm) && + !dev->class->suspend && !dev->class->resume)) && (!dev->type || pm_ops_is_empty(dev->type->pm)) && (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && - (!dev->driver || pm_ops_is_empty(dev->driver->pm)); + (!dev->driver || (pm_ops_is_empty(dev->driver->pm) && + !dev->driver->suspend && !dev->driver->resume)); spin_unlock_irq(&dev->power.lock); } -- cgit From ff76898c0a14a43f71bfe63dd1903087e96ce238 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 19 Sep 2017 08:23:22 -0700 Subject: cpufreq: dt-platdev: Add some missing platforms to the blacklist Commit edeec420de24 (cpufreq: dt-platdev: Automatically create cpufreq device with OPP v2) missed adding few platforms to the blacklist which create the cpufreq-dt device from their own drivers, after some dependencies are sorted out. And for those platforms, both the platform specific driver and the cpufreq-dt-platdev driver try to create the cpufreq-dt device now. Fix that by including those platforms in the blacklist. This doesn't include the TI platforms, for which there is a separate patch. Fixes: edeec420de24 (cpufreq: dt-cpufreq: platdev Automatically create device with OPP v2) Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt-platdev.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index a020da7940d6..430edadca527 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -106,6 +106,18 @@ static const struct of_device_id whitelist[] __initconst = { * platforms using "operating-points-v2" property. */ static const struct of_device_id blacklist[] __initconst = { + { .compatible = "calxeda,highbank", }, + { .compatible = "calxeda,ecx-2000", }, + + { .compatible = "marvell,armadaxp", }, + + { .compatible = "nvidia,tegra124", }, + + { .compatible = "st,stih407", }, + { .compatible = "st,stih410", }, + + { .compatible = "sigma,tango4", }, + { } }; -- cgit From ed40fad9a568efe83f5e80897ded71117b6911b3 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Thu, 31 Aug 2017 22:24:36 +0200 Subject: ARM: cpuidle: Avoid memleak if init fail In case there are no DT idle states defined or cpuidle_register_driver() fails, the copy of the idle driver is leaked: unreferenced object 0xede0dc00 (size 1024): comm "swapper/0", pid 1, jiffies 4294937431 (age 744.510s) hex dump (first 32 bytes): 94 9e 0b c1 00 00 00 00 00 00 00 00 00 00 00 00 ................ 57 46 49 00 00 00 00 00 00 00 00 00 00 00 00 00 WFI............. backtrace: [] arm_idle_init+0x44/0x1ac [] do_one_initcall+0x3c/0x16c [] kernel_init_freeable+0x110/0x1d0 [] kernel_init+0x8/0x114 [] ret_from_fork+0x14/0x3c So fix this by freeing the unregistered copy in error case. Signed-off-by: Stefan Wahren Fixes: d50a7d8acd78 (ARM: cpuidle: Support asymmetric idle definition) Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-arm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c index 7080c384ad5d..52a75053ee03 100644 --- a/drivers/cpuidle/cpuidle-arm.c +++ b/drivers/cpuidle/cpuidle-arm.c @@ -104,13 +104,13 @@ static int __init arm_idle_init(void) ret = dt_init_idle_driver(drv, arm_idle_state_match, 1); if (ret <= 0) { ret = ret ? : -ENODEV; - goto out_fail; + goto init_fail; } ret = cpuidle_register_driver(drv); if (ret) { pr_err("Failed to register cpuidle driver\n"); - goto out_fail; + goto init_fail; } /* @@ -149,6 +149,8 @@ static int __init arm_idle_init(void) } return 0; +init_fail: + kfree(drv); out_fail: while (--cpu >= 0) { dev = per_cpu(cpuidle_devices, cpu); -- cgit From 0c0bceb796878a5baea1f47033215fac0774e498 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 8 Sep 2017 12:24:41 +0300 Subject: ACPI: properties: Return _DSD hierarchical extension (data) sub-nodes correctly The recently merged patch "ACPI: Prepare for constifying acpi_get_next_subnode() fwnode argument" was part of a patchset constifying the fwnode arguments across the fwnode property API. The purpose of the patch was to allow returning non-const fwnodes from a data structure the root of which is const. Unfortunately the patch introduced the functionality, in particular when starting parsed from an ACPI device node, the hierarchical data extension nodes would not be enumerated. Restore the old behaviour while still retaining constness properties of the patch. Fixes: 01c1da289791 "ACPI: Prepare for constifying acpi_get_next_subnode() fwnode argument" Signed-off-by: Sakari Ailus Acked-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index c1c216163de3..265b74f440b6 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -908,11 +908,12 @@ struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { const struct acpi_device *adev = to_acpi_device_node(fwnode); - struct acpi_device *child_adev = NULL; const struct list_head *head; struct list_head *next; if (!child || is_acpi_device_node(child)) { + struct acpi_device *child_adev; + if (adev) head = &adev->children; else @@ -922,8 +923,8 @@ struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, goto nondev; if (child) { - child_adev = to_acpi_device_node(child); - next = child_adev->node.next; + adev = to_acpi_device_node(child); + next = adev->node.next; if (next == head) { child = NULL; goto nondev; @@ -941,8 +942,8 @@ struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, const struct acpi_data_node *data = to_acpi_data_node(fwnode); struct acpi_data_node *dn; - if (child_adev) - head = &child_adev->data.subnodes; + if (adev) + head = &adev->data.subnodes; else if (data) head = &data->data.subnodes; else -- cgit From 0647169cf9aa441700eb8f23ea49be060626534b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 19 Sep 2017 12:41:37 +0200 Subject: rhashtable: Documentation tweak Clarify that rhashtable_walk_{stop,start} will not reset the iterator to the beginning of the hash table. Confusion between rhashtable_walk_enter and rhashtable_walk_start has already lead to a bug. Signed-off-by: Andreas Gruenbacher Signed-off-by: David S. Miller --- lib/rhashtable.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 707ca5d677c6..ddd7dde87c3c 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -735,9 +735,9 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_exit); * rhashtable_walk_start - Start a hash table walk * @iter: Hash table iterator * - * Start a hash table walk. Note that we take the RCU lock in all - * cases including when we return an error. So you must always call - * rhashtable_walk_stop to clean up. + * Start a hash table walk at the current iterator position. Note that we take + * the RCU lock in all cases including when we return an error. So you must + * always call rhashtable_walk_stop to clean up. * * Returns zero if successful. * @@ -846,7 +846,8 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_next); * rhashtable_walk_stop - Finish a hash table walk * @iter: Hash table iterator * - * Finish a hash table walk. + * Finish a hash table walk. Does not reset the iterator to the start of the + * hash table. */ void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU) -- cgit From 75df6e688ccd517e339a7c422ef7ad73045b18a2 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Sun, 17 Sep 2017 03:23:48 -0700 Subject: tracing: Fix trace_pipe behavior for instance traces When reading data from trace_pipe, tracing_wait_pipe() performs a check to see if tracing has been turned off after some data was read. Currently, this check always looks at global trace state, but it should be checking the trace instance where trace_pipe is located at. Because of this bug, cat instances/i1/trace_pipe in the following script will immediately exit instead of waiting for data: cd /sys/kernel/debug/tracing echo 0 > tracing_on mkdir -p instances/i1 echo 1 > instances/i1/tracing_on echo 1 > instances/i1/events/sched/sched_process_exec/enable cat instances/i1/trace_pipe Link: http://lkml.kernel.org/r/20170917102348.1615-1-tahsin@google.com Cc: stable@vger.kernel.org Fixes: 10246fa35d4f ("tracing: give easy way to clear trace buffer") Signed-off-by: Tahsin Erdogan Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d3ca35f38803..752e5daf0896 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5680,7 +5680,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracing_is_on() && iter->pos) + if (!tracer_tracing_is_on(iter->tr) && iter->pos) break; mutex_unlock(&iter->mutex); -- cgit From 930651a75bf1ba6893a8b8475270664ebdb6cf4a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 09:15:59 -0700 Subject: bpf: do not disable/enable BH in bpf_map_free_id() syzkaller reported following splat [1] Since hard irq are disabled by the caller, bpf_map_free_id() should not try to enable/disable BH. Another solution would be to change htab_map_delete_elem() to defer the free_htab_elem() call after raw_spin_unlock_irqrestore(&b->lock, flags), but this might be not enough to cover other code paths. [1] WARNING: CPU: 1 PID: 8052 at kernel/softirq.c:161 __local_bh_enable_ip +0x1e/0x160 kernel/softirq.c:161 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 8052 Comm: syz-executor1 Not tainted 4.13.0-next-20170915+ #23 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:181 __warn+0x1c4/0x1d9 kernel/panic.c:542 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:261 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:__local_bh_enable_ip+0x1e/0x160 kernel/softirq.c:161 RSP: 0018:ffff8801cdcd7748 EFLAGS: 00010046 RAX: 0000000000000082 RBX: 0000000000000201 RCX: 0000000000000000 RDX: 1ffffffff0b5933c RSI: 0000000000000201 RDI: ffffffff85ac99e0 RBP: ffff8801cdcd7758 R08: ffffffff85b87158 R09: 1ffff10039b9aec6 R10: ffff8801c99f24c0 R11: 0000000000000002 R12: ffffffff817b0b47 R13: dffffc0000000000 R14: ffff8801cdcd77e8 R15: 0000000000000001 __raw_spin_unlock_bh include/linux/spinlock_api_smp.h:176 [inline] _raw_spin_unlock_bh+0x30/0x40 kernel/locking/spinlock.c:207 spin_unlock_bh include/linux/spinlock.h:361 [inline] bpf_map_free_id kernel/bpf/syscall.c:197 [inline] __bpf_map_put+0x267/0x320 kernel/bpf/syscall.c:227 bpf_map_put+0x1a/0x20 kernel/bpf/syscall.c:235 bpf_map_fd_put_ptr+0x15/0x20 kernel/bpf/map_in_map.c:96 free_htab_elem+0xc3/0x1b0 kernel/bpf/hashtab.c:658 htab_map_delete_elem+0x74d/0x970 kernel/bpf/hashtab.c:1063 map_delete_elem kernel/bpf/syscall.c:633 [inline] SYSC_bpf kernel/bpf/syscall.c:1479 [inline] SyS_bpf+0x2188/0x46a0 kernel/bpf/syscall.c:1451 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: f3f1c054c288 ("bpf: Introduce bpf_map ID") Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Acked-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb17e1cd1d43..25d074920a00 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -186,15 +186,17 @@ static int bpf_map_alloc_id(struct bpf_map *map) static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { + unsigned long flags; + if (do_idr_lock) - spin_lock_bh(&map_idr_lock); + spin_lock_irqsave(&map_idr_lock, flags); else __acquire(&map_idr_lock); idr_remove(&map_idr, map->id); if (do_idr_lock) - spin_unlock_bh(&map_idr_lock); + spin_unlock_irqrestore(&map_idr_lock, flags); else __release(&map_idr_lock); } -- cgit From 039cc1c1eb971ffe1973fddb8769a80fd4950a6f Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Tue, 19 Sep 2017 15:06:13 -0500 Subject: cpufreq: ti-cpufreq: Support additional am43xx platforms Rather than letting the ti-cpufreq driver match against 'ti,am4372' machine compatible during probe let's match against 'ti,am43' so that we can support both 'ti,am4372' and 'ti,am438x' platforms which both match to this compatible. Signed-off-by: Dave Gerlach Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/ti-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c index b29cd3398463..4bf47de6101f 100644 --- a/drivers/cpufreq/ti-cpufreq.c +++ b/drivers/cpufreq/ti-cpufreq.c @@ -190,7 +190,7 @@ static int ti_cpufreq_setup_syscon_register(struct ti_cpufreq_data *opp_data) static const struct of_device_id ti_cpufreq_of_match[] = { { .compatible = "ti,am33xx", .data = &am3x_soc_data, }, - { .compatible = "ti,am4372", .data = &am4x_soc_data, }, + { .compatible = "ti,am43", .data = &am4x_soc_data, }, { .compatible = "ti,dra7", .data = &dra7_soc_data }, {}, }; -- cgit From 2a4776e14f7da3d48fffb4edbb82355742f23478 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:10 +0100 Subject: net: hns3: Fixes initialization of phy address from firmware Default phy address of every port is 0. Therefore, phy address for each port need to be fetched from firmware and device initialized with fetched non-default phy address. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index bb45365fb817..db4e07dac29a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1066,6 +1066,7 @@ static int hclge_configure(struct hclge_dev *hdev) for (i = 0; i < ETH_ALEN; i++) hdev->hw.mac.mac_addr[i] = cfg.mac_addr[i]; hdev->hw.mac.media_type = cfg.media_type; + hdev->hw.mac.phy_addr = cfg.phy_addr; hdev->num_desc = cfg.tqp_desc_num; hdev->tm_info.num_pg = 1; hdev->tm_info.num_tc = cfg.tc_num; -- cgit From c5b1b97522ef32d2170c9aa1a0c1eec179acbb3a Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:11 +0100 Subject: net: hns3: Fixes the command used to unmap ring from vector This patch fixes the IMP command being used to unmap the vector from the corresponding ring. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index db4e07dac29a..e324bc6e9f4f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2779,7 +2779,7 @@ static int hclge_unmap_ring_from_vector( } i = 0; hclge_cmd_setup_basic_desc(&desc, - HCLGE_OPC_ADD_RING_TO_VECTOR, + HCLGE_OPC_DEL_RING_TO_VECTOR, false); req->int_vector_id = vector_id; } -- cgit From 0305b443a3ba5b84aa474786026df04a70460135 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:12 +0100 Subject: net: hns3: Fixes ring-to-vector map-and-unmap command This patch fixes the vector-to-ring map and unmap command and adds INT_GL(for, Gap Limiting Interrupts) and VF id to it as required by the hardware interface. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Lipeng Signed-off-by: Mingguang Qu Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 8 ++++++-- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 91ae0135ee50..c2b613b40509 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -238,7 +238,7 @@ struct hclge_tqp_map { u8 rsv[18]; }; -#define HCLGE_VECTOR_ELEMENTS_PER_CMD 11 +#define HCLGE_VECTOR_ELEMENTS_PER_CMD 10 enum hclge_int_type { HCLGE_INT_TX, @@ -252,8 +252,12 @@ struct hclge_ctrl_vector_chain { #define HCLGE_INT_TYPE_S 0 #define HCLGE_INT_TYPE_M 0x3 #define HCLGE_TQP_ID_S 2 -#define HCLGE_TQP_ID_M (0x3fff << HCLGE_TQP_ID_S) +#define HCLGE_TQP_ID_M (0x7ff << HCLGE_TQP_ID_S) +#define HCLGE_INT_GL_IDX_S 13 +#define HCLGE_INT_GL_IDX_M (0x3 << HCLGE_INT_GL_IDX_S) __le16 tqp_type_and_id[HCLGE_VECTOR_ELEMENTS_PER_CMD]; + u8 vfid; + u8 rsv; }; #define HCLGE_TC_NUM 8 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e324bc6e9f4f..eafd9c678162 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2680,7 +2680,11 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport *vport, int vector_id, hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M, HCLGE_TQP_ID_S, node->tqp_index); + hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M, + HCLGE_INT_GL_IDX_S, + hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]); + req->vfid = vport->vport_id; if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) { req->int_cause_num = HCLGE_VECTOR_ELEMENTS_PER_CMD; @@ -2764,8 +2768,12 @@ static int hclge_unmap_ring_from_vector( hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M, HCLGE_TQP_ID_S, node->tqp_index); + hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M, + HCLGE_INT_GL_IDX_S, + hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]); + req->vfid = vport->vport_id; if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) { req->int_cause_num = HCLGE_VECTOR_ELEMENTS_PER_CMD; -- cgit From 139e8792537b327252a9676591a78e6408d50a85 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:13 +0100 Subject: net: hns3: Fixes the initialization of MAC address in hardware This patch fixes the initialization of MAC address, fetched from HNS3 firmware i.e. when it is not randomly generated, to the HNS3 hardware. Fixes: ca60906d2795 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC") Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index 1c3e29447891..4d68d6ea5143 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -2705,10 +2705,11 @@ static void hns3_init_mac_addr(struct net_device *netdev) eth_hw_addr_random(netdev); dev_warn(priv->dev, "using random MAC address %pM\n", netdev->dev_addr); - /* Also copy this new MAC address into hdev */ - if (h->ae_algo->ops->set_mac_addr) - h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr); } + + if (h->ae_algo->ops->set_mac_addr) + h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr); + } static void hns3_nic_set_priv_ops(struct net_device *netdev) -- cgit From fbbb1536b220eb4f4f95cbceae6579489a8adad5 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 19 Sep 2017 17:17:14 +0100 Subject: net: hns3: Fixes the ether address copy with appropriate API This patch replaces the ethernet address copy instance with more appropriate ether_addr_copy() function. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index eafd9c678162..8e172afd4876 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1063,8 +1063,7 @@ static int hclge_configure(struct hclge_dev *hdev) hdev->base_tqp_pid = 0; hdev->rss_size_max = 1; hdev->rx_buf_len = cfg.rx_buf_len; - for (i = 0; i < ETH_ALEN; i++) - hdev->hw.mac.mac_addr[i] = cfg.mac_addr[i]; + ether_addr_copy(hdev->hw.mac.mac_addr, cfg.mac_addr); hdev->hw.mac.media_type = cfg.media_type; hdev->hw.mac.phy_addr = cfg.phy_addr; hdev->num_desc = cfg.tqp_desc_num; -- cgit From 5e43aef8491ae3b5feb79cd15260faf39303ef33 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:15 +0100 Subject: net: hns3: Fixes the default VLAN-id of PF When there is no vlan id in the packets, hardware will treat the vlan id as 0 and look for the mac_vlan table. This patch set the default vlan id of PF as 0. Without this config, it will fail when look for mac_vlan table, and hardware will drop packets. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Mingguang Qu Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 8e172afd4876..74008ef23169 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3673,6 +3673,7 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev) { #define HCLGE_VLAN_TYPE_VF_TABLE 0 #define HCLGE_VLAN_TYPE_PORT_TABLE 1 + struct hnae3_handle *handle; int ret; ret = hclge_set_vlan_filter_ctrl(hdev, HCLGE_VLAN_TYPE_VF_TABLE, @@ -3682,8 +3683,11 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev) ret = hclge_set_vlan_filter_ctrl(hdev, HCLGE_VLAN_TYPE_PORT_TABLE, true); + if (ret) + return ret; - return ret; + handle = &hdev->vport[0].nic; + return hclge_set_port_vlan_filter(handle, htons(ETH_P_8021Q), 0, false); } static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu) -- cgit From 90f7b11a5a0081feb7041fcc795c9a131a62a725 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:16 +0100 Subject: net: hns3: Fixes the premature exit of loop when matching clients When register/unregister ae_dev, ae_dev should match all client in the client_list. Enet and roce can co-exists together so we should continue checking for enet and roce presence together. So break should not be there. Above caused problems in loading and unloading of modules. Fixes: 38eddd126772 ("net: hns3: Add support of the HNAE3 framework") Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.c | 43 ++++++----------------------- 1 file changed, 9 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c index 59efbd605416..5bcb2238acb2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c @@ -37,20 +37,15 @@ static bool hnae3_client_match(enum hnae3_client_type client_type, } static int hnae3_match_n_instantiate(struct hnae3_client *client, - struct hnae3_ae_dev *ae_dev, - bool is_reg, bool *matched) + struct hnae3_ae_dev *ae_dev, bool is_reg) { int ret; - *matched = false; - /* check if this client matches the type of ae_dev */ if (!(hnae3_client_match(client->type, ae_dev->dev_type) && hnae_get_bit(ae_dev->flag, HNAE3_DEV_INITED_B))) { return 0; } - /* there is a match of client and dev */ - *matched = true; /* now, (un-)instantiate client by calling lower layer */ if (is_reg) { @@ -69,7 +64,6 @@ int hnae3_register_client(struct hnae3_client *client) { struct hnae3_client *client_tmp; struct hnae3_ae_dev *ae_dev; - bool matched; int ret = 0; mutex_lock(&hnae3_common_lock); @@ -86,7 +80,7 @@ int hnae3_register_client(struct hnae3_client *client) /* if the client could not be initialized on current port, for * any error reasons, move on to next available port */ - ret = hnae3_match_n_instantiate(client, ae_dev, true, &matched); + ret = hnae3_match_n_instantiate(client, ae_dev, true); if (ret) dev_err(&ae_dev->pdev->dev, "match and instantiation failed for port\n"); @@ -102,12 +96,11 @@ EXPORT_SYMBOL(hnae3_register_client); void hnae3_unregister_client(struct hnae3_client *client) { struct hnae3_ae_dev *ae_dev; - bool matched; mutex_lock(&hnae3_common_lock); /* un-initialize the client on every matched port */ list_for_each_entry(ae_dev, &hnae3_ae_dev_list, node) { - hnae3_match_n_instantiate(client, ae_dev, false, &matched); + hnae3_match_n_instantiate(client, ae_dev, false); } list_del(&client->node); @@ -124,7 +117,6 @@ int hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo) const struct pci_device_id *id; struct hnae3_ae_dev *ae_dev; struct hnae3_client *client; - bool matched; int ret = 0; mutex_lock(&hnae3_common_lock); @@ -151,13 +143,10 @@ int hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo) * initialize the figure out client instance */ list_for_each_entry(client, &hnae3_client_list, node) { - ret = hnae3_match_n_instantiate(client, ae_dev, true, - &matched); + ret = hnae3_match_n_instantiate(client, ae_dev, true); if (ret) dev_err(&ae_dev->pdev->dev, "match and instantiation failed\n"); - if (matched) - break; } } @@ -175,7 +164,6 @@ void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo) const struct pci_device_id *id; struct hnae3_ae_dev *ae_dev; struct hnae3_client *client; - bool matched; mutex_lock(&hnae3_common_lock); /* Check if there are matched ae_dev */ @@ -187,12 +175,8 @@ void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo) /* check the client list for the match with this ae_dev type and * un-initialize the figure out client instance */ - list_for_each_entry(client, &hnae3_client_list, node) { - hnae3_match_n_instantiate(client, ae_dev, false, - &matched); - if (matched) - break; - } + list_for_each_entry(client, &hnae3_client_list, node) + hnae3_match_n_instantiate(client, ae_dev, false); ae_algo->ops->uninit_ae_dev(ae_dev); hnae_set_bit(ae_dev->flag, HNAE3_DEV_INITED_B, 0); @@ -212,7 +196,6 @@ int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev) const struct pci_device_id *id; struct hnae3_ae_algo *ae_algo; struct hnae3_client *client; - bool matched; int ret = 0; mutex_lock(&hnae3_common_lock); @@ -246,13 +229,10 @@ int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev) * initialize the figure out client instance */ list_for_each_entry(client, &hnae3_client_list, node) { - ret = hnae3_match_n_instantiate(client, ae_dev, true, - &matched); + ret = hnae3_match_n_instantiate(client, ae_dev, true); if (ret) dev_err(&ae_dev->pdev->dev, "match and instantiation failed\n"); - if (matched) - break; } out_err: @@ -270,7 +250,6 @@ void hnae3_unregister_ae_dev(struct hnae3_ae_dev *ae_dev) const struct pci_device_id *id; struct hnae3_ae_algo *ae_algo; struct hnae3_client *client; - bool matched; mutex_lock(&hnae3_common_lock); /* Check if there are matched ae_algo */ @@ -279,12 +258,8 @@ void hnae3_unregister_ae_dev(struct hnae3_ae_dev *ae_dev) if (!id) continue; - list_for_each_entry(client, &hnae3_client_list, node) { - hnae3_match_n_instantiate(client, ae_dev, false, - &matched); - if (matched) - break; - } + list_for_each_entry(client, &hnae3_client_list, node) + hnae3_match_n_instantiate(client, ae_dev, false); ae_algo->ops->uninit_ae_dev(ae_dev); hnae_set_bit(ae_dev->flag, HNAE3_DEV_INITED_B, 0); -- cgit From b5b7db8d680464b1d631fd016f5e093419f0bfd9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 10:05:57 -0700 Subject: tcp: fastopen: fix on syn-data transmit failure Our recent change exposed a bug in TCP Fastopen Client that syzkaller found right away [1] When we prepare skb with SYN+DATA, we attempt to transmit it, and we update socket state as if the transmit was a success. In socket RTX queue we have two skbs, one with the SYN alone, and a second one containing the DATA. When (malicious) ACK comes in, we now complain that second one had no skb_mstamp. The proper fix is to make sure that if the transmit failed, we do not pretend we sent the DATA skb, and make it our send_head. When 3WHS completes, we can now send the DATA right away, without having to wait for a timeout. [1] WARNING: CPU: 0 PID: 100189 at net/ipv4/tcp_input.c:3117 tcp_clean_rtx_queue+0x2057/0x2ab0 net/ipv4/tcp_input.c:3117() WARN_ON_ONCE(last_ackt == 0); Modules linked in: CPU: 0 PID: 100189 Comm: syz-executor1 Not tainted Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 0000000000000000 ffff8800b35cb1d8 ffffffff81cad00d 0000000000000000 ffffffff828a4347 ffff88009f86c080 ffffffff8316eb20 0000000000000d7f ffff8800b35cb220 ffffffff812c33c2 ffff8800baad2440 00000009d46575c0 Call Trace: [] __dump_stack [] dump_stack+0xc1/0x124 [] warn_slowpath_common+0xe2/0x150 [] warn_slowpath_null+0x2e/0x40 [] tcp_clean_rtx_queue+0x2057/0x2ab0 n [] tcp_ack+0x151d/0x3930 [] tcp_rcv_state_process+0x1c69/0x4fd0 [] tcp_v4_do_rcv+0x54f/0x7c0 [] sk_backlog_rcv [] __release_sock+0x12b/0x3a0 [] release_sock+0x5e/0x1c0 [] inet_wait_for_connect [] __inet_stream_connect+0x545/0xc50 [] tcp_sendmsg_fastopen [] tcp_sendmsg+0x2298/0x35a0 [] inet_sendmsg+0xe5/0x520 [] sock_sendmsg_nosec [] sock_sendmsg+0xcf/0x110 Fixes: 8c72c65b426b ("tcp: update skb->skb_mstamp more carefully") Fixes: 783237e8daf1 ("net-tcp: Fast Open client - sending SYN-data") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 517d737059d1..0bc9e46a5369 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3389,6 +3389,10 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) goto done; } + /* data was not sent, this is our new send_head */ + sk->sk_send_head = syn_data; + tp->packets_out -= tcp_skb_pcount(syn_data); + fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) @@ -3441,6 +3445,11 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; + buff = tcp_send_head(sk); + if (unlikely(buff)) { + tp->snd_nxt = TCP_SKB_CB(buff)->seq; + tp->pushed_seq = TCP_SKB_CB(buff)->seq; + } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ -- cgit From f55956065ec94e3e9371463d693a1029c4cc3007 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Tue, 19 Sep 2017 19:35:18 +0200 Subject: net: emac: Fix napi poll list corruption This patch is pretty much a carbon copy of commit 3079c652141f ("caif: Fix napi poll list corruption") with "caif" replaced by "emac". The commit d75b1ade567f ("net: less interrupt masking in NAPI") breaks emac. It is now required that if the entire budget is consumed when poll returns, the napi poll_list must remain empty. However, like some other drivers emac tries to do a last-ditch check and if there is more work it will call napi_reschedule and then immediately process some of this new work. Should the entire budget be consumed while processing such new work then we will violate the new caller contract. This patch fixes this by not touching any work when we reschedule in emac. Signed-off-by: Christian Lamparter Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/emac/mal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c index 2c74baa2398a..fff09dcf9e34 100644 --- a/drivers/net/ethernet/ibm/emac/mal.c +++ b/drivers/net/ethernet/ibm/emac/mal.c @@ -402,7 +402,7 @@ static int mal_poll(struct napi_struct *napi, int budget) unsigned long flags; MAL_DBG2(mal, "poll(%d)" NL, budget); - again: + /* Process TX skbs */ list_for_each(l, &mal->poll_list) { struct mal_commac *mc = @@ -451,7 +451,6 @@ static int mal_poll(struct napi_struct *napi, int budget) spin_lock_irqsave(&mal->lock, flags); mal_disable_eob_irq(mal); spin_unlock_irqrestore(&mal->lock, flags); - goto again; } mc->ops->poll_tx(mc->dev); } -- cgit From 7c30013133964aaa2f45c17d6e9782ac6cfd7f5f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Sep 2017 00:44:21 +0200 Subject: bpf: fix ri->map_owner pointer on bpf_prog_realloc Commit 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") passed the pointer to the prog itself to be loaded into r4 prior on bpf_redirect_map() helper call, so that we can store the owner into ri->map_owner out of the helper. Issue with that is that the actual address of the prog is still subject to change when subsequent rewrites occur that require slow path in bpf_prog_realloc() to alloc more memory, e.g. from patching inlining helper functions or constant blinding. Thus, we really need to take prog->aux as the address we're holding, which also works with prog clones as they share the same aux object. Instead of then fetching aux->prog during runtime, which could potentially incur cache misses due to false sharing, we are going to just use aux for comparison on the map owner. This will also keep the patchlet of the same size, and later check in xdp_map_invalid() only accesses read-only aux pointer from the prog, it's also in the same cacheline already from prior access when calling bpf_func. Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 7 ++++++- net/core/filter.c | 24 +++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 799b2451ef2d..b914fbe1383e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4205,7 +4205,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) } if (insn->imm == BPF_FUNC_redirect_map) { - u64 addr = (unsigned long)prog; + /* Note, we cannot use prog directly as imm as subsequent + * rewrites would still change the prog pointer. The only + * stable address we can use is aux, which also works with + * prog clones during blinding. + */ + u64 addr = (unsigned long)prog->aux; struct bpf_insn r4_ld[] = { BPF_LD_IMM64(BPF_REG_4, addr), *insn, diff --git a/net/core/filter.c b/net/core/filter.c index 24dd33dd9f04..82edad58d066 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1794,7 +1794,7 @@ struct redirect_info { u32 flags; struct bpf_map *map; struct bpf_map *map_to_flush; - const struct bpf_prog *map_owner; + unsigned long map_owner; }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); @@ -2500,11 +2500,17 @@ void xdp_do_flush_map(void) } EXPORT_SYMBOL_GPL(xdp_do_flush_map); +static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, + unsigned long aux) +{ + return (unsigned long)xdp_prog->aux != aux; +} + static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); - const struct bpf_prog *map_owner = ri->map_owner; + unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; struct net_device *fwd = NULL; u32 index = ri->ifindex; @@ -2512,9 +2518,9 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->ifindex = 0; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; - if (unlikely(map_owner != xdp_prog)) { + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { err = -EFAULT; map = NULL; goto err; @@ -2574,7 +2580,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); - const struct bpf_prog *map_owner = ri->map_owner; + unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; struct net_device *fwd = NULL; u32 index = ri->ifindex; @@ -2583,10 +2589,10 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, ri->ifindex = 0; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; if (map) { - if (unlikely(map_owner != xdp_prog)) { + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { err = -EFAULT; map = NULL; goto err; @@ -2632,7 +2638,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; return XDP_REDIRECT; } @@ -2646,7 +2652,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { }; BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, - const struct bpf_prog *, map_owner) + unsigned long, map_owner) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); -- cgit From 6819a14ecbe2e089e5c5bb74edecafdde2028a00 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Mon, 4 Sep 2017 15:52:55 +0100 Subject: net: ipv6: fix regression of no RTM_DELADDR sent after DAD failure Commit f784ad3d79e5 ("ipv6: do not send RTM_DELADDR for tentative addresses") incorrectly assumes that no RTM_NEWADDR are sent for addresses in tentative state, as this does happen for the standard IPv6 use-case of DAD failure, see the call to ipv6_ifa_notify() in addconf_dad_stop(). So as a result of this change, no RTM_DELADDR is sent after DAD failure for a link-local when strict DAD (accept_dad=2) is configured, or on the next admin down in other cases. The absence of this notification breaks backwards compatibility and causes problems after DAD failure if this notification was being relied on. The solution is to allow RTM_DELADDR to still be sent after DAD failure. Fixes: f784ad3d79e5 ("ipv6: do not send RTM_DELADDR for tentative addresses") Signed-off-by: Mike Manning Cc: Mahesh Bandewar Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c2e2a78787ec..d7dbcc8eda10 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4940,9 +4940,10 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) /* Don't send DELADDR notification for TENTATIVE address, * since NEWADDR notification is sent only after removing - * TENTATIVE flag. + * TENTATIVE flag, if DAD has not failed. */ - if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR) + if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) && + event == RTM_DELADDR) return; skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); -- cgit From 35e015e1f5773417952fe91ce8790baf9b4237a2 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 12 Sep 2017 17:46:37 +0200 Subject: ipv6: fix net.ipv6.conf.all interface DAD handlers Currently, writing into net.ipv6.conf.all.{accept_dad,use_optimistic,optimistic_dad} has no effect. Fix handling of these flags by: - using the maximum of global and per-interface values for the accept_dad flag. That is, if at least one of the two values is non-zero, enable DAD on the interface. If at least one value is set to 2, enable DAD and disable IPv6 operation on the interface if MAC-based link-local address was found - using the logical OR of global and per-interface values for the optimistic_dad flag. If at least one of them is set to one, optimistic duplicate address detection (RFC 4429) is enabled on the interface - using the logical OR of global and per-interface values for the use_optimistic flag. If at least one of them is set to one, optimistic addresses won't be marked as deprecated during source address selection on the interface. While at it, as we're modifying the prototype for ipv6_use_optimistic_addr(), drop inline, and let the compiler decide. Fixes: 7fd2561e4ebd ("net: ipv6: Add a sysctl to make optimistic addresses useful candidates") Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 18 ++++++++++++++---- net/ipv6/addrconf.c | 27 ++++++++++++++++++++------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index b3345d0fe0a6..77f4de59dc9c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1680,6 +1680,9 @@ accept_dad - INTEGER 2: Enable DAD, and disable IPv6 operation if MAC-based duplicate link-local address has been found. + DAD operation and mode on a given interface will be selected according + to the maximum value of conf/{all,interface}/accept_dad. + force_tllao - BOOLEAN Enable sending the target link-layer address option even when responding to a unicast neighbor solicitation. @@ -1727,16 +1730,23 @@ suppress_frag_ndisc - INTEGER optimistic_dad - BOOLEAN Whether to perform Optimistic Duplicate Address Detection (RFC 4429). - 0: disabled (default) - 1: enabled + 0: disabled (default) + 1: enabled + + Optimistic Duplicate Address Detection for the interface will be enabled + if at least one of conf/{all,interface}/optimistic_dad is set to 1, + it will be disabled otherwise. use_optimistic - BOOLEAN If enabled, do not classify optimistic addresses as deprecated during source address selection. Preferred addresses will still be chosen before optimistic addresses, subject to other ranking in the source address selection algorithm. - 0: disabled (default) - 1: enabled + 0: disabled (default) + 1: enabled + + This will be enabled if at least one of + conf/{all,interface}/use_optimistic is set to 1, disabled otherwise. stable_secret - IPv6 address This IPv6 address will be used as a secret to generate IPv6 diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d7dbcc8eda10..96861c702c06 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1399,10 +1399,18 @@ static inline int ipv6_saddr_preferred(int type) return 0; } -static inline bool ipv6_use_optimistic_addr(struct inet6_dev *idev) +static bool ipv6_use_optimistic_addr(struct net *net, + struct inet6_dev *idev) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - return idev && idev->cnf.optimistic_dad && idev->cnf.use_optimistic; + if (!idev) + return false; + if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad) + return false; + if (!net->ipv6.devconf_all->use_optimistic && !idev->cnf.use_optimistic) + return false; + + return true; #else return false; #endif @@ -1472,7 +1480,7 @@ static int ipv6_get_saddr_eval(struct net *net, /* Rule 3: Avoid deprecated and optimistic addresses */ u8 avoid = IFA_F_DEPRECATED; - if (!ipv6_use_optimistic_addr(score->ifa->idev)) + if (!ipv6_use_optimistic_addr(net, score->ifa->idev)) avoid |= IFA_F_OPTIMISTIC; ret = ipv6_saddr_preferred(score->addr_type) || !(score->ifa->flags & avoid); @@ -2460,7 +2468,8 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, int max_addresses = in6_dev->cnf.max_addresses; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (in6_dev->cnf.optimistic_dad && + if ((net->ipv6.devconf_all->optimistic_dad || + in6_dev->cnf.optimistic_dad) && !net->ipv6.devconf_all->forwarding && sllao) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3051,7 +3060,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, u32 addr_flags = flags | IFA_F_PERMANENT; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (idev->cnf.optimistic_dad && + if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad || + idev->cnf.optimistic_dad) && !dev_net(idev->dev)->ipv6.devconf_all->forwarding) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3810,6 +3820,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) goto out; if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || + dev_net(dev)->ipv6.devconf_all->accept_dad < 1 || idev->cnf.accept_dad < 1 || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { @@ -3841,7 +3852,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) */ if (ifp->flags & IFA_F_OPTIMISTIC) { ip6_ins_rt(ifp->rt); - if (ipv6_use_optimistic_addr(idev)) { + if (ipv6_use_optimistic_addr(dev_net(dev), idev)) { /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ @@ -3897,7 +3908,9 @@ static void addrconf_dad_work(struct work_struct *w) action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; - if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6 && + if ((dev_net(idev->dev)->ipv6.devconf_all->accept_dad > 1 || + idev->cnf.accept_dad > 1) && + !idev->cnf.disable_ipv6 && !(ifp->flags & IFA_F_STABLE_PRIVACY)) { struct in6_addr addr; -- cgit From 23586b66d84ba3184b8820277f3fc42761640f87 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 18 Sep 2017 18:18:45 -0500 Subject: Fix SMB3.1.1 guest authentication to Samba Samba rejects SMB3.1.1 dialect (vers=3.1.1) negotiate requests from the kernel client due to the two byte pad at the end of the negotiate contexts. CC: Stable Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5c16591a128e..b0eaebe627e9 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -439,7 +439,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req) build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); req->NegotiateContextCount = cpu_to_le16(2); - inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + 2 + inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + sizeof(struct smb2_encryption_neg_context)); /* calculate hash */ } #else -- cgit From 04fc52fb222d35e1f7a0d5d85b19a676ea1e10e8 Mon Sep 17 00:00:00 2001 From: Maciej Purski Date: Tue, 5 Sep 2017 14:23:02 +0200 Subject: drm/exynos/hdmi: Fix unsafe list iteration Function hdmi_mode_fixup() used bare list_for_each entry, which was unsafe and caused memory corruption detected by kasan. It now uses drm_for_each_connector_iter macro, which is now recommended by the documentation and safe. Signed-off-by: Maciej Purski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_hdmi.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_hdmi.c b/drivers/gpu/drm/exynos/exynos_hdmi.c index 214fa5e51963..0109ff40b1db 100644 --- a/drivers/gpu/drm/exynos/exynos_hdmi.c +++ b/drivers/gpu/drm/exynos/exynos_hdmi.c @@ -944,22 +944,27 @@ static bool hdmi_mode_fixup(struct drm_encoder *encoder, struct drm_device *dev = encoder->dev; struct drm_connector *connector; struct drm_display_mode *m; + struct drm_connector_list_iter conn_iter; int mode_ok; drm_mode_set_crtcinfo(adjusted_mode, 0); - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &conn_iter); + drm_for_each_connector_iter(connector, &conn_iter) { if (connector->encoder == encoder) break; } + if (connector) + drm_connector_get(connector); + drm_connector_list_iter_end(&conn_iter); - if (connector->encoder != encoder) + if (!connector) return true; mode_ok = hdmi_mode_valid(connector, adjusted_mode); if (mode_ok == MODE_OK) - return true; + goto cleanup; /* * Find the most suitable mode and copy it to adjusted_mode. @@ -979,6 +984,9 @@ static bool hdmi_mode_fixup(struct drm_encoder *encoder, } } +cleanup: + drm_connector_put(connector); + return true; } -- cgit From 8632ec8cdcaead67c019f1ba36760d1bdd0c5d23 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 23 Aug 2017 15:37:53 +1000 Subject: powerpc/configs: Update for CONFIG_SND changes Commit eb3b705aaed9 ("ALSA: Make CONFIG_SND_OSSEMUL user-selectable") means we need to set CONFIG_SND_OSSEMUL in our configs, otherwise we lose some of the SND symbols. And commit 0181307abc1d ("ALSA: seq: Reorganize kconfig and build") reorganised things, which causes the churn. Signed-off-by: Michael Ellerman --- arch/powerpc/configs/g5_defconfig | 5 +++-- arch/powerpc/configs/gamecube_defconfig | 5 +++-- arch/powerpc/configs/pasemi_defconfig | 3 ++- arch/powerpc/configs/pmac32_defconfig | 7 ++++--- arch/powerpc/configs/ppc64_defconfig | 7 ++++--- arch/powerpc/configs/ppc64e_defconfig | 7 ++++--- arch/powerpc/configs/ppc6xx_defconfig | 7 ++++--- arch/powerpc/configs/wii_defconfig | 5 +++-- 8 files changed, 27 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig index e084fa548d73..063817fee61c 100644 --- a/arch/powerpc/configs/g5_defconfig +++ b/arch/powerpc/configs/g5_defconfig @@ -138,10 +138,11 @@ CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m CONFIG_SND=m -CONFIG_SND_SEQUENCER=m +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m -CONFIG_SND_SEQUENCER_OSS=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQUENCER_OSS=m CONFIG_SND_POWERMAC=m CONFIG_SND_AOA=m CONFIG_SND_AOA_FABRIC_LAYOUT=m diff --git a/arch/powerpc/configs/gamecube_defconfig b/arch/powerpc/configs/gamecube_defconfig index 79bbc8238b32..805b0f87653c 100644 --- a/arch/powerpc/configs/gamecube_defconfig +++ b/arch/powerpc/configs/gamecube_defconfig @@ -64,11 +64,12 @@ CONFIG_LOGO=y # CONFIG_LOGO_LINUX_CLUT224 is not set CONFIG_SOUND=y CONFIG_SND=y -CONFIG_SND_SEQUENCER=y +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=y CONFIG_SND_PCM_OSS=y -CONFIG_SND_SEQUENCER_OSS=y # CONFIG_SND_VERBOSE_PROCFS is not set +CONFIG_SND_SEQUENCER=y +CONFIG_SND_SEQUENCER_OSS=y # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_GENERIC=y diff --git a/arch/powerpc/configs/pasemi_defconfig b/arch/powerpc/configs/pasemi_defconfig index 8cf4a46bef86..6daa56f8895c 100644 --- a/arch/powerpc/configs/pasemi_defconfig +++ b/arch/powerpc/configs/pasemi_defconfig @@ -115,9 +115,10 @@ CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_LOGO=y CONFIG_SOUND=y CONFIG_SND=y -CONFIG_SND_SEQUENCER=y +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=y CONFIG_SND_PCM_OSS=y +CONFIG_SND_SEQUENCER=y CONFIG_SND_SEQUENCER_OSS=y CONFIG_SND_USB_AUDIO=y CONFIG_SND_USB_USX2Y=y diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index 8e798b1fbc99..1aab9a62a681 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -227,11 +227,12 @@ CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m CONFIG_SND=m -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m -CONFIG_SND_SEQUENCER_OSS=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m CONFIG_SND_DUMMY=m CONFIG_SND_POWERMAC=m CONFIG_SND_AOA=m diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 791db775a09c..6ddca80c52c3 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -222,11 +222,12 @@ CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m CONFIG_SND=m -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m -CONFIG_SND_SEQUENCER_OSS=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m CONFIG_SND_POWERMAC=m CONFIG_SND_AOA=m CONFIG_SND_AOA_FABRIC_LAYOUT=m diff --git a/arch/powerpc/configs/ppc64e_defconfig b/arch/powerpc/configs/ppc64e_defconfig index d0fe0f8f77c2..41d85cb3c9a2 100644 --- a/arch/powerpc/configs/ppc64e_defconfig +++ b/arch/powerpc/configs/ppc64e_defconfig @@ -141,11 +141,12 @@ CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m CONFIG_SND=m -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m -CONFIG_SND_SEQUENCER_OSS=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m CONFIG_HID_DRAGONRISE=y CONFIG_HID_GYRATION=y CONFIG_HID_TWINHAN=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index ae6eba482d75..da0e8d535eb8 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -789,17 +789,18 @@ CONFIG_LOGO=y # CONFIG_LOGO_LINUX_VGA16 is not set CONFIG_SOUND=m CONFIG_SND=m -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m -CONFIG_SND_SEQUENCER_OSS=y CONFIG_SND_DYNAMIC_MINORS=y # CONFIG_SND_SUPPORT_OLD_API is not set CONFIG_SND_VERBOSE_PRINTK=y CONFIG_SND_DEBUG=y CONFIG_SND_DEBUG_VERBOSE=y CONFIG_SND_PCM_XRUN_DEBUG=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m CONFIG_SND_DUMMY=m CONFIG_SND_VIRMIDI=m CONFIG_SND_MTPAV=m diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig index aef41b17a8bc..9c7400a19e9d 100644 --- a/arch/powerpc/configs/wii_defconfig +++ b/arch/powerpc/configs/wii_defconfig @@ -79,11 +79,12 @@ CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_SOUND=y CONFIG_SND=y -CONFIG_SND_SEQUENCER=y +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=y CONFIG_SND_PCM_OSS=y -CONFIG_SND_SEQUENCER_OSS=y # CONFIG_SND_VERBOSE_PROCFS is not set +CONFIG_SND_SEQUENCER=y +CONFIG_SND_SEQUENCER_OSS=y CONFIG_HID_APPLE=m CONFIG_HID_WACOM=m CONFIG_MMC=y -- cgit From 4917fcb58cc73f6b81455e3c5f960144809ddf1a Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 19 Sep 2017 11:47:06 +0530 Subject: powerpc/sysrq: Fix oops whem ppmu is not registered Kernel crashes if power pmu is not registered and user tries to dump regs with 'echo p > /proc/sysrq-trigger'. Sample log: Unable to handle kernel paging request for data at address 0x00000008 Faulting instruction address: 0xc0000000000d52f0 NIP [c0000000000d52f0] perf_event_print_debug+0x10/0x230 LR [c00000000058a938] sysrq_handle_showregs+0x38/0x50 Call Trace: printk+0x38/0x4c (unreliable) __handle_sysrq+0xe4/0x270 write_sysrq_trigger+0x64/0x80 proc_reg_write+0x80/0xd0 __vfs_write+0x40/0x200 vfs_write+0xc8/0x240 SyS_write+0x60/0x110 system_call+0x58/0x6c Fixes: 5f6d0380c640 ("powerpc/perf: Define perf_event_print_debug() to print PMU register values") Signed-off-by: Ravi Bangoria Reviewed-by: Kamalesh Babulal Signed-off-by: Michael Ellerman --- arch/powerpc/perf/core-book3s.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 2e3eb7431571..9e3da168d54c 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -793,6 +793,11 @@ void perf_event_print_debug(void) u32 pmcs[MAX_HWEVENTS]; int i; + if (!ppmu) { + pr_info("Performance monitor hardware not registered.\n"); + return; + } + if (!ppmu->n_counter) return; -- cgit From c1fa0768a8713b135848f78fd43ffc208d8ded70 Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Wed, 13 Sep 2017 22:13:48 -0400 Subject: powerpc/tm: Flush TM only if CPU has TM feature Commit cd63f3c ("powerpc/tm: Fix saving of TM SPRs in core dump") added code to access TM SPRs in flush_tmregs_to_thread(). However flush_tmregs_to_thread() does not check if TM feature is available on CPU before trying to access TM SPRs in order to copy live state to thread structures. flush_tmregs_to_thread() is indeed guarded by CONFIG_PPC_TRANSACTIONAL_MEM but it might be the case that kernel was compiled with CONFIG_PPC_TRANSACTIONAL_MEM enabled and ran on a CPU without TM feature available, thus rendering the execution of TM instructions that are treated by the CPU as illegal instructions. The fix is just to add proper checking in flush_tmregs_to_thread() if CPU has the TM feature before accessing any TM-specific resource, returning immediately if TM is no available on the CPU. Adding that checking in flush_tmregs_to_thread() instead of in places where it is called, like in vsr_get() and vsr_set(), is better because avoids the same problem cropping up elsewhere. Cc: stable@vger.kernel.org # v4.13+ Fixes: cd63f3c ("powerpc/tm: Fix saving of TM SPRs in core dump") Signed-off-by: Gustavo Romero Reviewed-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 07cd22e35405..f52ad5bb7109 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -131,7 +131,7 @@ static void flush_tmregs_to_thread(struct task_struct *tsk) * in the appropriate thread structures from live. */ - if (tsk != current) + if ((!cpu_has_feature(CPU_FTR_TM)) || (tsk != current)) return; if (MSR_TM_SUSPENDED(mfmsr())) { -- cgit From ad47ff3e33503e0969db2d4f9a40942aa6414598 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 19 Sep 2017 20:45:52 +1000 Subject: powerpc/sstep: Fix issues with set_cr0() set_cr0() broke when we changed analyse_instr() to not modify the register state. Instead of looking at regs->gpr[x] which has not been updated yet, we need to look at op->val. Fixes: 3cdfcbfd32b9 ("powerpc: Change analyse_instr so it doesn't modify *regs") Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index fb9f58b868e7..9d72e5900320 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -944,9 +944,9 @@ NOKPROBE_SYMBOL(emulate_dcbz); : "r" (addr), "i" (-EFAULT), "0" (err)) static nokprobe_inline void set_cr0(const struct pt_regs *regs, - struct instruction_op *op, int rd) + struct instruction_op *op) { - long val = regs->gpr[rd]; + long val = op->val; op->type |= SETCC; op->ccval = (regs->ccr & 0x0fffffff) | ((regs->xer >> 3) & 0x10000000); @@ -1326,7 +1326,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, case 13: /* addic. */ imm = (short) instr; add_with_carry(regs, op, rd, regs->gpr[ra], imm, 0); - set_cr0(regs, op, rd); + set_cr0(regs, op); return 1; case 14: /* addi */ @@ -1397,13 +1397,13 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, case 28: /* andi. */ op->val = regs->gpr[rd] & (unsigned short) instr; - set_cr0(regs, op, ra); + set_cr0(regs, op); goto logical_done_nocc; case 29: /* andis. */ imm = (unsigned short) instr; op->val = regs->gpr[rd] & (imm << 16); - set_cr0(regs, op, ra); + set_cr0(regs, op); goto logical_done_nocc; #ifdef __powerpc64__ @@ -2526,7 +2526,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, logical_done: if (instr & 1) - set_cr0(regs, op, ra); + set_cr0(regs, op); logical_done_nocc: op->reg = ra; op->type |= SETREG; @@ -2534,7 +2534,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, arith_done: if (instr & 1) - set_cr0(regs, op, rd); + set_cr0(regs, op); compute_done: op->reg = rd; op->type |= SETREG; -- cgit From 5bcaa4cc41923871777c3d13906267e812775094 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 19 Sep 2017 20:45:53 +1000 Subject: powerpc/sstep: Fix issues with mcrf mcrf broke when we changed analyse_instr() to not modify the register state. The instruction writes to the CR, so we need to store the result in op->ccval, not op->val. Fixes: 3cdfcbfd32b9 ("powerpc: Change analyse_instr so it doesn't modify *regs") Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 9d72e5900320..c4cda1afb49d 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1513,10 +1513,10 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->type = COMPUTE + SETCC; imm = 0xf0000000UL; val = regs->gpr[rd]; - op->val = regs->ccr; + op->ccval = regs->ccr; for (sh = 0; sh < 8; ++sh) { if (instr & (0x80000 >> sh)) - op->val = (op->val & ~imm) | + op->ccval = (op->ccval & ~imm) | (val & imm); imm >>= 4; } -- cgit From 1575fe06f6b1d156ed31fb22c8631d49d79751d8 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 20 Sep 2017 09:32:19 +1000 Subject: powerpc/sstep: mullw should calculate a 64 bit signed result mullw should do a 32 bit signed multiply and create a 64 bit signed result. It currently truncates the result to 32 bits. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index c4cda1afb49d..5e8418c28bd8 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1651,8 +1651,9 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, goto arith_done; case 235: /* mullw */ - op->val = (unsigned int) regs->gpr[ra] * - (unsigned int) regs->gpr[rb]; + op->val = (long)(int) regs->gpr[ra] * + (int) regs->gpr[rb]; + goto arith_done; case 266: /* add */ -- cgit From 5d298baa41883fc421acfd932799c0f4177249ae Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 31 Aug 2017 17:17:41 +0530 Subject: powerpc/powernv: Clear LPCR[PECE1] via stop-api only for deep state offline Commit 24be85a23d1f ("powerpc/powernv: Clear PECE1 in LPCR via stop-api only on Hotplug") clears the PECE1 bit of the LPCR via stop-api during CPU-Hotplug to prevent wakeup due to a decrementer on an offlined CPU which is in a deep stop state. In the case where the stop-api support is found to be lacking, the commit 785a12afdb4a ("powerpc/powernv/idle: Disable LOSE_FULL_CONTEXT states when stop-api fails") disables deep states that lose hypervisor context. Thus in this case, the offlined CPU will be put to some shallow idle state. However, we currently unconditionally clear the PECE1 in LPCR via stop-api during CPU-Hotplug even when deep states are disabled due to stop-api failure. Fix this by clearing PECE1 of LPCR via stop-api during CPU-Hotplug *only* when the offlined CPU will be put to a deep state that loses hypervisor context. Fixes: 24be85a23d1f ("powerpc/powernv: Clear PECE1 in LPCR via stop-api only on Hotplug") Reported-by: Pavithra Prakash Signed-off-by: Gautham R. Shenoy Reviewed-by: Nicholas Piggin Tested-by: Pavithra Prakash Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/idle.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 9f59041a172b..443d5ca71995 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -393,7 +393,13 @@ static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) u64 pir = get_hard_smp_processor_id(cpu); mtspr(SPRN_LPCR, lpcr_val); - opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val); + + /* + * Program the LPCR via stop-api only if the deepest stop state + * can lose hypervisor context. + */ + if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) + opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val); } /* -- cgit From 590d08d3da45e9fed423b08ab38d71886c07abc8 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 19 Sep 2017 11:43:47 -0500 Subject: SMB3: Fix endian warning Multi-dialect negotiate patch had a minor endian error. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg CC: Stable # 4.13+ --- fs/cifs/smb2pdu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b0eaebe627e9..b4c58a1db1ae 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -570,10 +570,11 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* ops set to 3.0 by default for default so update */ ses->server->ops = &smb21_operations; } - } else if (rsp->DialectRevision != ses->server->vals->protocol_id) { + } else if (le16_to_cpu(rsp->DialectRevision) != + ses->server->vals->protocol_id) { /* if requested single dialect ensure returned dialect matched */ cifs_dbg(VFS, "Illegal 0x%x dialect returned: not requested\n", - cpu_to_le16(rsp->DialectRevision)); + le16_to_cpu(rsp->DialectRevision)); return -EIO; } -- cgit From c721c38957fb19982416f6be71aae7b30630d83b Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 19 Sep 2017 18:40:03 -0500 Subject: SMB3: Warn user if trying to sign connection that authenticated as guest It can be confusing if user ends up authenticated as guest but they requested signing (server will return error validating signed packets) so add log message for this. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg CC: Stable --- fs/cifs/smb2pdu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b4c58a1db1ae..d499ce265c3b 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1176,6 +1176,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, while (sess_data->func) sess_data->func(sess_data); + if ((ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) && (ses->sign)) + cifs_dbg(VFS, "signing requested but authenticated as guest\n"); rc = sess_data->result; out: kfree(sess_data); -- cgit From 6e82e929d98552ed5156919e0f532eeba2da704b Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 20 Sep 2017 13:09:23 +1000 Subject: cifs: show 'soft' in the mount options for hard mounts Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 180b3356ff86..a864e6575309 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -461,6 +461,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",nocase"); if (tcon->retry) seq_puts(s, ",hard"); + else + seq_puts(s, ",soft"); if (tcon->use_persistent) seq_puts(s, ",persistenthandles"); else if (tcon->use_resilient) -- cgit From fd0b19ed5389187829b854900511c9195875bb42 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 19 Sep 2017 22:07:18 -0700 Subject: MIPS: Fix perf event init Commit c311c797998c ("cpumask: make "nr_cpumask_bits" unsigned") modified mipspmu_event_init() to cast the struct perf_event cpu field to an unsigned integer before it is compared with nr_cpumask_bits (and *ahem* did so without copying the linux-mips mailing list or any MIPS developers...). This is broken because the cpu field may be -1 for events which follow a process rather than being affine to a particular CPU. When this is the case the cast to an unsigned int results in a value equal to ULONG_MAX, which is always greater than nr_cpumask_bits so we always fail mipspmu_event_init() and return -ENODEV. The check against nr_cpumask_bits seems nonsensical anyway, so this patch simply removes it. The cpu field is going to either be -1 or a valid CPU number. Comparing it with nr_cpumask_bits is effectively checking that it's a valid cpu number, but it seems safe to rely on the core perf events code to ensure that's the case. The end result is that this fixes use of perf on MIPS when not constraining events to a particular CPU, and fixes the "perf list hw" command which fails to list any events without this. Signed-off-by: Paul Burton Fixes: c311c797998c ("cpumask: make "nr_cpumask_bits" unsigned") Cc: Alexey Dobriyan Cc: Andrew Morton Cc: linux-mips@linux-mips.org Cc: stable # v4.12+ Patchwork: https://patchwork.linux-mips.org/patch/17323/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/perf_event_mipsxx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 9e6c74bf66c4..6668f67a61c3 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -618,8 +618,7 @@ static int mipspmu_event_init(struct perf_event *event) return -ENOENT; } - if ((unsigned int)event->cpu >= nr_cpumask_bits || - (event->cpu >= 0 && !cpu_online(event->cpu))) + if (event->cpu >= 0 && !cpu_online(event->cpu)) return -ENODEV; if (!atomic_inc_not_zero(&active_events)) { -- cgit From bd6227a150fdb56e7bb734976ef6e53a2c1cb334 Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Thu, 14 Sep 2017 17:10:28 +0200 Subject: crypto: drbg - fix freeing of resources During the change to use aligned buffers, the deallocation code path was not updated correctly. The current code tries to free the aligned buffer pointer and not the original buffer pointer as it is supposed to. Thus, the code is updated to free the original buffer pointer and set the aligned buffer pointer that is used throughout the code to NULL. Fixes: 3cfc3b9721123 ("crypto: drbg - use aligned buffers") CC: CC: Herbert Xu Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- crypto/drbg.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crypto/drbg.c b/crypto/drbg.c index 633a88e93ab0..70018397e59a 100644 --- a/crypto/drbg.c +++ b/crypto/drbg.c @@ -1133,10 +1133,10 @@ static inline void drbg_dealloc_state(struct drbg_state *drbg) { if (!drbg) return; - kzfree(drbg->V); - drbg->Vbuf = NULL; - kzfree(drbg->C); - drbg->Cbuf = NULL; + kzfree(drbg->Vbuf); + drbg->V = NULL; + kzfree(drbg->Cbuf); + drbg->C = NULL; kzfree(drbg->scratchpadbuf); drbg->scratchpadbuf = NULL; drbg->reseed_ctr = 0; -- cgit From 569f11c9f788959b640116b5bbd6d8a1f07326da Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:00 -0500 Subject: crypto: x86/blowfish - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use R12 instead of RBP. R12 can't be used as the RT0 register because of x86 instruction encoding limitations. So use R12 for CTX and RDI for CTX. This means that CTX is no longer an implicit function argument. Instead it needs to be explicitly copied from RDI. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/blowfish-x86_64-asm_64.S | 48 +++++++++++++++++--------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 246c67006ed0..8c1fcb6bad21 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -33,7 +33,7 @@ #define s3 ((16 + 2 + (3 * 256)) * 4) /* register macros */ -#define CTX %rdi +#define CTX %r12 #define RIO %rsi #define RX0 %rax @@ -56,12 +56,12 @@ #define RX2bh %ch #define RX3bh %dh -#define RT0 %rbp +#define RT0 %rdi #define RT1 %rsi #define RT2 %r8 #define RT3 %r9 -#define RT0d %ebp +#define RT0d %edi #define RT1d %esi #define RT2d %r8d #define RT3d %r9d @@ -120,13 +120,14 @@ ENTRY(__blowfish_enc_blk) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: bool, if true: xor output */ - movq %rbp, %r11; + movq %r12, %r11; + movq %rdi, CTX; movq %rsi, %r10; movq %rdx, RIO; @@ -142,7 +143,7 @@ ENTRY(__blowfish_enc_blk) round_enc(14); add_roundkey_enc(16); - movq %r11, %rbp; + movq %r11, %r12; movq %r10, RIO; test %cl, %cl; @@ -157,12 +158,13 @@ ENDPROC(__blowfish_enc_blk) ENTRY(blowfish_dec_blk) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ - movq %rbp, %r11; + movq %r12, %r11; + movq %rdi, CTX; movq %rsi, %r10; movq %rdx, RIO; @@ -181,7 +183,7 @@ ENTRY(blowfish_dec_blk) movq %r10, RIO; write_block(); - movq %r11, %rbp; + movq %r11, %r12; ret; ENDPROC(blowfish_dec_blk) @@ -298,20 +300,21 @@ ENDPROC(blowfish_dec_blk) ENTRY(__blowfish_enc_blk_4way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: bool, if true: xor output */ - pushq %rbp; + pushq %r12; pushq %rbx; pushq %rcx; - preload_roundkey_enc(0); - + movq %rdi, CTX movq %rsi, %r11; movq %rdx, RIO; + preload_roundkey_enc(0); + read_block4(); round_enc4(0); @@ -324,39 +327,40 @@ ENTRY(__blowfish_enc_blk_4way) round_enc4(14); add_preloaded_roundkey4(); - popq %rbp; + popq %r12; movq %r11, RIO; - test %bpl, %bpl; + test %r12b, %r12b; jnz .L__enc_xor4; write_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; .L__enc_xor4: xor_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; ENDPROC(__blowfish_enc_blk_4way) ENTRY(blowfish_dec_blk_4way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ - pushq %rbp; + pushq %r12; pushq %rbx; - preload_roundkey_dec(17); - movq %rsi, %r11; + movq %rdi, CTX; + movq %rsi, %r11 movq %rdx, RIO; + preload_roundkey_dec(17); read_block4(); round_dec4(17); @@ -373,7 +377,7 @@ ENTRY(blowfish_dec_blk_4way) write_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; ENDPROC(blowfish_dec_blk_4way) -- cgit From b46c9d717645529417ca9045cfdbf59f84922573 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:01 -0500 Subject: crypto: x86/camellia - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use R12 instead of RBP. Both are callee-saved registers, so the substitution is straightforward. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/camellia-x86_64-asm_64.S | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 310319c601ed..95ba6956a7f6 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -75,17 +75,17 @@ #define RCD1bh %dh #define RT0 %rsi -#define RT1 %rbp +#define RT1 %r12 #define RT2 %r8 #define RT0d %esi -#define RT1d %ebp +#define RT1d %r12d #define RT2d %r8d #define RT2bl %r8b #define RXOR %r9 -#define RRBP %r10 +#define RR12 %r10 #define RDST %r11 #define RXORd %r9d @@ -197,7 +197,7 @@ ENTRY(__camellia_enc_blk) * %rdx: src * %rcx: bool xor */ - movq %rbp, RRBP; + movq %r12, RR12; movq %rcx, RXOR; movq %rsi, RDST; @@ -227,13 +227,13 @@ ENTRY(__camellia_enc_blk) enc_outunpack(mov, RT1); - movq RRBP, %rbp; + movq RR12, %r12; ret; .L__enc_xor: enc_outunpack(xor, RT1); - movq RRBP, %rbp; + movq RR12, %r12; ret; ENDPROC(__camellia_enc_blk) @@ -248,7 +248,7 @@ ENTRY(camellia_dec_blk) movl $24, RXORd; cmovel RXORd, RT2d; /* max */ - movq %rbp, RRBP; + movq %r12, RR12; movq %rsi, RDST; movq %rdx, RIO; @@ -271,7 +271,7 @@ ENTRY(camellia_dec_blk) dec_outunpack(); - movq RRBP, %rbp; + movq RR12, %r12; ret; ENDPROC(camellia_dec_blk) @@ -433,7 +433,7 @@ ENTRY(__camellia_enc_blk_2way) */ pushq %rbx; - movq %rbp, RRBP; + movq %r12, RR12; movq %rcx, RXOR; movq %rsi, RDST; movq %rdx, RIO; @@ -461,14 +461,14 @@ ENTRY(__camellia_enc_blk_2way) enc_outunpack2(mov, RT2); - movq RRBP, %rbp; + movq RR12, %r12; popq %rbx; ret; .L__enc2_xor: enc_outunpack2(xor, RT2); - movq RRBP, %rbp; + movq RR12, %r12; popq %rbx; ret; ENDPROC(__camellia_enc_blk_2way) @@ -485,7 +485,7 @@ ENTRY(camellia_dec_blk_2way) cmovel RXORd, RT2d; /* max */ movq %rbx, RXOR; - movq %rbp, RRBP; + movq %r12, RR12; movq %rsi, RDST; movq %rdx, RIO; @@ -508,7 +508,7 @@ ENTRY(camellia_dec_blk_2way) dec_outunpack2(); - movq RRBP, %rbp; + movq RR12, %r12; movq RXOR, %rbx; ret; ENDPROC(camellia_dec_blk_2way) -- cgit From 4b15606664a2f8d7c4f0092fb0305fe1c7c65b7b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:02 -0500 Subject: crypto: x86/cast5 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use R15 instead of RBP. R15 can't be used as the RID1 register because of x86 instruction encoding limitations. So use R15 for CTX and RDI for CTX. This means that CTX is no longer an implicit function argument. Instead it needs to be explicitly copied from RDI. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 47 ++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index b4a8806234ea..86107c961bb4 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -47,7 +47,7 @@ /********************************************************************** 16-way AVX cast5 **********************************************************************/ -#define CTX %rdi +#define CTX %r15 #define RL1 %xmm0 #define RR1 %xmm1 @@ -70,8 +70,8 @@ #define RTMP %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %rdi +#define RID1d %edi #define RID2 %rsi #define RID2d %esi @@ -226,7 +226,7 @@ .align 16 __cast5_enc_blk16: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RL1: blocks 1 and 2 * RR1: blocks 3 and 4 * RL2: blocks 5 and 6 @@ -246,9 +246,11 @@ __cast5_enc_blk16: * RR4: encrypted blocks 15 and 16 */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -283,7 +285,7 @@ __cast5_enc_blk16: .L__skip_enc: popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; @@ -298,7 +300,7 @@ ENDPROC(__cast5_enc_blk16) .align 16 __cast5_dec_blk16: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RL1: encrypted blocks 1 and 2 * RR1: encrypted blocks 3 and 4 * RL2: encrypted blocks 5 and 6 @@ -318,9 +320,11 @@ __cast5_dec_blk16: * RR4: decrypted blocks 15 and 16 */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -356,7 +360,7 @@ __cast5_dec_blk16: vmovdqa .Lbswap_mask, RKM; popq %rbx; - popq %rbp; + popq %r15; outunpack_blocks(RR1, RL1, RTMP, RX, RKM); outunpack_blocks(RR2, RL2, RTMP, RX, RKM); @@ -372,12 +376,14 @@ ENDPROC(__cast5_dec_blk16) ENTRY(cast5_ecb_enc_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; vmovdqu (0*4*4)(%rdx), RL1; @@ -400,18 +406,22 @@ ENTRY(cast5_ecb_enc_16way) vmovdqu RR4, (6*4*4)(%r11); vmovdqu RL4, (7*4*4)(%r11); + popq %r15; FRAME_END ret; ENDPROC(cast5_ecb_enc_16way) ENTRY(cast5_ecb_dec_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + + movq %rdi, CTX; movq %rsi, %r11; vmovdqu (0*4*4)(%rdx), RL1; @@ -434,20 +444,22 @@ ENTRY(cast5_ecb_dec_16way) vmovdqu RR4, (6*4*4)(%r11); vmovdqu RL4, (7*4*4)(%r11); + popq %r15; FRAME_END ret; ENDPROC(cast5_ecb_dec_16way) ENTRY(cast5_cbc_dec_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -483,23 +495,24 @@ ENTRY(cast5_cbc_dec_16way) vmovdqu RR4, (6*16)(%r11); vmovdqu RL4, (7*16)(%r11); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast5_cbc_dec_16way) ENTRY(cast5_ctr_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: iv (big endian, 64bit) */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -558,8 +571,8 @@ ENTRY(cast5_ctr_16way) vmovdqu RR4, (6*16)(%r11); vmovdqu RL4, (7*16)(%r11); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast5_ctr_16way) -- cgit From c66cc3be2951fad4d7d7f799baf57c8c5cc8d655 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:03 -0500 Subject: crypto: x86/cast6 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use R15 instead of RBP. R15 can't be used as the RID1 register because of x86 instruction encoding limitations. So use R15 for CTX and RDI for CTX. This means that CTX is no longer an implicit function argument. Instead it needs to be explicitly copied from RDI. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 50 +++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 952d3156a933..7f30b6f0d72c 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -47,7 +47,7 @@ /********************************************************************** 8-way AVX cast6 **********************************************************************/ -#define CTX %rdi +#define CTX %r15 #define RA1 %xmm0 #define RB1 %xmm1 @@ -70,8 +70,8 @@ #define RTMP %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %rdi +#define RID1d %edi #define RID2 %rsi #define RID2d %esi @@ -264,15 +264,17 @@ .align 8 __cast6_enc_blk8: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks * output: * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -297,7 +299,7 @@ __cast6_enc_blk8: QBAR(11); popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; @@ -310,15 +312,17 @@ ENDPROC(__cast6_enc_blk8) .align 8 __cast6_dec_blk8: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks * output: * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -343,7 +347,7 @@ __cast6_dec_blk8: QBAR(0); popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); @@ -354,12 +358,14 @@ ENDPROC(__cast6_dec_blk8) ENTRY(cast6_ecb_enc_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -368,18 +374,21 @@ ENTRY(cast6_ecb_enc_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_ecb_enc_8way) ENTRY(cast6_ecb_dec_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -388,20 +397,22 @@ ENTRY(cast6_ecb_dec_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_ecb_dec_8way) ENTRY(cast6_cbc_dec_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -411,8 +422,8 @@ ENTRY(cast6_cbc_dec_8way) store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast6_cbc_dec_8way) @@ -425,9 +436,10 @@ ENTRY(cast6_ctr_8way) * %rcx: iv (little endian, 128bit) */ FRAME_BEGIN - pushq %r12; + pushq %r15 + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -438,8 +450,8 @@ ENTRY(cast6_ctr_8way) store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast6_ctr_8way) @@ -452,7 +464,9 @@ ENTRY(cast6_xts_enc_8way) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX movq %rsi, %r11; /* regs <= src, dst <= IVs, regs <= regs xor IVs */ @@ -464,6 +478,7 @@ ENTRY(cast6_xts_enc_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_xts_enc_8way) @@ -476,7 +491,9 @@ ENTRY(cast6_xts_dec_8way) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX movq %rsi, %r11; /* regs <= src, dst <= IVs, regs <= regs xor IVs */ @@ -488,6 +505,7 @@ ENTRY(cast6_xts_dec_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_xts_dec_8way) -- cgit From 3ed7b4d67c6745300c9b5c6baa55da1161b57f60 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:04 -0500 Subject: crypto: x86/des3_ede - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use RSI instead of RBP for RT1. Since RSI is also used as a the 'dst' function argument, it needs to be saved on the stack until the argument is needed. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/des3_ede-asm_64.S | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S index f3e91647ca27..8e49ce117494 100644 --- a/arch/x86/crypto/des3_ede-asm_64.S +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -64,12 +64,12 @@ #define RW2bh %ch #define RT0 %r15 -#define RT1 %rbp +#define RT1 %rsi #define RT2 %r14 #define RT3 %rdx #define RT0d %r15d -#define RT1d %ebp +#define RT1d %esi #define RT2d %r14d #define RT3d %edx @@ -177,13 +177,14 @@ ENTRY(des3_ede_x86_64_crypt_blk) * %rsi: dst * %rdx: src */ - pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; pushq %r14; pushq %r15; + pushq %rsi; /* dst */ + read_block(%rdx, RL0, RR0); initial_permutation(RL0, RR0); @@ -241,6 +242,8 @@ ENTRY(des3_ede_x86_64_crypt_blk) round1(32+15, RL0, RR0, dummy2); final_permutation(RR0, RL0); + + popq %rsi /* dst */ write_block(%rsi, RR0, RL0); popq %r15; @@ -248,7 +251,6 @@ ENTRY(des3_ede_x86_64_crypt_blk) popq %r13; popq %r12; popq %rbx; - popq %rbp; ret; ENDPROC(des3_ede_x86_64_crypt_blk) @@ -432,13 +434,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) * %rdx: src (3 blocks) */ - pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; pushq %r14; pushq %r15; + pushq %rsi /* dst */ + /* load input */ movl 0 * 4(%rdx), RL0d; movl 1 * 4(%rdx), RR0d; @@ -520,6 +523,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) bswapl RR2d; bswapl RL2d; + popq %rsi /* dst */ movl RR0d, 0 * 4(%rsi); movl RL0d, 1 * 4(%rsi); movl RR1d, 2 * 4(%rsi); @@ -532,7 +536,6 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) popq %r13; popq %r12; popq %rbx; - popq %rbp; ret; ENDPROC(des3_ede_x86_64_crypt_blk_3way) -- cgit From d7b1722c72aa915283ada27709c6feeb392f6038 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:05 -0500 Subject: crypto: x86/sha1-avx2 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use R11 instead of RBP. Since R11 isn't a callee-saved register, it doesn't need to be saved and restored on the stack. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/sha1_avx2_x86_64_asm.S | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1eab79c9ac48..9f712a7dfd79 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -89,7 +89,7 @@ #define REG_RE %rdx #define REG_RTA %r12 #define REG_RTB %rbx -#define REG_T1 %ebp +#define REG_T1 %r11d #define xmm_mov vmovups #define avx2_zeroupper vzeroupper #define RND_F1 1 @@ -637,7 +637,6 @@ _loop3: ENTRY(\name) push %rbx - push %rbp push %r12 push %r13 push %r14 @@ -673,7 +672,6 @@ _loop3: pop %r14 pop %r13 pop %r12 - pop %rbp pop %rbx ret -- cgit From 6488bce756861b94810e54f83416d5e74c0f18bf Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:06 -0500 Subject: crypto: x86/sha1-ssse3 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Swap the usages of R12 and RBP. Use R12 for the REG_D register, and use RBP to store the pre-aligned stack pointer. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/sha1_ssse3_asm.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index a4109506a5e8..6204bd53528c 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -37,7 +37,7 @@ #define REG_A %ecx #define REG_B %esi #define REG_C %edi -#define REG_D %ebp +#define REG_D %r12d #define REG_E %edx #define REG_T1 %eax @@ -74,10 +74,10 @@ ENTRY(\name) push %rbx - push %rbp push %r12 + push %rbp + mov %rsp, %rbp - mov %rsp, %r12 sub $64, %rsp # allocate workspace and $~15, %rsp # align stack @@ -99,10 +99,9 @@ xor %rax, %rax rep stosq - mov %r12, %rsp # deallocate workspace - - pop %r12 + mov %rbp, %rsp # deallocate workspace pop %rbp + pop %r12 pop %rbx ret -- cgit From 673ac6fbc74f835e2125df9ee39e8a2a423832e2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:07 -0500 Subject: crypto: x86/sha256-avx - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Swap the usages of R12 and RBP. Use R12 for the TBL register, and use RBP to store the pre-aligned stack pointer. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256-avx-asm.S | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index e08888a1a5f2..001bbcf93c79 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -103,7 +103,7 @@ SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx -TBL = %rbp +TBL = %r12 a = %eax b = %ebx @@ -350,13 +350,13 @@ a = TMP_ ENTRY(sha256_transform_avx) .align 32 pushq %rbx - pushq %rbp + pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %r12 + pushq %rbp + movq %rsp, %rbp - mov %rsp, %r12 subq $STACK_SIZE, %rsp # allocate stack space and $~15, %rsp # align stack pointer @@ -452,13 +452,12 @@ loop2: done_hash: - mov %r12, %rsp - - popq %r12 + mov %rbp, %rsp + popq %rbp popq %r15 popq %r14 popq %r13 - popq %rbp + popq %r12 popq %rbx ret ENDPROC(sha256_transform_avx) -- cgit From d3dfbfe2e6e7ecd620531d5201314ad14c4ed5b3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:08 -0500 Subject: crypto: x86/sha256-avx2 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. There's no need to use RBP as a temporary register for the TBL value, because it always stores the same value: the address of the K256 table. Instead just reference the address of K256 directly. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256-avx2-asm.S | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 89c8f09787d2..1420db15dcdd 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -98,8 +98,6 @@ d = %r8d e = %edx # clobbers NUM_BLKS y3 = %esi # clobbers INP - -TBL = %rbp SRND = CTX # SRND is same register as CTX a = %eax @@ -531,7 +529,6 @@ STACK_SIZE = _RSP + _RSP_SIZE ENTRY(sha256_transform_rorx) .align 32 pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 @@ -568,8 +565,6 @@ ENTRY(sha256_transform_rorx) mov CTX, _CTX(%rsp) loop0: - lea K256(%rip), TBL - ## Load first 16 dwords from two blocks VMOVDQ 0*32(INP),XTMP0 VMOVDQ 1*32(INP),XTMP1 @@ -597,19 +592,19 @@ last_block_enter: .align 16 loop1: - vpaddd 0*32(TBL, SRND), X0, XFER + vpaddd K256+0*32(SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 0*32 - vpaddd 1*32(TBL, SRND), X0, XFER + vpaddd K256+1*32(SRND), X0, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 1*32 - vpaddd 2*32(TBL, SRND), X0, XFER + vpaddd K256+2*32(SRND), X0, XFER vmovdqa XFER, 2*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 2*32 - vpaddd 3*32(TBL, SRND), X0, XFER + vpaddd K256+3*32(SRND), X0, XFER vmovdqa XFER, 3*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 3*32 @@ -619,10 +614,11 @@ loop1: loop2: ## Do last 16 rounds with no scheduling - vpaddd 0*32(TBL, SRND), X0, XFER + vpaddd K256+0*32(SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 0*32 - vpaddd 1*32(TBL, SRND), X1, XFER + + vpaddd K256+1*32(SRND), X1, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 1*32 add $2*32, SRND @@ -676,9 +672,6 @@ loop3: ja done_hash do_last_block: - #### do last block - lea K256(%rip), TBL - VMOVDQ 0*16(INP),XWORD0 VMOVDQ 1*16(INP),XWORD1 VMOVDQ 2*16(INP),XWORD2 @@ -718,7 +711,6 @@ done_hash: popq %r14 popq %r13 popq %r12 - popq %rbp popq %rbx ret ENDPROC(sha256_transform_rorx) -- cgit From 539012dcbdd1ff028268764385ed1f6d600812a7 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:09 -0500 Subject: crypto: x86/sha256-ssse3 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Swap the usages of R12 and RBP. Use R12 for the TBL register, and use RBP to store the pre-aligned stack pointer. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256-ssse3-asm.S | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index 39b83c93e7fd..c6c05ed2c16a 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -95,7 +95,7 @@ SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx -TBL = %rbp +TBL = %r12 a = %eax b = %ebx @@ -356,13 +356,13 @@ a = TMP_ ENTRY(sha256_transform_ssse3) .align 32 pushq %rbx - pushq %rbp + pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %r12 + pushq %rbp + mov %rsp, %rbp - mov %rsp, %r12 subq $STACK_SIZE, %rsp and $~15, %rsp @@ -462,13 +462,12 @@ loop2: done_hash: - mov %r12, %rsp - - popq %r12 + mov %rbp, %rsp + popq %rbp popq %r15 popq %r14 popq %r13 - popq %rbp + popq %r12 popq %rbx ret -- cgit From ca04c823763e5b82c237cabe0c17f547ecdc6271 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:10 -0500 Subject: crypto: sha512-avx2 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Mix things up a little bit to get rid of the RBP usage, without hurting performance too much. Use RDI instead of RBP for the TBL pointer. That will clobber CTX, so spill CTX onto the stack and use R12 to read it in the outer loop. R12 is used as a non-persistent temporary variable elsewhere, so it's safe to use. Also remove the unused y4 variable. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/sha512-avx2-asm.S | 75 ++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 7f5f6c6ec72e..b16d56005162 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -69,8 +69,9 @@ XFER = YTMP0 BYTE_FLIP_MASK = %ymm9 -# 1st arg -CTX = %rdi +# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 +CTX1 = %rdi +CTX2 = %r12 # 2nd arg INP = %rsi # 3rd arg @@ -81,7 +82,7 @@ d = %r8 e = %rdx y3 = %rsi -TBL = %rbp +TBL = %rdi # clobbers CTX1 a = %rax b = %rbx @@ -91,26 +92,26 @@ g = %r10 h = %r11 old_h = %r11 -T1 = %r12 +T1 = %r12 # clobbers CTX2 y0 = %r13 y1 = %r14 y2 = %r15 -y4 = %r12 - # Local variables (stack frame) XFER_SIZE = 4*8 SRND_SIZE = 1*8 INP_SIZE = 1*8 INPEND_SIZE = 1*8 +CTX_SIZE = 1*8 RSPSAVE_SIZE = 1*8 -GPRSAVE_SIZE = 6*8 +GPRSAVE_SIZE = 5*8 frame_XFER = 0 frame_SRND = frame_XFER + XFER_SIZE frame_INP = frame_SRND + SRND_SIZE frame_INPEND = frame_INP + INP_SIZE -frame_RSPSAVE = frame_INPEND + INPEND_SIZE +frame_CTX = frame_INPEND + INPEND_SIZE +frame_RSPSAVE = frame_CTX + CTX_SIZE frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE frame_size = frame_GPRSAVE + GPRSAVE_SIZE @@ -576,12 +577,11 @@ ENTRY(sha512_transform_rorx) mov %rax, frame_RSPSAVE(%rsp) # Save GPRs - mov %rbp, frame_GPRSAVE(%rsp) - mov %rbx, 8*1+frame_GPRSAVE(%rsp) - mov %r12, 8*2+frame_GPRSAVE(%rsp) - mov %r13, 8*3+frame_GPRSAVE(%rsp) - mov %r14, 8*4+frame_GPRSAVE(%rsp) - mov %r15, 8*5+frame_GPRSAVE(%rsp) + mov %rbx, 8*0+frame_GPRSAVE(%rsp) + mov %r12, 8*1+frame_GPRSAVE(%rsp) + mov %r13, 8*2+frame_GPRSAVE(%rsp) + mov %r14, 8*3+frame_GPRSAVE(%rsp) + mov %r15, 8*4+frame_GPRSAVE(%rsp) shl $7, NUM_BLKS # convert to bytes jz done_hash @@ -589,14 +589,17 @@ ENTRY(sha512_transform_rorx) mov NUM_BLKS, frame_INPEND(%rsp) ## load initial digest - mov 8*0(CTX),a - mov 8*1(CTX),b - mov 8*2(CTX),c - mov 8*3(CTX),d - mov 8*4(CTX),e - mov 8*5(CTX),f - mov 8*6(CTX),g - mov 8*7(CTX),h + mov 8*0(CTX1), a + mov 8*1(CTX1), b + mov 8*2(CTX1), c + mov 8*3(CTX1), d + mov 8*4(CTX1), e + mov 8*5(CTX1), f + mov 8*6(CTX1), g + mov 8*7(CTX1), h + + # save %rdi (CTX) before it gets clobbered + mov %rdi, frame_CTX(%rsp) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK @@ -652,14 +655,15 @@ loop2: subq $1, frame_SRND(%rsp) jne loop2 - addm 8*0(CTX),a - addm 8*1(CTX),b - addm 8*2(CTX),c - addm 8*3(CTX),d - addm 8*4(CTX),e - addm 8*5(CTX),f - addm 8*6(CTX),g - addm 8*7(CTX),h + mov frame_CTX(%rsp), CTX2 + addm 8*0(CTX2), a + addm 8*1(CTX2), b + addm 8*2(CTX2), c + addm 8*3(CTX2), d + addm 8*4(CTX2), e + addm 8*5(CTX2), f + addm 8*6(CTX2), g + addm 8*7(CTX2), h mov frame_INP(%rsp), INP add $128, INP @@ -669,12 +673,11 @@ loop2: done_hash: # Restore GPRs - mov frame_GPRSAVE(%rsp) ,%rbp - mov 8*1+frame_GPRSAVE(%rsp) ,%rbx - mov 8*2+frame_GPRSAVE(%rsp) ,%r12 - mov 8*3+frame_GPRSAVE(%rsp) ,%r13 - mov 8*4+frame_GPRSAVE(%rsp) ,%r14 - mov 8*5+frame_GPRSAVE(%rsp) ,%r15 + mov 8*0+frame_GPRSAVE(%rsp), %rbx + mov 8*1+frame_GPRSAVE(%rsp), %r12 + mov 8*2+frame_GPRSAVE(%rsp), %r13 + mov 8*3+frame_GPRSAVE(%rsp), %r14 + mov 8*4+frame_GPRSAVE(%rsp), %r15 # Restore Stack Pointer mov frame_RSPSAVE(%rsp), %rsp -- cgit From 8f182f845d0fa26ecc18d3bdcc3d1077a0ea3a31 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:11 -0500 Subject: crypto: x86/twofish - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use R13 instead of RBP. Both are callee-saved registers, so the substitution is straightforward. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index b3f49d286348..73b471da3622 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -76,8 +76,8 @@ #define RT %xmm14 #define RR %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %r13 +#define RID1d %r13d #define RID2 %rsi #define RID2d %esi @@ -259,7 +259,7 @@ __twofish_enc_blk8: vmovdqu w(CTX), RK1; - pushq %rbp; + pushq %r13; pushq %rbx; pushq %rcx; @@ -282,7 +282,7 @@ __twofish_enc_blk8: popq %rcx; popq %rbx; - popq %rbp; + popq %r13; outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); @@ -301,7 +301,7 @@ __twofish_dec_blk8: vmovdqu (w+4*4)(CTX), RK1; - pushq %rbp; + pushq %r13; pushq %rbx; inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); @@ -322,7 +322,7 @@ __twofish_dec_blk8: vmovdqu (w)(CTX), RK1; popq %rbx; - popq %rbp; + popq %r13; outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); -- cgit From afd62fa26343be6445479e75de9f07092a061459 Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Wed, 13 Sep 2017 12:44:51 +0200 Subject: crypto: talitos - fix sha224 Kernel crypto tests report the following error at startup [ 2.752626] alg: hash: Test 4 failed for sha224-talitos [ 2.757907] 00000000: 30 e2 86 e2 e7 8a dd 0d d7 eb 9f d5 83 fe f1 b0 00000010: 2d 5a 6c a5 f9 55 ea fd 0e 72 05 22 This patch fixes it Cc: Signed-off-by: Christophe Leroy Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 79791c690858..6596761bc0c8 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -1756,9 +1756,9 @@ static int common_nonsnoop_hash(struct talitos_edesc *edesc, req_ctx->swinit = 0; } else { desc->ptr[1] = zero_entry; - /* Indicate next op is not the first. */ - req_ctx->first = 0; } + /* Indicate next op is not the first. */ + req_ctx->first = 0; /* HMAC key */ if (ctx->keylen) -- cgit From 886a27c0fc8a34633aadb0986dba11d8c150ae2e Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Wed, 13 Sep 2017 12:44:57 +0200 Subject: crypto: talitos - fix hashing md5sum on some files gives wrong result Exemple: With the md5sum from libkcapi: c15115c05bad51113f81bdaee735dd09 test With the original md5sum: bbdf41d80ba7e8b2b7be3a0772be76cb test This patch fixes this issue Cc: Signed-off-by: Christophe Leroy Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 6596761bc0c8..8a48fc4f3642 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -1769,7 +1769,7 @@ static int common_nonsnoop_hash(struct talitos_edesc *edesc, sg_count = edesc->src_nents ?: 1; if (is_sec1 && sg_count > 1) - sg_copy_to_buffer(areq->src, sg_count, edesc->buf, length); + sg_copy_to_buffer(req_ctx->psrc, sg_count, edesc->buf, length); else sg_count = dma_map_sg(dev, req_ctx->psrc, sg_count, DMA_TO_DEVICE); -- cgit From 56136631573baa537a15e0012055ffe8cfec1a33 Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Tue, 12 Sep 2017 11:03:39 +0200 Subject: crypto: talitos - Don't provide setkey for non hmac hashing algs. Today, md5sum fails with error -ENOKEY because a setkey function is set for non hmac hashing algs, see strace output below: mmap(NULL, 378880, PROT_READ, MAP_SHARED, 6, 0) = 0x77f50000 accept(3, 0, NULL) = 7 vmsplice(5, [{"bin/\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 378880}], 1, SPLICE_F_MORE|SPLICE_F_GIFT) = 262144 splice(4, NULL, 7, NULL, 262144, SPLICE_F_MORE) = -1 ENOKEY (Required key not available) write(2, "Generation of hash for file kcap"..., 50) = 50 munmap(0x77f50000, 378880) = 0 This patch ensures that setkey() function is set only for hmac hashing. Cc: Signed-off-by: Christophe Leroy Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 8a48fc4f3642..dff88838dce7 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -3057,7 +3057,8 @@ static struct talitos_crypto_alg *talitos_alg_alloc(struct device *dev, t_alg->algt.alg.hash.final = ahash_final; t_alg->algt.alg.hash.finup = ahash_finup; t_alg->algt.alg.hash.digest = ahash_digest; - t_alg->algt.alg.hash.setkey = ahash_setkey; + if (!strncmp(alg->cra_name, "hmac", 4)) + t_alg->algt.alg.hash.setkey = ahash_setkey; t_alg->algt.alg.hash.import = ahash_import; t_alg->algt.alg.hash.export = ahash_export; -- cgit From 3e1166b94e661ed51af1fe1fe5f74bd83450b50f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 12 Sep 2017 12:12:16 +0200 Subject: crypto: inside-secure - fix gcc-4.9 warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All older compiler versions up to gcc-4.9 produce these harmless warnings: drivers/crypto/inside-secure/safexcel_cipher.c:389:9: warning: missing braces around initializer [-Wmissing-braces] drivers/crypto/inside-secure/safexcel_cipher.c:389:9: warning: (near initialization for ‘result.completion’) [-Wmissing-braces] drivers/crypto/inside-secure/safexcel_hash.c:422:9: warning: missing braces around initializer [-Wmissing-braces] drivers/crypto/inside-secure/safexcel_hash.c:422:9: warning: (near initialization for ‘result.completion’) [-Wmissing-braces] This changes the syntax to something that works on all versions without warnings. Fixes: 1b44c5a60c13 ("crypto: inside-secure - add SafeXcel EIP197 crypto engine driver") Signed-off-by: Arnd Bergmann Acked-by: Antoine Tenart Signed-off-by: Herbert Xu --- drivers/crypto/inside-secure/safexcel_cipher.c | 2 +- drivers/crypto/inside-secure/safexcel_hash.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c index d2207ac5ba19..5438552bc6d7 100644 --- a/drivers/crypto/inside-secure/safexcel_cipher.c +++ b/drivers/crypto/inside-secure/safexcel_cipher.c @@ -386,7 +386,7 @@ static int safexcel_cipher_exit_inv(struct crypto_tfm *tfm) struct safexcel_cipher_ctx *ctx = crypto_tfm_ctx(tfm); struct safexcel_crypto_priv *priv = ctx->priv; struct skcipher_request req; - struct safexcel_inv_result result = { 0 }; + struct safexcel_inv_result result = {}; int ring = ctx->base.ring; memset(&req, 0, sizeof(struct skcipher_request)); diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c index 3f819399cd95..3980f946874f 100644 --- a/drivers/crypto/inside-secure/safexcel_hash.c +++ b/drivers/crypto/inside-secure/safexcel_hash.c @@ -419,7 +419,7 @@ static int safexcel_ahash_exit_inv(struct crypto_tfm *tfm) struct safexcel_ahash_ctx *ctx = crypto_tfm_ctx(tfm); struct safexcel_crypto_priv *priv = ctx->priv; struct ahash_request req; - struct safexcel_inv_result result = { 0 }; + struct safexcel_inv_result result = {}; int ring = ctx->base.ring; memset(&req, 0, sizeof(struct ahash_request)); -- cgit From c056d910f08029662080a01b4ce2110e2c9a27b6 Mon Sep 17 00:00:00 2001 From: Horia Geantă Date: Fri, 1 Sep 2017 17:12:59 +0300 Subject: crypto: caam - fix LS1021A support on ARMv7 multiplatform kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When built using multi_v7_defconfig, driver does not work on LS1021A: [...] caam 1700000.crypto: can't identify CAAM ipg clk: -2 caam: probe of 1700000.crypto failed with error -2 [...] It turns out we have to detect at runtime whether driver is running on an i.MX platform or not. Cc: Fixes: 6c3af9559352 ("crypto: caam - add support for LS1021A") Signed-off-by: Horia Geantă Signed-off-by: Herbert Xu --- drivers/crypto/caam/Kconfig | 5 +--- drivers/crypto/caam/ctrl.c | 19 ++++++++------- drivers/crypto/caam/regs.h | 59 +++++++++++++++++++++------------------------ 3 files changed, 39 insertions(+), 44 deletions(-) diff --git a/drivers/crypto/caam/Kconfig b/drivers/crypto/caam/Kconfig index e36aeacd7635..1eb852765469 100644 --- a/drivers/crypto/caam/Kconfig +++ b/drivers/crypto/caam/Kconfig @@ -1,6 +1,7 @@ config CRYPTO_DEV_FSL_CAAM tristate "Freescale CAAM-Multicore driver backend" depends on FSL_SOC || ARCH_MXC || ARCH_LAYERSCAPE + select SOC_BUS help Enables the driver module for Freescale's Cryptographic Accelerator and Assurance Module (CAAM), also known as the SEC version 4 (SEC4). @@ -141,10 +142,6 @@ config CRYPTO_DEV_FSL_CAAM_RNG_API To compile this as a module, choose M here: the module will be called caamrng. -config CRYPTO_DEV_FSL_CAAM_IMX - def_bool SOC_IMX6 || SOC_IMX7D - depends on CRYPTO_DEV_FSL_CAAM - config CRYPTO_DEV_FSL_CAAM_DEBUG bool "Enable debug output in CAAM driver" depends on CRYPTO_DEV_FSL_CAAM diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c index dacb53fb690e..027e121c6f70 100644 --- a/drivers/crypto/caam/ctrl.c +++ b/drivers/crypto/caam/ctrl.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "compat.h" #include "regs.h" @@ -19,6 +20,8 @@ bool caam_little_end; EXPORT_SYMBOL(caam_little_end); bool caam_dpaa2; EXPORT_SYMBOL(caam_dpaa2); +bool caam_imx; +EXPORT_SYMBOL(caam_imx); #ifdef CONFIG_CAAM_QI #include "qi.h" @@ -28,19 +31,11 @@ EXPORT_SYMBOL(caam_dpaa2); * i.MX targets tend to have clock control subsystems that can * enable/disable clocking to our device. */ -#ifdef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX static inline struct clk *caam_drv_identify_clk(struct device *dev, char *clk_name) { - return devm_clk_get(dev, clk_name); + return caam_imx ? devm_clk_get(dev, clk_name) : NULL; } -#else -static inline struct clk *caam_drv_identify_clk(struct device *dev, - char *clk_name) -{ - return NULL; -} -#endif /* * Descriptor to instantiate RNG State Handle 0 in normal mode and @@ -430,6 +425,10 @@ static int caam_probe(struct platform_device *pdev) { int ret, ring, gen_sk, ent_delay = RTSDCTL_ENT_DLY_MIN; u64 caam_id; + static const struct soc_device_attribute imx_soc[] = { + {.family = "Freescale i.MX"}, + {}, + }; struct device *dev; struct device_node *nprop, *np; struct caam_ctrl __iomem *ctrl; @@ -451,6 +450,8 @@ static int caam_probe(struct platform_device *pdev) dev_set_drvdata(dev, ctrlpriv); nprop = pdev->dev.of_node; + caam_imx = (bool)soc_device_match(imx_soc); + /* Enable clocking */ clk = caam_drv_identify_clk(&pdev->dev, "ipg"); if (IS_ERR(clk)) { diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h index 2b5efff9ec3c..17cfd23a38fa 100644 --- a/drivers/crypto/caam/regs.h +++ b/drivers/crypto/caam/regs.h @@ -67,6 +67,7 @@ */ extern bool caam_little_end; +extern bool caam_imx; #define caam_to_cpu(len) \ static inline u##len caam##len ## _to_cpu(u##len val) \ @@ -154,13 +155,10 @@ static inline u64 rd_reg64(void __iomem *reg) #else /* CONFIG_64BIT */ static inline void wr_reg64(void __iomem *reg, u64 data) { -#ifndef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX - if (caam_little_end) { + if (!caam_imx && caam_little_end) { wr_reg32((u32 __iomem *)(reg) + 1, data >> 32); wr_reg32((u32 __iomem *)(reg), data); - } else -#endif - { + } else { wr_reg32((u32 __iomem *)(reg), data >> 32); wr_reg32((u32 __iomem *)(reg) + 1, data); } @@ -168,41 +166,40 @@ static inline void wr_reg64(void __iomem *reg, u64 data) static inline u64 rd_reg64(void __iomem *reg) { -#ifndef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX - if (caam_little_end) + if (!caam_imx && caam_little_end) return ((u64)rd_reg32((u32 __iomem *)(reg) + 1) << 32 | (u64)rd_reg32((u32 __iomem *)(reg))); - else -#endif - return ((u64)rd_reg32((u32 __iomem *)(reg)) << 32 | - (u64)rd_reg32((u32 __iomem *)(reg) + 1)); + + return ((u64)rd_reg32((u32 __iomem *)(reg)) << 32 | + (u64)rd_reg32((u32 __iomem *)(reg) + 1)); } #endif /* CONFIG_64BIT */ +static inline u64 cpu_to_caam_dma64(dma_addr_t value) +{ + if (caam_imx) + return (((u64)cpu_to_caam32(lower_32_bits(value)) << 32) | + (u64)cpu_to_caam32(upper_32_bits(value))); + + return cpu_to_caam64(value); +} + +static inline u64 caam_dma64_to_cpu(u64 value) +{ + if (caam_imx) + return (((u64)caam32_to_cpu(lower_32_bits(value)) << 32) | + (u64)caam32_to_cpu(upper_32_bits(value))); + + return caam64_to_cpu(value); +} + #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT -#ifdef CONFIG_SOC_IMX7D -#define cpu_to_caam_dma(value) \ - (((u64)cpu_to_caam32(lower_32_bits(value)) << 32) | \ - (u64)cpu_to_caam32(upper_32_bits(value))) -#define caam_dma_to_cpu(value) \ - (((u64)caam32_to_cpu(lower_32_bits(value)) << 32) | \ - (u64)caam32_to_cpu(upper_32_bits(value))) -#else -#define cpu_to_caam_dma(value) cpu_to_caam64(value) -#define caam_dma_to_cpu(value) caam64_to_cpu(value) -#endif /* CONFIG_SOC_IMX7D */ +#define cpu_to_caam_dma(value) cpu_to_caam_dma64(value) +#define caam_dma_to_cpu(value) caam_dma64_to_cpu(value) #else #define cpu_to_caam_dma(value) cpu_to_caam32(value) #define caam_dma_to_cpu(value) caam32_to_cpu(value) -#endif /* CONFIG_ARCH_DMA_ADDR_T_64BIT */ - -#ifdef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX -#define cpu_to_caam_dma64(value) \ - (((u64)cpu_to_caam32(lower_32_bits(value)) << 32) | \ - (u64)cpu_to_caam32(upper_32_bits(value))) -#else -#define cpu_to_caam_dma64(value) cpu_to_caam64(value) -#endif +#endif /* CONFIG_ARCH_DMA_ADDR_T_64BIT */ /* * jr_outentry -- cgit From e117765a117da3ece15689cb8a759d16c415b08c Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Wed, 30 Aug 2017 09:17:39 +0200 Subject: crypto: af_alg - update correct dst SGL entry When two adjacent TX SGL are processed and parts of both TX SGLs are pulled into the per-request TX SGL, the wrong per-request TX SGL entries were updated. This fixes a NULL pointer dereference when a cipher implementation walks the TX SGL where some of the SGL entries were NULL. Fixes: e870456d8e7c ("crypto: algif_skcipher - overhaul memory...") Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- crypto/af_alg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index ffa9f4ccd9b4..337cf382718e 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -619,14 +619,14 @@ void af_alg_pull_tsgl(struct sock *sk, size_t used, struct scatterlist *dst, struct af_alg_ctx *ctx = ask->private; struct af_alg_tsgl *sgl; struct scatterlist *sg; - unsigned int i, j; + unsigned int i, j = 0; while (!list_empty(&ctx->tsgl_list)) { sgl = list_first_entry(&ctx->tsgl_list, struct af_alg_tsgl, list); sg = sgl->sg; - for (i = 0, j = 0; i < sgl->cur; i++) { + for (i = 0; i < sgl->cur; i++) { size_t plen = min_t(size_t, used, sg[i].length); struct page *page = sg_page(sg + i); -- cgit From 8afafa6fba7809c0785018b77c95b19e58b35b94 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 15 Sep 2017 15:38:21 +0530 Subject: powerpc/kprobes: Update optprobes to use emulate_update_regs() Optprobes depended on an updated regs->nip from analyse_instr() to identify the location to branch back from the optprobes trampoline. However, since commit 3cdfcbfd32b9d ("powerpc: Change analyse_instr so it doesn't modify *regs"), analyse_instr() doesn't update the registers anymore. Due to this, we end up branching back from the optprobes trampoline to the same branch into the trampoline resulting in a loop. Fix this by calling out to emulate_update_regs() before using the nip. Additionally, explicitly compare the return value from analyse_instr() to 1, rather than just checking for !0 so as to guard against any future changes to analyse_instr() that may result in -1 being returned in more scenarios. Fixes: 3cdfcbfd32b9d ("powerpc: Change analyse_instr so it doesn't modify *regs") Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/optprobes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 6f8273f5e988..91e037ab20a1 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -104,8 +104,10 @@ static unsigned long can_optimize(struct kprobe *p) * and that can be emulated. */ if (!is_conditional_branch(*p->ainsn.insn) && - analyse_instr(&op, ®s, *p->ainsn.insn)) + analyse_instr(&op, ®s, *p->ainsn.insn) == 1) { + emulate_update_regs(®s, &op); nip = regs.nip; + } return nip; } -- cgit From 1b25fda0533462c9cee3a22e8a7bea68fa670af2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 19 Sep 2017 12:52:22 +0200 Subject: s390/topology: alternative topology for topology-less machines If running on machines that do not provide topology information we currently generate a "fake" topology which defines the maximum distance between each cpu: each cpu will be put into an own drawer. Historically this used to be the best option for (virtual) machines in overcommited hypervisors. For some workloads however it is better to generate a different topology where all cpus are siblings within a package (all cpus are core siblings). This shows performance improvements of up to 10%, depending on the workload. In order to keep the current behaviour, but also allow to switch to the different core sibling topology use the existing "topology=" kernel parameter: Specifying "topology=on" on machines without topology information will generate the core siblings (fake) topology information, instead of the default topology information where all cpus have the maximum distance. On machines which provide topology information specifying "topology=on" does not have any effect. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/early.c | 12 -------- arch/s390/kernel/topology.c | 72 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 61 insertions(+), 23 deletions(-) diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index ca8cd80e8feb..60181caf8e8a 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -404,18 +404,6 @@ static inline void save_vector_registers(void) #endif } -static int __init topology_setup(char *str) -{ - bool enabled; - int rc; - - rc = kstrtobool(str, &enabled); - if (!rc && !enabled) - S390_lowcore.machine_flags &= ~MACHINE_FLAG_TOPOLOGY; - return rc; -} -early_param("topology", topology_setup); - static int __init disable_vector_extension(char *str) { S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX; diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index bb47c92476f0..a0ce9c83f589 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -29,12 +29,20 @@ #define PTF_VERTICAL (1UL) #define PTF_CHECK (2UL) +enum { + TOPOLOGY_MODE_HW, + TOPOLOGY_MODE_SINGLE, + TOPOLOGY_MODE_PACKAGE, + TOPOLOGY_MODE_UNINITIALIZED +}; + struct mask_info { struct mask_info *next; unsigned char id; cpumask_t mask; }; +static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED; static void set_topology_timer(void); static void topology_work_fn(struct work_struct *work); static struct sysinfo_15_1_x *tl_info; @@ -59,11 +67,26 @@ static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) cpumask_t mask; cpumask_copy(&mask, cpumask_of(cpu)); - if (!MACHINE_HAS_TOPOLOGY) - return mask; - for (; info; info = info->next) { - if (cpumask_test_cpu(cpu, &info->mask)) - return info->mask; + switch (topology_mode) { + case TOPOLOGY_MODE_HW: + while (info) { + if (cpumask_test_cpu(cpu, &info->mask)) { + mask = info->mask; + break; + } + info = info->next; + } + if (cpumask_empty(&mask)) + cpumask_copy(&mask, cpumask_of(cpu)); + break; + case TOPOLOGY_MODE_PACKAGE: + cpumask_copy(&mask, cpu_present_mask); + break; + default: + /* fallthrough */ + case TOPOLOGY_MODE_SINGLE: + cpumask_copy(&mask, cpumask_of(cpu)); + break; } return mask; } @@ -74,7 +97,7 @@ static cpumask_t cpu_thread_map(unsigned int cpu) int i; cpumask_copy(&mask, cpumask_of(cpu)); - if (!MACHINE_HAS_TOPOLOGY) + if (topology_mode != TOPOLOGY_MODE_HW) return mask; cpu -= cpu % (smp_cpu_mtid + 1); for (i = 0; i <= smp_cpu_mtid; i++) @@ -223,7 +246,7 @@ int topology_set_cpu_management(int fc) static void update_cpu_masks(void) { struct cpu_topology_s390 *topo; - int cpu; + int cpu, id; for_each_possible_cpu(cpu) { topo = &cpu_topology[cpu]; @@ -231,12 +254,13 @@ static void update_cpu_masks(void) topo->core_mask = cpu_group_map(&socket_info, cpu); topo->book_mask = cpu_group_map(&book_info, cpu); topo->drawer_mask = cpu_group_map(&drawer_info, cpu); - if (!MACHINE_HAS_TOPOLOGY) { + if (topology_mode != TOPOLOGY_MODE_HW) { + id = topology_mode == TOPOLOGY_MODE_PACKAGE ? 0 : cpu; topo->thread_id = cpu; topo->core_id = cpu; - topo->socket_id = cpu; - topo->book_id = cpu; - topo->drawer_id = cpu; + topo->socket_id = id; + topo->book_id = id; + topo->drawer_id = id; if (cpu_present(cpu)) cpumask_set_cpu(cpu, &cpus_with_topology); } @@ -459,6 +483,12 @@ void __init topology_init_early(void) struct sysinfo_15_1_x *info; set_sched_topology(s390_topology); + if (topology_mode == TOPOLOGY_MODE_UNINITIALIZED) { + if (MACHINE_HAS_TOPOLOGY) + topology_mode = TOPOLOGY_MODE_HW; + else + topology_mode = TOPOLOGY_MODE_SINGLE; + } if (!MACHINE_HAS_TOPOLOGY) goto out; tl_info = memblock_virt_alloc(PAGE_SIZE, PAGE_SIZE); @@ -474,6 +504,26 @@ out: __arch_update_cpu_topology(); } +static inline int topology_get_mode(int enabled) +{ + if (!enabled) + return TOPOLOGY_MODE_SINGLE; + return MACHINE_HAS_TOPOLOGY ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE; +} + +static int __init topology_setup(char *str) +{ + bool enabled; + int rc; + + rc = kstrtobool(str, &enabled); + if (rc) + return rc; + topology_mode = topology_get_mode(enabled); + return 0; +} +early_param("topology", topology_setup); + static int __init topology_init(void) { if (MACHINE_HAS_TOPOLOGY) -- cgit From 51dce3867c6c63c7500332e5448c2ba76808d6b5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 14 Sep 2017 14:42:32 +0200 Subject: s390/topology: enable / disable topology dynamically Add a new sysctl file /proc/sys/s390/topology which displays if topology is on (1) or off (0) as specified by the "topology=" kernel parameter. This allows to change topology information during runtime and configuring it via /etc/sysctl.conf instead of using the kernel line parameter. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/topology.c | 76 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index a0ce9c83f589..ed0bdd220e1a 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -8,6 +8,8 @@ #include #include +#include +#include #include #include #include @@ -207,10 +209,8 @@ static void topology_update_polarization_simple(void) { int cpu; - mutex_lock(&smp_cpu_state_mutex); for_each_possible_cpu(cpu) smp_cpu_set_polarization(cpu, POLARIZATION_HRZ); - mutex_unlock(&smp_cpu_state_mutex); } static int ptf(unsigned long fc) @@ -278,6 +278,7 @@ static int __arch_update_cpu_topology(void) struct sysinfo_15_1_x *info = tl_info; int rc = 0; + mutex_lock(&smp_cpu_state_mutex); cpumask_clear(&cpus_with_topology); if (MACHINE_HAS_TOPOLOGY) { rc = 1; @@ -287,6 +288,7 @@ static int __arch_update_cpu_topology(void) update_cpu_masks(); if (!MACHINE_HAS_TOPOLOGY) topology_update_polarization_simple(); + mutex_unlock(&smp_cpu_state_mutex); return rc; } @@ -313,6 +315,11 @@ void topology_schedule_update(void) schedule_work(&topology_work); } +static void topology_flush_work(void) +{ + flush_work(&topology_work); +} + static void topology_timer_fn(unsigned long ignored) { if (ptf(PTF_CHECK)) @@ -511,6 +518,11 @@ static inline int topology_get_mode(int enabled) return MACHINE_HAS_TOPOLOGY ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE; } +static inline int topology_is_enabled(void) +{ + return topology_mode != TOPOLOGY_MODE_SINGLE; +} + static int __init topology_setup(char *str) { bool enabled; @@ -524,12 +536,72 @@ static int __init topology_setup(char *str) } early_param("topology", topology_setup); +static int topology_ctl_handler(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int len; + int new_mode; + char buf[2]; + + if (!*lenp || *ppos) { + *lenp = 0; + return 0; + } + if (!write) { + strncpy(buf, topology_is_enabled() ? "1\n" : "0\n", + ARRAY_SIZE(buf)); + len = strnlen(buf, ARRAY_SIZE(buf)); + if (len > *lenp) + len = *lenp; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + goto out; + } + len = *lenp; + if (copy_from_user(buf, buffer, len > sizeof(buf) ? sizeof(buf) : len)) + return -EFAULT; + if (buf[0] != '0' && buf[0] != '1') + return -EINVAL; + mutex_lock(&smp_cpu_state_mutex); + new_mode = topology_get_mode(buf[0] == '1'); + if (topology_mode != new_mode) { + topology_mode = new_mode; + topology_schedule_update(); + } + mutex_unlock(&smp_cpu_state_mutex); + topology_flush_work(); +out: + *lenp = len; + *ppos += len; + return 0; +} + +static struct ctl_table topology_ctl_table[] = { + { + .procname = "topology", + .mode = 0644, + .proc_handler = topology_ctl_handler, + }, + { }, +}; + +static struct ctl_table topology_dir_table[] = { + { + .procname = "s390", + .maxlen = 0, + .mode = 0555, + .child = topology_ctl_table, + }, + { }, +}; + static int __init topology_init(void) { if (MACHINE_HAS_TOPOLOGY) set_topology_timer(); else topology_update_polarization_simple(); + register_sysctl_table(topology_dir_table); return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching); } device_initcall(topology_init); -- cgit From 9e09007486c883de4aa78a384e1d54c3fe6e42d5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 18 Sep 2017 18:12:36 +0900 Subject: kbuild: rpm-pkg: delete firmware_install to fix build error Commit 5620a0d1aacd ("firmware: delete in-kernel firmware") deleted in-kernel firmware support, including "make firmware_install". Since then, "make rpm-pkg" / "make binrpm-pkg" fails to build with the error: make[2]: *** No rule to make target `firmware_install'. Stop. Commit df85b2d767aa ("firmware: Restore support for built-in firmware") restored the build infrastructure for CONFIG_EXTRA_FIRMWARE, but this is out of the scope of "make firmware_install". So, the right thing to do is to kill the use of "make firmware_install". Fixes: 5620a0d1aacd ("firmware: delete in-kernel firmware") Signed-off-by: Masahiro Yamada Acked-by: Greg Kroah-Hartman --- scripts/package/mkspec | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/package/mkspec b/scripts/package/mkspec index bb43f153fd8e..bf3e9abb2846 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -88,11 +88,8 @@ echo 'mkdir -p $RPM_BUILD_ROOT/boot/efi $RPM_BUILD_ROOT/lib/modules' echo "%else" echo 'mkdir -p $RPM_BUILD_ROOT/boot $RPM_BUILD_ROOT/lib/modules' echo "%endif" -echo 'mkdir -p $RPM_BUILD_ROOT'"/lib/firmware/$KERNELRELEASE" -echo 'INSTALL_MOD_PATH=$RPM_BUILD_ROOT make %{?_smp_mflags} KBUILD_SRC= mod-fw= modules_install' -echo 'INSTALL_FW_PATH=$RPM_BUILD_ROOT'"/lib/firmware/$KERNELRELEASE" -echo 'make INSTALL_FW_PATH=$INSTALL_FW_PATH' firmware_install +echo 'INSTALL_MOD_PATH=$RPM_BUILD_ROOT make %{?_smp_mflags} KBUILD_SRC= modules_install' echo "%ifarch ia64" echo 'cp $KBUILD_IMAGE $RPM_BUILD_ROOT'"/boot/efi/vmlinuz-$KERNELRELEASE" echo 'ln -s '"efi/vmlinuz-$KERNELRELEASE" '$RPM_BUILD_ROOT'"/boot/" @@ -119,7 +116,7 @@ if ! $PREBUILT; then echo 'rm -f $RPM_BUILD_ROOT'"/lib/modules/$KERNELRELEASE/build" echo 'rm -f $RPM_BUILD_ROOT'"/lib/modules/$KERNELRELEASE/source" echo "mkdir -p "'$RPM_BUILD_ROOT'"/usr/src/kernels/$KERNELRELEASE" -echo "EXCLUDES=\"$RCS_TAR_IGNORE --exclude .tmp_versions --exclude=*vmlinux* --exclude=*.o --exclude=*.ko --exclude=*.cmd --exclude=Documentation --exclude=firmware --exclude .config.old --exclude .missing-syscalls.d\"" +echo "EXCLUDES=\"$RCS_TAR_IGNORE --exclude .tmp_versions --exclude=*vmlinux* --exclude=*.o --exclude=*.ko --exclude=*.cmd --exclude=Documentation --exclude .config.old --exclude .missing-syscalls.d\"" echo "tar "'$EXCLUDES'" -cf- . | (cd "'$RPM_BUILD_ROOT'"/usr/src/kernels/$KERNELRELEASE;tar xvf -)" echo 'cd $RPM_BUILD_ROOT'"/lib/modules/$KERNELRELEASE" echo "ln -sf /usr/src/kernels/$KERNELRELEASE build" @@ -154,7 +151,6 @@ echo '%defattr (-, root, root)' echo "/lib/modules/$KERNELRELEASE" echo "%exclude /lib/modules/$KERNELRELEASE/build" echo "%exclude /lib/modules/$KERNELRELEASE/source" -echo "/lib/firmware/$KERNELRELEASE" echo "/boot/*" echo "" echo "%files headers" -- cgit From cc18abbe449aafc013831a8e0440afc336ae1cba Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 18 Sep 2017 18:22:19 +0900 Subject: kbuild: deb-pkg: remove firmware package support Commit 5620a0d1aacd ("firmware: delete in-kernel firmware") deleted in-kernel firmware support, including the firmware install command. So, the firmware package does not make sense any more. Remove it. Signed-off-by: Masahiro Yamada Reviewed-by: Riku Voipio Acked-by: Greg Kroah-Hartman --- scripts/package/builddeb | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index aad67000e4dd..0bc87473f68f 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -92,12 +92,10 @@ else fi sourcename=$KDEB_SOURCENAME tmpdir="$objtree/debian/tmp" -fwdir="$objtree/debian/fwtmp" kernel_headers_dir="$objtree/debian/hdrtmp" libc_headers_dir="$objtree/debian/headertmp" dbg_dir="$objtree/debian/dbgtmp" packagename=linux-image-$version -fwpackagename=linux-firmware-image-$version kernel_headers_packagename=linux-headers-$version libc_headers_packagename=linux-libc-dev dbg_packagename=$packagename-dbg @@ -126,10 +124,9 @@ esac BUILD_DEBUG="$(grep -s '^CONFIG_DEBUG_INFO=y' $KCONFIG_CONFIG || true)" # Setup the directory structure -rm -rf "$tmpdir" "$fwdir" "$kernel_headers_dir" "$libc_headers_dir" "$dbg_dir" $objtree/debian/files +rm -rf "$tmpdir" "$kernel_headers_dir" "$libc_headers_dir" "$dbg_dir" $objtree/debian/files mkdir -m 755 -p "$tmpdir/DEBIAN" mkdir -p "$tmpdir/lib" "$tmpdir/boot" -mkdir -p "$fwdir/lib/firmware/$version/" mkdir -p "$kernel_headers_dir/lib/modules/$version/" # Build and install the kernel @@ -306,7 +303,6 @@ else cat <> debian/control Package: $packagename -Suggests: $fwpackagename Architecture: any Description: Linux kernel, version $version This package contains the Linux kernel, modules and corresponding other @@ -345,22 +341,6 @@ Description: Linux kernel headers for $KERNELRELEASE on \${kernel:debarch} This is useful for people who need to build external modules EOF -# Do we have firmware? Move it out of the way and build it into a package. -if [ -e "$tmpdir/lib/firmware" ]; then - mv "$tmpdir/lib/firmware"/* "$fwdir/lib/firmware/$version/" - rmdir "$tmpdir/lib/firmware" - - cat <> debian/control - -Package: $fwpackagename -Architecture: all -Description: Linux kernel firmware, version $version - This package contains firmware from the Linux kernel, version $version. -EOF - - create_package "$fwpackagename" "$fwdir" -fi - cat <> debian/control Package: $libc_headers_packagename -- cgit From 25b080bd53f2873785fa049f570ac1b361c11d72 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 20 Sep 2017 22:01:26 +0900 Subject: kbuild: rpm-pkg: fix version number handling The "Release:" field of the spec file is determined based on the .version file. However, the .version file is not copied to the source tar file. So, when we build the kernel from the source package, the UTS_VERSION always indicates #1. This does not match with "rpm -q". The kernel UTS_VERSION and "rpm -q" do not agree for binrpm-pkg, either. Please note the kernel has already been built before the spec file is created. Currently, mkspec invokes mkversion. This script returns an incremented version. So, the "Release:" field of the spec file is greater than the version in the kernel by one. For the source package build (where .version file is missing), we can give KBUILD_BUILD_VERSION=%{release} to the build command. For the binary package build, we can simply read out the .version file because it contains the version number that was used for building the kernel image. We can remove scripts/mkversion because scripts/package/Makefile need not touch the .version file. Signed-off-by: Masahiro Yamada --- scripts/mkversion | 6 ------ scripts/package/Makefile | 5 ----- scripts/package/mkspec | 6 ++---- 3 files changed, 2 insertions(+), 15 deletions(-) delete mode 100644 scripts/mkversion diff --git a/scripts/mkversion b/scripts/mkversion deleted file mode 100644 index c12addc9c7ef..000000000000 --- a/scripts/mkversion +++ /dev/null @@ -1,6 +0,0 @@ -if [ ! -f .version ] -then - echo 1 -else - expr 0`cat .version` + 1 -fi diff --git a/scripts/package/Makefile b/scripts/package/Makefile index 71b4a8af9d4d..73f9f3192b9f 100644 --- a/scripts/package/Makefile +++ b/scripts/package/Makefile @@ -50,8 +50,6 @@ rpm-pkg rpm: FORCE $(MAKE) clean $(CONFIG_SHELL) $(MKSPEC) >$(objtree)/kernel.spec $(call cmd,src_tar,$(KERNELPATH),kernel.spec) - $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version - mv -f $(objtree)/.tmp_version $(objtree)/.version rpmbuild $(RPMOPTS) --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz rm $(KERNELPATH).tar.gz kernel.spec @@ -60,9 +58,6 @@ rpm-pkg rpm: FORCE binrpm-pkg: FORCE $(MAKE) KBUILD_SRC= $(CONFIG_SHELL) $(MKSPEC) prebuilt > $(objtree)/binkernel.spec - $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version - mv -f $(objtree)/.tmp_version $(objtree)/.version - rpmbuild $(RPMOPTS) --define "_builddir $(objtree)" --target \ $(UTS_MACHINE) -bb $(objtree)/binkernel.spec rm binkernel.spec diff --git a/scripts/package/mkspec b/scripts/package/mkspec index bf3e9abb2846..f47f17aae135 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -27,9 +27,7 @@ __KERNELRELEASE=`echo $KERNELRELEASE | sed -e "s/-/_/g"` echo "Name: kernel" echo "Summary: The Linux Kernel" echo "Version: $__KERNELRELEASE" -# we need to determine the NEXT version number so that uname and -# rpm -q will agree -echo "Release: `. $srctree/scripts/mkversion`" +echo "Release: $(cat .version 2>/dev/null || echo 1)" echo "License: GPL" echo "Group: System Environment/Kernel" echo "Vendor: The Linux Community" @@ -77,7 +75,7 @@ fi echo "%build" if ! $PREBUILT; then -echo "make clean && make %{?_smp_mflags}" +echo "make clean && make %{?_smp_mflags} KBUILD_BUILD_VERSION=%{release}" echo "" fi -- cgit From 35f3c984548524be5eb4c3f5295cf58d909e8746 Mon Sep 17 00:00:00 2001 From: Frank Rowand Date: Mon, 18 Sep 2017 17:18:44 -0700 Subject: scripts/dtc: dtx_diff - 2nd update of include dts paths to match build Update dtx_diff include paths in the same manner as: commit b12869a8d519 ("of: remove drivers/of/testcase-data from include search path for CPP"), commit 5ffa2aed389c ("of: remove arch/$(SRCARCH)/boot/dts from include search path for CPP"), and commit 50f9ddaf64e1 ("of: search scripts/dtc/include-prefixes path for both CPP and DTC"). Remove proposed include path kernel/dts/, which was never implemented for the dtb build. For the diff case, each source file is compiled separately. For each of those compiles, provide the location of the source file as an include path, not the location of both source files. Signed-off-by: Frank Rowand Signed-off-by: Rob Herring --- scripts/dtc/dtx_diff | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/scripts/dtc/dtx_diff b/scripts/dtc/dtx_diff index f9a3d8d23c64..8c4fbad2055e 100755 --- a/scripts/dtc/dtx_diff +++ b/scripts/dtc/dtx_diff @@ -86,6 +86,7 @@ eod compile_to_dts() { dtx="$1" + dtc_include="$2" if [ -d "${dtx}" ] ; then @@ -113,7 +114,7 @@ compile_to_dts() { # ----- input is DTS (source) if ( cpp ${cpp_flags} -x assembler-with-cpp ${dtx} \ - | ${DTC} -I dts ) ; then + | ${DTC} ${dtc_include} -I dts ) ; then return fi @@ -320,18 +321,13 @@ fi cpp_flags="\ -nostdinc \ - -I${srctree}/arch/${ARCH}/boot/dts \ -I${srctree}/scripts/dtc/include-prefixes \ - -I${srctree}/drivers/of/testcase-data \ -undef -D__DTS__" -dtc_flags="\ - -i ${srctree}/arch/${ARCH}/boot/dts/ \ - -i ${srctree}/kernel/dts \ - ${dtx_path_1_dtc_include} \ - ${dtx_path_2_dtc_include}" - -DTC="${DTC} ${dtc_flags} -O dts -qq -f ${dtc_sort} -o -" +DTC="\ + ${DTC} \ + -i ${srctree}/scripts/dtc/include-prefixes \ + -O dts -qq -f ${dtc_sort} -o -" # ----- do the diff or decompile @@ -339,11 +335,11 @@ DTC="${DTC} ${dtc_flags} -O dts -qq -f ${dtc_sort} -o -" if (( ${cmd_diff} )) ; then diff ${diff_flags} --label "${dtx_file_1}" --label "${dtx_file_2}" \ - <(compile_to_dts "${dtx_file_1}") \ - <(compile_to_dts "${dtx_file_2}") + <(compile_to_dts "${dtx_file_1}" "${dtx_path_1_dtc_include}") \ + <(compile_to_dts "${dtx_file_2}" "${dtx_path_2_dtc_include}") else - compile_to_dts "${dtx_file_1}" + compile_to_dts "${dtx_file_1}" "${dtx_path_1_dtc_include}" fi -- cgit From 749aaf3372b8b56b8743c3e27700d64c8bd06921 Mon Sep 17 00:00:00 2001 From: John Keeping Date: Wed, 20 Sep 2017 13:56:06 -0500 Subject: PCI: endpoint: Use correct "end of test" interrupt pci_epf_test_raise_irq() reads the interrupt to use for the response from reg->command, but this has been cleared at the beginning of the command handler so the value is always zero at this point. Instead, extract the interrupt index before handling the command and then pass the requested interrupt into pci_epf_test_raise_irq(). This allows us to remove the specific code to extract the interrupt for COMMAND_RAISE_MSI_IRQ since it is now handled in common code. Fixes: 3ecf3232c54c ("PCI: endpoint: Do not reset *command* inadvertently") Signed-off-by: John Keeping Signed-off-by: Bjorn Helgaas Acked-by: Kishon Vijay Abraham I --- drivers/pci/endpoint/functions/pci-epf-test.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c index 4ddc6e8f9fe7..f9308c2f22e6 100644 --- a/drivers/pci/endpoint/functions/pci-epf-test.c +++ b/drivers/pci/endpoint/functions/pci-epf-test.c @@ -251,9 +251,8 @@ err: return ret; } -static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test) +static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test, u8 irq) { - u8 irq; u8 msi_count; struct pci_epf *epf = epf_test->epf; struct pci_epc *epc = epf->epc; @@ -262,7 +261,6 @@ static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test) reg->status |= STATUS_IRQ_RAISED; msi_count = pci_epc_get_msi(epc); - irq = (reg->command & MSI_NUMBER_MASK) >> MSI_NUMBER_SHIFT; if (irq > msi_count || msi_count <= 0) pci_epc_raise_irq(epc, PCI_EPC_IRQ_LEGACY, 0); else @@ -289,6 +287,8 @@ static void pci_epf_test_cmd_handler(struct work_struct *work) reg->command = 0; reg->status = 0; + irq = (command & MSI_NUMBER_MASK) >> MSI_NUMBER_SHIFT; + if (command & COMMAND_RAISE_LEGACY_IRQ) { reg->status = STATUS_IRQ_RAISED; pci_epc_raise_irq(epc, PCI_EPC_IRQ_LEGACY, 0); @@ -301,7 +301,7 @@ static void pci_epf_test_cmd_handler(struct work_struct *work) reg->status |= STATUS_WRITE_FAIL; else reg->status |= STATUS_WRITE_SUCCESS; - pci_epf_test_raise_irq(epf_test); + pci_epf_test_raise_irq(epf_test, irq); goto reset_handler; } @@ -311,7 +311,7 @@ static void pci_epf_test_cmd_handler(struct work_struct *work) reg->status |= STATUS_READ_SUCCESS; else reg->status |= STATUS_READ_FAIL; - pci_epf_test_raise_irq(epf_test); + pci_epf_test_raise_irq(epf_test, irq); goto reset_handler; } @@ -321,13 +321,12 @@ static void pci_epf_test_cmd_handler(struct work_struct *work) reg->status |= STATUS_COPY_SUCCESS; else reg->status |= STATUS_COPY_FAIL; - pci_epf_test_raise_irq(epf_test); + pci_epf_test_raise_irq(epf_test, irq); goto reset_handler; } if (command & COMMAND_RAISE_MSI_IRQ) { msi_count = pci_epc_get_msi(epc); - irq = (command & MSI_NUMBER_MASK) >> MSI_NUMBER_SHIFT; if (irq > msi_count || msi_count <= 0) goto reset_handler; reg->status = STATUS_IRQ_RAISED; -- cgit From 008ba2a13f2d04c947adc536d19debb8fe66f110 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 14 Sep 2017 17:14:41 -0400 Subject: packet: hold bind lock when rebinding to fanout hook Packet socket bind operations must hold the po->bind_lock. This keeps po->running consistent with whether the socket is actually on a ptype list to receive packets. fanout_add unbinds a socket and its packet_rcv/tpacket_rcv call, then binds the fanout object to receive through packet_rcv_fanout. Make it hold the po->bind_lock when testing po->running and rebinding. Else, it can race with other rebind operations, such as that in packet_set_ring from packet_rcv to tpacket_rcv. Concurrent updates can result in a socket being added to a fanout group twice, causing use-after-free KASAN bug reports, among others. Reported independently by both trinity and syzkaller. Verified that the syzkaller reproducer passes after this patch. Fixes: dc99f600698d ("packet: Add fanout support.") Reported-by: nixioaming Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c26172995511..d288f52c53f7 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1684,10 +1684,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) mutex_lock(&fanout_mutex); - err = -EINVAL; - if (!po->running) - goto out; - err = -EALREADY; if (po->fanout) goto out; @@ -1749,7 +1745,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) list_add(&match->list, &fanout_list); } err = -EINVAL; - if (match->type == type && + + spin_lock(&po->bind_lock); + if (po->running && + match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; @@ -1761,6 +1760,13 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) err = 0; } } + spin_unlock(&po->bind_lock); + + if (err && !refcount_read(&match->sk_ref)) { + list_del(&match->list); + kfree(match); + } + out: if (err && rollover) { kfree(rollover); -- cgit From ec9dd352d591f0c90402ec67a317c1ed4fb2e638 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 18 Sep 2017 16:38:36 -0700 Subject: bpf: one perf event close won't free bpf program attached by another perf event This patch fixes a bug exhibited by the following scenario: 1. fd1 = perf_event_open with attr.config = ID1 2. attach bpf program prog1 to fd1 3. fd2 = perf_event_open with attr.config = ID1 4. user program closes fd2 and prog1 is detached from the tracepoint. 5. user program with fd1 does not work properly as tracepoint no output any more. The issue happens at step 4. Multiple perf_event_open can be called successfully, but only one bpf prog pointer in the tp_event. In the current logic, any fd release for the same tp_event will free the tp_event->prog. The fix is to free tp_event->prog only when the closing fd corresponds to the one which registered the program. Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- include/linux/trace_events.h | 1 + kernel/events/core.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 7f11050746ae..2e0f22298fe9 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -272,6 +272,7 @@ struct trace_event_call { int perf_refcount; struct hlist_head __percpu *perf_events; struct bpf_prog *prog; + struct perf_event *bpf_prog_owner; int (*perf_perm)(struct trace_event_call *, struct perf_event *); diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e691b75b2db..6bc21e202ae4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8171,6 +8171,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) } } event->tp_event->prog = prog; + event->tp_event->bpf_prog_owner = event; return 0; } @@ -8185,7 +8186,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) return; prog = event->tp_event->prog; - if (prog) { + if (prog && event->tp_event->bpf_prog_owner == event) { event->tp_event->prog = NULL; bpf_prog_put(prog); } -- cgit From 1fa089ec6d68849bfe60abd141375dc9b21b0ee8 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 20 Sep 2017 16:46:49 -0500 Subject: [SMB3] Update session and share information displayed for debugging SMB2/SMB3 We were not displaying some key fields (session status and capabilities and whether guest authenticated) for SMB2/SMB3 session in /proc/fs/cifs/DebugData. This is needed for real world triage of problems with the (now much more common) SMB3 mounts. Signed-off-by: Steve French --- fs/cifs/cifs_debug.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 9727e1dcacd5..cbb9534b89b4 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -160,8 +160,13 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) || (ses->serverNOS == NULL)) { - seq_printf(m, "\n%d) entry for %s not fully " - "displayed\n\t", i, ses->serverName); + seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d\t", + i, ses->serverName, ses->ses_count, + ses->capabilities, ses->status); + if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) + seq_printf(m, "Guest\t"); + else if (ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) + seq_printf(m, "Anonymous\t"); } else { seq_printf(m, "\n%d) Name: %s Domain: %s Uses: %d OS:" -- cgit From b22666febf6fc67776d49782057fe4dd06f41552 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 20 Sep 2017 18:10:17 -0400 Subject: drm/amdkfd: Fix incorrect destroy_mqd parameter When uninitializing a kernel queue. Signed-off-by: Yong Zhao Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 0649dd43e780..a4ca1133482c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -184,7 +184,7 @@ static void uninitialize(struct kernel_queue *kq) if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) kq->mqd->destroy_mqd(kq->mqd, kq->queue->mqd, - false, + KFD_PREEMPT_TYPE_WAVEFRONT_RESET, QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, kq->queue->pipe, kq->queue->queue); -- cgit From cb1d9967461cdf3b6aac6317c8d954a14f842571 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Wed, 20 Sep 2017 18:10:21 -0400 Subject: drm/amdkfd: Fix kernel-queue wrapping bugs Avoid intermediate negative numbers when doing calculations with a mix of signed and unsigned variables where implicit conversions can lead to unexpected results. When kernel queue buffer wraps around to 0, we need to check that rptr won't be overwritten by the new packet. Signed-off-by: Yong Zhao Signed-off-by: Felix Kuehling Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index a4ca1133482c..ed71ad40e8f7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -210,6 +210,11 @@ static int acquire_packet_buffer(struct kernel_queue *kq, uint32_t wptr, rptr; unsigned int *queue_address; + /* When rptr == wptr, the buffer is empty. + * When rptr == wptr + 1, the buffer is full. + * It is always rptr that advances to the position of wptr, rather than + * the opposite. So we can only use up to queue_size_dwords - 1 dwords. + */ rptr = *kq->rptr_kernel; wptr = *kq->wptr_kernel; queue_address = (unsigned int *)kq->pq_kernel_addr; @@ -219,11 +224,10 @@ static int acquire_packet_buffer(struct kernel_queue *kq, pr_debug("wptr: %d\n", wptr); pr_debug("queue_address 0x%p\n", queue_address); - available_size = (rptr - 1 - wptr + queue_size_dwords) % + available_size = (rptr + queue_size_dwords - 1 - wptr) % queue_size_dwords; - if (packet_size_in_dwords >= queue_size_dwords || - packet_size_in_dwords >= available_size) { + if (packet_size_in_dwords > available_size) { /* * make sure calling functions know * acquire_packet_buffer() failed @@ -233,6 +237,14 @@ static int acquire_packet_buffer(struct kernel_queue *kq, } if (wptr + packet_size_in_dwords >= queue_size_dwords) { + /* make sure after rolling back to position 0, there is + * still enough space. + */ + if (packet_size_in_dwords >= rptr) { + *buffer_ptr = NULL; + return -ENOMEM; + } + /* fill nops, roll back and start at position 0 */ while (wptr > 0) { queue_address[wptr] = kq->nop_packet; wptr = (wptr + 1) % queue_size_dwords; -- cgit From c986169fdec992c464c84d0a3abdbfc846ed3df9 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 20 Sep 2017 18:10:22 -0400 Subject: drm/amdkfd: Print event limit messages only once per process To avoid spamming the log. Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 5 ++++- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 5979158c3f7b..944abfad39c1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -292,7 +292,10 @@ static int create_signal_event(struct file *devkfd, struct kfd_event *ev) { if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { - pr_warn("Signal event wasn't created because limit was reached\n"); + if (!p->signal_event_limit_reached) { + pr_warn("Signal event wasn't created because limit was reached\n"); + p->signal_event_limit_reached = true; + } return -ENOMEM; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index b397ec726400..b87e96cee5fa 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -521,6 +521,7 @@ struct kfd_process { struct list_head signal_event_pages; u32 next_nonsignal_event_id; size_t signal_event_count; + bool signal_event_limit_reached; }; /** -- cgit From c2a64bb9fcd31c39feddf30748b4ee8d82e53c6a Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 19 Sep 2017 13:19:13 -0400 Subject: net: compat: assert the size of cmsg copied in is as expected The actual length of cmsg fetched in during the second loop (i.e., kcmsg - kcmsg_base) could be different from what we get from the first loop (i.e., kcmlen). The main reason is that the two get_user() calls in the two loops (i.e., get_user(ucmlen, &ucmsg->cmsg_len) and __get_user(ucmlen, &ucmsg->cmsg_len)) could cause ucmlen to have different values even they fetch from the same userspace address, as user can race to change the memory content in &ucmsg->cmsg_len across fetches. Although in the second loop, the sanity check if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp)) is inplace, it only ensures that the cmsg fetched in during the second loop does not exceed the length of kcmlen, but not necessarily equal to kcmlen. But indicated by the assignment kmsg->msg_controllen = kcmlen, we should enforce that. This patch adds this additional sanity check and ensures that what is recorded in kmsg->msg_controllen is the actual cmsg length. Signed-off-by: Meng Xu Signed-off-by: David S. Miller --- net/compat.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/compat.c b/net/compat.c index 6ded6c821d7a..22381719718c 100644 --- a/net/compat.c +++ b/net/compat.c @@ -185,6 +185,13 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); } + /* + * check the length of messages copied in is the same as the + * what we get from the first loop + */ + if ((char *)kcmsg - (char *)kcmsg_base != kcmlen) + goto Einval; + /* Ok, looks like we made it. Hook it up and return success. */ kmsg->msg_control = kcmsg_base; kmsg->msg_controllen = kcmlen; -- cgit From 92dd5452c1be873a1193561f4f691763103d22ac Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 19 Sep 2017 18:45:56 +0100 Subject: net: change skb->mac_header when Generic XDP calls adjust_head Since XDP's view of the packet includes the MAC header, moving the start- of-packet with bpf_xdp_adjust_head needs to also update the offset of the MAC header (which is relative to skb->head, not to the skb->data that was changed). Without this, tcpdump sees packets starting from the old MAC header rather than the new one, at least in my tests on the loopback device. Fixes: b5cdae3291f7 ("net: Generic XDP") Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev.c b/net/core/dev.c index fb766d906148..9a2254f9802f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3892,6 +3892,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); + skb->mac_header += off; switch (act) { case XDP_REDIRECT: -- cgit From 5e62d98c4bbc243f3dca18e73a754b629839fc5c Mon Sep 17 00:00:00 2001 From: Troy Kisky Date: Tue, 19 Sep 2017 17:33:07 -0700 Subject: net: fec: only check queue 0 if RXF_0/TXF_0 interrupt is set Before queue 0 was always checked if any queue caused an interrupt. It is better to just mark queue 0 if queue 0 has caused an interrupt. Signed-off-by: Troy Kisky Acked-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 56f56d6ada9c..464055fb33d5 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1559,14 +1559,14 @@ fec_enet_collect_events(struct fec_enet_private *fep, uint int_events) if (int_events == 0) return false; - if (int_events & FEC_ENET_RXF) + if (int_events & FEC_ENET_RXF_0) fep->work_rx |= (1 << 2); if (int_events & FEC_ENET_RXF_1) fep->work_rx |= (1 << 0); if (int_events & FEC_ENET_RXF_2) fep->work_rx |= (1 << 1); - if (int_events & FEC_ENET_TXF) + if (int_events & FEC_ENET_TXF_0) fep->work_tx |= (1 << 2); if (int_events & FEC_ENET_TXF_1) fep->work_tx |= (1 << 0); -- cgit From 7063c163cd4a20184b3bbada503dab8f254a8c56 Mon Sep 17 00:00:00 2001 From: Troy Kisky Date: Tue, 19 Sep 2017 17:33:08 -0700 Subject: net: fec: remove unused interrupt FEC_ENET_TS_TIMER FEC_ENET_TS_TIMER is not checked in the interrupt routine so there is no need to enable it. Signed-off-by: Troy Kisky Acked-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index 38c7b21e5d63..ede1876a9a19 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -374,8 +374,8 @@ struct bufdesc_ex { #define FEC_ENET_TS_AVAIL ((uint)0x00010000) #define FEC_ENET_TS_TIMER ((uint)0x00008000) -#define FEC_DEFAULT_IMASK (FEC_ENET_TXF | FEC_ENET_RXF | FEC_ENET_MII | FEC_ENET_TS_TIMER) -#define FEC_NAPI_IMASK (FEC_ENET_MII | FEC_ENET_TS_TIMER) +#define FEC_DEFAULT_IMASK (FEC_ENET_TXF | FEC_ENET_RXF | FEC_ENET_MII) +#define FEC_NAPI_IMASK FEC_ENET_MII #define FEC_RX_DISABLED_IMASK (FEC_DEFAULT_IMASK & (~FEC_ENET_RXF)) /* ENET interrupt coalescing macro define */ -- cgit From e24ee2780a1d6786b94139ebf95b44c9ae62bf13 Mon Sep 17 00:00:00 2001 From: Troy Kisky Date: Tue, 19 Sep 2017 17:33:09 -0700 Subject: net: fec: return IRQ_HANDLED if fec_ptp_check_pps_event handled it fec_ptp_check_pps_event will return 1 if FEC_T_TF_MASK caused an interrupt. Don't return IRQ_NONE in this case. Signed-off-by: Troy Kisky Acked-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 464055fb33d5..3dc2d771a222 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1604,8 +1604,8 @@ fec_enet_interrupt(int irq, void *dev_id) } if (fep->ptp_clock) - fec_ptp_check_pps_event(fep); - + if (fec_ptp_check_pps_event(fep)) + ret = IRQ_HANDLED; return ret; } -- cgit From 02388bf87f72e1d47174cd8f81c34443920eb5a0 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 19 Sep 2017 21:49:55 -0400 Subject: isdn/i4l: fetch the ppp_write buffer in one shot In isdn_ppp_write(), the header (i.e., protobuf) of the buffer is fetched twice from userspace. The first fetch is used to peek at the protocol of the message and reset the huptimer if necessary; while the second fetch copies in the whole buffer. However, given that buf resides in userspace memory, a user process can race to change its memory content across fetches. By doing so, we can either avoid resetting the huptimer for any type of packets (by first setting proto to PPP_LCP and later change to the actual type) or force resetting the huptimer for LCP packets. This patch changes this double-fetch behavior into two single fetches decided by condition (lp->isdn_device < 0 || lp->isdn_channel <0). A more detailed discussion can be found at https://marc.info/?l=linux-kernel&m=150586376926123&w=2 Signed-off-by: Meng Xu Signed-off-by: David S. Miller --- drivers/isdn/i4l/isdn_ppp.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c index 6c44609fd83a..cd2b3c69771a 100644 --- a/drivers/isdn/i4l/isdn_ppp.c +++ b/drivers/isdn/i4l/isdn_ppp.c @@ -825,7 +825,6 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) isdn_net_local *lp; struct ippp_struct *is; int proto; - unsigned char protobuf[4]; is = file->private_data; @@ -839,24 +838,28 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) if (!lp) printk(KERN_DEBUG "isdn_ppp_write: lp == NULL\n"); else { - /* - * Don't reset huptimer for - * LCP packets. (Echo requests). - */ - if (copy_from_user(protobuf, buf, 4)) - return -EFAULT; - proto = PPP_PROTOCOL(protobuf); - if (proto != PPP_LCP) - lp->huptimer = 0; + if (lp->isdn_device < 0 || lp->isdn_channel < 0) { + unsigned char protobuf[4]; + /* + * Don't reset huptimer for + * LCP packets. (Echo requests). + */ + if (copy_from_user(protobuf, buf, 4)) + return -EFAULT; + + proto = PPP_PROTOCOL(protobuf); + if (proto != PPP_LCP) + lp->huptimer = 0; - if (lp->isdn_device < 0 || lp->isdn_channel < 0) return 0; + } if ((dev->drv[lp->isdn_device]->flags & DRV_FLAG_RUNNING) && lp->dialstate == 0 && (lp->flags & ISDN_NET_CONNECTED)) { unsigned short hl; struct sk_buff *skb; + unsigned char *cpy_buf; /* * we need to reserve enough space in front of * sk_buff. old call to dev_alloc_skb only reserved @@ -869,11 +872,21 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) return count; } skb_reserve(skb, hl); - if (copy_from_user(skb_put(skb, count), buf, count)) + cpy_buf = skb_put(skb, count); + if (copy_from_user(cpy_buf, buf, count)) { kfree_skb(skb); return -EFAULT; } + + /* + * Don't reset huptimer for + * LCP packets. (Echo requests). + */ + proto = PPP_PROTOCOL(cpy_buf); + if (proto != PPP_LCP) + lp->huptimer = 0; + if (is->debug & 0x40) { printk(KERN_DEBUG "ppp xmit: len %d\n", (int) skb->len); isdn_ppp_frame_log("xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot); -- cgit From e92a0843795779678397ac0790a76de20f79cc13 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:50 +0800 Subject: net: hns3: Cleanup for ROCE capability flag in ae_dev This patch add the ROCE supported flag in the driver_data field of pci_device_id, delete roce_pci_tbl and change HNAE_DEV_SUPPORT_ROCE_B to HNAE3_DEV_SUPPORT_ROCE_B. This cleanup is done in order to support adding capability in pci_device_id and to fix initialization failure when cmd is not supported. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 5 ++++- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 25 ++++------------------ .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 16 +++++++++----- 3 files changed, 19 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index b2f28ae81273..0f7b61a92f44 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -49,7 +49,10 @@ #define HNAE3_CLASS_NAME_SIZE 16 #define HNAE3_DEV_INITED_B 0x0 -#define HNAE_DEV_SUPPORT_ROCE_B 0x1 +#define HNAE3_DEV_SUPPORT_ROCE_B 0x1 + +#define hnae3_dev_roce_supported(hdev) \ + hnae_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_ROCE_B) #define ring_ptr_move_fw(ring, p) \ ((ring)->p = ((ring)->p + 1) % (ring)->desc_num) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 74008ef23169..6953d19c6475 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -46,17 +46,7 @@ static const struct pci_device_id ae_algo_pci_tbl[] = { {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0}, - /* Required last entry */ - {0, } -}; - -static const struct pci_device_id roce_pci_tbl[] = { - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0}, - /* Required last entry */ + /* required last entry */ {0, } }; @@ -894,7 +884,7 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev) hdev->num_tqps = __le16_to_cpu(req->tqp_num); hdev->pkt_buf_size = __le16_to_cpu(req->buf_size) << HCLGE_BUF_UNIT_S; - if (hnae_get_bit(hdev->ae_dev->flag, HNAE_DEV_SUPPORT_ROCE_B)) { + if (hnae3_dev_roce_supported(hdev)) { hdev->num_roce_msix = hnae_get_field(__le16_to_cpu(req->pf_intr_vector_number), HCLGE_PF_VEC_NUM_M, HCLGE_PF_VEC_NUM_S); @@ -3932,8 +3922,7 @@ static int hclge_init_client_instance(struct hnae3_client *client, goto err; if (hdev->roce_client && - hnae_get_bit(hdev->ae_dev->flag, - HNAE_DEV_SUPPORT_ROCE_B)) { + hnae3_dev_roce_supported(hdev)) { struct hnae3_client *rc = hdev->roce_client; ret = hclge_init_roce_base_info(vport); @@ -3956,8 +3945,7 @@ static int hclge_init_client_instance(struct hnae3_client *client, break; case HNAE3_CLIENT_ROCE: - if (hnae_get_bit(hdev->ae_dev->flag, - HNAE_DEV_SUPPORT_ROCE_B)) { + if (hnae3_dev_roce_supported(hdev)) { hdev->roce_client = client; vport->roce.client = client; } @@ -4069,7 +4057,6 @@ static void hclge_pci_uninit(struct hclge_dev *hdev) static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) { struct pci_dev *pdev = ae_dev->pdev; - const struct pci_device_id *id; struct hclge_dev *hdev; int ret; @@ -4084,10 +4071,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hdev->ae_dev = ae_dev; ae_dev->priv = hdev; - id = pci_match_id(roce_pci_tbl, ae_dev->pdev); - if (id) - hnae_set_bit(ae_dev->flag, HNAE_DEV_SUPPORT_ROCE_B, 1); - ret = hclge_pci_init(hdev); if (ret) { dev_err(&pdev->dev, "PCI init failed\n"); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index 4d68d6ea5143..94d8bb5b92f0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -41,11 +41,16 @@ static struct hnae3_client client; static const struct pci_device_id hns3_pci_tbl[] = { {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_GE), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, /* required last entry */ {0, } }; @@ -1348,6 +1353,7 @@ static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } ae_dev->pdev = pdev; + ae_dev->flag = ent->driver_data; ae_dev->dev_type = HNAE3_DEV_KNIC; pci_set_drvdata(pdev, ae_dev); -- cgit From 2daf4a6536f3109ed0ed758cec14743e0e5c20ea Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:51 +0800 Subject: net: hns3: Fix initialization when cmd is not supported When ae_dev doesn't support DCB, rx_priv_wl_config, common_thrd_config and tm_qs_bp_cfg can't be called, otherwise cmd return fail, which causes the hclge module initialization process to fail. This patch fix it by adding a DCB capability flag to check if the ae_dev support DCB. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 7 ++++++ .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 26 +++++++++++++--------- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 4 ++++ .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 10 ++++----- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 0f7b61a92f44..ad685f5aa6d1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -50,10 +50,17 @@ #define HNAE3_DEV_INITED_B 0x0 #define HNAE3_DEV_SUPPORT_ROCE_B 0x1 +#define HNAE3_DEV_SUPPORT_DCB_B 0x2 + +#define HNAE3_DEV_SUPPORT_ROCE_DCB_BITS (BIT(HNAE3_DEV_SUPPORT_DCB_B) |\ + BIT(HNAE3_DEV_SUPPORT_ROCE_B)) #define hnae3_dev_roce_supported(hdev) \ hnae_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_ROCE_B) +#define hnae3_dev_dcb_supported(hdev) \ + hnae_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_DCB_B) + #define ring_ptr_move_fw(ring, p) \ ((ring)->p = ((ring)->p + 1) % (ring)->desc_num) #define ring_ptr_move_bw(ring, p) \ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 6953d19c6475..903f43a8c2a1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1772,18 +1772,22 @@ int hclge_buffer_alloc(struct hclge_dev *hdev) return ret; } - ret = hclge_rx_priv_wl_config(hdev); - if (ret) { - dev_err(&hdev->pdev->dev, - "could not configure rx private waterline %d\n", ret); - return ret; - } + if (hnae3_dev_dcb_supported(hdev)) { + ret = hclge_rx_priv_wl_config(hdev); + if (ret) { + dev_err(&hdev->pdev->dev, + "could not configure rx private waterline %d\n", + ret); + return ret; + } - ret = hclge_common_thrd_config(hdev); - if (ret) { - dev_err(&hdev->pdev->dev, - "could not configure common threshold %d\n", ret); - return ret; + ret = hclge_common_thrd_config(hdev); + if (ret) { + dev_err(&hdev->pdev->dev, + "could not configure common threshold %d\n", + ret); + return ret; + } } ret = hclge_common_wl_config(hdev); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 1c577d268f00..c91dbf19c4b1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -976,6 +976,10 @@ int hclge_pause_setup_hw(struct hclge_dev *hdev) if (ret) return ret; + /* Only DCB-supported dev supports qset back pressure setting */ + if (!hnae3_dev_dcb_supported(hdev)) + return 0; + for (i = 0; i < hdev->tm_info.num_tc; i++) { ret = hclge_tm_qs_bp_cfg(hdev, i); if (ret) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index 94d8bb5b92f0..35369e1c8036 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -42,15 +42,15 @@ static const struct pci_device_id hns3_pci_tbl[] = { {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_GE), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, /* required last entry */ {0, } }; -- cgit From d221df4e0faae2b9cc8ad78f3e5e777461b6b542 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:52 +0800 Subject: net: hns3: Fix for DEFAULT_DV when dev doesn't support DCB When ae_dev doesn't support DCB, DEFAULT_DV must be set to a lower value, otherwise the buffer allocation process will fail. This patch fix it by setting it to 30K bytes. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 1 + drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index c2b613b40509..30e2ad5ac0da 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -688,6 +688,7 @@ struct hclge_reset_tqp_queue { #define HCLGE_DEFAULT_TX_BUF 0x4000 /* 16k bytes */ #define HCLGE_TOTAL_PKT_BUF 0x108000 /* 1.03125M bytes */ #define HCLGE_DEFAULT_DV 0xA000 /* 40k byte */ +#define HCLGE_DEFAULT_NON_DCB_DV 0x7800 /* 30K byte */ #define HCLGE_TYPE_CRQ 0 #define HCLGE_TYPE_CSQ 1 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 903f43a8c2a1..796370adf99c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1444,7 +1444,11 @@ static bool hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all) tc_num = hclge_get_tc_num(hdev); pfc_enable_num = hclge_get_pfc_enalbe_num(hdev); - shared_buf_min = 2 * hdev->mps + HCLGE_DEFAULT_DV; + if (hnae3_dev_dcb_supported(hdev)) + shared_buf_min = 2 * hdev->mps + HCLGE_DEFAULT_DV; + else + shared_buf_min = 2 * hdev->mps + HCLGE_DEFAULT_NON_DCB_DV; + shared_buf_tc = pfc_enable_num * hdev->mps + (tc_num - pfc_enable_num) * hdev->mps / 2 + hdev->mps; -- cgit From bb1fe9ea6371e075d3d1448cd3ff6441d31307be Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:53 +0800 Subject: net: hns3: Fix for not setting rx private buffer size to zero When rx private buffer is disabled, there may be some case that the rx private buffer is not set to zero, which may cause buffer allocation process to fail. This patch fixes this problem by setting priv->enable to 0 and priv->buf_size to zero when rx private buffer is disabled. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 796370adf99c..a7d8fb1e15f6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1504,6 +1504,11 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size) priv->wl.high = 2 * hdev->mps; priv->buf_size = priv->wl.high; } + } else { + priv->enable = 0; + priv->wl.low = 0; + priv->wl.high = 0; + priv->buf_size = 0; } } @@ -1516,8 +1521,15 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size) for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { priv = &hdev->priv_buf[i]; - if (hdev->hw_tc_map & BIT(i)) - priv->enable = 1; + priv->enable = 0; + priv->wl.low = 0; + priv->wl.high = 0; + priv->buf_size = 0; + + if (!(hdev->hw_tc_map & BIT(i))) + continue; + + priv->enable = 1; if (hdev->tm_info.hw_pfc_map & BIT(i)) { priv->wl.low = 128; -- cgit From b8c8bf47da5576657370798da6f18a8cb0245d5b Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:54 +0800 Subject: net: hns3: Fix for rx_priv_buf_alloc not setting rx shared buffer rx_priv_buf_alloc is used to tell hardware how much buffer is used for rx direction, right now only the private buffer is assigned. For ae_dev that doesn't support DCB, private rx buffer is assigned to zero, only shared rx buffer is used. So not setting the shared rx buffer cause dropping of packet in SSU. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 3 ++- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 30e2ad5ac0da..758cf3948131 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -270,7 +270,8 @@ struct hclge_tx_buff_alloc { struct hclge_rx_priv_buff { __le16 buf_num[HCLGE_TC_NUM]; - u8 rsv[8]; + __le16 shared_buf; + u8 rsv[6]; }; struct hclge_query_version { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index a7d8fb1e15f6..e313552bb23d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1622,6 +1622,10 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev) cpu_to_le16(true << HCLGE_TC0_PRI_BUF_EN_B); } + req->shared_buf = + cpu_to_le16((hdev->s_buf.buf_size >> HCLGE_BUF_UNIT_S) | + (1 << HCLGE_TC0_PRI_BUF_EN_B)); + ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) { dev_err(&hdev->pdev->dev, -- cgit From d602a52540c9b92e0dd152cfe1d0848c23f08894 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:55 +0800 Subject: net: hns3: Fix for rx priv buf allocation when DCB is not supported When hdev doesn't support DCB, rx private buffer is not allocated, otherwise there is not enough buffer for rx shared buffer, causing buffer allocation process to fail. This patch fixes by checking the dcb capability in hclge_rx_buffer_calc. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e313552bb23d..c660f0caf709 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1489,6 +1489,16 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size) struct hclge_priv_buf *priv; int i; + /* When DCB is not supported, rx private + * buffer is not allocated. + */ + if (!hnae3_dev_dcb_supported(hdev)) { + if (!hclge_is_rx_buf_ok(hdev, rx_all)) + return -ENOMEM; + + return 0; + } + /* step 1, try to alloc private buffer for all enabled tc */ for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { priv = &hdev->priv_buf[i]; -- cgit From c4726338d928c824f56c27734d837b8244132705 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:56 +0800 Subject: net: hns3: Fix typo error for feild in hclge_tm This patch fixes a typo error for feild, which should be field. Fixes: 848440544b41f ("net: hns3: Add support of TX Scheduler & Shaper to HNS3 driver") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 20 ++++++++++---------- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index c91dbf19c4b1..fe659f752237 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -280,11 +280,11 @@ static int hclge_tm_pg_shapping_cfg(struct hclge_dev *hdev, shap_cfg_cmd->pg_id = pg_id; - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, IR_B, ir_b); - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, IR_U, ir_u); - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, IR_S, ir_s); - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, BS_B, bs_b); - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, BS_S, bs_s); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_B, ir_b); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_U, ir_u); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_S, ir_s); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, BS_B, bs_b); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, BS_S, bs_s); return hclge_cmd_send(&hdev->hw, &desc, 1); } @@ -307,11 +307,11 @@ static int hclge_tm_pri_shapping_cfg(struct hclge_dev *hdev, shap_cfg_cmd->pri_id = pri_id; - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, IR_B, ir_b); - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, IR_U, ir_u); - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, IR_S, ir_s); - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, BS_B, bs_b); - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, BS_S, bs_s); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_B, ir_b); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_U, ir_u); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_S, ir_s); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, BS_B, bs_b); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, BS_S, bs_s); return hclge_cmd_send(&hdev->hw, &desc, 1); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h index 7e67337dfaf2..85158b0d73fe 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h @@ -94,10 +94,10 @@ struct hclge_bp_to_qs_map_cmd { u32 rsvd1; }; -#define hclge_tm_set_feild(dest, string, val) \ +#define hclge_tm_set_field(dest, string, val) \ hnae_set_field((dest), (HCLGE_TM_SHAP_##string##_MSK), \ (HCLGE_TM_SHAP_##string##_LSH), val) -#define hclge_tm_get_feild(src, string) \ +#define hclge_tm_get_field(src, string) \ hnae_get_field((src), (HCLGE_TM_SHAP_##string##_MSK), \ (HCLGE_TM_SHAP_##string##_LSH)) -- cgit From 68ece54efd417d415462adbaa2700cba50de3ff6 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:57 +0800 Subject: net: hns3: Fix for setting rss_size incorrectly rss_size is 1, 2, 4, 8, 16, 32, 64, 128, but acutal tc queue size can be any u16 less than 128. If tc queue size is 5, we set the rss_size to 8, indirection table will be used to limit the size of actual queue size. It may cause dropping of receiving packet in hardware if rss_size is not set correctly. For now, each TC has the same rss size. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 76 ++++++++++------------ .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 1 + .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 1 + 3 files changed, 38 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index c660f0caf709..e0685e630afe 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2606,6 +2606,7 @@ static int hclge_rss_init_hw(struct hclge_dev *hdev) u16 tc_valid[HCLGE_MAX_TC_NUM]; u16 tc_size[HCLGE_MAX_TC_NUM]; u32 *rss_indir = NULL; + u16 rss_size = 0, roundup_size; const u8 *key; int i, ret, j; @@ -2620,7 +2621,13 @@ static int hclge_rss_init_hw(struct hclge_dev *hdev) for (j = 0; j < hdev->num_vmdq_vport + 1; j++) { for (i = 0; i < HCLGE_RSS_IND_TBL_SIZE; i++) { vport[j].rss_indirection_tbl[i] = - i % hdev->rss_size_max; + i % vport[j].alloc_rss_size; + + /* vport 0 is for PF */ + if (j != 0) + continue; + + rss_size = vport[j].alloc_rss_size; rss_indir[i] = vport[j].rss_indirection_tbl[i]; } } @@ -2637,42 +2644,31 @@ static int hclge_rss_init_hw(struct hclge_dev *hdev) if (ret) goto err; + /* Each TC have the same queue size, and tc_size set to hardware is + * the log2 of roundup power of two of rss_size, the acutal queue + * size is limited by indirection table. + */ + if (rss_size > HCLGE_RSS_TC_SIZE_7 || rss_size == 0) { + dev_err(&hdev->pdev->dev, + "Configure rss tc size failed, invalid TC_SIZE = %d\n", + rss_size); + return -EINVAL; + } + + roundup_size = roundup_pow_of_two(rss_size); + roundup_size = ilog2(roundup_size); + for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { - if (hdev->hw_tc_map & BIT(i)) - tc_valid[i] = 1; - else - tc_valid[i] = 0; + tc_valid[i] = 0; - switch (hdev->rss_size_max) { - case HCLGE_RSS_TC_SIZE_0: - tc_size[i] = 0; - break; - case HCLGE_RSS_TC_SIZE_1: - tc_size[i] = 1; - break; - case HCLGE_RSS_TC_SIZE_2: - tc_size[i] = 2; - break; - case HCLGE_RSS_TC_SIZE_3: - tc_size[i] = 3; - break; - case HCLGE_RSS_TC_SIZE_4: - tc_size[i] = 4; - break; - case HCLGE_RSS_TC_SIZE_5: - tc_size[i] = 5; - break; - case HCLGE_RSS_TC_SIZE_6: - tc_size[i] = 6; - break; - case HCLGE_RSS_TC_SIZE_7: - tc_size[i] = 7; - break; - default: - break; - } - tc_offset[i] = hdev->rss_size_max * i; + if (!(hdev->hw_tc_map & BIT(i))) + continue; + + tc_valid[i] = 1; + tc_size[i] = roundup_size; + tc_offset[i] = rss_size * i; } + ret = hclge_set_rss_tc_mode(hdev, tc_valid, tc_size, tc_offset); err: @@ -4167,12 +4163,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) return ret; } - ret = hclge_rss_init_hw(hdev); - if (ret) { - dev_err(&pdev->dev, "Rss init fail, ret =%d\n", ret); - return ret; - } - ret = hclge_init_vlan_config(hdev); if (ret) { dev_err(&pdev->dev, "VLAN init fail, ret =%d\n", ret); @@ -4185,6 +4175,12 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) return ret; } + ret = hclge_rss_init_hw(hdev); + if (ret) { + dev_err(&pdev->dev, "Rss init fail, ret =%d\n", ret); + return ret; + } + setup_timer(&hdev->service_timer, hclge_service_timer, (unsigned long)hdev); INIT_WORK(&hdev->service_task, hclge_service_task); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index edb10ad075eb..7f8dd129c10d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -477,6 +477,7 @@ struct hclge_vport { u8 rss_hash_key[HCLGE_RSS_KEY_SIZE]; /* User configured hash keys */ /* User configured lookup table entries */ u8 rss_indirection_tbl[HCLGE_RSS_IND_TBL_SIZE]; + u16 alloc_rss_size; u16 qs_offset; u16 bw_limit; /* VSI BW Limit (0 = disabled) */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index fe659f752237..b7ba7aa66620 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -397,6 +397,7 @@ static void hclge_tm_vport_tc_info_update(struct hclge_vport *vport) kinfo->num_tqps / kinfo->num_tc); vport->qs_offset = hdev->tm_info.num_tc * vport->vport_id; vport->dwrr = 100; /* 100 percent as init */ + vport->alloc_rss_size = kinfo->rss_size; for (i = 0; i < kinfo->num_tc; i++) { if (hdev->hw_tc_map & BIT(i)) { -- cgit From c5795c5308af81568d1573598716091120c85a38 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:58 +0800 Subject: net: hns3: Fix for pri to tc mapping in TM Current mapping between pri and tc is one to one, so user can't map multi priorities to the same tc. This patch changes the mapping to many to one. Fixes: 848440544b41f ("net: hns3: Add support of TX Scheduler & Shaper to HNS3 driver") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 3 ++- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 16 +++++++++------- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index ad685f5aa6d1..1a01cadfe5f3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -376,12 +376,12 @@ struct hnae3_ae_algo { struct hnae3_tc_info { u16 tqp_offset; /* TQP offset from base TQP */ u16 tqp_count; /* Total TQPs */ - u8 up; /* user priority */ u8 tc; /* TC index */ bool enable; /* If this TC is enable or not */ }; #define HNAE3_MAX_TC 8 +#define HNAE3_MAX_USER_PRIO 8 struct hnae3_knic_private_info { struct net_device *netdev; /* Set by KNIC client when init instance */ u16 rss_size; /* Allocated RSS queues */ @@ -389,6 +389,7 @@ struct hnae3_knic_private_info { u16 num_desc; u8 num_tc; /* Total number of enabled TCs */ + u8 prio_tc[HNAE3_MAX_USER_PRIO]; /* TC indexed by prio */ struct hnae3_tc_info tc_info[HNAE3_MAX_TC]; /* Idx of array is HW TC */ u16 num_tqps; /* total number of TQPs in this handle */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 7f8dd129c10d..9fcfd9395424 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -176,7 +176,6 @@ struct hclge_pg_info { struct hclge_tc_info { u8 tc_id; u8 tc_sch_mode; /* 0: sp; 1: dwrr */ - u8 up; u8 pgid; u32 bw_limit; }; @@ -197,6 +196,7 @@ struct hclge_tm_info { u8 num_tc; u8 num_pg; /* It must be 1 if vNET-Base schd */ u8 pg_dwrr[HCLGE_PG_NUM]; + u8 prio_tc[HNAE3_MAX_USER_PRIO]; struct hclge_pg_info pg_info[HCLGE_PG_NUM]; struct hclge_tc_info tc_info[HNAE3_MAX_TC]; enum hclge_fc_mode fc_mode; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index b7ba7aa66620..73a75d7cc551 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -128,9 +128,7 @@ static int hclge_fill_pri_array(struct hclge_dev *hdev, u8 *pri, u8 pri_id) { u8 tc; - for (tc = 0; tc < hdev->tm_info.num_tc; tc++) - if (hdev->tm_info.tc_info[tc].up == pri_id) - break; + tc = hdev->tm_info.prio_tc[pri_id]; if (tc >= hdev->tm_info.num_tc) return -EINVAL; @@ -158,7 +156,7 @@ static int hclge_up_to_tc_map(struct hclge_dev *hdev) hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_PRI_TO_TC_MAPPING, false); - for (pri_id = 0; pri_id < hdev->tm_info.num_tc; pri_id++) { + for (pri_id = 0; pri_id < HNAE3_MAX_USER_PRIO; pri_id++) { ret = hclge_fill_pri_array(hdev, pri, pri_id); if (ret) return ret; @@ -405,16 +403,17 @@ static void hclge_tm_vport_tc_info_update(struct hclge_vport *vport) kinfo->tc_info[i].tqp_offset = i * kinfo->rss_size; kinfo->tc_info[i].tqp_count = kinfo->rss_size; kinfo->tc_info[i].tc = i; - kinfo->tc_info[i].up = hdev->tm_info.tc_info[i].up; } else { /* Set to default queue if TC is disable */ kinfo->tc_info[i].enable = false; kinfo->tc_info[i].tqp_offset = 0; kinfo->tc_info[i].tqp_count = 1; kinfo->tc_info[i].tc = 0; - kinfo->tc_info[i].up = 0; } } + + memcpy(kinfo->prio_tc, hdev->tm_info.prio_tc, + FIELD_SIZEOF(struct hnae3_knic_private_info, prio_tc)); } static void hclge_tm_vport_info_update(struct hclge_dev *hdev) @@ -436,12 +435,15 @@ static void hclge_tm_tc_info_init(struct hclge_dev *hdev) for (i = 0; i < hdev->tm_info.num_tc; i++) { hdev->tm_info.tc_info[i].tc_id = i; hdev->tm_info.tc_info[i].tc_sch_mode = HCLGE_SCH_MODE_DWRR; - hdev->tm_info.tc_info[i].up = i; hdev->tm_info.tc_info[i].pgid = 0; hdev->tm_info.tc_info[i].bw_limit = hdev->tm_info.pg_info[0].bw_limit; } + for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) + hdev->tm_info.prio_tc[i] = + (i >= hdev->tm_info.num_tc) ? 0 : i; + hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE; } -- cgit From 4d61eda812041ef9c820d7f147884133fd3307bc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 19 Sep 2017 16:27:39 +0100 Subject: CIFS: make arrays static const, reduces object code size Don't populate the read-only arrays types[] on the stack, instead make them both static const. Makes the object code smaller by over 200 bytes: Before: text data bss dec hex filename 111503 37696 448 149647 2488f fs/cifs/file.o After: text data bss dec hex filename 111140 37856 448 149444 247c4 fs/cifs/file.o Signed-off-by: Colin Ian King Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/file.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0786f19d288f..822311919163 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1102,8 +1102,10 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) struct cifs_tcon *tcon; unsigned int num, max_num, max_buf; LOCKING_ANDX_RANGE *buf, *cur; - int types[] = {LOCKING_ANDX_LARGE_FILES, - LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + static const int types[] = { + LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES + }; int i; xid = get_xid(); @@ -1434,8 +1436,10 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, unsigned int xid) { int rc = 0, stored_rc; - int types[] = {LOCKING_ANDX_LARGE_FILES, - LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + static const int types[] = { + LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES + }; unsigned int i; unsigned int max_num, num, max_buf; LOCKING_ANDX_RANGE *buf, *cur; -- cgit From 94183331e815617246b1baa97e0916f358c794bb Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Thu, 7 Sep 2017 16:03:27 +0800 Subject: cifs: release cifs root_cred after exit_cifs memory leak was found by kmemleak. exit_cifs_spnego should be called before cifs module removed, or cifs root_cred will not be released. kmemleak report: unreferenced object 0xffff880070a3ce40 (size 192): backtrace: kmemleak_alloc+0x4a/0xa0 kmem_cache_alloc+0xc7/0x1d0 prepare_kernel_cred+0x20/0x120 init_cifs_spnego+0x2d/0x170 [cifs] 0xffffffffc07801f3 do_one_initcall+0x51/0x1b0 do_init_module+0x60/0x1fd load_module+0x161e/0x1b60 SYSC_finit_module+0xa9/0x100 SyS_finit_module+0xe/0x10 Signed-off-by: Shu Wang Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg CC: Stable --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index a864e6575309..8c8b75d33f31 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1449,7 +1449,7 @@ exit_cifs(void) exit_cifs_idmap(); #endif #ifdef CONFIG_CIFS_UPCALL - unregister_key_type(&cifs_spnego_key_type); + exit_cifs_spnego(); #endif cifs_destroy_request_bufs(); cifs_destroy_mids(); -- cgit From f5c4ba816315d3b813af16f5571f86c8d4e897bd Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Fri, 8 Sep 2017 18:48:33 +0800 Subject: cifs: release auth_key.response for reconnect. There is a race that cause cifs reconnect in cifs_mount, - cifs_mount - cifs_get_tcp_session - [ start thread cifs_demultiplex_thread - cifs_read_from_socket: -ECONNABORTED - DELAY_WORK smb2_reconnect_server ] - cifs_setup_session - [ smb2_reconnect_server ] auth_key.response was allocated in cifs_setup_session, and will release when the session destoried. So when session re- connect, auth_key.response should be check and released. Tested with my system: CIFS VFS: Free previous auth_key.response = ffff8800320bbf80 A simple auth_key.response allocation call trace: - cifs_setup_session - SMB2_sess_setup - SMB2_sess_auth_rawntlmssp_authenticate - build_ntlmssp_auth_blob - setup_ntlmv2_rsp Signed-off-by: Shu Wang Signed-off-by: Steve French CC: Stable Reviewed-by: Ronnie Sahlberg --- fs/cifs/connect.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8d38b22afb2b..0bfc2280436d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4154,6 +4154,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n", server->sec_mode, server->capabilities, server->timeAdj); + if (ses->auth_key.response) { + cifs_dbg(VFS, "Free previous auth_key.response = %p\n", + ses->auth_key.response); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + ses->auth_key.len = 0; + } + if (server->ops->sess_setup) rc = server->ops->sess_setup(xid, ses, nls_info); -- cgit From 0603c96f3af50e2f9299fa410c224ab1d465e0f9 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 20 Sep 2017 19:57:18 -0500 Subject: SMB: Validate negotiate (to protect against downgrade) even if signing off As long as signing is supported (ie not a guest user connection) and connection is SMB3 or SMB3.02, then validate negotiate (protect against man in the middle downgrade attacks). We had been doing this only when signing was required, not when signing was just enabled, but this more closely matches recommended SMB3 behavior and is better security. Suggested by Metze. Signed-off-by: Steve French Reviewed-by: Jeremy Allison Acked-by: Stefan Metzmacher Reviewed-by: Ronnie Sahlberg CC: Stable --- fs/cifs/smb2pdu.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index d499ce265c3b..6f0e6343c15e 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -656,15 +656,22 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) /* * validation ioctl must be signed, so no point sending this if we - * can not sign it. We could eventually change this to selectively + * can not sign it (ie are not known user). Even if signing is not + * required (enabled but not negotiated), in those cases we selectively * sign just this, the first and only signed request on a connection. - * This is good enough for now since a user who wants better security - * would also enable signing on the mount. Having validation of - * negotiate info for signed connections helps reduce attack vectors + * Having validation of negotiate info helps reduce attack vectors. */ - if (tcon->ses->server->sign == false) + if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) return 0; /* validation requires signing */ + if (tcon->ses->user_name == NULL) { + cifs_dbg(FYI, "Can't validate negotiate: null user mount\n"); + return 0; /* validation requires signing */ + } + + if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) + cifs_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n"); + vneg_inbuf.Capabilities = cpu_to_le32(tcon->ses->server->vals->req_capabilities); memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid, -- cgit From a90bcb86ae700c12432446c4aa1819e7b8e172ec Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Tue, 29 Aug 2017 11:20:32 -0700 Subject: iov_iter: fix page_copy_sane for compound pages Issue is that if the data crosses a page boundary inside a compound page, this check will incorrectly trigger a WARN_ON. To fix this, compute the order using the head of the compound page and adjust the offset to be relative to that head. Fixes: 72e809ed81ed ("iov_iter: sanity checks for copy to/from page primitives") Signed-off-by: Petar Penkov CC: Al Viro CC: Eric Dumazet Signed-off-by: Al Viro --- lib/iov_iter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 52c8dd6d8e82..1c1c06ddc20a 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -687,8 +687,10 @@ EXPORT_SYMBOL(_copy_from_iter_full_nocache); static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) { - size_t v = n + offset; - if (likely(n <= v && v <= (PAGE_SIZE << compound_order(page)))) + struct page *head = compound_head(page); + size_t v = n + offset + page_address(page) - page_address(head); + + if (likely(n <= v && v <= (PAGE_SIZE << compound_order(head)))) return true; WARN_ON(1); return false; -- cgit From 58aff0af757356065f33290d96a9cd46dfbcae88 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 18 Sep 2017 17:47:38 +0100 Subject: ipc/shm: Fix order of parameters when calling copy_compat_shmid_to_user Commit 553f770ef71b ("ipc: move compat shmctl to native") moved the compat IPC syscall handling into ipc/shm.c and refactored the struct accessors in the process. Unfortunately, the call to copy_compat_shmid_to_user when handling a compat {IPC,SHM}_STAT command gets the arguments the wrong way round, passing a kernel stack address as the user buffer (destination) and the user buffer as the kernel stack address (source). This patch fixes the parameter ordering so the buffers are accessed correctly. Cc: Al Viro Cc: Andrew Morton Signed-off-by: Will Deacon Signed-off-by: Al Viro --- ipc/shm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/shm.c b/ipc/shm.c index 1b3adfe3c60e..1e2b1692ba2c 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1237,7 +1237,7 @@ COMPAT_SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, void __user *, uptr) err = shmctl_stat(ns, shmid, cmd, &sem64); if (err < 0) return err; - if (copy_compat_shmid_to_user(&sem64, uptr, version)) + if (copy_compat_shmid_to_user(uptr, &sem64, version)) err = -EFAULT; return err; -- cgit From 3e77adeea3c5393c9b624832f65441e92867f618 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 7 Sep 2017 16:35:40 +1000 Subject: powerpc/eeh: Create PHB PEs after EEH is initialized Otherwise we end up not yet having computed the right diag data size on powernv where EEH initialization is delayed, thus causing memory corruption later on when calling OPAL. Fixes: 5cb1f8fdddb7 ("powerpc/powernv/pci: Dynamically allocate PHB diag data") Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Benjamin Herrenschmidt Acked-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh.c | 4 ++++ arch/powerpc/kernel/eeh_dev.c | 18 ------------------ 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 9e816787c0d4..116000b45531 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1019,6 +1019,10 @@ int eeh_init(void) } else if ((ret = eeh_ops->init())) return ret; + /* Initialize PHB PEs */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + eeh_dev_phb_init_dynamic(hose); + /* Initialize EEH event */ ret = eeh_event_init(); if (ret) diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c index ad04ecd63c20..a34e6912c15e 100644 --- a/arch/powerpc/kernel/eeh_dev.c +++ b/arch/powerpc/kernel/eeh_dev.c @@ -78,21 +78,3 @@ void eeh_dev_phb_init_dynamic(struct pci_controller *phb) /* EEH PE for PHB */ eeh_phb_pe_create(phb); } - -/** - * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs - * - * Scan all the existing PHBs and create EEH devices for their OF - * nodes and their children OF nodes - */ -static int __init eeh_dev_phb_init(void) -{ - struct pci_controller *phb, *tmp; - - list_for_each_entry_safe(phb, tmp, &hose_list, list_node) - eeh_dev_phb_init_dynamic(phb); - - return 0; -} - -core_initcall(eeh_dev_phb_init); -- cgit From 087ff6a5ae3052bb2835e191094b793789cb8817 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Wed, 20 Sep 2017 17:02:51 -0400 Subject: powerpc/pseries: Fix "OF: ERROR: Bad of_node_put() on /cpus" during DLPAR Commit 215ee763f8cb ("powerpc: pseries: remove dlpar_attach_node dependency on full path") reworked dlpar_attach_node() to no longer look up the parent node "/cpus", but instead to have the parent node passed by the caller in the function parameter list. As a result dlpar_attach_node() is no longer responsible for freeing the reference to the parent node. However, commit 215ee763f8cb failed to remove the of_node_put(parent) call in dlpar_attach_node(), or to take into account that the reference to the parent in the caller dlpar_cpu_add() needs to be held until after dlpar_attach_node() returns. As a result doing repeated cpu add/remove dlpar operations will eventually result in the following error: OF: ERROR: Bad of_node_put() on /cpus CPU: 0 PID: 10896 Comm: drmgr Not tainted 4.13.0-autotest #1 Call Trace: dump_stack+0x15c/0x1f8 (unreliable) of_node_release+0x1a4/0x1c0 kobject_put+0x1a8/0x310 kobject_del+0xbc/0xf0 __of_detach_node_sysfs+0x144/0x210 of_detach_node+0xf0/0x180 dlpar_detach_node+0xc4/0x120 dlpar_cpu_remove+0x280/0x560 dlpar_cpu_release+0xbc/0x1b0 arch_cpu_release+0x6c/0xb0 cpu_release_store+0xa0/0x100 dev_attr_store+0x68/0xa0 sysfs_kf_write+0xa8/0xf0 kernfs_fop_write+0x2cc/0x400 __vfs_write+0x5c/0x340 vfs_write+0x1a8/0x3d0 SyS_write+0xa8/0x1a0 system_call+0x58/0x6c Fix the issue by removing the of_node_put(parent) call from dlpar_attach_node(), and ensuring that the reference to the parent node is properly held and released by the caller dlpar_cpu_add(). Fixes: 215ee763f8cb ("powerpc: pseries: remove dlpar_attach_node dependency on full path") Signed-off-by: Tyrel Datwyler Reported-by: Abdul Haleem [mpe: Add a comment in the code and frob the change log slightly] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/dlpar.c | 1 - arch/powerpc/platforms/pseries/hotplug-cpu.c | 6 +++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 783f36364690..e45b5f10645a 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -266,7 +266,6 @@ int dlpar_attach_node(struct device_node *dn, struct device_node *parent) return rc; } - of_node_put(dn->parent); return 0; } diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index fc0d8f97c03a..fadb95efbb9e 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -462,15 +462,19 @@ static ssize_t dlpar_cpu_add(u32 drc_index) } dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent); - of_node_put(parent); if (!dn) { pr_warn("Failed call to configure-connector, drc index: %x\n", drc_index); dlpar_release_drc(drc_index); + of_node_put(parent); return -EINVAL; } rc = dlpar_attach_node(dn, parent); + + /* Regardless we are done with parent now */ + of_node_put(parent); + if (rc) { saved_rc = rc; pr_warn("Failed to attach node %s, rc: %d, drc index: %x\n", -- cgit From b537ca6fede69a281dc524983e5e633d79a10a08 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Wed, 20 Sep 2017 17:02:52 -0400 Subject: powerpc/pseries: Fix parent_dn reference leak in add_dt_node() A reference to the parent device node is held by add_dt_node() for the node to be added. If the call to dlpar_configure_connector() fails add_dt_node() returns ENOENT and that reference is not freed. Add a call to of_node_put(parent_dn) prior to bailing out after a failed dlpar_configure_connector() call. Fixes: 8d5ff320766f ("powerpc/pseries: Make dlpar_configure_connector parent node aware") Cc: stable@vger.kernel.org # v3.12+ Signed-off-by: Tyrel Datwyler Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/mobility.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 210ce632d63e..f7042ad492ba 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -226,8 +226,10 @@ static int add_dt_node(__be32 parent_phandle, __be32 drc_index) return -ENOENT; dn = dlpar_configure_connector(drc_index, parent_dn); - if (!dn) + if (!dn) { + of_node_put(parent_dn); return -ENOENT; + } rc = dlpar_attach_node(dn, parent_dn); if (rc) -- cgit From 0551968add53777fddd18f4ffb4e3bbc1f646d79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Sep 2017 11:54:44 +0200 Subject: Revert "genirq: Restrict effective affinity to interrupts actually using it" This reverts commit 74def747bcd09692bdbf8c6a15350795b0f11ca8. The change to the helper function is only correct for the /proc/irq/ readout usage, but breaks the existing x86 usage of that function. Reported-by: Yanko Kaneti Signed-off-by: Thomas Gleixner Cc: Marc Zyngier --- include/linux/irq.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index b99a784635ff..d4728bf6a537 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -783,10 +783,7 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) static inline struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) { - if (!cpumask_empty(d->common->effective_affinity)) - return d->common->effective_affinity; - - return d->common->affinity; + return d->common->effective_affinity; } static inline void irq_data_update_effective_affinity(struct irq_data *d, const struct cpumask *m) -- cgit From 2bc84526d174a2a89c76438f049fc03ac259a159 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 6 Sep 2017 16:44:35 -0600 Subject: Makefile: kselftest and kselftest-clean fail for make O=dir case kselftest and kselftest-clean targets fail when object directory is specified to relocate objects. Fix it so it can find the source tree to build from. make O=/tmp/kselftest_top kselftest make[1]: Entering directory '/tmp/kselftest_top' make[2]: Entering directory '/tmp/kselftest_top' make[2]: *** tools/testing/selftests: No such file or directory. Stop. make[2]: Leaving directory '/tmp/kselftest_top' ./linux-kselftest/Makefile:1185: recipe for target 'kselftest' failed make[1]: *** [kselftest] Error 2 make[1]: Leaving directory '/tmp/kselftest_top' Makefile:145: recipe for target 'sub-make' failed make: *** [sub-make] Error 2 Signed-off-by: Shuah Khan Acked-by: Masahiro Yamada --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 64cbc66cebca..9a70578922d3 100644 --- a/Makefile +++ b/Makefile @@ -1172,11 +1172,11 @@ headers_check: headers_install PHONY += kselftest kselftest: - $(Q)$(MAKE) -C tools/testing/selftests run_tests + $(Q)$(MAKE) -C $(srctree)/tools/testing/selftests run_tests PHONY += kselftest-clean kselftest-clean: - $(Q)$(MAKE) -C tools/testing/selftests clean + $(Q)$(MAKE) -C $(srctree)/tools/testing/selftests clean PHONY += kselftest-merge kselftest-merge: -- cgit From 8050ef2b83a18f628f9501af958fbff39443d58d Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 6 Sep 2017 18:36:22 -0600 Subject: selftests: lib.mk: kselftest and kselftest-clean fail for make O=dir case kselftest and kselftest-clean targets fail when object directory is specified to relocate objects. Main Makefile make O= path clears the built-in defines LINK.c, COMPILE.S, LINK.S, and RM that are used in lib.mk to build and clean targets. Define them. Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 4665463779f5..266d3ed4bb41 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -7,6 +7,7 @@ OUTPUT := $(shell pwd) endif TEST_GEN_PROGS := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS)) +TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED)) TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES)) all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) @@ -62,6 +63,11 @@ endef emit_tests: $(EMIT_TESTS) +# define if isn't already. It is undefined in make O= case. +ifeq ($(RM),) +RM := rm -f +endif + define CLEAN $(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN) endef @@ -69,6 +75,15 @@ endef clean: $(CLEAN) +# When make O= with kselftest target from main level +# the following aren't defined. +# +ifneq ($(KBUILD_SRC),) +LINK.c = $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) +COMPILE.S = $(CC) $(ASFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c +LINK.S = $(CC) $(ASFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) +endif + $(OUTPUT)/%:%.c $(LINK.c) $^ $(LDLIBS) -o $@ -- cgit From 52fd1d082398b928a86d4fdf33c9f3abe1bf7914 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Sep 2017 19:57:43 -0600 Subject: selftests: Makefile: clear LDFLAGS for make O=dir use-case kselftest target fails when object directory is specified to relocate objects. Inherited "LDFLAGS = -m" fails the test builds. Clear it. Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 26ce4f7168be..f4368db011ea 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -52,6 +52,10 @@ override LDFLAGS = override MAKEFLAGS = endif +ifneq ($(KBUILD_SRC),) +override LDFLAGS = +endif + BUILD := $(O) ifndef BUILD BUILD := $(KBUILD_OUTPUT) -- cgit From e0a5696a23290c31c5ac2c76f0e7fe50a12c1fc6 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Sep 2017 20:04:26 -0600 Subject: selftests: lib.mk: fix test executable status check to use full path Fix test executable status check to use full path for make O=dir case,m when tests are relocated to user specified object directory. Without the full path, this check fails to find the file and fails the test. Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 266d3ed4bb41..fd1cbbbca8d7 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -21,7 +21,7 @@ define RUN_TESTS test_num=`echo $$test_num+1 | bc`; \ echo "selftests: $$BASENAME_TEST"; \ echo "========================================"; \ - if [ ! -x $$BASENAME_TEST ]; then \ + if [ ! -x $$TEST ]; then \ echo "selftests: Warning: file $$BASENAME_TEST is not executable, correct this.";\ echo "not ok 1..$$test_num selftests: $$BASENAME_TEST [FAIL]"; \ else \ -- cgit From e2fb65594cae9a016fab4639a5d22a914f1e16c8 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 8 Sep 2017 16:05:50 -0600 Subject: selftests: watchdog: fix to use TEST_GEN_PROGS and remove clean TEST_PROGS should be used for test scripts that don't ned to be built. Use TEST_GEN_PROGS instead which is intended for test executables. Remove clean target and let the common clean take care of cleaning. Signed-off-by: Shuah Khan --- tools/testing/selftests/watchdog/Makefile | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/testing/selftests/watchdog/Makefile b/tools/testing/selftests/watchdog/Makefile index f863c664e3d1..ee068511fd0b 100644 --- a/tools/testing/selftests/watchdog/Makefile +++ b/tools/testing/selftests/watchdog/Makefile @@ -1,8 +1,3 @@ -TEST_PROGS := watchdog-test - -all: $(TEST_PROGS) +TEST_GEN_PROGS := watchdog-test include ../lib.mk - -clean: - rm -fr $(TEST_PROGS) -- cgit From be16a244c199983656e58ddb10d80c67197e502f Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 13:06:33 -0600 Subject: selftests: lib.mk: add TEST_CUSTOM_PROGS to allow custom test run/install Some tests such as sync can't use generic build rules in lib.mk and require custom rules. Currently there is no provision to allow custom builds and test such as sync use TEST_PROGS which is reserved for test shell scripts. Add TEST_CUSTOM_PROGS variable to lib.mk to run and install custom tests built by individual test make files. Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index fd1cbbbca8d7..b4699c71aee4 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -6,6 +6,12 @@ ifeq (0,$(MAKELEVEL)) OUTPUT := $(shell pwd) endif +# The following are built by lib.mk common compile rules. +# TEST_CUSTOM_PROGS should be used by tests that require +# custom build rule and prevent common build rule use. +# TEST_PROGS are for test shell scripts. +# TEST_CUSTOM_PROGS and TEST_PROGS will be run by common run_tests +# and install targets. Common clean doesn't touch them. TEST_GEN_PROGS := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS)) TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED)) TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES)) @@ -31,7 +37,7 @@ define RUN_TESTS endef run_tests: all - $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_PROGS)) + $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS)) define INSTALL_RULE @if [ "X$(TEST_PROGS)$(TEST_PROGS_EXTENDED)$(TEST_FILES)" != "X" ]; then \ @@ -39,10 +45,10 @@ define INSTALL_RULE echo "rsync -a $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(INSTALL_PATH)/"; \ rsync -a $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(INSTALL_PATH)/; \ fi - @if [ "X$(TEST_GEN_PROGS)$(TEST_GEN_PROGS_EXTENDED)$(TEST_GEN_FILES)" != "X" ]; then \ + @if [ "X$(TEST_GEN_PROGS)$(TEST_CUSTOM_PROGS)$(TEST_GEN_PROGS_EXTENDED)$(TEST_GEN_FILES)" != "X" ]; then \ mkdir -p ${INSTALL_PATH}; \ - echo "rsync -a $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/"; \ - rsync -a $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/; \ + echo "rsync -a $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/"; \ + rsync -a $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/; \ fi endef @@ -54,7 +60,7 @@ else endif define EMIT_TESTS - @for TEST in $(TEST_GEN_PROGS) $(TEST_PROGS); do \ + @for TEST in $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS); do \ BASENAME_TEST=`basename $$TEST`; \ echo "(./$$BASENAME_TEST > /tmp/$$BASENAME_TEST 2>&1 && echo \"selftests: $$BASENAME_TEST [PASS]\") || echo \"selftests: $$BASENAME_TEST [FAIL]\""; \ done; -- cgit From 38f7251852a0cd47f34af4a6f84df0fadafa8ac7 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 13:46:44 -0600 Subject: selftests: sync: use TEST_CUSTOM_PROGS instead of TEST_PROGS lib.mk var TEST_CUSTOM_PROGS is for tests that need custom build rules. TEST_PROGS is used for test shell scripts. Fix it to use TEST_CUSTOM_PROGS. lib.mk will run and install them. Signed-off-by: Shuah Khan --- tools/testing/selftests/sync/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/sync/Makefile b/tools/testing/selftests/sync/Makefile index 4981c6b6d050..43db80b71e80 100644 --- a/tools/testing/selftests/sync/Makefile +++ b/tools/testing/selftests/sync/Makefile @@ -2,9 +2,11 @@ CFLAGS += -O2 -g -std=gnu89 -pthread -Wall -Wextra CFLAGS += -I../../../../usr/include/ LDFLAGS += -pthread -TEST_PROGS = sync_test +# lib.mk TEST_CUSTOM_PROGS var is for custome tests that need special +# build rules. lib.mk will run and install them. +TEST_CUSTOM_PROGS = sync_test -all: $(TEST_PROGS) +all: $(TEST_CUSTOM_PROGS) include ../lib.mk -- cgit From b2fc6ade9fe2118591e7f51e21b51f65e36f138f Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 14:14:41 -0600 Subject: selftests: sync: kselftest and kselftest-clean fail for make O=dir case sync test fails to build when object directory is specified to relocate object files. Fix it to specify the correct path. Fix clean target to remove objects. Also include simplified logic to use TEST_CUSTOM_PROGS in build and clean targets instead of hard-coding the test name each time. Signed-off-by: Shuah Khan --- tools/testing/selftests/sync/Makefile | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/sync/Makefile b/tools/testing/selftests/sync/Makefile index 43db80b71e80..8e04d0afcbd7 100644 --- a/tools/testing/selftests/sync/Makefile +++ b/tools/testing/selftests/sync/Makefile @@ -2,14 +2,16 @@ CFLAGS += -O2 -g -std=gnu89 -pthread -Wall -Wextra CFLAGS += -I../../../../usr/include/ LDFLAGS += -pthread -# lib.mk TEST_CUSTOM_PROGS var is for custome tests that need special +.PHONY: all clean + +include ../lib.mk + +# lib.mk TEST_CUSTOM_PROGS var is for custom tests that need special # build rules. lib.mk will run and install them. -TEST_CUSTOM_PROGS = sync_test +TEST_CUSTOM_PROGS := $(OUTPUT)/sync_test all: $(TEST_CUSTOM_PROGS) -include ../lib.mk - OBJS = sync_test.o sync.o TESTS += sync_alloc.o @@ -20,6 +22,16 @@ TESTS += sync_stress_parallelism.o TESTS += sync_stress_consumer.o TESTS += sync_stress_merge.o -sync_test: $(OBJS) $(TESTS) +OBJS := $(patsubst %,$(OUTPUT)/%,$(OBJS)) +TESTS := $(patsubst %,$(OUTPUT)/%,$(TESTS)) + +$(TEST_CUSTOM_PROGS): $(TESTS) $(OBJS) + $(CC) -o $(TEST_CUSTOM_PROGS) $(OBJS) $(TESTS) $(CFLAGS) $(LDFLAGS) + +$(OBJS): $(OUTPUT)/%.o: %.c + $(CC) -c $^ -o $@ + +$(TESTS): $(OUTPUT)/%.o: %.c + $(CC) -c $^ -o $@ -EXTRA_CLEAN := sync_test $(OBJS) $(TESTS) +EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(OBJS) $(TESTS) -- cgit From 1a940687e424080a6ed285f2de8b2f0d018edf3e Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 19:03:11 -0600 Subject: selftests: lib.mk: copy test scripts and test files for make O=dir run For make O=dir run_tests to work, test scripts, test files, and other dependencies need to be copied over to the object directory. Running tests from the object directory is necessary to avoid making the source tree dirty. Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index b4699c71aee4..f65886af7c0c 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -37,7 +37,18 @@ define RUN_TESTS endef run_tests: all +ifneq ($(KBUILD_SRC),) + @if [ "X$(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES)" != "X" ]; then + @rsync -aq $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(OUTPUT) + fi + @if [ "X$(TEST_PROGS)" != "X" ]; then + $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(OUTPUT)/$(TEST_PROGS)) + else + $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS)) + fi +else $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS)) +endif define INSTALL_RULE @if [ "X$(TEST_PROGS)$(TEST_PROGS_EXTENDED)$(TEST_FILES)" != "X" ]; then \ -- cgit From 9bbe7dc05c1f80200621a626ec2266987f4b42b3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Sep 2017 17:55:24 +0200 Subject: MIPS: MSP71xx: Include asm/setup.h msp71xx_defconfig can not be built at the in v4.14-rc1 arch/mips/pmcs-msp71xx/msp_smp.c:72:2: error: implicit declaration of function 'set_vi_handler' [-Werror=implicit-function-declaration] I don't know what caused the regression, but including the right header is the obvious fix. Signed-off-by: Arnd Bergmann Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/17309/ Signed-off-by: Ralf Baechle --- arch/mips/pmcs-msp71xx/msp_smp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/mips/pmcs-msp71xx/msp_smp.c b/arch/mips/pmcs-msp71xx/msp_smp.c index ffa0f7101a97..2b08242ade62 100644 --- a/arch/mips/pmcs-msp71xx/msp_smp.c +++ b/arch/mips/pmcs-msp71xx/msp_smp.c @@ -22,6 +22,8 @@ #include #include +#include + #ifdef CONFIG_MIPS_MT_SMP #define MIPS_CPU_IPI_RESCHED_IRQ 0 /* SW int 0 for resched */ #define MIPS_CPU_IPI_CALL_IRQ 1 /* SW int 1 for call */ -- cgit From c22c8043105591a8b74142cf837604087cdba40b Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 19 Sep 2017 14:11:22 +0100 Subject: MIPS: Fix input modify in __write_64bit_c0_split() The inline asm in __write_64bit_c0_split() modifies the 64-bit input operand by shifting the high register left by 32, and constructing the full 64-bit value in the low register (even on a 32-bit kernel), so if that value is used again it could cause breakage as GCC would assume the registers haven't changed when they have. To quote the GCC extended asm documentation: > Warning: Do not modify the contents of input-only operands (except for > inputs tied to outputs). The compiler assumes that on exit from the > asm statement these operands contain the same values as they had > before executing the statement. Avoid modifying the input by using a temporary variable as an output which is modified instead of the input and not otherwise used. The asm is always __volatile__ so GCC shouldn't optimise it out. The low register of the temporary output is written before the high register of the input is read, so we have two constraint alternatives, one where both use the same registers (for when the input value isn't subsequently used), and one with an early clobber on the output in case the low output uses the same register as the high input. This allows the resulting assembly to remain mostly unchanged. A diff of a MIPS32r6 kernel reveals only three differences, two in relation to write_c0_r10k_diag() in cpu_probe() (register allocation rearranged slightly but otherwise identical), and one in relation to write_c0_cvmmemctl2() in kvm_vz_local_flush_guesttlb_all(), but the octeon CPU is only supported on 64-bit kernels where __write_64bit_c0_split() isn't used so that shouldn't matter in practice. So there currently doesn't appear to be anything broken by this bug. Signed-off-by: James Hogan Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17315/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/mipsregs.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h index e4ed1bc9a734..a6810923b3f0 100644 --- a/arch/mips/include/asm/mipsregs.h +++ b/arch/mips/include/asm/mipsregs.h @@ -1377,29 +1377,32 @@ do { \ #define __write_64bit_c0_split(source, sel, val) \ do { \ + unsigned long long __tmp; \ unsigned long __flags; \ \ local_irq_save(__flags); \ if (sel == 0) \ __asm__ __volatile__( \ ".set\tmips64\n\t" \ - "dsll\t%L0, %L0, 32\n\t" \ + "dsll\t%L0, %L1, 32\n\t" \ "dsrl\t%L0, %L0, 32\n\t" \ - "dsll\t%M0, %M0, 32\n\t" \ + "dsll\t%M0, %M1, 32\n\t" \ "or\t%L0, %L0, %M0\n\t" \ "dmtc0\t%L0, " #source "\n\t" \ ".set\tmips0" \ - : : "r" (val)); \ + : "=&r,r" (__tmp) \ + : "r,0" (val)); \ else \ __asm__ __volatile__( \ ".set\tmips64\n\t" \ - "dsll\t%L0, %L0, 32\n\t" \ + "dsll\t%L0, %L1, 32\n\t" \ "dsrl\t%L0, %L0, 32\n\t" \ - "dsll\t%M0, %M0, 32\n\t" \ + "dsll\t%M0, %M1, 32\n\t" \ "or\t%L0, %L0, %M0\n\t" \ "dmtc0\t%L0, " #source ", " #sel "\n\t" \ ".set\tmips0" \ - : : "r" (val)); \ + : "=&r,r" (__tmp) \ + : "r,0" (val)); \ local_irq_restore(__flags); \ } while (0) -- cgit From 8eba3651f1dad49c83bb7f8d672301dac4c6add6 Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Tue, 12 Sep 2017 20:36:28 +0200 Subject: MIPS: PCI: fix pcibios_map_irq section mismatch Drop the __init from pcibios_map_irq() to make this section mis- match go away: WARNING: vmlinux.o(.text+0x56acd4): Section mismatch in reference from the function pcibios_scanbus() to the function .init.text:pcibios_map_irq() The function pcibios_scanbus() references the function __init pcibios_map_irq(). This is often because pcibios_scanbus lacks a __init annotation or the annotation of pcibios_map_irq is wrong. Run-Tested only on Alchemy. Signed-off-by: Manuel Lauss Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17267/ Signed-off-by: Ralf Baechle --- arch/mips/pci/fixup-capcella.c | 2 +- arch/mips/pci/fixup-cobalt.c | 2 +- arch/mips/pci/fixup-emma2rh.c | 2 +- arch/mips/pci/fixup-fuloong2e.c | 2 +- arch/mips/pci/fixup-ip32.c | 2 +- arch/mips/pci/fixup-lantiq.c | 2 +- arch/mips/pci/fixup-lemote2f.c | 2 +- arch/mips/pci/fixup-loongson3.c | 2 +- arch/mips/pci/fixup-malta.c | 2 +- arch/mips/pci/fixup-mpc30x.c | 2 +- arch/mips/pci/fixup-pmcmsp.c | 2 +- arch/mips/pci/fixup-sni.c | 2 +- arch/mips/pci/fixup-tb0219.c | 2 +- arch/mips/pci/fixup-tb0226.c | 2 +- arch/mips/pci/fixup-tb0287.c | 2 +- arch/mips/pci/pci-alchemy.c | 2 +- arch/mips/pci/pci-bcm47xx.c | 2 +- arch/mips/pci/pci-lasat.c | 2 +- arch/mips/pci/pci-mt7620.c | 2 +- arch/mips/pci/pci-octeon.c | 4 ++-- arch/mips/pci/pci-rt2880.c | 2 +- arch/mips/pci/pci-rt3883.c | 2 +- arch/mips/pci/pci-xlp.c | 2 +- arch/mips/pci/pci-xlr.c | 2 +- 24 files changed, 25 insertions(+), 25 deletions(-) diff --git a/arch/mips/pci/fixup-capcella.c b/arch/mips/pci/fixup-capcella.c index 1c02f5737367..35a671595a37 100644 --- a/arch/mips/pci/fixup-capcella.c +++ b/arch/mips/pci/fixup-capcella.c @@ -38,7 +38,7 @@ static char irq_tab_capcella[][5] __initdata = { [14] = { -1, INTA, INTB, INTC, INTD } }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return irq_tab_capcella[slot][pin]; } diff --git a/arch/mips/pci/fixup-cobalt.c b/arch/mips/pci/fixup-cobalt.c index b3ab59318d91..62810d3fe99b 100644 --- a/arch/mips/pci/fixup-cobalt.c +++ b/arch/mips/pci/fixup-cobalt.c @@ -174,7 +174,7 @@ static char irq_tab_raq2[] __initdata = { [COBALT_PCICONF_ETH1] = ETH1_IRQ }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (cobalt_board_id <= COBALT_BRD_ID_QUBE1) return irq_tab_qube1[slot]; diff --git a/arch/mips/pci/fixup-emma2rh.c b/arch/mips/pci/fixup-emma2rh.c index 19caf775c206..4832ac9f118a 100644 --- a/arch/mips/pci/fixup-emma2rh.c +++ b/arch/mips/pci/fixup-emma2rh.c @@ -85,7 +85,7 @@ static void emma2rh_pci_host_fixup(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NEC, PCI_DEVICE_ID_NEC_EMMA2RH, emma2rh_pci_host_fixup); -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return irq_map[slot][pin]; } diff --git a/arch/mips/pci/fixup-fuloong2e.c b/arch/mips/pci/fixup-fuloong2e.c index 50da773faede..b47c2771dc99 100644 --- a/arch/mips/pci/fixup-fuloong2e.c +++ b/arch/mips/pci/fixup-fuloong2e.c @@ -19,7 +19,7 @@ /* South bridge slot number is set by the pci probe process */ static u8 sb_slot = 5; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = 0; diff --git a/arch/mips/pci/fixup-ip32.c b/arch/mips/pci/fixup-ip32.c index 133685e215ee..ea29f5450be3 100644 --- a/arch/mips/pci/fixup-ip32.c +++ b/arch/mips/pci/fixup-ip32.c @@ -39,7 +39,7 @@ static char irq_tab_mace[][5] __initdata = { * irqs. I suppose a device without a pin A will thank us for doing it * right if there exists such a broken piece of crap. */ -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return irq_tab_mace[slot][pin]; } diff --git a/arch/mips/pci/fixup-lantiq.c b/arch/mips/pci/fixup-lantiq.c index 2b5427d3f35c..81530a13b349 100644 --- a/arch/mips/pci/fixup-lantiq.c +++ b/arch/mips/pci/fixup-lantiq.c @@ -23,7 +23,7 @@ int pcibios_plat_dev_init(struct pci_dev *dev) return 0; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return of_irq_parse_and_map_pci(dev, slot, pin); } diff --git a/arch/mips/pci/fixup-lemote2f.c b/arch/mips/pci/fixup-lemote2f.c index 95ab9a1bd010..7e5991e0e323 100644 --- a/arch/mips/pci/fixup-lemote2f.c +++ b/arch/mips/pci/fixup-lemote2f.c @@ -51,7 +51,7 @@ static char irq_tab[][5] __initdata = { {0, 0, 0, 0, 0}, /* 27: Unused */ }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int virq; diff --git a/arch/mips/pci/fixup-loongson3.c b/arch/mips/pci/fixup-loongson3.c index 2b6d5e196f99..8a741c2c6685 100644 --- a/arch/mips/pci/fixup-loongson3.c +++ b/arch/mips/pci/fixup-loongson3.c @@ -32,7 +32,7 @@ static void print_fixup_info(const struct pci_dev *pdev) pdev->vendor, pdev->device, pdev->irq); } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { print_fixup_info(dev); return dev->irq; diff --git a/arch/mips/pci/fixup-malta.c b/arch/mips/pci/fixup-malta.c index 40e920c653cc..1f5f25e39590 100644 --- a/arch/mips/pci/fixup-malta.c +++ b/arch/mips/pci/fixup-malta.c @@ -38,7 +38,7 @@ static char irq_tab[][5] __initdata = { {0, PCID, PCIA, PCIB, PCIC } /* 21: PCI Slot 4 */ }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int virq; virq = irq_tab[slot][pin]; diff --git a/arch/mips/pci/fixup-mpc30x.c b/arch/mips/pci/fixup-mpc30x.c index 8e4f8288eca2..5da62c76e271 100644 --- a/arch/mips/pci/fixup-mpc30x.c +++ b/arch/mips/pci/fixup-mpc30x.c @@ -34,7 +34,7 @@ static const int irq_tab_mpc30x[] __initconst = { [29] = MQ200_IRQ, }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (slot == 30) return internal_func_irqs[PCI_FUNC(dev->devfn)]; diff --git a/arch/mips/pci/fixup-pmcmsp.c b/arch/mips/pci/fixup-pmcmsp.c index fab405c21c2f..f2b7b1e4395b 100644 --- a/arch/mips/pci/fixup-pmcmsp.c +++ b/arch/mips/pci/fixup-pmcmsp.c @@ -202,7 +202,7 @@ int pcibios_plat_dev_init(struct pci_dev *dev) * RETURNS: IRQ number * ****************************************************************************/ -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { #if !defined(CONFIG_PMC_MSP7120_GW) && !defined(CONFIG_PMC_MSP7120_EVAL) printk(KERN_WARNING "PCI: unknown board, no PCI IRQs assigned.\n"); diff --git a/arch/mips/pci/fixup-sni.c b/arch/mips/pci/fixup-sni.c index f67ebeeb4200..309e1c562959 100644 --- a/arch/mips/pci/fixup-sni.c +++ b/arch/mips/pci/fixup-sni.c @@ -130,7 +130,7 @@ static inline int is_rm300_revd(void) return (csmsr & 0xa0) == 0x20; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { switch (sni_brd_type) { case SNI_BRD_PCI_TOWER_CPLUS: diff --git a/arch/mips/pci/fixup-tb0219.c b/arch/mips/pci/fixup-tb0219.c index d0b0083fbd27..cc581535f257 100644 --- a/arch/mips/pci/fixup-tb0219.c +++ b/arch/mips/pci/fixup-tb0219.c @@ -23,7 +23,7 @@ #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = -1; diff --git a/arch/mips/pci/fixup-tb0226.c b/arch/mips/pci/fixup-tb0226.c index 4196ccf3ea3d..b827b5cad5fd 100644 --- a/arch/mips/pci/fixup-tb0226.c +++ b/arch/mips/pci/fixup-tb0226.c @@ -23,7 +23,7 @@ #include #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = -1; diff --git a/arch/mips/pci/fixup-tb0287.c b/arch/mips/pci/fixup-tb0287.c index 8c5039ed75d7..98f26285f2e3 100644 --- a/arch/mips/pci/fixup-tb0287.c +++ b/arch/mips/pci/fixup-tb0287.c @@ -22,7 +22,7 @@ #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { unsigned char bus; int irq = -1; diff --git a/arch/mips/pci/pci-alchemy.c b/arch/mips/pci/pci-alchemy.c index e99ca7702d8a..f15ec98de2de 100644 --- a/arch/mips/pci/pci-alchemy.c +++ b/arch/mips/pci/pci-alchemy.c @@ -522,7 +522,7 @@ static int __init alchemy_pci_init(void) arch_initcall(alchemy_pci_init); -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { struct alchemy_pci_context *ctx = dev->sysdata; if (ctx && ctx->board_map_irq) diff --git a/arch/mips/pci/pci-bcm47xx.c b/arch/mips/pci/pci-bcm47xx.c index 76f16eaed0ad..230d7dd273e2 100644 --- a/arch/mips/pci/pci-bcm47xx.c +++ b/arch/mips/pci/pci-bcm47xx.c @@ -28,7 +28,7 @@ #include #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return 0; } diff --git a/arch/mips/pci/pci-lasat.c b/arch/mips/pci/pci-lasat.c index 40d2797d2bc4..47f4ee6bbb3b 100644 --- a/arch/mips/pci/pci-lasat.c +++ b/arch/mips/pci/pci-lasat.c @@ -61,7 +61,7 @@ arch_initcall(lasat_pci_setup); #define LASAT_IRQ_PCIC (LASAT_IRQ_BASE + 7) #define LASAT_IRQ_PCID (LASAT_IRQ_BASE + 8) -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { switch (slot) { case 1: diff --git a/arch/mips/pci/pci-mt7620.c b/arch/mips/pci/pci-mt7620.c index 4e633c1e7ff3..90fba9bf98da 100644 --- a/arch/mips/pci/pci-mt7620.c +++ b/arch/mips/pci/pci-mt7620.c @@ -361,7 +361,7 @@ static int mt7620_pci_probe(struct platform_device *pdev) return 0; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { u16 cmd; u32 val; diff --git a/arch/mips/pci/pci-octeon.c b/arch/mips/pci/pci-octeon.c index 9ee01936862e..771f5de6362d 100644 --- a/arch/mips/pci/pci-octeon.c +++ b/arch/mips/pci/pci-octeon.c @@ -59,7 +59,7 @@ union octeon_pci_address { } s; }; -int __initconst (*octeon_pcibios_map_irq)(const struct pci_dev *dev, +int (*octeon_pcibios_map_irq)(const struct pci_dev *dev, u8 slot, u8 pin); enum octeon_dma_bar_type octeon_dma_bar_type = OCTEON_DMA_BAR_TYPE_INVALID; @@ -74,7 +74,7 @@ enum octeon_dma_bar_type octeon_dma_bar_type = OCTEON_DMA_BAR_TYPE_INVALID; * as it goes through each bridge. * Returns Interrupt number for the device */ -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (octeon_pcibios_map_irq) return octeon_pcibios_map_irq(dev, slot, pin); diff --git a/arch/mips/pci/pci-rt2880.c b/arch/mips/pci/pci-rt2880.c index d6360fe73d05..711cdccdf65b 100644 --- a/arch/mips/pci/pci-rt2880.c +++ b/arch/mips/pci/pci-rt2880.c @@ -181,7 +181,7 @@ static inline void rt2880_pci_write_u32(unsigned long reg, u32 val) spin_unlock_irqrestore(&rt2880_pci_lock, flags); } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { u16 cmd; int irq = -1; diff --git a/arch/mips/pci/pci-rt3883.c b/arch/mips/pci/pci-rt3883.c index 04f8ea953297..958899ffe99c 100644 --- a/arch/mips/pci/pci-rt3883.c +++ b/arch/mips/pci/pci-rt3883.c @@ -564,7 +564,7 @@ err_put_intc_node: return err; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return of_irq_parse_and_map_pci(dev, slot, pin); } diff --git a/arch/mips/pci/pci-xlp.c b/arch/mips/pci/pci-xlp.c index 7babf01600cb..9eff9137f78e 100644 --- a/arch/mips/pci/pci-xlp.c +++ b/arch/mips/pci/pci-xlp.c @@ -205,7 +205,7 @@ int xlp_socdev_to_node(const struct pci_dev *lnkdev) return PCI_SLOT(lnkdev->devfn) / 8; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { struct pci_dev *lnkdev; int lnkfunc, node; diff --git a/arch/mips/pci/pci-xlr.c b/arch/mips/pci/pci-xlr.c index 26d2dabef281..2a1c81a129ba 100644 --- a/arch/mips/pci/pci-xlr.c +++ b/arch/mips/pci/pci-xlr.c @@ -315,7 +315,7 @@ static void xls_pcie_ack_b(struct irq_data *d) } } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return get_irq_vector(dev); } -- cgit From 51a9a8284e43642fc3e85810fd54f4c245d23a14 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Sep 2017 10:03:12 +0100 Subject: x86/xen: clean up clang build warning In the case where sizeof(maddr) != sizeof(long) p is initialized and never read and clang throws a warning on this. Move declaration of p to clean up the clang build warning: warning: Value stored to 'p' during its initialization is never read Signed-off-by: Colin Ian King Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- arch/x86/include/asm/xen/hypercall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 9606688caa4b..e089c1675a7c 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -552,13 +552,13 @@ static inline void MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { - u32 *p = (u32 *) &desc; - mcl->op = __HYPERVISOR_update_descriptor; if (sizeof(maddr) == sizeof(long)) { mcl->args[0] = maddr; mcl->args[1] = *(unsigned long *)&desc; } else { + u32 *p = (u32 *)&desc; + mcl->args[0] = maddr; mcl->args[1] = maddr >> 32; mcl->args[2] = *p++; -- cgit From c8e1812960eeae42e2183154927028511c4bc566 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 20 Sep 2017 15:45:36 +0300 Subject: net_sched: always reset qdisc backlog in qdisc_reset() SKB stored in qdisc->gso_skb also counted into backlog. Some qdiscs don't reset backlog to zero in ->reset(), for example sfq just dequeue and free all queued skb. Signed-off-by: Konstantin Khlebnikov Fixes: 2ccccf5fb43f ("net_sched: update hierarchical backlog too") Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 92237e75dbbc..bf8c81e07c70 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -685,6 +685,7 @@ void qdisc_reset(struct Qdisc *qdisc) qdisc->gso_skb = NULL; } qdisc->q.qlen = 0; + qdisc->qstats.backlog = 0; } EXPORT_SYMBOL(qdisc_reset); -- cgit From 21f4d5cc25ec0e6e8eb8420dd2c399e6d2fc7d14 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 20 Sep 2017 15:46:11 +0300 Subject: net_sched/hfsc: fix curve activation in hfsc_change_class() If real-time or fair-share curves are enabled in hfsc_change_class() class isn't inserted into rb-trees yet. Thus init_ed() and init_vf() must be called in place of update_ed() and update_vf(). Remove isn't required because for now curves cannot be disabled. Signed-off-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index daaf214e5201..3f88b75488b0 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -958,6 +958,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } if (cl != NULL) { + int old_flags; + if (parentid) { if (cl->cl_parent && cl->cl_parent->cl_common.classid != parentid) @@ -978,6 +980,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } sch_tree_lock(sch); + old_flags = cl->cl_flags; + if (rsc != NULL) hfsc_change_rsc(cl, rsc, cur_time); if (fsc != NULL) @@ -986,10 +990,21 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, hfsc_change_usc(cl, usc, cur_time); if (cl->qdisc->q.qlen != 0) { - if (cl->cl_flags & HFSC_RSC) - update_ed(cl, qdisc_peek_len(cl->qdisc)); - if (cl->cl_flags & HFSC_FSC) - update_vf(cl, 0, cur_time); + int len = qdisc_peek_len(cl->qdisc); + + if (cl->cl_flags & HFSC_RSC) { + if (old_flags & HFSC_RSC) + update_ed(cl, len); + else + init_ed(cl, len); + } + + if (cl->cl_flags & HFSC_FSC) { + if (old_flags & HFSC_FSC) + update_vf(cl, 0, cur_time); + else + init_vf(cl, len); + } } sch_tree_unlock(sch); -- cgit From fe2502e49b58606580c77b3d84e42f946de182d8 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 20 Sep 2017 09:18:45 -0700 Subject: net_sched: remove cls_flower idr on failure Fixes: c15ab236d69d ("net/sched: Change cls_flower to use IDR") Cc: Chris Mi Cc: Jiri Pirko Signed-off-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1a267e77c6de..d230cb4c8094 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -922,28 +922,28 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!tc_flags_valid(fnew->flags)) { err = -EINVAL; - goto errout; + goto errout_idr; } } err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); if (err) - goto errout; + goto errout_idr; err = fl_check_assign_mask(head, &mask); if (err) - goto errout; + goto errout_idr; if (!tc_skip_sw(fnew->flags)) { if (!fold && fl_lookup(head, &fnew->mkey)) { err = -EEXIST; - goto errout; + goto errout_idr; } err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, head->ht_params); if (err) - goto errout; + goto errout_idr; } if (!tc_skip_hw(fnew->flags)) { @@ -952,7 +952,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, &mask.key, fnew); if (err) - goto errout; + goto errout_idr; } if (!tc_in_hw(fnew->flags)) @@ -981,6 +981,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, kfree(tb); return 0; +errout_idr: + if (fnew->handle) + idr_remove_ext(&head->handle_idr, fnew->handle); errout: tcf_exts_destroy(&fnew->exts); kfree(fnew); -- cgit From 0ab09befdbb7ca9b969d6206108629ddff43876e Mon Sep 17 00:00:00 2001 From: Alex Ng Date: Wed, 20 Sep 2017 11:17:35 -0700 Subject: hv_netvsc: fix send buffer failure on MTU change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If MTU is changed the host would reject the send buffer change. This problem is result of recent change to allow changing send buffer size. Every time we change the MTU, we store the previous net_device section count before destroying the buffer, but we don’t store the previous section size. When we reinitialize the buffer, its size is calculated by multiplying the previous count and previous size. Since we continuously increase the MTU, the host returns us a decreasing count value while the section size is reinitialized to 1728 bytes every time. This eventually leads to a condition where the calculated buf_size is so small that the host rejects it. Fixes: 8b5327975ae1 ("netvsc: allow controlling send/recv buffer size") Signed-off-by: Alex Ng Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/hyperv_net.h | 2 ++ drivers/net/hyperv/netvsc.c | 7 ++----- drivers/net/hyperv/netvsc_drv.c | 8 ++++++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index d98cdfb1536b..5176be76ca7d 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -150,6 +150,8 @@ struct netvsc_device_info { u32 num_chn; u32 send_sections; u32 recv_sections; + u32 send_section_size; + u32 recv_section_size; }; enum rndis_device_state { diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index a5511b7326af..8d5077fb0492 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -76,9 +76,6 @@ static struct netvsc_device *alloc_net_device(void) net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT; net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT; - net_device->recv_section_size = NETVSC_RECV_SECTION_SIZE; - net_device->send_section_size = NETVSC_SEND_SECTION_SIZE; - init_completion(&net_device->channel_init_wait); init_waitqueue_head(&net_device->subchan_open); INIT_WORK(&net_device->subchan_work, rndis_set_subchannel); @@ -262,7 +259,7 @@ static int netvsc_init_buf(struct hv_device *device, int ret = 0; /* Get receive buffer area. */ - buf_size = device_info->recv_sections * net_device->recv_section_size; + buf_size = device_info->recv_sections * device_info->recv_section_size; buf_size = roundup(buf_size, PAGE_SIZE); net_device->recv_buf = vzalloc(buf_size); @@ -344,7 +341,7 @@ static int netvsc_init_buf(struct hv_device *device, goto cleanup; /* Now setup the send buffer. */ - buf_size = device_info->send_sections * net_device->send_section_size; + buf_size = device_info->send_sections * device_info->send_section_size; buf_size = round_up(buf_size, PAGE_SIZE); net_device->send_buf = vzalloc(buf_size); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index d4902ee5f260..a32ae02e1b6c 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -848,7 +848,9 @@ static int netvsc_set_channels(struct net_device *net, device_info.num_chn = count; device_info.ring_size = ring_size; device_info.send_sections = nvdev->send_section_cnt; + device_info.send_section_size = nvdev->send_section_size; device_info.recv_sections = nvdev->recv_section_cnt; + device_info.recv_section_size = nvdev->recv_section_size; rndis_filter_device_remove(dev, nvdev); @@ -963,7 +965,9 @@ static int netvsc_change_mtu(struct net_device *ndev, int mtu) device_info.ring_size = ring_size; device_info.num_chn = nvdev->num_chn; device_info.send_sections = nvdev->send_section_cnt; + device_info.send_section_size = nvdev->send_section_size; device_info.recv_sections = nvdev->recv_section_cnt; + device_info.recv_section_size = nvdev->recv_section_size; rndis_filter_device_remove(hdev, nvdev); @@ -1485,7 +1489,9 @@ static int netvsc_set_ringparam(struct net_device *ndev, device_info.num_chn = nvdev->num_chn; device_info.ring_size = ring_size; device_info.send_sections = new_tx; + device_info.send_section_size = nvdev->send_section_size; device_info.recv_sections = new_rx; + device_info.recv_section_size = nvdev->recv_section_size; netif_device_detach(ndev); was_opened = rndis_filter_opened(nvdev); @@ -1934,7 +1940,9 @@ static int netvsc_probe(struct hv_device *dev, device_info.ring_size = ring_size; device_info.num_chn = VRSS_CHANNEL_DEFAULT; device_info.send_sections = NETVSC_DEFAULT_TX; + device_info.send_section_size = NETVSC_SEND_SECTION_SIZE; device_info.recv_sections = NETVSC_DEFAULT_RX; + device_info.recv_section_size = NETVSC_RECV_SECTION_SIZE; nvdev = rndis_filter_device_add(dev, &device_info); if (IS_ERR(nvdev)) { -- cgit From 4a7a3860caac1a8779e8c459d8abe21b111798d6 Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Wed, 20 Sep 2017 15:32:53 -0500 Subject: net: qcom/emac: add software control for pause frame mode The EMAC has the option of sending only a single pause frame when flow control is enabled and the RX queue is full. Although sending only one pause frame has little value, this would allow admins to enable automatic flow control without having to worry about the EMAC flooding nearby switches with pause frames if the kernel hangs. The option is enabled by using the single-pause-mode private flag. Signed-off-by: Timur Tabi Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/emac/emac-ethtool.c | 30 +++++++++++++++++++++++ drivers/net/ethernet/qualcomm/emac/emac-mac.c | 22 +++++++++++++++++ drivers/net/ethernet/qualcomm/emac/emac.c | 3 +++ drivers/net/ethernet/qualcomm/emac/emac.h | 3 +++ 4 files changed, 58 insertions(+) diff --git a/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c index bbe24639aa5a..c8c6231b87f3 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c @@ -88,6 +88,8 @@ static void emac_set_msglevel(struct net_device *netdev, u32 data) static int emac_get_sset_count(struct net_device *netdev, int sset) { switch (sset) { + case ETH_SS_PRIV_FLAGS: + return 1; case ETH_SS_STATS: return EMAC_STATS_LEN; default: @@ -100,6 +102,10 @@ static void emac_get_strings(struct net_device *netdev, u32 stringset, u8 *data) unsigned int i; switch (stringset) { + case ETH_SS_PRIV_FLAGS: + strcpy(data, "single-pause-mode"); + break; + case ETH_SS_STATS: for (i = 0; i < EMAC_STATS_LEN; i++) { strlcpy(data, emac_ethtool_stat_strings[i], @@ -230,6 +236,27 @@ static int emac_get_regs_len(struct net_device *netdev) return EMAC_MAX_REG_SIZE * sizeof(u32); } +#define EMAC_PRIV_ENABLE_SINGLE_PAUSE BIT(0) + +static int emac_set_priv_flags(struct net_device *netdev, u32 flags) +{ + struct emac_adapter *adpt = netdev_priv(netdev); + + adpt->single_pause_mode = !!(flags & EMAC_PRIV_ENABLE_SINGLE_PAUSE); + + if (netif_running(netdev)) + return emac_reinit_locked(adpt); + + return 0; +} + +static u32 emac_get_priv_flags(struct net_device *netdev) +{ + struct emac_adapter *adpt = netdev_priv(netdev); + + return adpt->single_pause_mode ? EMAC_PRIV_ENABLE_SINGLE_PAUSE : 0; +} + static const struct ethtool_ops emac_ethtool_ops = { .get_link_ksettings = phy_ethtool_get_link_ksettings, .set_link_ksettings = phy_ethtool_set_link_ksettings, @@ -253,6 +280,9 @@ static const struct ethtool_ops emac_ethtool_ops = { .get_regs_len = emac_get_regs_len, .get_regs = emac_get_regs, + + .set_priv_flags = emac_set_priv_flags, + .get_priv_flags = emac_get_priv_flags, }; void emac_set_ethtool_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c index bcd4708b3745..0ea3ca09c689 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c @@ -551,6 +551,28 @@ static void emac_mac_start(struct emac_adapter *adpt) mac &= ~(HUGEN | VLAN_STRIP | TPAUSE | SIMR | HUGE | MULTI_ALL | DEBUG_MODE | SINGLE_PAUSE_MODE); + /* Enable single-pause-frame mode if requested. + * + * If enabled, the EMAC will send a single pause frame when the RX + * queue is full. This normally leads to packet loss because + * the pause frame disables the remote MAC only for 33ms (the quanta), + * and then the remote MAC continues sending packets even though + * the RX queue is still full. + * + * If disabled, the EMAC sends a pause frame every 31ms until the RX + * queue is no longer full. Normally, this is the preferred + * method of operation. However, when the system is hung (e.g. + * cores are halted), the EMAC interrupt handler is never called + * and so the RX queue fills up quickly and stays full. The resuling + * non-stop "flood" of pause frames sometimes has the effect of + * disabling nearby switches. In some cases, other nearby switches + * are also affected, shutting down the entire network. + * + * The user can enable or disable single-pause-frame mode + * via ethtool. + */ + mac |= adpt->single_pause_mode ? SINGLE_PAUSE_MODE : 0; + writel_relaxed(csr1, adpt->csr + EMAC_EMAC_WRAPPER_CSR1); writel_relaxed(mac, adpt->base + EMAC_MAC_CTRL); diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c index 60850bfa3d32..759543512117 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac.c @@ -443,6 +443,9 @@ static void emac_init_adapter(struct emac_adapter *adpt) /* default to automatic flow control */ adpt->automatic = true; + + /* Disable single-pause-frame mode by default */ + adpt->single_pause_mode = false; } /* Get the clock */ diff --git a/drivers/net/ethernet/qualcomm/emac/emac.h b/drivers/net/ethernet/qualcomm/emac/emac.h index 8ee4ec6aef2e..d7c9f44209d4 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac.h +++ b/drivers/net/ethernet/qualcomm/emac/emac.h @@ -363,6 +363,9 @@ struct emac_adapter { bool tx_flow_control; bool rx_flow_control; + /* True == use single-pause-frame mode. */ + bool single_pause_mode; + /* Ring parameter */ u8 tpd_burst; u8 rfd_burst; -- cgit From 19cab8872692960535aa6d12e3a295ac51d1a648 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 20 Sep 2017 15:52:13 -0700 Subject: net: ethtool: Add back transceiver type Commit 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") deprecated the ethtool_cmd::transceiver field, which was fine in premise, except that the PHY library was actually using it to report the type of transceiver: internal or external. Use the first word of the reserved field to put this __u8 transceiver field back in. It is made read-only, and we don't expect the ETHTOOL_xLINKSETTINGS API to be doing anything with this anyway, so this is mostly for the legacy path where we do: ethtool_get_settings() -> dev->ethtool_ops->get_link_ksettings() -> convert_link_ksettings_to_legacy_settings() to have no information loss compared to the legacy get_settings API. Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 6 +++++- net/core/ethtool.c | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9c041dae8e2c..5bd1b1de4ea0 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1753,6 +1753,8 @@ enum ethtool_reset_flags { * %ethtool_link_mode_bit_indices for the link modes, and other * link features that the link partner advertised through * autonegotiation; 0 if unknown or not applicable. Read-only. + * @transceiver: Used to distinguish different possible PHY types, + * reported consistently by PHYLIB. Read-only. * * If autonegotiation is disabled, the speed and @duplex represent the * fixed link mode and are writable if the driver supports multiple @@ -1804,7 +1806,9 @@ struct ethtool_link_settings { __u8 eth_tp_mdix; __u8 eth_tp_mdix_ctrl; __s8 link_mode_masks_nwords; - __u32 reserved[8]; + __u8 transceiver; + __u8 reserved1[3]; + __u32 reserved[7]; __u32 link_mode_masks[0]; /* layout of link_mode_masks fields: * __u32 map_supported[link_mode_masks_nwords]; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 6a582ae4c5d9..3228411ada0f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -525,6 +525,8 @@ convert_link_ksettings_to_legacy_settings( = link_ksettings->base.eth_tp_mdix; legacy_settings->eth_tp_mdix_ctrl = link_ksettings->base.eth_tp_mdix_ctrl; + legacy_settings->transceiver + = link_ksettings->base.transceiver; return retval; } -- cgit From ceb628134a75564d7bfa8e4ef902e6e588339e11 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 20 Sep 2017 15:52:14 -0700 Subject: net: phy: Keep reporting transceiver type With commit 2d55173e71b0 ("phy: add generic function to support ksetting support"), we lost the ability to report the transceiver type like we used to. Now that we have added back the transceiver type to ethtool_link_settings, we can report it back like we used to and have no loss of information. Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") Fixes: 2d55173e71b0 ("phy: add generic function to support ksetting support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index e842d2cd1ee7..2b1e67bc1e73 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -373,7 +373,8 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev, cmd->base.port = PORT_BNC; else cmd->base.port = PORT_MII; - + cmd->base.transceiver = phy_is_internal(phydev) ? + XCVR_INTERNAL : XCVR_EXTERNAL; cmd->base.phy_address = phydev->mdio.addr; cmd->base.autoneg = phydev->autoneg; cmd->base.eth_tp_mdix_ctrl = phydev->mdix_ctrl; -- cgit From 8a7ffeb795f864dd605b579c05934cba95dc8ad3 Mon Sep 17 00:00:00 2001 From: Nisar Sayed Date: Thu, 21 Sep 2017 02:36:36 +0530 Subject: lan78xx: Fix for eeprom read/write when device auto suspend Fix for eeprom read/write when device auto suspend Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Nisar Sayed Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index b99a7fb09f8e..fcf85ae37435 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1265,30 +1265,46 @@ static int lan78xx_ethtool_get_eeprom(struct net_device *netdev, struct ethtool_eeprom *ee, u8 *data) { struct lan78xx_net *dev = netdev_priv(netdev); + int ret; + + ret = usb_autopm_get_interface(dev->intf); + if (ret) + return ret; ee->magic = LAN78XX_EEPROM_MAGIC; - return lan78xx_read_raw_eeprom(dev, ee->offset, ee->len, data); + ret = lan78xx_read_raw_eeprom(dev, ee->offset, ee->len, data); + + usb_autopm_put_interface(dev->intf); + + return ret; } static int lan78xx_ethtool_set_eeprom(struct net_device *netdev, struct ethtool_eeprom *ee, u8 *data) { struct lan78xx_net *dev = netdev_priv(netdev); + int ret; + + ret = usb_autopm_get_interface(dev->intf); + if (ret) + return ret; /* Allow entire eeprom update only */ if ((ee->magic == LAN78XX_EEPROM_MAGIC) && (ee->offset == 0) && (ee->len == 512) && (data[0] == EEPROM_INDICATOR)) - return lan78xx_write_raw_eeprom(dev, ee->offset, ee->len, data); + ret = lan78xx_write_raw_eeprom(dev, ee->offset, ee->len, data); else if ((ee->magic == LAN78XX_OTP_MAGIC) && (ee->offset == 0) && (ee->len == 512) && (data[0] == OTP_INDICATOR_1)) - return lan78xx_write_raw_otp(dev, ee->offset, ee->len, data); + ret = lan78xx_write_raw_otp(dev, ee->offset, ee->len, data); - return -EINVAL; + usb_autopm_put_interface(dev->intf); + + return ret; } static void lan78xx_get_strings(struct net_device *netdev, u32 stringset, -- cgit From c077682282dd34c75e8477c22dffe7c9aebc6e98 Mon Sep 17 00:00:00 2001 From: Nisar Sayed Date: Thu, 21 Sep 2017 02:36:37 +0530 Subject: lan78xx: Allow EEPROM write for less than MAX_EEPROM_SIZE Allow EEPROM write for less than MAX_EEPROM_SIZE Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Nisar Sayed Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index fcf85ae37435..f8c63eec8353 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1290,11 +1290,10 @@ static int lan78xx_ethtool_set_eeprom(struct net_device *netdev, if (ret) return ret; - /* Allow entire eeprom update only */ - if ((ee->magic == LAN78XX_EEPROM_MAGIC) && - (ee->offset == 0) && - (ee->len == 512) && - (data[0] == EEPROM_INDICATOR)) + /* Invalid EEPROM_INDICATOR at offset zero will result in a failure + * to load data from EEPROM + */ + if (ee->magic == LAN78XX_EEPROM_MAGIC) ret = lan78xx_write_raw_eeprom(dev, ee->offset, ee->len, data); else if ((ee->magic == LAN78XX_OTP_MAGIC) && (ee->offset == 0) && -- cgit From e365280521029c9366bab038915274ddaa1b7195 Mon Sep 17 00:00:00 2001 From: Nisar Sayed Date: Thu, 21 Sep 2017 02:36:38 +0530 Subject: lan78xx: Use default values loaded from EEPROM/OTP after reset Use default value of auto duplex and auto speed values loaded from EEPROM/OTP after reset. The LAN78xx allows platform configurations to be loaded from EEPROM/OTP. Ex: When external phy is connected, the MAC can be configured to have correct auto speed, auto duplex, auto polarity configured from the EEPROM/OTP. Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Nisar Sayed Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index f8c63eec8353..0161f77641fa 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2449,7 +2449,6 @@ static int lan78xx_reset(struct lan78xx_net *dev) /* LAN7801 only has RGMII mode */ if (dev->chipid == ID_REV_CHIP_ID_7801_) buf &= ~MAC_CR_GMII_EN_; - buf |= MAC_CR_AUTO_DUPLEX_ | MAC_CR_AUTO_SPEED_; ret = lan78xx_write_reg(dev, MAC_CR, buf); ret = lan78xx_read_reg(dev, MAC_TX, &buf); -- cgit From f0ef1f4f2b772c0a1c8b35a6ae3edf974cc110dd Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Thu, 21 Sep 2017 08:24:27 +0200 Subject: net: stmmac: Cocci spatch "of_table" Make sure (of/i2c/platform)_device_id tables are NULL terminated. Found by coccinelle spatch "misc/of_table.cocci" Signed-off-by: Thomas Meyer Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index a366b3747eeb..8a280b48e3a9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -315,6 +315,7 @@ static int stmmac_dt_phy(struct plat_stmmacenet_data *plat, { .compatible = "allwinner,sun8i-h3-emac" }, { .compatible = "allwinner,sun8i-v3s-emac" }, { .compatible = "allwinner,sun50i-a64-emac" }, + {}, }; /* If phy-handle property is passed from DT, use it as the PHY */ -- cgit From 09579ac803a3638344b8544b5940793d5358673e Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 21 Sep 2017 09:16:26 +0200 Subject: net/smc: add missing dev_put In the infiniband part, SMC currently uses get_netdev which calls dev_hold on the returned net device. However, the SMC code never calls dev_put on that net device resulting in a wrong reference count. This patch adds a dev_put after the usage of the net device to fix the issue. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 547e0e113b17..0b5852299158 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -380,6 +380,7 @@ static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); if (ndev) { memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); + dev_put(ndev); } else if (!rc) { memcpy(&smcibdev->mac[ibport - 1][0], &smcibdev->gid[ibport - 1].raw[8], 3); -- cgit From 846e344eb7229018457d6d6fc1ab0cc0a167692f Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 21 Sep 2017 09:16:27 +0200 Subject: net/smc: add receive timeout check The SMC receive function currently lacks a timeout check under the condition that no data were received and no data are available. This patch adds such a check. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_rx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index b17a333e9bb0..3e631ae4b6b6 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -148,6 +148,8 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, read_done = sock_intr_errno(timeo); break; } + if (!timeo) + return -EAGAIN; } if (!atomic_read(&conn->bytes_to_rcv)) { -- cgit From 731b008560e6dfaf5fb297543f17bbe9bb868f3c Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:28 +0200 Subject: net/smc: take RCU read lock for routing cache lookup smc_netinfo_by_tcpsk() looks up the routing cache. Such a lookup requires protection by an RCU read lock. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8c6d24b2995d..2e8d2dabac0c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -282,6 +282,7 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, u8 *prefix_len) { struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct in_device *in_dev; struct sockaddr_in addr; int rc = -ENOENT; int len; @@ -298,14 +299,17 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, /* get address to which the internal TCP socket is bound */ kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); /* analyze IPv4 specific data of net_device belonging to TCP socket */ - for_ifa(dst->dev->ip_ptr) { - if (ifa->ifa_address != addr.sin_addr.s_addr) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dst->dev); + for_ifa(in_dev) { + if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) continue; *prefix_len = inet_mask_len(ifa->ifa_mask); *subnet = ifa->ifa_address & ifa->ifa_mask; rc = 0; break; - } endfor_ifa(dst->dev->ip_ptr); + } endfor_ifa(in_dev); + rcu_read_unlock(); out_rel: dst_release(dst); -- cgit From a6832c3acdb2ceb099ec3c385777fbaa6d5a5fd6 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:29 +0200 Subject: net/smc: adjust net_device refcount smc_pnet_fill_entry() uses dev_get_by_name() adding a refcount to ndev. The following smc_pnet_enter() has to reduce the refcount if the entry to be added exists already in the pnet table. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_pnet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 78f7af28ae4f..31f8453c25c5 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -181,8 +181,10 @@ static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem) sizeof(new_pnetelem->ndev->name)) || smc_pnet_same_ibname(pnetelem, new_pnetelem->smcibdev->ibdev->name, - new_pnetelem->ib_port)) + new_pnetelem->ib_port)) { + dev_put(pnetelem->ndev); goto found; + } } list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist); rc = 0; -- cgit From 8301fa44b41b1ca46a0547809eb173112559dc51 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:30 +0200 Subject: net/smc: adapt send request completion notification The solicited flag is meaningful for the receive completion queue. Ask for next work completion of any type on the send queue. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_wr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index ab56bda66783..525d91e0d57e 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -244,7 +244,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) int rc; ib_req_notify_cq(link->smcibdev->roce_cq_send, - IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], &failed_wr); -- cgit From 5bc11ddbdf7fc6681db5c3f9a92cdee0f19cee1e Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:31 +0200 Subject: net/smc: longer delay for client link group removal Client link group creation always follows the server linkgroup creation. If peer creates a new server link group, client has to create a new client link group. If peer reuses a server link group for a new connection, client has to reuse its client link group as well. This patch introduces a longer delay for client link group removal to make sure this link group still exists, once the peer decides to reuse a server link group. This avoids out-of-sync conditions for link groups. If already scheduled, modify the delay. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 1a16d51e2330..20b66e79c5d6 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -25,8 +25,9 @@ #include "smc_cdc.h" #include "smc_close.h" -#define SMC_LGR_NUM_INCR 256 -#define SMC_LGR_FREE_DELAY (600 * HZ) +#define SMC_LGR_NUM_INCR 256 +#define SMC_LGR_FREE_DELAY_SERV (600 * HZ) +#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10) static u32 smc_lgr_num; /* unique link group number */ @@ -107,8 +108,15 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); - if (reduced && !lgr->conns_num) - schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY); + if (!reduced || lgr->conns_num) + return; + /* client link group creation always follows the server link group + * creation. For client use a somewhat higher removal delay time, + * otherwise there is a risk of out-of-sync link groups. + */ + mod_delayed_work(system_wq, &lgr->free_work, + lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); } static void smc_lgr_free_work(struct work_struct *work) -- cgit From bfbedfd38378c1ad9df84469403d69c17b074b66 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:32 +0200 Subject: net/smc: terminate link group if out-of-sync is received An out-of-sync condition can just be detected by the client. If the server receives a CLC DECLINE message indicating an out-of-sync condition for the link groups, the server must clean up the out-of-sync link group. There is no need for an extra third parameter in smc_clc_send_decline(). Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 ++---- net/smc/smc_clc.c | 10 +++++----- net/smc/smc_clc.h | 3 +-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2e8d2dabac0c..745f145d4c4d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -513,7 +513,7 @@ decline_rdma: /* RDMA setup failed, switch back to TCP */ smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(smc, reason_code, 0); + rc = smc_clc_send_decline(smc, reason_code); if (rc < sizeof(struct smc_clc_msg_decline)) goto out_err; } @@ -808,8 +808,6 @@ static void smc_listen_work(struct work_struct *work) rc = local_contact; if (rc == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (rc == -ENOLINK) - reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ goto decline_rdma; } link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; @@ -903,7 +901,7 @@ decline_rdma: smc_conn_free(&new_smc->conn); new_smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(new_smc, reason_code, 0); + rc = smc_clc_send_decline(new_smc, reason_code); if (rc < sizeof(struct smc_clc_msg_decline)) goto out_err; } diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 3934913ab835..b7dd2743fb5c 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -95,9 +95,10 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } if (clcm->type == SMC_CLC_DECLINE) { reason_code = SMC_CLC_DECL_REPLY; - if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis) - == SMC_CLC_DECL_SYNCERR) + if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { smc->conn.lgr->sync_err = true; + smc_lgr_terminate(smc->conn.lgr); + } } out: @@ -105,8 +106,7 @@ out: } /* send CLC DECLINE message across internal TCP socket */ -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, - u8 out_of_sync) +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) { struct smc_clc_msg_decline dclc; struct msghdr msg; @@ -118,7 +118,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, dclc.hdr.type = SMC_CLC_DECLINE; dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); dclc.hdr.version = SMC_CLC_V1; - dclc.hdr.flag = out_of_sync ? 1 : 0; + dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0; memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 13db8ce177c9..1c55414041d4 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -106,8 +106,7 @@ struct smc_ib_device; int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, - u8 out_of_sync); +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport); int smc_clc_send_confirm(struct smc_sock *smc); -- cgit From 18e537cd58e8d6932719bfa79cb96a1fbc639199 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:33 +0200 Subject: net/smc: introduce a delay The number of outstanding work requests is limited. If all work requests are in use, tx processing is postponed to another scheduling of the tx worker. Switch to a delayed worker to have a gap for tx completion queue events before the next retry. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc.h | 2 +- net/smc/smc_close.c | 12 +++++++----- net/smc/smc_tx.c | 12 ++++++++---- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index 6e44313e4467..0ccd6fa387ad 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -149,7 +149,7 @@ struct smc_connection { atomic_t sndbuf_space; /* remaining space in sndbuf */ u16 tx_cdc_seq; /* sequence # for CDC send */ spinlock_t send_lock; /* protect wr_sends */ - struct work_struct tx_work; /* retry of smc_cdc_msg_send */ + struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 3c2e166b5d22..5201bc103bd8 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -208,7 +208,7 @@ again: case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state == SMC_ACTIVE) { /* send close request */ @@ -234,7 +234,7 @@ again: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_err != ECONNABORTED) { /* confirm close from peer */ @@ -263,7 +263,9 @@ again: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: - cancel_work_sync(&conn->tx_work); + release_sock(sk); + cancel_delayed_work_sync(&conn->tx_work); + lock_sock(sk); smc_close_abort(conn); sk->sk_state = SMC_CLOSED; smc_close_wait_tx_pends(smc); @@ -425,7 +427,7 @@ again: case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); /* send close wr request */ rc = smc_close_wr(conn); @@ -439,7 +441,7 @@ again: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); /* confirm close from peer */ rc = smc_close_wr(conn); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 3c656beb8820..3866573288dd 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -24,6 +24,8 @@ #include "smc_cdc.h" #include "smc_tx.h" +#define SMC_TX_WORK_DELAY HZ + /***************************** sndbuf producer *******************************/ /* callback implementation for sk.sk_write_space() @@ -406,7 +408,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) goto out_unlock; } rc = 0; - schedule_work(&conn->tx_work); + schedule_delayed_work(&conn->tx_work, + SMC_TX_WORK_DELAY); } goto out_unlock; } @@ -430,7 +433,7 @@ out_unlock: */ static void smc_tx_work(struct work_struct *work) { - struct smc_connection *conn = container_of(work, + struct smc_connection *conn = container_of(to_delayed_work(work), struct smc_connection, tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -468,7 +471,8 @@ void smc_tx_consumer_update(struct smc_connection *conn) if (!rc) rc = smc_cdc_msg_send(conn, wr_buf, pend); if (rc < 0) { - schedule_work(&conn->tx_work); + schedule_delayed_work(&conn->tx_work, + SMC_TX_WORK_DELAY); return; } smc_curs_write(&conn->rx_curs_confirmed, @@ -487,6 +491,6 @@ void smc_tx_consumer_update(struct smc_connection *conn) void smc_tx_init(struct smc_sock *smc) { smc->sk.sk_write_space = smc_tx_write_space; - INIT_WORK(&smc->conn.tx_work, smc_tx_work); + INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); spin_lock_init(&smc->conn.send_lock); } -- cgit From 8c96feeeb39ba0b89c6121f71e8f7aa2a4d46671 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:34 +0200 Subject: net/smc: no close wait in case of process shut down Usually socket closing is delayed if there is still data available in the send buffer to be transmitted. If a process is killed, the delay should be avoided. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_close.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 5201bc103bd8..f0d16fb825f7 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -174,15 +174,15 @@ int smc_close_active(struct smc_sock *smc) { struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; - long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; int old_state; + long timeout; int rc = 0; - if (sock_flag(sk, SOCK_LINGER) && - !(current->flags & PF_EXITING)) - timeout = sk->sk_lingertime; + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; again: old_state = sk->sk_state; @@ -413,13 +413,14 @@ void smc_close_sock_put_work(struct work_struct *work) int smc_close_shutdown_write(struct smc_sock *smc) { struct smc_connection *conn = &smc->conn; - long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; struct sock *sk = &smc->sk; int old_state; + long timeout; int rc = 0; - if (sock_flag(sk, SOCK_LINGER)) - timeout = sk->sk_lingertime; + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; again: old_state = sk->sk_state; -- cgit From e8b95728f724797f958912fd9b765a695595d3a6 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 1 Sep 2017 17:13:43 -0700 Subject: Input: uinput - avoid FF flush when destroying device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Normally, when input device supporting force feedback effects is being destroyed, we try to "flush" currently playing effects, so that the physical device does not continue vibrating (or executing other effects). Unfortunately this does not work well for uinput as flushing of the effects deadlocks with the destroy action: - if device is being destroyed because the file descriptor is being closed, then there is noone to even service FF requests; - if device is being destroyed because userspace sent UI_DEV_DESTROY, while theoretically it could be possible to service FF requests, userspace is unlikely to do so (they'd need to make sure FF handling happens on a separate thread) even if kernel solves the issue with FF ioctls deadlocking with UI_DEV_DESTROY ioctl on udev->mutex. To avoid lockups like the one below, let's install a custom input device flush handler, and avoid trying to flush force feedback effects when we destroying the device, and instead rely on uinput to shut off the device properly. NMI watchdog: Watchdog detected hard LOCKUP on cpu 3 ... <> [] _raw_spin_lock_irqsave+0x37/0x40 [] complete+0x1d/0x50 [] uinput_request_done+0x3c/0x40 [uinput] [] uinput_request_submit.part.7+0x47/0xb0 [uinput] [] uinput_dev_erase_effect+0x5b/0x76 [uinput] [] erase_effect+0xad/0xf0 [] flush_effects+0x4d/0x90 [] input_flush_device+0x40/0x60 [] evdev_cleanup+0xac/0xc0 [] evdev_disconnect+0x2b/0x60 [] __input_unregister_device+0xac/0x150 [] input_unregister_device+0x47/0x70 [] uinput_destroy_device+0xb5/0xc0 [uinput] [] uinput_ioctl_handler.isra.9+0x65e/0x740 [uinput] [] ? do_futex+0x12b/0xad0 [] uinput_ioctl+0x18/0x20 [uinput] [] do_vfs_ioctl+0x298/0x480 [] ? security_file_ioctl+0x43/0x60 [] SyS_ioctl+0x79/0x90 [] entry_SYSCALL_64_fastpath+0x12/0x71 Reported-by: Rodrigo Rivas Costa Reported-by: Clément VUCHENER Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=193741 Signed-off-by: Dmitry Torokhov --- drivers/input/ff-core.c | 13 ++++++++++--- drivers/input/misc/uinput.c | 18 ++++++++++++++++++ include/linux/input.h | 1 + 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/drivers/input/ff-core.c b/drivers/input/ff-core.c index 8f2042432c85..66a46c84e28f 100644 --- a/drivers/input/ff-core.c +++ b/drivers/input/ff-core.c @@ -237,9 +237,15 @@ int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file) EXPORT_SYMBOL_GPL(input_ff_erase); /* - * flush_effects - erase all effects owned by a file handle + * input_ff_flush - erase all effects owned by a file handle + * @dev: input device to erase effect from + * @file: purported owner of the effects + * + * This function erases all force-feedback effects associated with + * the given owner from specified device. Note that @file may be %NULL, + * in which case all effects will be erased. */ -static int flush_effects(struct input_dev *dev, struct file *file) +int input_ff_flush(struct input_dev *dev, struct file *file) { struct ff_device *ff = dev->ff; int i; @@ -255,6 +261,7 @@ static int flush_effects(struct input_dev *dev, struct file *file) return 0; } +EXPORT_SYMBOL_GPL(input_ff_flush); /** * input_ff_event() - generic handler for force-feedback events @@ -343,7 +350,7 @@ int input_ff_create(struct input_dev *dev, unsigned int max_effects) mutex_init(&ff->mutex); dev->ff = ff; - dev->flush = flush_effects; + dev->flush = input_ff_flush; dev->event = input_ff_event; __set_bit(EV_FF, dev->evbit); diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c index 022be0e22eba..2cff40be8860 100644 --- a/drivers/input/misc/uinput.c +++ b/drivers/input/misc/uinput.c @@ -230,6 +230,18 @@ static int uinput_dev_erase_effect(struct input_dev *dev, int effect_id) return uinput_request_submit(udev, &request); } +static int uinput_dev_flush(struct input_dev *dev, struct file *file) +{ + /* + * If we are called with file == NULL that means we are tearing + * down the device, and therefore we can not handle FF erase + * requests: either we are handling UI_DEV_DESTROY (and holding + * the udev->mutex), or the file descriptor is closed and there is + * nobody on the other side anymore. + */ + return file ? input_ff_flush(dev, file) : 0; +} + static void uinput_destroy_device(struct uinput_device *udev) { const char *name, *phys; @@ -297,6 +309,12 @@ static int uinput_create_device(struct uinput_device *udev) dev->ff->playback = uinput_dev_playback; dev->ff->set_gain = uinput_dev_set_gain; dev->ff->set_autocenter = uinput_dev_set_autocenter; + /* + * The standard input_ff_flush() implementation does + * not quite work for uinput as we can't reasonably + * handle FF requests during device teardown. + */ + dev->flush = uinput_dev_flush; } error = input_register_device(udev->dev); diff --git a/include/linux/input.h b/include/linux/input.h index a65e3b24fb18..fb5e23c7ed98 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -529,6 +529,7 @@ int input_ff_event(struct input_dev *dev, unsigned int type, unsigned int code, int input_ff_upload(struct input_dev *dev, struct ff_effect *effect, struct file *file); int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file); +int input_ff_flush(struct input_dev *dev, struct file *file); int input_ff_create_memless(struct input_dev *dev, void *data, int (*play_effect)(struct input_dev *, void *, struct ff_effect *)); -- cgit From 6b4877c7bdc6ae39ce03716df7caeecf204697eb Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 6 Sep 2017 16:22:59 -0700 Subject: Input: uinput - avoid crash when sending FF request to device going away If FF request comes in while uinput device is going away, uinput_request_send() will fail with -ENODEV, and uinput_request_submit() will attempt to mark the slot as unused by calling uinput_request_done(). Unfortunately in this case we haven't initialized request->done completion yet, and we get a crash: [ 39.402036] BUG: spinlock bad magic on CPU#1, fftest/3108 [ 39.402046] lock: 0xffff88006a93bb00, .magic: 00000000, .owner: /39, .owner_cpu: 1217155072 [ 39.402055] CPU: 1 PID: 3108 Comm: fftest Tainted: G W 4.13.0+ #15 [ 39.402059] Hardware name: LENOVO 20HQS0EG02/20HQS0EG02, BIOS N1MET37W (1.22 ) 07/04/2017 [ 39.402064] 0000000000000086 f0fad82f3ceaa120 ffff88006a93b9a0 ffffffff9de941bb [ 39.402077] ffff88026df8ae00 ffff88006a93bb00 ffff88006a93b9c0 ffffffff9dca62b7 [ 39.402088] ffff88006a93bb00 ffff88006a93baf8 ffff88006a93b9e0 ffffffff9dca62e7 [ 39.402099] Call Trace: [ 39.402112] [] dump_stack+0x4d/0x63 [ 39.402123] [] spin_dump+0x97/0x9c [ 39.402130] [] spin_bug+0x2b/0x2d [ 39.402138] [] do_raw_spin_lock+0x28/0xfd [ 39.402147] [] _raw_spin_lock_irqsave+0x19/0x1f [ 39.402154] [] complete+0x1d/0x48 [ 39.402162] [] 0xffffffffc04f30af [ 39.402167] [] 0xffffffffc04f468c [ 39.402177] [] ? __slab_free+0x22f/0x359 [ 39.402184] [] ? tk_clock_read+0xc/0xe [ 39.402189] [] 0xffffffffc04f471f [ 39.402195] [] ? __wake_up+0x44/0x4b [ 39.402200] [] ? 0xffffffffc04f3240 [ 39.402207] [] erase_effect+0xa1/0xd2 [ 39.402214] [] input_ff_flush+0x43/0x5c [ 39.402219] [] 0xffffffffc04f32ad [ 39.402227] [] input_flush_device+0x3d/0x51 [ 39.402234] [] evdev_flush+0x49/0x5c [ 39.402243] [] filp_close+0x3f/0x65 [ 39.402253] [] put_files_struct+0x66/0xc1 [ 39.402261] [] exit_files+0x47/0x4e [ 39.402270] [] do_exit+0x483/0x969 [ 39.402278] [] ? recalc_sigpending_tsk+0x3d/0x44 [ 39.402285] [] do_group_exit+0x42/0xb0 [ 39.402293] [] get_signal+0x58d/0x5bf [ 39.402300] [] do_signal+0x37/0x53e [ 39.402307] [] ? evdev_ioctl_handler+0xac8/0xb04 [ 39.402314] [] ? evdev_ioctl+0x10/0x12 [ 39.402321] [] ? do_vfs_ioctl+0x42e/0x501 [ 39.402328] [] prepare_exit_to_usermode+0x66/0x90 [ 39.402333] [] syscall_return_slowpath+0xe3/0xec [ 39.402339] [] int_ret_from_sys_call+0x25/0x8f While we could solve this by simply initializing the completion earlier, we are better off rearranging the code a bit so we avoid calling complete() on requests that we did not send out. This patch consolidates marking request slots as free in one place (in uinput_request_submit(), the same place where we acquire them) and having everyone else simply signal completion of the requests. Fixes: 00ce756ce53a ("Input: uinput - mark failed submission requests as free") Signed-off-by: Dmitry Torokhov --- drivers/input/misc/uinput.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c index 2cff40be8860..443151de90c6 100644 --- a/drivers/input/misc/uinput.c +++ b/drivers/input/misc/uinput.c @@ -98,14 +98,15 @@ static int uinput_request_reserve_slot(struct uinput_device *udev, uinput_request_alloc_id(udev, request)); } -static void uinput_request_done(struct uinput_device *udev, - struct uinput_request *request) +static void uinput_request_release_slot(struct uinput_device *udev, + unsigned int id) { /* Mark slot as available */ - udev->requests[request->id] = NULL; - wake_up(&udev->requests_waitq); + spin_lock(&udev->requests_lock); + udev->requests[id] = NULL; + spin_unlock(&udev->requests_lock); - complete(&request->done); + wake_up(&udev->requests_waitq); } static int uinput_request_send(struct uinput_device *udev, @@ -138,20 +139,22 @@ static int uinput_request_send(struct uinput_device *udev, static int uinput_request_submit(struct uinput_device *udev, struct uinput_request *request) { - int error; + int retval; - error = uinput_request_reserve_slot(udev, request); - if (error) - return error; + retval = uinput_request_reserve_slot(udev, request); + if (retval) + return retval; - error = uinput_request_send(udev, request); - if (error) { - uinput_request_done(udev, request); - return error; - } + retval = uinput_request_send(udev, request); + if (retval) + goto out; wait_for_completion(&request->done); - return request->retval; + retval = request->retval; + + out: + uinput_request_release_slot(udev, request->id); + return retval; } /* @@ -169,7 +172,7 @@ static void uinput_flush_requests(struct uinput_device *udev) request = udev->requests[i]; if (request) { request->retval = -ENODEV; - uinput_request_done(udev, request); + complete(&request->done); } } @@ -957,7 +960,7 @@ static long uinput_ioctl_handler(struct file *file, unsigned int cmd, } req->retval = ff_up.retval; - uinput_request_done(udev, req); + complete(&req->done); goto out; case UI_END_FF_ERASE: @@ -973,7 +976,7 @@ static long uinput_ioctl_handler(struct file *file, unsigned int cmd, } req->retval = ff_erase.retval; - uinput_request_done(udev, req); + complete(&req->done); goto out; } -- cgit From 127b8e2692b6c145955bb71e482ca1bc6ce10027 Mon Sep 17 00:00:00 2001 From: Gabriel Fernandez Date: Tue, 19 Sep 2017 13:44:06 +0200 Subject: dt-bindings: clk: stm32h7: fix clock-cell size The clock-cell size is 1 on stm32h7 plaform. Signed-off-by: Gabriel Fernandez Fixes: 3e4d618b0722 ("clk: stm32h7: Add stm32h743 clock driver") Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt b/Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt index a135504c7d57..cac24ee10b72 100644 --- a/Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt +++ b/Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt @@ -32,7 +32,7 @@ Example: compatible = "st,stm32h743-rcc", "st,stm32-rcc"; reg = <0x58024400 0x400>; #reset-cells = <1>; - #clock-cells = <2>; + #clock-cells = <1>; clocks = <&clk_hse>, <&clk_lse>, <&clk_i2s_ckin>; st,syscfg = <&pwrcfg>; -- cgit From 059fbe8b5171960bd5c2371bb327ac18733773de Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 21 Sep 2017 13:27:02 +0200 Subject: net: phy: Fix truncation of large IRQ numbers in phy_attached_print() Given NR_IRQS is 2048 on sparc64, and even 32784 on alpha, 3 digits is not enough to represent interrupt numbers on all architectures. Hence PHY interrupt numbers may be truncated during printing. Increase the buffer size from 4 to 8 bytes to fix this. Fixes: 5e369aefdce4818c ("net: stmmac: Delete dead code for MDIO registration") Signed-off-by: Geert Uytterhoeven Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 8cf0c5901f95..67f25ac29025 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -879,7 +879,7 @@ void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) { const char *drv_name = phydev->drv ? phydev->drv->name : "unbound"; char *irq_str; - char irq_num[4]; + char irq_num[8]; switch(phydev->irq) { case PHY_POLL: -- cgit From 222d7dbd258dad4cd5241c43ef818141fad5a87a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2017 09:15:46 -0700 Subject: net: prevent dst uses after free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In linux-4.13, Wei worked hard to convert dst to a traditional refcounted model, removing GC. We now want to make sure a dst refcount can not transition from 0 back to 1. The problem here is that input path attached a not refcounted dst to an skb. Then later, because packet is forwarded and hits skb_dst_force() before exiting RCU section, we might try to take a refcount on one dst that is about to be freed, if another cpu saw 1 -> 0 transition in dst_release() and queued the dst for freeing after one RCU grace period. Lets unify skb_dst_force() and skb_dst_force_safe(), since we should always perform the complete check against dst refcount, and not assume it is not zero. Bugzilla : https://bugzilla.kernel.org/show_bug.cgi?id=197005 [ 989.919496] skb_dst_force+0x32/0x34 [ 989.919498] __dev_queue_xmit+0x1ad/0x482 [ 989.919501] ? eth_header+0x28/0xc6 [ 989.919502] dev_queue_xmit+0xb/0xd [ 989.919504] neigh_connected_output+0x9b/0xb4 [ 989.919507] ip_finish_output2+0x234/0x294 [ 989.919509] ? ipt_do_table+0x369/0x388 [ 989.919510] ip_finish_output+0x12c/0x13f [ 989.919512] ip_output+0x53/0x87 [ 989.919513] ip_forward_finish+0x53/0x5a [ 989.919515] ip_forward+0x2cb/0x3e6 [ 989.919516] ? pskb_trim_rcsum.part.9+0x4b/0x4b [ 989.919518] ip_rcv_finish+0x2e2/0x321 [ 989.919519] ip_rcv+0x26f/0x2eb [ 989.919522] ? vlan_do_receive+0x4f/0x289 [ 989.919523] __netif_receive_skb_core+0x467/0x50b [ 989.919526] ? tcp_gro_receive+0x239/0x239 [ 989.919529] ? inet_gro_receive+0x226/0x238 [ 989.919530] __netif_receive_skb+0x4d/0x5f [ 989.919532] netif_receive_skb_internal+0x5c/0xaf [ 989.919533] napi_gro_receive+0x45/0x81 [ 989.919536] ixgbe_poll+0xc8a/0xf09 [ 989.919539] ? kmem_cache_free_bulk+0x1b6/0x1f7 [ 989.919540] net_rx_action+0xf4/0x266 [ 989.919543] __do_softirq+0xa8/0x19d [ 989.919545] irq_exit+0x5d/0x6b [ 989.919546] do_IRQ+0x9c/0xb5 [ 989.919548] common_interrupt+0x93/0x93 [ 989.919548] Similarly dst_clone() can use dst_hold() helper to have additional debugging, as a follow up to commit 44ebe79149ff ("net: add debug atomic_inc_not_zero() in dst_hold()") In net-next we will convert dst atomic_t to refcount_t for peace of mind. Fixes: a4c2fd7f7891 ("net: remove DST_NOCACHE flag") Signed-off-by: Eric Dumazet Cc: Wei Wang Reported-by: Paweł Staszewski Bisected-by: Paweł Staszewski Acked-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/dst.h | 22 ++++------------------ include/net/route.h | 2 +- include/net/sock.h | 2 +- 3 files changed, 6 insertions(+), 20 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 93568bd0a352..06a6765da074 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) static inline struct dst_entry *dst_clone(struct dst_entry *dst) { if (dst) - atomic_inc(&dst->__refcnt); + dst_hold(dst); return dst; } @@ -311,21 +311,6 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb __skb_dst_copy(nskb, oskb->_skb_refdst); } -/** - * skb_dst_force - makes sure skb dst is refcounted - * @skb: buffer - * - * If dst is not yet refcounted, let's do it - */ -static inline void skb_dst_force(struct sk_buff *skb) -{ - if (skb_dst_is_noref(skb)) { - WARN_ON(!rcu_read_lock_held()); - skb->_skb_refdst &= ~SKB_DST_NOREF; - dst_clone(skb_dst(skb)); - } -} - /** * dst_hold_safe - Take a reference on a dst if possible * @dst: pointer to dst entry @@ -339,16 +324,17 @@ static inline bool dst_hold_safe(struct dst_entry *dst) } /** - * skb_dst_force_safe - makes sure skb dst is refcounted + * skb_dst_force - makes sure skb dst is refcounted * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. */ -static inline void skb_dst_force_safe(struct sk_buff *skb) +static inline void skb_dst_force(struct sk_buff *skb) { if (skb_dst_is_noref(skb)) { struct dst_entry *dst = skb_dst(skb); + WARN_ON(!rcu_read_lock_held()); if (!dst_hold_safe(dst)) dst = NULL; diff --git a/include/net/route.h b/include/net/route.h index 1b09a9368c68..57dfc6850d37 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -190,7 +190,7 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, rcu_read_lock(); err = ip_route_input_noref(skb, dst, src, tos, devin); if (!err) { - skb_dst_force_safe(skb); + skb_dst_force(skb); if (!skb_dst(skb)) err = -EINVAL; } diff --git a/include/net/sock.h b/include/net/sock.h index 03a362568357..a6b9a8d1a6df 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -856,7 +856,7 @@ void sk_stream_write_space(struct sock *sk); static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { /* dont let skb dst not refcounted, we are going to leave rcu lock */ - skb_dst_force_safe(skb); + skb_dst_force(skb); if (!sk->sk_backlog.tail) sk->sk_backlog.head = skb; -- cgit From 4ba72fc080ad44a5c1e93449ec070cd4d331803f Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 21 Sep 2017 22:34:54 +0200 Subject: drm/sun4i: cec: Enable back CEC-pin framework Now that the cec-pin framework has been merged, we can remove the safeguard that were preventing the CEC part of the sun4i HDMI driver and actually start to use it. Signed-off-by: Hans Verkuil Signed-off-by: Maxime Ripard --- drivers/gpu/drm/sun4i/Kconfig | 2 +- drivers/gpu/drm/sun4i/sun4i_hdmi.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/sun4i/Kconfig b/drivers/gpu/drm/sun4i/Kconfig index 06f05302ee75..882d85db9053 100644 --- a/drivers/gpu/drm/sun4i/Kconfig +++ b/drivers/gpu/drm/sun4i/Kconfig @@ -26,7 +26,7 @@ config DRM_SUN4I_HDMI_CEC bool "Allwinner A10 HDMI CEC Support" depends on DRM_SUN4I_HDMI select CEC_CORE - depends on CEC_PIN + select CEC_PIN help Choose this option if you have an Allwinner SoC with an HDMI controller and want to use CEC. diff --git a/drivers/gpu/drm/sun4i/sun4i_hdmi.h b/drivers/gpu/drm/sun4i/sun4i_hdmi.h index 1457750988da..a1f8cba251a2 100644 --- a/drivers/gpu/drm/sun4i/sun4i_hdmi.h +++ b/drivers/gpu/drm/sun4i/sun4i_hdmi.h @@ -15,7 +15,7 @@ #include #include -#include +#include #define SUN4I_HDMI_CTRL_REG 0x004 #define SUN4I_HDMI_CTRL_ENABLE BIT(31) -- cgit From 44889942b6eb356eab27ce25fe10701adfec7776 Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Fri, 22 Sep 2017 07:53:15 +0200 Subject: KVM: nVMX: fix HOST_CR3/HOST_CR4 cache For nested virt we maintain multiple VMCS that can run on a vCPU. So it is incorrect to keep vmcs_host_cr3 and vmcs_host_cr4, whose purpose is caching the value of the rarely changing HOST_CR3 and HOST_CR4 VMCS fields, in vCPU-wide data structures. Hyper-V nested on KVM runs into this consistently for me with PCID enabled. CR3 is updated with a new value, unlikely(cr3 != vmx->host_state.vmcs_host_cr3) fires, and the currently loaded VMCS is updated. Then we switch from L2 to L1 and the next exit reverts CR3 to its old value. Fixes: d6e41f1151fe ("x86/mm, KVM: Teach KVM's VMX code that CR3 isn't a constant") Signed-off-by: Ladi Prosek Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0726ca7a1b02..c83d28b0ab05 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -200,6 +200,8 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; + unsigned long vmcs_host_cr3; /* May not match real cr3 */ + unsigned long vmcs_host_cr4; /* May not match real cr4 */ struct list_head loaded_vmcss_on_cpu_link; }; @@ -600,8 +602,6 @@ struct vcpu_vmx { int gs_ldt_reload_needed; int fs_reload_needed; u64 msr_host_bndcfgs; - unsigned long vmcs_host_cr3; /* May not match real cr3 */ - unsigned long vmcs_host_cr4; /* May not match real cr4 */ } host_state; struct { int vm86_active; @@ -5178,12 +5178,12 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) */ cr3 = __read_cr3(); vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ - vmx->host_state.vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->vmcs_host_cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ - vmx->host_state.vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->vmcs_host_cr4 = cr4; vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 @@ -9274,15 +9274,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) { + if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) { vmcs_writel(HOST_CR3, cr3); - vmx->host_state.vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->vmcs_host_cr3 = cr3; } cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { + if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) { vmcs_writel(HOST_CR4, cr4); - vmx->host_state.vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->vmcs_host_cr4 = cr4; } /* When single-stepping over STI and MOV SS, we must clear the -- cgit From e001fa78d44d0b5c7b1498d1e4a038740efa3b1e Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Fri, 15 Sep 2017 15:26:14 +1000 Subject: KVM: PPC: Book3S HV: Check for updated HDSISR on P9 HDSI exception On POWER9 DD2.1 and below, sometimes on a Hypervisor Data Storage Interrupt (HDSI) the HDSISR is not be updated at all. To work around this we put a canary value into the HDSISR before returning to a guest and then check for this canary when we take a HDSI. If we find the canary on a HDSI, we know the hardware didn't update the HDSISR. In this case we return to the guest to retake the HDSI which should correctly update the HDSISR the second time HDSI entry. After talking to Paulus we've applied this workaround to all POWER9 CPUs. The workaround of returning to the guest shouldn't ever be triggered on well behaving CPU. The extra instructions should have negligible performance impact. Signed-off-by: Michael Neuling Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 17936f82d3c7..ec69fa45d5a2 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1121,6 +1121,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) BEGIN_FTR_SECTION mtspr SPRN_PPR, r0 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + +/* Move canary into DSISR to check for later */ +BEGIN_FTR_SECTION + li r0, 0x7fff + mtspr SPRN_HDSISR, r0 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ld r0, VCPU_GPR(R0)(r4) ld r4, VCPU_GPR(R4)(r4) @@ -1956,9 +1963,14 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) kvmppc_hdsi: ld r3, VCPU_KVM(r9) lbz r0, KVM_RADIX(r3) - cmpwi r0, 0 mfspr r4, SPRN_HDAR mfspr r6, SPRN_HDSISR +BEGIN_FTR_SECTION + /* Look for DSISR canary. If we find it, retry instruction */ + cmpdi r6, 0x7fff + beq 6f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + cmpwi r0, 0 bne .Lradix_hdsi /* on radix, just save DAR/DSISR/ASDR */ /* HPTE not found fault or protection fault? */ andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h -- cgit From e87be9b29c22852ec300826e3b1d551b00c1eb7a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 14 Sep 2017 14:30:43 +0200 Subject: mmc: tmio: remove broken and noisy debug macro Some change for v4.14 broke the debug output for TMIO. But since it was not helpful to me and too noisy for my taste anyhow, let's just remove it instead of fixing it. We'll find something better if we'd need it... Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc_core.c | 47 ---------------------------------------- 1 file changed, 47 deletions(-) diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 12cf8288d663..a7293e186e03 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -129,50 +129,6 @@ static int tmio_mmc_next_sg(struct tmio_mmc_host *host) #define CMDREQ_TIMEOUT 5000 -#ifdef CONFIG_MMC_DEBUG - -#define STATUS_TO_TEXT(a, status, i) \ - do { \ - if ((status) & TMIO_STAT_##a) { \ - if ((i)++) \ - printk(KERN_DEBUG " | "); \ - printk(KERN_DEBUG #a); \ - } \ - } while (0) - -static void pr_debug_status(u32 status) -{ - int i = 0; - - pr_debug("status: %08x = ", status); - STATUS_TO_TEXT(CARD_REMOVE, status, i); - STATUS_TO_TEXT(CARD_INSERT, status, i); - STATUS_TO_TEXT(SIGSTATE, status, i); - STATUS_TO_TEXT(WRPROTECT, status, i); - STATUS_TO_TEXT(CARD_REMOVE_A, status, i); - STATUS_TO_TEXT(CARD_INSERT_A, status, i); - STATUS_TO_TEXT(SIGSTATE_A, status, i); - STATUS_TO_TEXT(CMD_IDX_ERR, status, i); - STATUS_TO_TEXT(STOPBIT_ERR, status, i); - STATUS_TO_TEXT(ILL_FUNC, status, i); - STATUS_TO_TEXT(CMD_BUSY, status, i); - STATUS_TO_TEXT(CMDRESPEND, status, i); - STATUS_TO_TEXT(DATAEND, status, i); - STATUS_TO_TEXT(CRCFAIL, status, i); - STATUS_TO_TEXT(DATATIMEOUT, status, i); - STATUS_TO_TEXT(CMDTIMEOUT, status, i); - STATUS_TO_TEXT(RXOVERFLOW, status, i); - STATUS_TO_TEXT(TXUNDERRUN, status, i); - STATUS_TO_TEXT(RXRDY, status, i); - STATUS_TO_TEXT(TXRQ, status, i); - STATUS_TO_TEXT(ILL_ACCESS, status, i); - printk("\n"); -} - -#else -#define pr_debug_status(s) do { } while (0) -#endif - static void tmio_mmc_enable_sdio_irq(struct mmc_host *mmc, int enable) { struct tmio_mmc_host *host = mmc_priv(mmc); @@ -762,9 +718,6 @@ irqreturn_t tmio_mmc_irq(int irq, void *devid) status = sd_ctrl_read16_and_16_as_32(host, CTL_STATUS); ireg = status & TMIO_MASK_IRQ & ~host->sdcard_irq_mask; - pr_debug_status(status); - pr_debug_status(ireg); - /* Clear the status except the interrupt status */ sd_ctrl_write32_as_16_and_16(host, CTL_STATUS, TMIO_MASK_IRQ); -- cgit From 6ae033689d7b1a419def78e8e990b0eab8bb6419 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 18 Sep 2017 15:16:08 +0300 Subject: mmc: sdhci-pci: Fix voltage switch for some Intel host controllers Some Intel host controllers (e.g. CNP) use an ACPI device-specific method to ensure correct voltage switching. Fix voltage switch for those, by adding a call to the DSM. Signed-off-by: Adrian Hunter Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index bbaddf18a1b3..d0ccc6729fd2 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -392,6 +392,7 @@ static const struct sdhci_pci_fixes sdhci_intel_pch_sdio = { enum { INTEL_DSM_FNS = 0, + INTEL_DSM_V18_SWITCH = 3, INTEL_DSM_DRV_STRENGTH = 9, INTEL_DSM_D3_RETUNE = 10, }; @@ -557,6 +558,19 @@ static void intel_hs400_enhanced_strobe(struct mmc_host *mmc, sdhci_writel(host, val, INTEL_HS400_ES_REG); } +static void sdhci_intel_voltage_switch(struct sdhci_host *host) +{ + struct sdhci_pci_slot *slot = sdhci_priv(host); + struct intel_host *intel_host = sdhci_pci_priv(slot); + struct device *dev = &slot->chip->pdev->dev; + u32 result = 0; + int err; + + err = intel_dsm(intel_host, dev, INTEL_DSM_V18_SWITCH, &result); + pr_debug("%s: %s DSM error %d result %u\n", + mmc_hostname(host->mmc), __func__, err, result); +} + static const struct sdhci_ops sdhci_intel_byt_ops = { .set_clock = sdhci_set_clock, .set_power = sdhci_intel_set_power, @@ -565,6 +579,7 @@ static const struct sdhci_ops sdhci_intel_byt_ops = { .reset = sdhci_reset, .set_uhs_signaling = sdhci_set_uhs_signaling, .hw_reset = sdhci_pci_hw_reset, + .voltage_switch = sdhci_intel_voltage_switch, }; static void byt_read_dsm(struct sdhci_pci_slot *slot) -- cgit From c0d05cde2a685cd6486390c62be684ce456d84d6 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 21 Sep 2017 11:20:58 +0100 Subject: iommu/of: Remove PCI host bridge node check of_pci_iommu_init() tries to be clever and stop its alias walk at the device represented by master_np, in case of weird PCI topologies where the bridge to the IOMMU and the rest of the system is not at the root. It turns out this is a bit short-sighted, since there are plenty of other callers of pci_for_each_dma_alias() which would also need the same behaviour in that situation, and the only platform so far with such a topology (Cavium ThunderX2) already solves it more generally via a PCI quirk. As this check is effectively redundant, and returning a boolean value as an int is a bit broken anyway, let's just get rid of it. Reported-by: Jean-Philippe Brucker Fixes: d87beb749281 ("iommu/of: Handle PCI aliases properly") Signed-off-by: Robin Murphy Tested-by: Jean-Philippe Brucker Signed-off-by: Joerg Roedel --- drivers/iommu/of_iommu.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index e60e3dba85a0..50947ebb6d17 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -157,10 +157,7 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) err = of_iommu_xlate(info->dev, &iommu_spec); of_node_put(iommu_spec.np); - if (err) - return err; - - return info->np == pdev->bus->dev.of_node; + return err; } const struct iommu_ops *of_iommu_configure(struct device *dev, -- cgit From a88dc7ba15cd4f4ef5102b5185f8fa7ff86e54e1 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 20 Sep 2017 12:26:38 +0530 Subject: drivers/perf: arm_pmu_acpi: Release memory obtained by kasprintf Free memory region, if arm_pmu_acpi_probe is not successful. Acked-by: Will Deacon Signed-off-by: Arvind Yadav Signed-off-by: Catalin Marinas --- drivers/perf/arm_pmu_acpi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c index 0a9b78705ee8..3303dd8d8eb5 100644 --- a/drivers/perf/arm_pmu_acpi.c +++ b/drivers/perf/arm_pmu_acpi.c @@ -235,6 +235,7 @@ int arm_pmu_acpi_probe(armpmu_init_fn init_fn) ret = armpmu_register(pmu); if (ret) { pr_warn("Failed to register PMU for CPU%d\n", cpu); + kfree(pmu->name); return ret; } } -- cgit From e6f9bc34d3779cb7b6a337afed5de8be3f0fab77 Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Thu, 31 Aug 2017 09:30:34 -0700 Subject: IB/core: Fix for core panic Build with the latest patches resulted in panic: 11384.486289] BUG: unable to handle kernel NULL pointer dereference at (null) [11384.486293] IP: (null) [11384.486295] PGD 0 [11384.486295] P4D 0 [11384.486296] [11384.486299] Oops: 0010 [#1] SMP ......... snip ...... [11384.486401] CPU: 0 PID: 968 Comm: kworker/0:1H Tainted: G W O 4.13.0-a-stream-20170825 #1 [11384.486402] Hardware name: Intel Corporation S2600WT2R/S2600WT2R, BIOS SE5C610.86B.01.01.0014.121820151719 12/18/2015 [11384.486418] Workqueue: ib-comp-wq ib_cq_poll_work [ib_core] [11384.486419] task: ffff880850579680 task.stack: ffffc90007fec000 [11384.486420] RIP: 0010: (null) [11384.486420] RSP: 0018:ffffc90007fef970 EFLAGS: 00010206 [11384.486421] RAX: ffff88084cfe8000 RBX: ffff88084dce4000 RCX: ffffc90007fef978 [11384.486422] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff88084cfe8000 [11384.486422] RBP: ffffc90007fefab0 R08: 0000000000000000 R09: ffff88084dce4080 [11384.486423] R10: ffffffffa02d7f60 R11: 0000000000000000 R12: ffff88105af65a00 [11384.486423] R13: ffff88084dce4000 R14: 000000000000c000 R15: 000000000000c000 [11384.486424] FS: 0000000000000000(0000) GS:ffff88085f400000(0000) knlGS:0000000000000000 [11384.486425] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [11384.486425] CR2: 0000000000000000 CR3: 0000000001c09000 CR4: 00000000001406f0 [11384.486426] Call Trace: [11384.486431] ? is_valid_mcast_lid.isra.21+0xfb/0x110 [ib_core] [11384.486436] ib_attach_mcast+0x6f/0xa0 [ib_core] [11384.486441] ipoib_mcast_attach+0x81/0x190 [ib_ipoib] [11384.486443] ipoib_mcast_join_complete+0x354/0xb40 [ib_ipoib] [11384.486448] mcast_work_handler+0x330/0x6c0 [ib_core] [11384.486452] join_handler+0x101/0x220 [ib_core] [11384.486455] ib_sa_mcmember_rec_callback+0x54/0x80 [ib_core] [11384.486459] recv_handler+0x3a/0x60 [ib_core] [11384.486462] ib_mad_recv_done+0x423/0x9b0 [ib_core] [11384.486466] __ib_process_cq+0x5d/0xb0 [ib_core] [11384.486469] ib_cq_poll_work+0x20/0x60 [ib_core] [11384.486472] process_one_work+0x149/0x360 [11384.486474] worker_thread+0x4d/0x3c0 [11384.486487] kthread+0x109/0x140 [11384.486488] ? rescuer_thread+0x380/0x380 [11384.486489] ? kthread_park+0x60/0x60 [11384.486490] ? kthread_park+0x60/0x60 [11384.486493] ret_from_fork+0x25/0x30 [11384.486493] Code: Bad RIP value. [11384.486493] Code: Bad RIP value. [11384.486496] RIP: (null) RSP: ffffc90007fef970 [11384.486497] CR2: 0000000000000000 [11384.486531] ---[ end trace b1acec6fb4ff6e75 ]--- [11384.532133] Kernel panic - not syncing: Fatal exception [11384.536541] Kernel Offset: disabled [11384.969491] ---[ end Kernel panic - not syncing: Fatal exception [11384.976875] sched: Unexpected reschedule of offline CPU#1! [11384.983646] ------------[ cut here ]------------ Rdma device driver may not have implemented (*get_link_layer)() so it can not be called directly. Should use appropriate helper function. Reviewed-by: Yuval Shaia Fixes: 523633359224 ("IB/core: Fix the validations of a multicast LID in attach or detach operations") Cc: stable@kernel.org # 4.13 Reviewed-by: Dennis Dalessandro Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index ee9e27dc799b..de57d6c11a25 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1646,7 +1646,7 @@ static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) */ if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) { if (attr.qp_state >= IB_QPS_INIT) { - if (qp->device->get_link_layer(qp->device, attr.port_num) != + if (rdma_port_get_link_layer(qp->device, attr.port_num) != IB_LINK_LAYER_INFINIBAND) return true; goto lid_check; @@ -1655,7 +1655,7 @@ static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) /* Can't get a quick answer, iterate over all ports */ for (port = 0; port < qp->device->phys_port_cnt; port++) - if (qp->device->get_link_layer(qp->device, port) != + if (rdma_port_get_link_layer(qp->device, port) != IB_LINK_LAYER_INFINIBAND) num_eth_ports++; -- cgit From 3d318605f5e32ff44fb290d9b67573b34213c4c8 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 13 Sep 2017 09:52:32 -0700 Subject: iw_cxgb4: put ep reference in pass_accept_req() The listening endpoint should always be dereferenced at the end of pass_accept_req(). Fixes: f86fac79afec ("RDMA/iw_cxgb4: atomic find and reference for listening endpoints") Cc: stable@vger.kernel.org Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index ceaa2fa54d32..83322dbc4711 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -2594,9 +2594,9 @@ fail: c4iw_put_ep(&child_ep->com); reject: reject_cr(dev, hwtid, skb); +out: if (parent_ep) c4iw_put_ep(&parent_ep->com); -out: return 0; } -- cgit From 3c8415cc7aff467faba25841fb859660ac14a04e Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 5 Sep 2017 11:52:33 -0700 Subject: iw_cxgb4: drop listen destroy replies if no ep found If the thread waiting for a CLOSE_LISTSRV_RPL times out and bails, then we need to handle a subsequent CPL if it arrives and the stid has been released. In this case silently drop it. Cc: stable@vger.kernel.org Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/cm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 83322dbc4711..19297e408820 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -2333,9 +2333,14 @@ static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb) unsigned int stid = GET_TID(rpl); struct c4iw_listen_ep *ep = get_ep_from_stid(dev, stid); + if (!ep) { + pr_debug("%s stid %d lookup failure!\n", __func__, stid); + goto out; + } pr_debug("%s ep %p\n", __func__, ep); c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status)); c4iw_put_ep(&ep->com); +out: return 0; } -- cgit From 8b1bbf36b7452c4acb20e91948eaa5e225ea6978 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 5 Sep 2017 11:52:34 -0700 Subject: iw_cxgb4: remove the stid on listen create failure If a listen create fails, then the server tid (stid) is incorrectly left in the stid idr table, which can cause a touch-after-free if the stid is looked up and the already freed endpoint is touched. So make sure and remove it in the error path. Cc: stable@vger.kernel.org Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 19297e408820..daf7a56e5d7e 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -3462,7 +3462,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) cm_id->provider_data = ep; goto out; } - + remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid); cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, ep->com.local_addr.ss_family); fail2: -- cgit From 05f5c38576475439f3124a3e6d62db68de83f7be Mon Sep 17 00:00:00 2001 From: KT Liao Date: Fri, 22 Sep 2017 10:00:57 -0700 Subject: Input: elan_i2c - extend Flash-Write delay The original 20ms delay is only marginally enough delay after a block write operation during firmware update. Let's increase the delay to ensure that the controller finishes up storing the page to avoid failures in the firmware updates. Signed-off-by: KT Liao Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elan_i2c_i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/mouse/elan_i2c_i2c.c b/drivers/input/mouse/elan_i2c_i2c.c index 15b1330606c1..e19eb60b3d2f 100644 --- a/drivers/input/mouse/elan_i2c_i2c.c +++ b/drivers/input/mouse/elan_i2c_i2c.c @@ -598,7 +598,7 @@ static int elan_i2c_write_fw_block(struct i2c_client *client, } /* Wait for F/W to update one page ROM data. */ - msleep(20); + msleep(35); error = elan_i2c_read_cmd(client, ETP_I2C_IAP_CTRL_CMD, val); if (error) { -- cgit From af3c79beff48fb2bd1c79d02688004cf57215acf Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Thu, 7 Sep 2017 13:38:18 +0300 Subject: IB/ipoib: Suppress the retry related completion errors IPoIB doesn't support transport/rnr retry schemes as per RFC so those errors are expected. No need to flood the log files with them. Tested-by: Michael Nowak Tested-by: Rafael Alejandro Peralez Tested-by: Liwen Huang Tested-by: Hong Liu Reviewed-by: Mukesh Kacker Reported-by: Rajiv Raja Signed-off-by: Santosh Shilimkar Signed-off-by: Yuval Shaia Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 14b62f7472b4..7774654c2ccb 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -823,12 +823,18 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) wc->status != IB_WC_WR_FLUSH_ERR) { struct ipoib_neigh *neigh; - if (wc->status != IB_WC_RNR_RETRY_EXC_ERR) - ipoib_warn(priv, "failed cm send event (status=%d, wrid=%d vend_err %x)\n", - wc->status, wr_id, wc->vendor_err); + /* IB_WC[_RNR]_RETRY_EXC_ERR error is part of the life cycle, + * so don't make waves. + */ + if (wc->status == IB_WC_RNR_RETRY_EXC_ERR || + wc->status == IB_WC_RETRY_EXC_ERR) + ipoib_dbg(priv, + "%s: failed cm send event (status=%d, wrid=%d vend_err 0x%x)\n", + __func__, wc->status, wr_id, wc->vendor_err); else - ipoib_dbg(priv, "failed cm send event (status=%d, wrid=%d vend_err %x)\n", - wc->status, wr_id, wc->vendor_err); + ipoib_warn(priv, + "%s: failed cm send event (status=%d, wrid=%d vend_err 0x%x)\n", + __func__, wc->status, wr_id, wc->vendor_err); spin_lock_irqsave(&priv->lock, flags); neigh = tx->neigh; -- cgit From 06564f60859bdf7e73d70ae35d7e285e96ae9c46 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 11 Sep 2017 17:03:13 +0100 Subject: IB/ocrdma: fix incorrect fall-through on switch statement In the case where mbox_status is OCRDMA_MBX_STATUS_FAILED and add_status is OCRDMA_MBX_STATUS_FAILED err_num is assigned -EAGAIN however the case OCRDMA_MBX_STATUS_FAILED is missing a break and falls through to the default case which then re-assigns err_num to -EFAULT. Fix this so that err_num is assigned to -EAGAIN for the add_status OCRDMA_MBX_STATUS_FAILED case and -EFAULT otherwise. Detected by CoverityScan CID#703125 ("Missing break in switch") Fixes: fe2caefcdf58 ("RDMA/ocrdma: Add driver for Emulex OneConnect IBoE RDMA adapter") Signed-off-by: Colin Ian King Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index dcb5942f9fb5..65b166cc7437 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -252,7 +252,10 @@ static int ocrdma_get_mbx_errno(u32 status) case OCRDMA_MBX_ADDI_STATUS_INSUFFICIENT_RESOURCES: err_num = -EAGAIN; break; + default: + err_num = -EFAULT; } + break; default: err_num = -EFAULT; } -- cgit From cbafad87e1507044c7d442087d41d5e3d432cc4e Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Mon, 18 Sep 2017 12:28:48 +0100 Subject: IB/mlx5: fix debugfs cleanup If delay_drop_debugfs_init() fails in any of the operations to create debugfs, it is calling delay_drop_debugfs_cleanup() as part of its cleanup. But delay_drop_debugfs_cleanup() checks for 'dbg' and since we have not yet pointed 'dbg' to the debugfs we need to cleanup, the cleanup fails and we are left with stray debugfs elements and also a memory leak. Fixes: 4a5fd5d2965c ("IB/mlx5: Add necessary delay drop assignment") Signed-off-by: Sudip Mukherjee Acked-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index ab3c562d5ba7..05fb4bdff6a0 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3837,11 +3837,13 @@ static int delay_drop_debugfs_init(struct mlx5_ib_dev *dev) if (!dbg) return -ENOMEM; + dev->delay_drop.dbg = dbg; + dbg->dir_debugfs = debugfs_create_dir("delay_drop", dev->mdev->priv.dbg_root); if (!dbg->dir_debugfs) - return -ENOMEM; + goto out_debugfs; dbg->events_cnt_debugfs = debugfs_create_atomic_t("num_timeout_events", 0400, @@ -3865,8 +3867,6 @@ static int delay_drop_debugfs_init(struct mlx5_ib_dev *dev) if (!dbg->timeout_debugfs) goto out_debugfs; - dev->delay_drop.dbg = dbg; - return 0; out_debugfs: -- cgit From e13547bc181ae6c279adf0df054717787f24ee89 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 19 Sep 2017 13:22:13 +0300 Subject: IB/bnxt_re: Fix frame stack compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce stack size by dynamically allocating memory instead of declaring large struct on the stack: drivers/infiniband/hw/bnxt_re/ib_verbs.c: In function ‘bnxt_re_query_qp’: drivers/infiniband/hw/bnxt_re/ib_verbs.c:1600:1: warning: the frame size of 1216 bytes is larger than 1024 bytes [-Wframe-larger-than=] } ^ Cc: Selvin Xavier Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Signed-off-by: Leon Romanovsky Acked-by: Selvin Xavier Reviewed-by: Jonathan Toppins Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 67 +++++++++++++++++--------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 01eee15bbd65..03caf48af8e2 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1551,43 +1551,46 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, { struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); struct bnxt_re_dev *rdev = qp->rdev; - struct bnxt_qplib_qp qplib_qp; + struct bnxt_qplib_qp *qplib_qp; int rc; - memset(&qplib_qp, 0, sizeof(struct bnxt_qplib_qp)); - qplib_qp.id = qp->qplib_qp.id; - qplib_qp.ah.host_sgid_index = qp->qplib_qp.ah.host_sgid_index; + qplib_qp = kzalloc(sizeof(*qplib_qp), GFP_KERNEL); + if (!qplib_qp) + return -ENOMEM; + + qplib_qp->id = qp->qplib_qp.id; + qplib_qp->ah.host_sgid_index = qp->qplib_qp.ah.host_sgid_index; - rc = bnxt_qplib_query_qp(&rdev->qplib_res, &qplib_qp); + rc = bnxt_qplib_query_qp(&rdev->qplib_res, qplib_qp); if (rc) { dev_err(rdev_to_dev(rdev), "Failed to query HW QP"); - return rc; + goto out; } - qp_attr->qp_state = __to_ib_qp_state(qplib_qp.state); - qp_attr->en_sqd_async_notify = qplib_qp.en_sqd_async_notify ? 1 : 0; - qp_attr->qp_access_flags = __to_ib_access_flags(qplib_qp.access); - qp_attr->pkey_index = qplib_qp.pkey_index; - qp_attr->qkey = qplib_qp.qkey; + qp_attr->qp_state = __to_ib_qp_state(qplib_qp->state); + qp_attr->en_sqd_async_notify = qplib_qp->en_sqd_async_notify ? 1 : 0; + qp_attr->qp_access_flags = __to_ib_access_flags(qplib_qp->access); + qp_attr->pkey_index = qplib_qp->pkey_index; + qp_attr->qkey = qplib_qp->qkey; qp_attr->ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; - rdma_ah_set_grh(&qp_attr->ah_attr, NULL, qplib_qp.ah.flow_label, - qplib_qp.ah.host_sgid_index, - qplib_qp.ah.hop_limit, - qplib_qp.ah.traffic_class); - rdma_ah_set_dgid_raw(&qp_attr->ah_attr, qplib_qp.ah.dgid.data); - rdma_ah_set_sl(&qp_attr->ah_attr, qplib_qp.ah.sl); - ether_addr_copy(qp_attr->ah_attr.roce.dmac, qplib_qp.ah.dmac); - qp_attr->path_mtu = __to_ib_mtu(qplib_qp.path_mtu); - qp_attr->timeout = qplib_qp.timeout; - qp_attr->retry_cnt = qplib_qp.retry_cnt; - qp_attr->rnr_retry = qplib_qp.rnr_retry; - qp_attr->min_rnr_timer = qplib_qp.min_rnr_timer; - qp_attr->rq_psn = qplib_qp.rq.psn; - qp_attr->max_rd_atomic = qplib_qp.max_rd_atomic; - qp_attr->sq_psn = qplib_qp.sq.psn; - qp_attr->max_dest_rd_atomic = qplib_qp.max_dest_rd_atomic; - qp_init_attr->sq_sig_type = qplib_qp.sig_type ? IB_SIGNAL_ALL_WR : - IB_SIGNAL_REQ_WR; - qp_attr->dest_qp_num = qplib_qp.dest_qpn; + rdma_ah_set_grh(&qp_attr->ah_attr, NULL, qplib_qp->ah.flow_label, + qplib_qp->ah.host_sgid_index, + qplib_qp->ah.hop_limit, + qplib_qp->ah.traffic_class); + rdma_ah_set_dgid_raw(&qp_attr->ah_attr, qplib_qp->ah.dgid.data); + rdma_ah_set_sl(&qp_attr->ah_attr, qplib_qp->ah.sl); + ether_addr_copy(qp_attr->ah_attr.roce.dmac, qplib_qp->ah.dmac); + qp_attr->path_mtu = __to_ib_mtu(qplib_qp->path_mtu); + qp_attr->timeout = qplib_qp->timeout; + qp_attr->retry_cnt = qplib_qp->retry_cnt; + qp_attr->rnr_retry = qplib_qp->rnr_retry; + qp_attr->min_rnr_timer = qplib_qp->min_rnr_timer; + qp_attr->rq_psn = qplib_qp->rq.psn; + qp_attr->max_rd_atomic = qplib_qp->max_rd_atomic; + qp_attr->sq_psn = qplib_qp->sq.psn; + qp_attr->max_dest_rd_atomic = qplib_qp->max_dest_rd_atomic; + qp_init_attr->sq_sig_type = qplib_qp->sig_type ? IB_SIGNAL_ALL_WR : + IB_SIGNAL_REQ_WR; + qp_attr->dest_qp_num = qplib_qp->dest_qpn; qp_attr->cap.max_send_wr = qp->qplib_qp.sq.max_wqe; qp_attr->cap.max_send_sge = qp->qplib_qp.sq.max_sge; @@ -1596,7 +1599,9 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp_attr->cap.max_inline_data = qp->qplib_qp.max_inline_data; qp_init_attr->cap = qp_attr->cap; - return 0; +out: + kfree(qplib_qp); + return rc; } /* Routine for sending QP1 packets for RoCE V1 an V2 -- cgit From 01df7f5a77b926c2d790cf66d942d2a68d4ad5de Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Thu, 21 Sep 2017 15:56:21 -0700 Subject: RDMA/vmw_pvrdma: Fix reporting correct opcodes for completion Since the IB_WC_BIND_MW opcode has been dropped, set the correct IB WC opcode explicitly. Fixes: 29c8d9eba550 ("IB: Add vmw_pvrdma driver") Reviewed-by: Aditya Sarwade Reviewed-by: Jorgen Hansen Signed-off-by: Adit Ranadive Signed-off-by: Bryan Tan Signed-off-by: Doug Ledford --- drivers/infiniband/hw/vmw_pvrdma/pvrdma.h | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h index 663a0c301c43..984aa3484928 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h @@ -416,9 +416,34 @@ static inline enum ib_wc_status pvrdma_wc_status_to_ib( return (enum ib_wc_status)status; } -static inline int pvrdma_wc_opcode_to_ib(int opcode) -{ - return opcode; +static inline int pvrdma_wc_opcode_to_ib(unsigned int opcode) +{ + switch (opcode) { + case PVRDMA_WC_SEND: + return IB_WC_SEND; + case PVRDMA_WC_RDMA_WRITE: + return IB_WC_RDMA_WRITE; + case PVRDMA_WC_RDMA_READ: + return IB_WC_RDMA_READ; + case PVRDMA_WC_COMP_SWAP: + return IB_WC_COMP_SWAP; + case PVRDMA_WC_FETCH_ADD: + return IB_WC_FETCH_ADD; + case PVRDMA_WC_LOCAL_INV: + return IB_WC_LOCAL_INV; + case PVRDMA_WC_FAST_REG_MR: + return IB_WC_REG_MR; + case PVRDMA_WC_MASKED_COMP_SWAP: + return IB_WC_MASKED_COMP_SWAP; + case PVRDMA_WC_MASKED_FETCH_ADD: + return IB_WC_MASKED_FETCH_ADD; + case PVRDMA_WC_RECV: + return IB_WC_RECV; + case PVRDMA_WC_RECV_RDMA_WITH_IMM: + return IB_WC_RECV_RDMA_WITH_IMM; + default: + return IB_WC_SEND; + } } static inline int pvrdma_wc_flags_to_ib(int flags) -- cgit From cd9100ca9e78eb03bf5a0ee2ba98ebdc8cc5880e Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Tue, 19 Sep 2017 09:19:09 -0500 Subject: i40iw: Fail open if there are no available MSI-X vectors Check number of available MSI-X vectors for i40iw. If there are no available vectors, fail the open. Signed-off-by: Shiraz Saleem Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index cc742c3132c6..5965b59a2f83 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -1400,6 +1400,11 @@ static enum i40iw_status_code i40iw_save_msix_info(struct i40iw_device *iwdev, u32 i; u32 size; + if (!ldev->msix_count) { + i40iw_pr_err("No MSI-X vectors\n"); + return I40IW_ERR_CONFIG; + } + iwdev->msix_count = ldev->msix_count; size = sizeof(struct i40iw_msix_vector) * iwdev->msix_count; @@ -1550,7 +1555,7 @@ static enum i40iw_status_code i40iw_setup_init_state(struct i40iw_handler *hdl, status = i40iw_save_msix_info(iwdev, ldev); if (status) - goto exit; + return status; iwdev->hw.dev_context = (void *)ldev->pcidev; iwdev->hw.hw_addr = ldev->hw_addr; status = i40iw_allocate_dma_mem(&iwdev->hw, -- cgit From 47fb3c1610b28dd435b892762280eebf04fe9adf Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Tue, 19 Sep 2017 09:19:10 -0500 Subject: i40iw: Prevent multiple netdev event notifier registrations Netdev event notifier registration/de-registration is not synchronized with a lock and there is a possibility of a duplicate registration of notifier before the unregister completes. Register netdev event notifiers during module init and de-register them at module exit. This avoids the need to tie the registration to first netdev client interface open and de-registration to last client interface close and the synchronization to achieve it. This also fixes a crash due to duplicate registration. BUG: unable to handle kernel paging request at ffffffffa0d60388 IP: [] notifier_call_chain+0x3d/0x70 PGD 190d067 PUD 190e063 PMD 76c840067 PTE 0 Oops: 0000 [#1] SMP Modules linked in: i40e(OF-) fuse btrfs zlib_deflate raid6_pq xor vfat msdos [..] e1000e vxlan ip_tunnel ptp pps_core i2c_core video [last unloaded: i40iw] CPU: 1 PID: 27101 Comm: modprobe Tainted: GF W O-------------- 3.10.0-229.el7.x86_64 #1 Hardware name: Gigabyte Technology Co., Ltd. To be filled by O.E.M./Q87M-D2H, BIOS F7 01/17/2014 task: ffff88076e8a96c0 ti: ffff8806959c8000 task.ti: ffff8806959c8000 RIP: 0010:[] [] notifier_call_chain+0x3d/0x70 RSP: 0018:ffff8806959cbb38 EFLAGS: 00010282 RAX: ffffffffa0d60380 RBX: 00000000fffffffd RCX: 0000000000000000 0708] RDX: 0000000000000000 RSI: ffff88081227a000 RDI: 0000000000000002 RBP: ffff8806959cbb60 R08: 0000000000000246 R09: 000000000000700c R10: ffff88080e16ea40 R11: 00000000000ae8df R12: ffffffffa0d60380 R13: 0000000000000002 R14: ffff88076e738800 R15: 0000000000000000 FS: 00007f604ef4a740(0000) GS:ffff88083e240000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffa0d60388 CR3: 0000000753cd2000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffffffff819e73a0 0000000000000000 0000000000000002 ffff88076e738800 00000000ffffffff ffff8806959cbba0 ffffffff8109d61d 0000000000000000 0000000000000000 ffff88076e738800 0000000000000000 ffff88076e738800 Call Trace: [] __blocking_notifier_call_chain+0x4d/0x70 [] blocking_notifier_call_chain+0x16/0x20 [] __inet_del_ifa+0x154/0x2b0 [] inetdev_event+0x182/0x530 [] notifier_call_chain+0x4c/0x70 [] raw_notifier_call_chain+0x16/0x20 [] call_netdevice_notifiers+0x2d/0x60 [] rollback_registered_many+0x105/0x220 [] rollback_registered+0x40/0x70 [] unregister_netdevice_queue+0x48/0x80 [] unregister_netdev+0x1c/0x30 [] i40e_vsi_release+0x2a9/0x2b0 [i40e] [] i40e_remove+0x128/0x2b0 [i40e] [] pci_device_remove+0x3b/0xb0 [] __device_release_driver+0x7f/0xf0 [] driver_detach+0xb8/0xc0 [] bus_remove_driver+0x9b/0x120 [] driver_unregister+0x2c/0x50 [] pci_unregister_driver+0x2c/0x90 [] i40e_exit_module+0x10/0x23 [i40e] [] SyS_delete_module+0x16b/0x2d0 [] ? do_notify_resume+0x9c/0xb0 [] system_call_fastpath+0x16/0x1b Code: e5 41 57 4d 89 c7 41 56 49 89 d6 41 55 49 89 f5 41 54 53 89 cb 75 14 eb 3d 0f 1f 44 00 00 83 eb 01 74 25 4d 85 e4 74 20 4c 89 e0 <4c> 8b 60 08 4c 89 f2 4c 89 ee 48 89 c7 ff 10 4d 85 ff 74 04 41 RIP [] notifier_call_chain+0x3d/0x70 Signed-off-by: Shiraz Saleem Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw.h | 1 - drivers/infiniband/hw/i40iw/i40iw_main.c | 32 ++++++++++++++++--------------- drivers/infiniband/hw/i40iw/i40iw_utils.c | 6 +++--- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h index 9b1566468744..a65e4cbdce2f 100644 --- a/drivers/infiniband/hw/i40iw/i40iw.h +++ b/drivers/infiniband/hw/i40iw/i40iw.h @@ -201,7 +201,6 @@ enum init_completion_state { CEQ_CREATED, ILQ_CREATED, IEQ_CREATED, - INET_NOTIFIER, IP_ADDR_REGISTERED, RDMA_DEV_REGISTERED }; diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index 5965b59a2f83..27590ae21881 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -99,8 +99,6 @@ static struct notifier_block i40iw_net_notifier = { .notifier_call = i40iw_net_event }; -static atomic_t i40iw_notifiers_registered; - /** * i40iw_find_i40e_handler - find a handler given a client info * @ldev: pointer to a client info @@ -1376,11 +1374,20 @@ error: */ static void i40iw_register_notifiers(void) { - if (atomic_inc_return(&i40iw_notifiers_registered) == 1) { - register_inetaddr_notifier(&i40iw_inetaddr_notifier); - register_inet6addr_notifier(&i40iw_inetaddr6_notifier); - register_netevent_notifier(&i40iw_net_notifier); - } + register_inetaddr_notifier(&i40iw_inetaddr_notifier); + register_inet6addr_notifier(&i40iw_inetaddr6_notifier); + register_netevent_notifier(&i40iw_net_notifier); +} + +/** + * i40iw_unregister_notifiers - unregister tcp ip notifiers + */ + +static void i40iw_unregister_notifiers(void) +{ + unregister_netevent_notifier(&i40iw_net_notifier); + unregister_inetaddr_notifier(&i40iw_inetaddr_notifier); + unregister_inet6addr_notifier(&i40iw_inetaddr6_notifier); } /** @@ -1467,12 +1474,6 @@ static void i40iw_deinit_device(struct i40iw_device *iwdev) if (!iwdev->reset) i40iw_del_macip_entry(iwdev, (u8)iwdev->mac_ip_table_idx); /* fallthrough */ - case INET_NOTIFIER: - if (!atomic_dec_return(&i40iw_notifiers_registered)) { - unregister_netevent_notifier(&i40iw_net_notifier); - unregister_inetaddr_notifier(&i40iw_inetaddr_notifier); - unregister_inet6addr_notifier(&i40iw_inetaddr6_notifier); - } /* fallthrough */ case PBLE_CHUNK_MEM: i40iw_destroy_pble_pool(dev, iwdev->pble_rsrc); @@ -1672,8 +1673,6 @@ static int i40iw_open(struct i40e_info *ldev, struct i40e_client *client) break; iwdev->init_state = PBLE_CHUNK_MEM; iwdev->virtchnl_wq = alloc_ordered_workqueue("iwvch", WQ_MEM_RECLAIM); - i40iw_register_notifiers(); - iwdev->init_state = INET_NOTIFIER; status = i40iw_add_mac_ip(iwdev); if (status) break; @@ -2023,6 +2022,8 @@ static int __init i40iw_init_module(void) i40iw_client.type = I40E_CLIENT_IWARP; spin_lock_init(&i40iw_handler_lock); ret = i40e_register_client(&i40iw_client); + i40iw_register_notifiers(); + return ret; } @@ -2034,6 +2035,7 @@ static int __init i40iw_init_module(void) */ static void __exit i40iw_exit_module(void) { + i40iw_unregister_notifiers(); i40e_unregister_client(&i40iw_client); } diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c index 62f1f45b8737..e52dbbb4165e 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c @@ -160,7 +160,7 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, return NOTIFY_DONE; iwdev = &hdl->device; - if (iwdev->init_state < INET_NOTIFIER) + if (iwdev->init_state < IP_ADDR_REGISTERED || iwdev->closing) return NOTIFY_DONE; netdev = iwdev->ldev->netdev; @@ -217,7 +217,7 @@ int i40iw_inet6addr_event(struct notifier_block *notifier, return NOTIFY_DONE; iwdev = &hdl->device; - if (iwdev->init_state < INET_NOTIFIER) + if (iwdev->init_state < IP_ADDR_REGISTERED || iwdev->closing) return NOTIFY_DONE; netdev = iwdev->ldev->netdev; @@ -266,7 +266,7 @@ int i40iw_net_event(struct notifier_block *notifier, unsigned long event, void * if (!iwhdl) return NOTIFY_DONE; iwdev = &iwhdl->device; - if (iwdev->init_state < INET_NOTIFIER) + if (iwdev->init_state < IP_ADDR_REGISTERED || iwdev->closing) return NOTIFY_DONE; p = (__be32 *)neigh->primary_key; i40iw_copy_ip_ntohl(local_ipaddr, p); -- cgit From 471b370d52a4a461bf855ff542b544f3e4a5cb3a Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Tue, 19 Sep 2017 09:19:11 -0500 Subject: i40iw: Call i40iw_cm_disconn on modify QP to disconnect If QP modify to closing/terminate/error fails, connection is not torn down as there is no corresponding asynchronous event that will initiate the teardown. Add explicit call to i40iw_cm_disconn if not waiting in modify QP, otherwise schedule it in CM timer. Signed-off-by: Shiraz Saleem Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 1aa411034a27..28b3d02d511b 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -1027,7 +1027,19 @@ int i40iw_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, iwqp->hw_tcp_state = I40IW_TCP_STATE_CLOSED; iwqp->last_aeq = I40IW_AE_RESET_SENT; spin_unlock_irqrestore(&iwqp->lock, flags); + i40iw_cm_disconn(iwqp); } + } else { + spin_lock_irqsave(&iwqp->lock, flags); + if (iwqp->cm_id) { + if (atomic_inc_return(&iwqp->close_timer_started) == 1) { + iwqp->cm_id->add_ref(iwqp->cm_id); + i40iw_schedule_cm_timer(iwqp->cm_node, + (struct i40iw_puda_buf *)iwqp, + I40IW_TIMER_TYPE_CLOSE, 1, 0); + } + } + spin_unlock_irqrestore(&iwqp->lock, flags); } } return 0; -- cgit From dfc612b3407e88913a58db00b3bca93685d4f4f9 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Tue, 19 Sep 2017 09:19:12 -0500 Subject: i40iw: Add missing VLAN priority Set the VLAN priority which is in the upper 3 bits of the VLAN tag field in the QP context. Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 3 ++- drivers/infiniband/hw/i40iw/i40iw_cm.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 14f36ba4e5be..b7215448bb63 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -3255,7 +3255,8 @@ static void i40iw_init_tcp_ctx(struct i40iw_cm_node *cm_node, tcp_info->snd_mss = cpu_to_le32(((u32)cm_node->tcp_cntxt.mss)); if (cm_node->vlan_id < VLAN_TAG_PRESENT) { tcp_info->insert_vlan_tag = true; - tcp_info->vlan_tag = cpu_to_le16(cm_node->vlan_id); + tcp_info->vlan_tag = cpu_to_le16(((u16)cm_node->user_pri << I40IW_VLAN_PRIO_SHIFT) | + cm_node->vlan_id); } if (cm_node->ipv4) { tcp_info->src_port = cpu_to_le16(cm_node->loc_port); diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h index 2e52e38ffcf3..8626e7f1fdd3 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.h +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h @@ -71,6 +71,8 @@ #define I40IW_HW_IRD_SETTING_32 32 #define I40IW_HW_IRD_SETTING_64 64 +#define I40IW_VLAN_PRIO_SHIFT 13 + enum ietf_mpa_flags { IETF_MPA_FLAGS_MARKERS = 0x80, /* receive Markers */ IETF_MPA_FLAGS_CRC = 0x40, /* receive Markers */ -- cgit From f16dc0aa5ea20a2cf173e82ade5f05bfecaa850a Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Tue, 19 Sep 2017 09:19:13 -0500 Subject: i40iw: Add support for port reuse on active side connections During OpenMPI scale up testing, we observe rdma_connect failures if ports are reused on multiple connections. This is because the Control Queue-Pair (CQP) command to add the reused port to Accelerated Port Bit VectorTable (APBVT) fails as there already exists an entry. Check for duplicate port before invoking the CQP command to add APBVT entry and delete the entry only if the port is not in use. Signed-off-by: Shiraz Saleem Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 151 ++++++++++++++++----------------- drivers/infiniband/hw/i40iw/i40iw_cm.h | 3 + 2 files changed, 78 insertions(+), 76 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index b7215448bb63..5230dd3c938c 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -1504,23 +1504,40 @@ static void i40iw_add_hte_node(struct i40iw_cm_core *cm_core, } /** - * listen_port_in_use - determine if port is in use - * @port: Listen port number + * i40iw_port_in_use - determine if port is in use + * @port: port number + * @active_side: flag for listener side vs active side */ -static bool i40iw_listen_port_in_use(struct i40iw_cm_core *cm_core, u16 port) +static bool i40iw_port_in_use(struct i40iw_cm_core *cm_core, u16 port, bool active_side) { struct i40iw_cm_listener *listen_node; + struct i40iw_cm_node *cm_node; unsigned long flags; bool ret = false; - spin_lock_irqsave(&cm_core->listen_list_lock, flags); - list_for_each_entry(listen_node, &cm_core->listen_nodes, list) { - if (listen_node->loc_port == port) { - ret = true; - break; + if (active_side) { + /* search connected node list */ + spin_lock_irqsave(&cm_core->ht_lock, flags); + list_for_each_entry(cm_node, &cm_core->connected_nodes, list) { + if (cm_node->loc_port == port) { + ret = true; + break; + } + } + if (!ret) + clear_bit(port, cm_core->active_side_ports); + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + } else { + spin_lock_irqsave(&cm_core->listen_list_lock, flags); + list_for_each_entry(listen_node, &cm_core->listen_nodes, list) { + if (listen_node->loc_port == port) { + ret = true; + break; + } } + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); } - spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + return ret; } @@ -1868,7 +1885,7 @@ static int i40iw_dec_refcnt_listen(struct i40iw_cm_core *cm_core, spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); if (listener->iwdev) { - if (apbvt_del && !i40iw_listen_port_in_use(cm_core, listener->loc_port)) + if (apbvt_del && !i40iw_port_in_use(cm_core, listener->loc_port, false)) i40iw_manage_apbvt(listener->iwdev, listener->loc_port, I40IW_MANAGE_APBVT_DEL); @@ -2247,21 +2264,21 @@ static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *cm_node) if (cm_node->listener) { i40iw_dec_refcnt_listen(cm_core, cm_node->listener, 0, true); } else { - if (!i40iw_listen_port_in_use(cm_core, cm_node->loc_port) && - cm_node->apbvt_set) { + if (!i40iw_port_in_use(cm_core, cm_node->loc_port, true) && cm_node->apbvt_set) { i40iw_manage_apbvt(cm_node->iwdev, cm_node->loc_port, I40IW_MANAGE_APBVT_DEL); - i40iw_get_addr_info(cm_node, &nfo); - if (cm_node->qhash_set) { - i40iw_manage_qhash(cm_node->iwdev, - &nfo, - I40IW_QHASH_TYPE_TCP_ESTABLISHED, - I40IW_QHASH_MANAGE_TYPE_DELETE, - NULL, - false); - cm_node->qhash_set = 0; - } + cm_node->apbvt_set = 0; + } + i40iw_get_addr_info(cm_node, &nfo); + if (cm_node->qhash_set) { + i40iw_manage_qhash(cm_node->iwdev, + &nfo, + I40IW_QHASH_TYPE_TCP_ESTABLISHED, + I40IW_QHASH_MANAGE_TYPE_DELETE, + NULL, + false); + cm_node->qhash_set = 0; } } @@ -3738,10 +3755,8 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct sockaddr_in *raddr; struct sockaddr_in6 *laddr6; struct sockaddr_in6 *raddr6; - bool qhash_set = false; - int apbvt_set = 0; - int err = 0; - enum i40iw_status_code status; + int ret = 0; + unsigned long flags; ibqp = i40iw_get_qp(cm_id->device, conn_param->qpn); if (!ibqp) @@ -3790,32 +3805,6 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_info.user_pri = rt_tos2priority(cm_id->tos); i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_DCB, "%s TOS:[%d] UP:[%d]\n", __func__, cm_id->tos, cm_info.user_pri); - if ((cm_info.ipv4 && (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr)) || - (!cm_info.ipv4 && memcmp(laddr6->sin6_addr.in6_u.u6_addr32, - raddr6->sin6_addr.in6_u.u6_addr32, - sizeof(laddr6->sin6_addr.in6_u.u6_addr32)))) { - status = i40iw_manage_qhash(iwdev, - &cm_info, - I40IW_QHASH_TYPE_TCP_ESTABLISHED, - I40IW_QHASH_MANAGE_TYPE_ADD, - NULL, - true); - if (status) - return -EINVAL; - qhash_set = true; - } - status = i40iw_manage_apbvt(iwdev, cm_info.loc_port, I40IW_MANAGE_APBVT_ADD); - if (status) { - i40iw_manage_qhash(iwdev, - &cm_info, - I40IW_QHASH_TYPE_TCP_ESTABLISHED, - I40IW_QHASH_MANAGE_TYPE_DELETE, - NULL, - false); - return -EINVAL; - } - - apbvt_set = 1; cm_id->add_ref(cm_id); cm_node = i40iw_create_cm_node(&iwdev->cm_core, iwdev, conn_param->private_data_len, @@ -3823,17 +3812,40 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) &cm_info); if (IS_ERR(cm_node)) { - err = PTR_ERR(cm_node); - goto err_out; + ret = PTR_ERR(cm_node); + cm_id->rem_ref(cm_id); + return ret; + } + + if ((cm_info.ipv4 && (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr)) || + (!cm_info.ipv4 && memcmp(laddr6->sin6_addr.in6_u.u6_addr32, + raddr6->sin6_addr.in6_u.u6_addr32, + sizeof(laddr6->sin6_addr.in6_u.u6_addr32)))) { + if (i40iw_manage_qhash(iwdev, &cm_info, I40IW_QHASH_TYPE_TCP_ESTABLISHED, + I40IW_QHASH_MANAGE_TYPE_ADD, NULL, true)) { + ret = -EINVAL; + goto err; + } + cm_node->qhash_set = true; } + spin_lock_irqsave(&iwdev->cm_core.ht_lock, flags); + if (!test_and_set_bit(cm_info.loc_port, iwdev->cm_core.active_side_ports)) { + spin_unlock_irqrestore(&iwdev->cm_core.ht_lock, flags); + if (i40iw_manage_apbvt(iwdev, cm_info.loc_port, I40IW_MANAGE_APBVT_ADD)) { + ret = -EINVAL; + goto err; + } + } else { + spin_unlock_irqrestore(&iwdev->cm_core.ht_lock, flags); + } + + cm_node->apbvt_set = true; i40iw_record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO && !cm_node->ord_size) cm_node->ord_size = 1; - cm_node->apbvt_set = apbvt_set; - cm_node->qhash_set = qhash_set; iwqp->cm_node = cm_node; cm_node->iwqp = iwqp; iwqp->cm_id = cm_id; @@ -3841,11 +3853,9 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (cm_node->state != I40IW_CM_STATE_OFFLOADED) { cm_node->state = I40IW_CM_STATE_SYN_SENT; - err = i40iw_send_syn(cm_node, 0); - if (err) { - i40iw_rem_ref_cm_node(cm_node); - goto err_out; - } + ret = i40iw_send_syn(cm_node, 0); + if (ret) + goto err; } i40iw_debug(cm_node->dev, @@ -3854,9 +3864,10 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_node->rem_port, cm_node, cm_node->cm_id); + return 0; -err_out: +err: if (cm_info.ipv4) i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM, @@ -3868,22 +3879,10 @@ err_out: "Api - connect() FAILED: dest addr=%pI6", cm_info.rem_addr); - if (qhash_set) - i40iw_manage_qhash(iwdev, - &cm_info, - I40IW_QHASH_TYPE_TCP_ESTABLISHED, - I40IW_QHASH_MANAGE_TYPE_DELETE, - NULL, - false); - - if (apbvt_set && !i40iw_listen_port_in_use(&iwdev->cm_core, - cm_info.loc_port)) - i40iw_manage_apbvt(iwdev, - cm_info.loc_port, - I40IW_MANAGE_APBVT_DEL); + i40iw_rem_ref_cm_node(cm_node); cm_id->rem_ref(cm_id); iwdev->cm_core.stats_connect_errs++; - return err; + return ret; } /** diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h index 8626e7f1fdd3..45abef76295b 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.h +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h @@ -71,6 +71,7 @@ #define I40IW_HW_IRD_SETTING_32 32 #define I40IW_HW_IRD_SETTING_64 64 +#define MAX_PORTS 65536 #define I40IW_VLAN_PRIO_SHIFT 13 enum ietf_mpa_flags { @@ -413,6 +414,8 @@ struct i40iw_cm_core { spinlock_t ht_lock; /* manage hash table */ spinlock_t listen_list_lock; /* listen list */ + unsigned long active_side_ports[BITS_TO_LONGS(MAX_PORTS)]; + u64 stats_nodes_created; u64 stats_nodes_destroyed; u64 stats_listen_created; -- cgit From 432654df90f2a7d8ac8438905208074604ed3e00 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 11 Sep 2017 21:41:43 +0200 Subject: parisc: Fix too large frame size warnings The parisc architecture has larger stack frames than most other architectures on 32-bit kernels. Increase the maximum allowed stack frame to 1280 bytes for parisc to avoid warnings in the do_sys_poll() and pat_memconfig() functions. Signed-off-by: Helge Deller --- lib/Kconfig.debug | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b19c491cbc4e..2689b7c50c52 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -219,7 +219,8 @@ config FRAME_WARN range 0 8192 default 0 if KASAN default 2048 if GCC_PLUGIN_LATENT_ENTROPY - default 1024 if !64BIT + default 1280 if (!64BIT && PARISC) + default 1024 if (!64BIT && !PARISC) default 2048 if 64BIT help Tell gcc to warn at build time for stack frames larger than this. -- cgit From e77900abfd8be4e207412d8b7752dbb9838e2571 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 17 Sep 2017 21:05:02 +0200 Subject: parisc: Stop unwinding at start of stack Check stack pointer if we are reaching the stack end and stop unwinding if we do. This fixes early backtraces and avoids showing unrealistic call stacks. Signed-off-by: Helge Deller --- arch/parisc/kernel/unwind.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index 48dc7d4d20bb..caab39dfa95d 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -279,6 +280,17 @@ static void unwind_frame_regs(struct unwind_frame_info *info) info->prev_sp = sp - 64; info->prev_ip = 0; + + /* The stack is at the end inside the thread_union + * struct. If we reach data, we have reached the + * beginning of the stack and should stop unwinding. */ + if (info->prev_sp >= (unsigned long) task_thread_info(info->t) && + info->prev_sp < ((unsigned long) task_thread_info(info->t) + + THREAD_SZ_ALGN)) { + info->prev_sp = 0; + break; + } + if (get_user(tmp, (unsigned long *)(info->prev_sp - RP_OFFSET))) break; info->prev_ip = tmp; -- cgit From 08b8a99b2c5ea8da4d3dd55056881d12baea1e04 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 17 Sep 2017 21:17:10 +0200 Subject: parisc: Move start_parisc() into init section Signed-off-by: Helge Deller --- arch/parisc/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index dee6f9d6a153..a31e91c1782b 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -398,9 +399,8 @@ static int __init parisc_init(void) } arch_initcall(parisc_init); -void start_parisc(void) +void __init start_parisc(void) { - extern void start_kernel(void); extern void early_trap_init(void); int ret, cpunum; -- cgit From 77089c5274fe2f72db5a2cd956d0d308aed08e68 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 17 Sep 2017 21:15:09 +0200 Subject: parisc: Add wrapper for pdc_instr() firmware function Signed-off-by: Helge Deller --- arch/parisc/include/asm/pdc.h | 1 + arch/parisc/kernel/firmware.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/arch/parisc/include/asm/pdc.h b/arch/parisc/include/asm/pdc.h index 26b4455baa83..510341f62d97 100644 --- a/arch/parisc/include/asm/pdc.h +++ b/arch/parisc/include/asm/pdc.h @@ -280,6 +280,7 @@ void setup_pdc(void); /* in inventory.c */ /* wrapper-functions from pdc.c */ int pdc_add_valid(unsigned long address); +int pdc_instr(unsigned int *instr); int pdc_chassis_info(struct pdc_chassis_info *chassis_info, void *led_info, unsigned long len); int pdc_chassis_disp(unsigned long disp); int pdc_chassis_warn(unsigned long *warn); diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index ab80e5c6f651..6d471c00c71a 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -232,6 +232,26 @@ int pdc_add_valid(unsigned long address) } EXPORT_SYMBOL(pdc_add_valid); +/** + * pdc_instr - Get instruction that invokes PDCE_CHECK in HPMC handler. + * @instr: Pointer to variable which will get instruction opcode. + * + * The return value is PDC_OK (0) in case call succeeded. + */ +int __init pdc_instr(unsigned int *instr) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&pdc_lock, flags); + retval = mem_pdc_call(PDC_INSTR, 0UL, __pa(pdc_result)); + convert_to_wide(pdc_result); + *instr = pdc_result[0]; + spin_unlock_irqrestore(&pdc_lock, flags); + + return retval; +} + /** * pdc_chassis_info - Return chassis information. * @result: The return buffer. -- cgit From 8d771b143fe2e3941fc8a32926d21410004578c0 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 17 Sep 2017 21:28:11 +0200 Subject: parisc: Add PDCE_CHECK instruction to HPMC handler According to the programming note at page 1-31 of the PA 1.1 Firmware Architecture document, one should use the PDC_INSTR firmware function to get the instruction that invokes a PDCE_CHECK in the HPMC handler. This patch follows this note and sets the instruction which has been a nop up until now. Testing on a C3000 and C8000 showed that this firmware call isn't implemented on those machines, so maybe it's only needed on older ones. Signed-off-by: Helge Deller --- arch/parisc/kernel/traps.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 991654c88eec..230333157fe3 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -817,7 +817,7 @@ void __init initialize_ivt(const void *iva) u32 check = 0; u32 *ivap; u32 *hpmcp; - u32 length; + u32 length, instr; if (strcmp((const char *)iva, "cows can fly")) panic("IVT invalid"); @@ -827,6 +827,14 @@ void __init initialize_ivt(const void *iva) for (i = 0; i < 8; i++) *ivap++ = 0; + /* + * Use PDC_INSTR firmware function to get instruction that invokes + * PDCE_CHECK in HPMC handler. See programming note at page 1-31 of + * the PA 1.1 Firmware Architecture document. + */ + if (pdc_instr(&instr) == PDC_OK) + ivap[0] = instr; + /* Compute Checksum for HPMC handler */ length = os_hpmc_size; ivap[7] = length; -- cgit From ea6976483fb0ced259fbaa9e4f68a2cdcee7e312 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 18 Sep 2017 17:55:24 +0200 Subject: parisc: Check if initrd was loaded into broken RAM While scanning the PDT for reported broken memory modules, warn if the initrd was coincidentally loaded into bad memory. Signed-off-by: Helge Deller --- arch/parisc/kernel/pdt.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c index 05730a83895c..00aed082969b 100644 --- a/arch/parisc/kernel/pdt.c +++ b/arch/parisc/kernel/pdt.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -216,8 +217,16 @@ void __init pdc_pdt_init(void) } for (i = 0; i < pdt_status.pdt_entries; i++) { + unsigned long addr; + report_mem_err(pdt_entry[i]); + addr = pdt_entry[i] & PDT_ADDR_PHYS_MASK; + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && + addr >= initrd_start && addr < initrd_end) + pr_crit("CRITICAL: initrd possibly broken " + "due to bad memory!\n"); + /* mark memory page bad */ memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE); } -- cgit From a7e6601f70a53957b1d01c321319f0237bba5202 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 21 Sep 2017 21:22:27 +0200 Subject: parisc: Move init_per_cpu() into init section Signed-off-by: Helge Deller --- arch/parisc/include/asm/smp.h | 1 + arch/parisc/kernel/processor.c | 2 +- arch/parisc/kernel/setup.c | 2 +- arch/parisc/kernel/smp.c | 3 +-- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/parisc/include/asm/smp.h b/arch/parisc/include/asm/smp.h index a5dc9066c6d8..ad9c9c3b4136 100644 --- a/arch/parisc/include/asm/smp.h +++ b/arch/parisc/include/asm/smp.h @@ -1,6 +1,7 @@ #ifndef __ASM_SMP_H #define __ASM_SMP_H +extern int init_per_cpu(int cpuid); #if defined(CONFIG_SMP) diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index a778bd3c107c..e120d63c1b28 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -317,7 +317,7 @@ void __init collect_boot_cpu_data(void) * * o Enable CPU profiling hooks. */ -int init_per_cpu(int cpunum) +int __init init_per_cpu(int cpunum) { int ret; struct pdc_coproc_cfg coproc_cfg; diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index a31e91c1782b..f7d0c3b33d70 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -49,6 +49,7 @@ #include #include #include +#include static char __initdata command_line[COMMAND_LINE_SIZE]; @@ -116,7 +117,6 @@ void __init dma_ops_init(void) } #endif -extern int init_per_cpu(int cpuid); extern void collect_boot_cpu_data(void); void __init setup_arch(char **cmdline_p) diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 63365106ea19..30c28ab14540 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -255,12 +255,11 @@ void arch_send_call_function_single_ipi(int cpu) static void __init smp_cpu_init(int cpunum) { - extern int init_per_cpu(int); /* arch/parisc/kernel/processor.c */ extern void init_IRQ(void); /* arch/parisc/kernel/irq.c */ extern void start_cpu_itimer(void); /* arch/parisc/kernel/time.c */ /* Set modes and Enable floating point coprocessor */ - (void) init_per_cpu(cpunum); + init_per_cpu(cpunum); disable_sr_hashing(); -- cgit From 606f95e4255845155f62504a9e1f12665b1853c8 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 21 Sep 2017 21:52:08 +0200 Subject: parisc: Add HWPOISON page fault handler code Commit 24587380f61d ("parisc: Add MADV_HWPOISON and MADV_SOFT_OFFLINE") added the necessary constants to handle hardware-poisoning. Those were needed to support the page deallocation feature from firmware. But I completely missed to add the relevant fault handler code. This now showed up when I ran the madvise07 testcase from the Linux Test Project, which failed with a kernel BUG at arch/parisc/mm/fault.c:320. With this patch the parisc kernel now behaves like other platforms and gives the same kernel syslog warnings when poisoning pages. Signed-off-by: Helge Deller --- arch/parisc/mm/fault.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 5b101f6a5607..e247edbca68e 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -261,7 +262,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, struct task_struct *tsk; struct mm_struct *mm; unsigned long acc_type; - int fault; + int fault = 0; unsigned int flags; if (faulthandler_disabled()) @@ -315,7 +316,8 @@ good_area: goto out_of_memory; else if (fault & VM_FAULT_SIGSEGV) goto bad_area; - else if (fault & VM_FAULT_SIGBUS) + else if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) goto bad_area; BUG(); } @@ -352,8 +354,7 @@ bad_area: if (user_mode(regs)) { struct siginfo si; - - show_signal_msg(regs, code, address, tsk, vma); + unsigned int lsb = 0; switch (code) { case 15: /* Data TLB miss fault/Data page fault */ @@ -386,6 +387,30 @@ bad_area: si.si_code = (code == 26) ? SEGV_ACCERR : SEGV_MAPERR; break; } + +#ifdef CONFIG_MEMORY_FAILURE + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { + printk(KERN_ERR + "MCE: Killing %s:%d due to hardware memory corruption fault at %08lx\n", + tsk->comm, tsk->pid, address); + si.si_signo = SIGBUS; + si.si_code = BUS_MCEERR_AR; + } +#endif + + /* + * Either small page or large page may be poisoned. + * In other words, VM_FAULT_HWPOISON_LARGE and + * VM_FAULT_HWPOISON are mutually exclusive. + */ + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + else if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + else + show_signal_msg(regs, code, address, tsk, vma); + si.si_addr_lsb = lsb; + si.si_errno = 0; si.si_addr = (void __user *) address; force_sig_info(si.si_signo, &si, current); -- cgit From f9b941baa4b78dafcbc4790c0ed2fbbe2aafed06 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 31 Aug 2017 09:27:28 +0530 Subject: bnxt_re: Fix update of qplib_qp.mtu when modified The MTU value in the qplib_qp.mtu should be consistent with whatever mtu was set during INIT to RTR.The Next PSN and number of packets are calculated based on this member in the qplib_qp structure. Signed-off-by: Narender Reddy Signed-off-by: Devesh Sharma Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 03caf48af8e2..11bbf3e748b0 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1436,11 +1436,14 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; qp->qplib_qp.path_mtu = __from_ib_mtu(qp_attr->path_mtu); + qp->qplib_qp.mtu = ib_mtu_enum_to_int(qp_attr->path_mtu); } else if (qp_attr->qp_state == IB_QPS_RTR) { qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; qp->qplib_qp.path_mtu = __from_ib_mtu(iboe_get_mtu(rdev->netdev->mtu)); + qp->qplib_qp.mtu = + ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu)); } if (qp_attr_mask & IB_QP_TIMEOUT) { -- cgit From 2b6376305dcb2cb79d0599cf3b9ce98aa860cb4f Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Thu, 31 Aug 2017 09:27:29 +0530 Subject: bnxt_re: Stop issuing further cmds to FW once a cmd times out Once a cmd to FW times out(after 20s) it is reasonable to assume the FW or atleast the control path is dead. No point issuing further cmds to the FW as each subsequent cmd with another 20s timeout will cascade resulting in unnecessary traces and/or NMI Lockups. Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 4 ++++ drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 391bb7006e8f..2bdb1562bd21 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -107,6 +107,9 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, return -EINVAL; } + if (test_bit(FIRMWARE_TIMED_OUT, &rcfw->flags)) + return -ETIMEDOUT; + /* Cmdq are in 16-byte units, each request can consume 1 or more * cmdqe */ @@ -226,6 +229,7 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, /* timed out */ dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x timedout (%d)msec", cookie, opcode, RCFW_CMD_WAIT_TIME_MS); + set_bit(FIRMWARE_TIMED_OUT, &rcfw->flags); return rc; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 0ed312f17c8d..85b16da287f9 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -162,8 +162,9 @@ struct bnxt_qplib_rcfw { unsigned long *cmdq_bitmap; u32 bmap_size; unsigned long flags; -#define FIRMWARE_INITIALIZED_FLAG 1 +#define FIRMWARE_INITIALIZED_FLAG BIT(0) #define FIRMWARE_FIRST_FLAG BIT(31) +#define FIRMWARE_TIMED_OUT BIT(3) wait_queue_head_t waitq; int (*aeq_handler)(struct bnxt_qplib_rcfw *, struct creq_func_event *); -- cgit From 55311d055175cada8249d39436371afe790df699 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 31 Aug 2017 09:27:30 +0530 Subject: bnxt_re: Fix compare and swap atomic operands Driver must assign the user supplied compare/swap values in the wqe to successfully complete the atomic compare and swap operation. Signed-off-by: Devesh Sharma Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 11bbf3e748b0..0dbdbe1616ab 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1916,6 +1916,7 @@ static int bnxt_re_build_atomic_wqe(struct ib_send_wr *wr, switch (wr->opcode) { case IB_WR_ATOMIC_CMP_AND_SWP: wqe->type = BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP; + wqe->atomic.cmp_data = atomic_wr(wr)->compare_add; wqe->atomic.swap_data = atomic_wr(wr)->swap; break; case IB_WR_ATOMIC_FETCH_AND_ADD: -- cgit From 027c892924eac22f5ca8db4100bccde423be797d Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Thu, 31 Aug 2017 09:27:31 +0530 Subject: bnxt_re: Free up devices in module_exit path Clean up all devices added to the bnxt_re_dev_list in the module_exit entry point. Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/main.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 82d1cbc27aee..00a3b743c156 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1375,6 +1375,22 @@ err_netdev: static void __exit bnxt_re_mod_exit(void) { + struct bnxt_re_dev *rdev; + LIST_HEAD(to_be_deleted); + + mutex_lock(&bnxt_re_dev_lock); + /* Free all adapter allocated resources */ + if (!list_empty(&bnxt_re_dev_list)) + list_splice_init(&bnxt_re_dev_list, &to_be_deleted); + mutex_unlock(&bnxt_re_dev_lock); + + list_for_each_entry(rdev, &to_be_deleted, list) { + dev_info(rdev_to_dev(rdev), "Unregistering Device"); + bnxt_re_dev_stop(rdev); + bnxt_re_ib_unreg(rdev, true); + bnxt_re_remove_one(rdev); + bnxt_re_dev_unreg(rdev); + } unregister_netdevice_notifier(&bnxt_re_netdev_notifier); if (bnxt_re_wq) destroy_workqueue(bnxt_re_wq); -- cgit From d5917307bb1caa9cb0a915951c57f4cdbacca443 Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Thu, 31 Aug 2017 09:27:32 +0530 Subject: bnxt_re: Fix race between the netdev register and unregister events Upon receipt of the NETDEV_REGISTER event from the netdev notifier chain, the IB stack registration is spawned off to a workqueue since that also requires an rtnl lock. There could be 2 kinds of races between the NETDEV_REGISTER and the NETDEV_UNREGISTER event handling. a)The NETDEV_UNREGISTER event is received in rapid succession after the NETDEV_REGISTER event even before the work queue got a chance to run. b)The NETDEV_UNREGISTER event is received while the workqueue that handles registration with the IB stack is still in progress. Handle both the races with a bit flag that is set just before the work item is queued and cleared in the workqueue after the event is handled just before the workqueue item is freed. While adding the new flag, it was noted that the flags are all used in *_bit() operations which expect a bit number and not a literal constant with a bit set. So change the numbers to be bit numbers. Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 12 +++++++----- drivers/infiniband/hw/bnxt_re/main.c | 8 ++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index b3ad37fec578..a25f9d240880 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -93,11 +93,13 @@ struct bnxt_re_dev { struct ib_device ibdev; struct list_head list; unsigned long flags; -#define BNXT_RE_FLAG_NETDEV_REGISTERED 0 -#define BNXT_RE_FLAG_IBDEV_REGISTERED 1 -#define BNXT_RE_FLAG_GOT_MSIX 2 -#define BNXT_RE_FLAG_RCFW_CHANNEL_EN 8 -#define BNXT_RE_FLAG_QOS_WORK_REG 16 +#define BNXT_RE_FLAG_NETDEV_REGISTERED 0 +#define BNXT_RE_FLAG_IBDEV_REGISTERED 1 +#define BNXT_RE_FLAG_GOT_MSIX 2 +#define BNXT_RE_FLAG_HAVE_L2_REF 3 +#define BNXT_RE_FLAG_RCFW_CHANNEL_EN 4 +#define BNXT_RE_FLAG_QOS_WORK_REG 5 +#define BNXT_RE_FLAG_TASK_IN_PROG 6 struct net_device *netdev; unsigned int version, major, minor; struct bnxt_en_dev *en_dev; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 00a3b743c156..29c3d7e254af 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1259,6 +1259,8 @@ static void bnxt_re_task(struct work_struct *work) default: break; } + smp_mb__before_atomic(); + clear_bit(BNXT_RE_FLAG_TASK_IN_PROG, &rdev->flags); kfree(re_work); } @@ -1317,6 +1319,11 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier, break; case NETDEV_UNREGISTER: + /* netdev notifier will call NETDEV_UNREGISTER again later since + * we are still holding the reference to the netdev + */ + if (test_bit(BNXT_RE_FLAG_TASK_IN_PROG, &rdev->flags)) + goto exit; bnxt_re_ib_unreg(rdev, false); bnxt_re_remove_one(rdev); bnxt_re_dev_unreg(rdev); @@ -1335,6 +1342,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier, re_work->vlan_dev = (real_dev == netdev ? NULL : netdev); INIT_WORK(&re_work->work, bnxt_re_task); + set_bit(BNXT_RE_FLAG_TASK_IN_PROG, &rdev->flags); queue_work(bnxt_re_wq, &re_work->work); } } -- cgit From 74828b128115033ff25d4140d732a05a36eaeaf0 Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Thu, 31 Aug 2017 09:27:33 +0530 Subject: bnxt_re: Remove RTNL lock dependency in bnxt_re_query_port When there is a NETDEV_UNREGISTER event, bnxt_re driver calls ib_unregister_device() (RTNL lock held). ib_unregister_device attempts to flush a worker queue scheduled by ib_core and that queue might have a pending ib_query_port(). ib_query_port in turn calls bnxt_re_query_port(), which while querying the link speed using ib_get_eth_speed(), tries to acquire the rtnl_lock() which was already held by NETDEV_UNREGISTER. Fixing the issue by removing the link speed query from bnxt_re_query_port() Now the speed is queried post a successful ib_register_device or whenever there is a NETDEV_CHANGE event. Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 ++ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 11 +++-------- drivers/infiniband/hw/bnxt_re/main.c | 4 ++++ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index a25f9d240880..ecbac91b2e14 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -110,6 +110,8 @@ struct bnxt_re_dev { struct delayed_work worker; u8 cur_prio_map; + u8 active_speed; + u8 active_width; /* FP Notification Queue (CQ & SRQ) */ struct tasklet_struct nq_task; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 0dbdbe1616ab..7430ef07a0e1 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -259,14 +259,9 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num, port_attr->sm_sl = 0; port_attr->subnet_timeout = 0; port_attr->init_type_reply = 0; - /* call the underlying netdev's ethtool hooks to query speed settings - * for which we acquire rtnl_lock _only_ if it's registered with - * IB stack to avoid race in the NETDEV_UNREG path - */ - if (test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) - if (ib_get_eth_speed(ibdev, port_num, &port_attr->active_speed, - &port_attr->active_width)) - return -EINVAL; + port_attr->active_speed = rdev->active_speed; + port_attr->active_width = rdev->active_width; + return 0; } diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 29c3d7e254af..e7450ea92aa9 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1161,6 +1161,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) } } set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags); + ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, + &rdev->active_width); bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_PORT_ACTIVE); bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_GID_CHANGE); @@ -1255,6 +1257,8 @@ static void bnxt_re_task(struct work_struct *work) else if (netif_carrier_ok(rdev->netdev)) bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_PORT_ACTIVE); + ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, + &rdev->active_width); break; default: break; -- cgit From 1993519be8bc86342687c61d8d11c3ade62b3b84 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Thu, 31 Aug 2017 09:27:34 +0530 Subject: bnxt_re: Fix memory leak in FRMR path This patch fixes a memory leak issue when alloc_mr is used. mr->pages and mr->npages are used only in alloc_mr path. mr->pages is allocated when alloc_mr is called or in the case of FRMR, while creating the MR. mr->npages is updated only when the MR created is used i.e. after invoking map_mr_sg verb, before data transfer. In the dereg_mr path, if mr->npages is 0, driver ends up not freeing the memory created. Removing the npages check from the dereg_mr path for kernel consumers. Signed-off-by: Selvin Xavier Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 7430ef07a0e1..d7be4e4227ce 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3066,7 +3066,7 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr) return rc; } - if (mr->npages && mr->pages) { + if (mr->pages) { rc = bnxt_qplib_free_fast_reg_page_list(&rdev->qplib_res, &mr->qplib_frpl); kfree(mr->pages); -- cgit From 89aaca54ba60e91f02c1c168fbef5d71f71a6d43 Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Thu, 31 Aug 2017 09:27:35 +0530 Subject: bnxt_re: Don't issue cmd to delete GID for QP1 GID entry before the QP is destroyed FW needs the 0th GID Entry in the Table to be preserved before it's corresponding QP1 is deleted, else it will fail the cmd. Check for the same and return to prevent error msg being logged for cmd failure. Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index d7be4e4227ce..0d89621d9fe8 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -314,6 +314,7 @@ int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num, struct bnxt_re_gid_ctx *ctx, **ctx_tbl; struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl; + struct bnxt_qplib_gid *gid_to_del; /* Delete the entry from the hardware */ ctx = *context; @@ -323,11 +324,25 @@ int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num, if (sgid_tbl && sgid_tbl->active) { if (ctx->idx >= sgid_tbl->max) return -EINVAL; + gid_to_del = &sgid_tbl->tbl[ctx->idx]; + /* DEL_GID is called in WQ context(netdevice_event_work_handler) + * or via the ib_unregister_device path. In the former case QP1 + * may not be destroyed yet, in which case just return as FW + * needs that entry to be present and will fail it's deletion. + * We could get invoked again after QP1 is destroyed OR get an + * ADD_GID call with a different GID value for the same index + * where we issue MODIFY_GID cmd to update the GID entry -- TBD + */ + if (ctx->idx == 0 && + rdma_link_local_addr((struct in6_addr *)gid_to_del) && + ctx->refcnt == 1 && rdev->qp1_sqp) { + dev_dbg(rdev_to_dev(rdev), + "Trying to delete GID0 while QP1 is alive\n"); + return -EFAULT; + } ctx->refcnt--; if (!ctx->refcnt) { - rc = bnxt_qplib_del_sgid(sgid_tbl, - &sgid_tbl->tbl[ctx->idx], - true); + rc = bnxt_qplib_del_sgid(sgid_tbl, gid_to_del, true); if (rc) { dev_err(rdev_to_dev(rdev), "Failed to remove GID: %#x", rc); @@ -811,6 +826,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp) kfree(rdev->sqp_ah); kfree(rdev->qp1_sqp); + rdev->qp1_sqp = NULL; + rdev->sqp_ah = NULL; } if (!IS_ERR_OR_NULL(qp->rumem)) -- cgit From 19fe43a54fb67b6cc8857e65c78e1dc8aa2e97a3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 6 Jul 2017 10:56:21 +0200 Subject: apparmor: Fix shadowed local variable in unpack_trans_table() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit with W=2: security/apparmor/policy_unpack.c: In function ‘unpack_trans_table’: security/apparmor/policy_unpack.c:469: warning: declaration of ‘pos’ shadows a previous local security/apparmor/policy_unpack.c:451: warning: shadowed declaration is here Rename the old "pos" to "saved_pos" to fix this. Fixes: 5379a3312024a8be ("apparmor: support v7 transition format compatible with label_parse") Signed-off-by: Geert Uytterhoeven Reviewed-by: Serge Hallyn Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index c600f4dd1783..2d5a1a007b06 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -448,7 +448,7 @@ fail: */ static bool unpack_trans_table(struct aa_ext *e, struct aa_profile *profile) { - void *pos = e->pos; + void *saved_pos = e->pos; /* exec table is optional */ if (unpack_nameX(e, AA_STRUCT, "xtable")) { @@ -511,7 +511,7 @@ static bool unpack_trans_table(struct aa_ext *e, struct aa_profile *profile) fail: aa_free_domain_entries(&profile->file.trans); - e->pos = pos; + e->pos = saved_pos; return 0; } -- cgit From 86aea56f14929ff1c05eca1776e9068e907429d5 Mon Sep 17 00:00:00 2001 From: Christos Gkekas Date: Sat, 8 Jul 2017 20:50:21 +0100 Subject: apparmor: Fix logical error in verify_header() verify_header() is currently checking whether interface version is less than 5 *and* greater than 7, which always evaluates to false. Instead it should check whether it is less than 5 *or* greater than 7. Signed-off-by: Christos Gkekas Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 2d5a1a007b06..bda0dce3b582 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -832,7 +832,7 @@ static int verify_header(struct aa_ext *e, int required, const char **ns) * if not specified use previous version * Mask off everything that is not kernel abi version */ - if (VERSION_LT(e->version, v5) && VERSION_GT(e->version, v7)) { + if (VERSION_LT(e->version, v5) || VERSION_GT(e->version, v7)) { audit_iface(NULL, NULL, NULL, "unsupported interface version", e, error); return error; -- cgit From 5d314a81eca29b01939930c1c596dfa44937e970 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 13 Jul 2017 10:39:20 +0300 Subject: apparmor: Fix an error code in aafs_create() We accidentally forgot to set the error code on this path. It means we return NULL instead of an error pointer. I looked through a bunch of callers and I don't think it really causes a big issue, but the documentation says we're supposed to return error pointers here. Signed-off-by: Dan Carpenter Acked-by: Serge Hallyn Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 853c2ec8e0c9..2caeb748070c 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -248,8 +248,10 @@ static struct dentry *aafs_create(const char *name, umode_t mode, inode_lock(dir); dentry = lookup_one_len(name, parent, strlen(name)); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); goto fail_lock; + } if (d_really_is_positive(dentry)) { error = -EEXIST; -- cgit From c5561700c9cb951ec3a33a0914c840423b09d7c9 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Mon, 31 Jul 2017 23:44:37 -0700 Subject: apparmor: Redundant condition: prev_ns. in [label.c:1498] Reported-by: David Binderman Signed-off-by: John Johansen --- security/apparmor/label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/label.c b/security/apparmor/label.c index e052eaba1cf6..e324f4df3e34 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -1495,7 +1495,7 @@ static int aa_profile_snxprint(char *str, size_t size, struct aa_ns *view, view = profiles_ns(profile); if (view != profile->ns && - (!prev_ns || (prev_ns && *prev_ns != profile->ns))) { + (!prev_ns || (*prev_ns != profile->ns))) { if (prev_ns) *prev_ns = profile->ns; ns_name = aa_ns_name(view, profile->ns, -- cgit From cd1dbf76b23d5ab2cba5e657fe20b1e236a408cc Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 18 Jul 2017 22:56:22 -0700 Subject: apparmor: add the ability to mediate signals Add signal mediation where the signal can be mediated based on the signal, direction, or the label or the peer/target. The signal perms are verified on a cross check to ensure policy consistency in the case of incremental policy load/replacement. The optimization of skipping the cross check when policy is guaranteed to be consistent (single compile unit) remains to be done. policy rules have the form of SIGNAL_RULE = [ QUALIFIERS ] 'signal' [ SIGNAL ACCESS PERMISSIONS ] [ SIGNAL SET ] [ SIGNAL PEER ] SIGNAL ACCESS PERMISSIONS = SIGNAL ACCESS | SIGNAL ACCESS LIST SIGNAL ACCESS LIST = '(' Comma or space separated list of SIGNAL ACCESS ')' SIGNAL ACCESS = ( 'r' | 'w' | 'rw' | 'read' | 'write' | 'send' | 'receive' ) SIGNAL SET = 'set' '=' '(' SIGNAL LIST ')' SIGNAL LIST = Comma or space separated list of SIGNALS SIGNALS = ( 'hup' | 'int' | 'quit' | 'ill' | 'trap' | 'abrt' | 'bus' | 'fpe' | 'kill' | 'usr1' | 'segv' | 'usr2' | 'pipe' | 'alrm' | 'term' | 'stkflt' | 'chld' | 'cont' | 'stop' | 'stp' | 'ttin' | 'ttou' | 'urg' | 'xcpu' | 'xfsz' | 'vtalrm' | 'prof' | 'winch' | 'io' | 'pwr' | 'sys' | 'emt' | 'exists' | 'rtmin+0' ... 'rtmin+32' ) SIGNAL PEER = 'peer' '=' AARE eg. signal, # allow all signals signal send set=(hup, kill) peer=foo, Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/apparmorfs.c | 7 +++ security/apparmor/include/apparmor.h | 1 + security/apparmor/include/audit.h | 2 + security/apparmor/include/ipc.h | 6 +++ security/apparmor/include/sig_names.h | 95 +++++++++++++++++++++++++++++++++ security/apparmor/ipc.c | 99 +++++++++++++++++++++++++++++++++++ security/apparmor/lsm.c | 21 ++++++++ 7 files changed, 231 insertions(+) create mode 100644 security/apparmor/include/sig_names.h diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 2caeb748070c..a5f9e1aa51f7 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -32,6 +32,7 @@ #include "include/audit.h" #include "include/context.h" #include "include/crypto.h" +#include "include/ipc.h" #include "include/policy_ns.h" #include "include/label.h" #include "include/policy.h" @@ -2129,6 +2130,11 @@ static struct aa_sfs_entry aa_sfs_entry_ptrace[] = { { } }; +static struct aa_sfs_entry aa_sfs_entry_signal[] = { + AA_SFS_FILE_STRING("mask", AA_SFS_SIG_MASK), + { } +}; + static struct aa_sfs_entry aa_sfs_entry_domain[] = { AA_SFS_FILE_BOOLEAN("change_hat", 1), AA_SFS_FILE_BOOLEAN("change_hatv", 1), @@ -2179,6 +2185,7 @@ static struct aa_sfs_entry aa_sfs_entry_features[] = { AA_SFS_DIR("rlimit", aa_sfs_entry_rlimit), AA_SFS_DIR("caps", aa_sfs_entry_caps), AA_SFS_DIR("ptrace", aa_sfs_entry_ptrace), + AA_SFS_DIR("signal", aa_sfs_entry_signal), AA_SFS_DIR("query", aa_sfs_entry_query), { } }; diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h index aaf893f4e4f5..962a20a75e01 100644 --- a/security/apparmor/include/apparmor.h +++ b/security/apparmor/include/apparmor.h @@ -28,6 +28,7 @@ #define AA_CLASS_RLIMITS 5 #define AA_CLASS_DOMAIN 6 #define AA_CLASS_PTRACE 9 +#define AA_CLASS_SIGNAL 10 #define AA_CLASS_LABEL 16 #define AA_CLASS_LAST AA_CLASS_LABEL diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index c68839a44351..d9a156ae11b9 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -86,6 +86,7 @@ enum audit_type { #define OP_SHUTDOWN "socket_shutdown" #define OP_PTRACE "ptrace" +#define OP_SIGNAL "signal" #define OP_EXEC "exec" @@ -126,6 +127,7 @@ struct apparmor_audit_data { long pos; const char *ns; } iface; + int signal; struct { int rlim; unsigned long max; diff --git a/security/apparmor/include/ipc.h b/security/apparmor/include/ipc.h index 656fdb81c8a0..5ffc218d1e74 100644 --- a/security/apparmor/include/ipc.h +++ b/security/apparmor/include/ipc.h @@ -27,8 +27,14 @@ struct aa_profile; #define AA_PTRACE_PERM_MASK (AA_PTRACE_READ | AA_PTRACE_TRACE | \ AA_MAY_BE_READ | AA_MAY_BE_TRACED) +#define AA_SIGNAL_PERM_MASK (MAY_READ | MAY_WRITE) + +#define AA_SFS_SIG_MASK "hup int quit ill trap abrt bus fpe kill usr1 " \ + "segv usr2 pipe alrm term stkflt chld cont stop stp ttin ttou urg " \ + "xcpu xfsz vtalrm prof winch io pwr sys emt lost" int aa_may_ptrace(struct aa_label *tracer, struct aa_label *tracee, u32 request); +int aa_may_signal(struct aa_label *sender, struct aa_label *target, int sig); #endif /* __AA_IPC_H */ diff --git a/security/apparmor/include/sig_names.h b/security/apparmor/include/sig_names.h new file mode 100644 index 000000000000..0d4395f231ca --- /dev/null +++ b/security/apparmor/include/sig_names.h @@ -0,0 +1,95 @@ +#include + +#define SIGUNKNOWN 0 +#define MAXMAPPED_SIG 35 +/* provide a mapping of arch signal to internal signal # for mediation + * those that are always an alias SIGCLD for SIGCLHD and SIGPOLL for SIGIO + * map to the same entry those that may/or may not get a separate entry + */ +static const int sig_map[MAXMAPPED_SIG] = { + [0] = MAXMAPPED_SIG, /* existence test */ + [SIGHUP] = 1, + [SIGINT] = 2, + [SIGQUIT] = 3, + [SIGILL] = 4, + [SIGTRAP] = 5, /* -, 5, - */ + [SIGABRT] = 6, /* SIGIOT: -, 6, - */ + [SIGBUS] = 7, /* 10, 7, 10 */ + [SIGFPE] = 8, + [SIGKILL] = 9, + [SIGUSR1] = 10, /* 30, 10, 16 */ + [SIGSEGV] = 11, + [SIGUSR2] = 12, /* 31, 12, 17 */ + [SIGPIPE] = 13, + [SIGALRM] = 14, + [SIGTERM] = 15, + [SIGSTKFLT] = 16, /* -, 16, - */ + [SIGCHLD] = 17, /* 20, 17, 18. SIGCHLD -, -, 18 */ + [SIGCONT] = 18, /* 19, 18, 25 */ + [SIGSTOP] = 19, /* 17, 19, 23 */ + [SIGTSTP] = 20, /* 18, 20, 24 */ + [SIGTTIN] = 21, /* 21, 21, 26 */ + [SIGTTOU] = 22, /* 22, 22, 27 */ + [SIGURG] = 23, /* 16, 23, 21 */ + [SIGXCPU] = 24, /* 24, 24, 30 */ + [SIGXFSZ] = 25, /* 25, 25, 31 */ + [SIGVTALRM] = 26, /* 26, 26, 28 */ + [SIGPROF] = 27, /* 27, 27, 29 */ + [SIGWINCH] = 28, /* 28, 28, 20 */ + [SIGIO] = 29, /* SIGPOLL: 23, 29, 22 */ + [SIGPWR] = 30, /* 29, 30, 19. SIGINFO 29, -, - */ +#ifdef SIGSYS + [SIGSYS] = 31, /* 12, 31, 12. often SIG LOST/UNUSED */ +#endif +#ifdef SIGEMT + [SIGEMT] = 32, /* 7, - , 7 */ +#endif +#if defined(SIGLOST) && SIGPWR != SIGLOST /* sparc */ + [SIGLOST] = 33, /* unused on Linux */ +#endif +#if defined(SIGLOST) && defined(SIGSYS) && SIGLOST != SIGSYS + [SIGUNUSED] = 34, /* -, 31, - */ +#endif +}; + +/* this table is ordered post sig_map[sig] mapping */ +static const char *const sig_names[MAXMAPPED_SIG + 1] = { + "unknown", + "hup", + "int", + "quit", + "ill", + "trap", + "abrt", + "bus", + "fpe", + "kill", + "usr1", + "segv", + "usr2", + "pipe", + "alrm", + "term", + "stkflt", + "chld", + "cont", + "stop", + "stp", + "ttin", + "ttou", + "urg", + "xcpu", + "xfsz", + "vtalrm", + "prof", + "winch", + "io", + "pwr", + "sys", + "emt", + "lost", + "unused", + + "exists", /* always last existence test mapped to MAXMAPPED_SIG */ +}; + diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c index 11e66b5bbc42..66fb9ede9447 100644 --- a/security/apparmor/ipc.c +++ b/security/apparmor/ipc.c @@ -20,6 +20,7 @@ #include "include/context.h" #include "include/policy.h" #include "include/ipc.h" +#include "include/sig_names.h" /** * audit_ptrace_mask - convert mask to permission string @@ -121,3 +122,101 @@ int aa_may_ptrace(struct aa_label *tracer, struct aa_label *tracee, } +static inline int map_signal_num(int sig) +{ + if (sig > SIGRTMAX) + return SIGUNKNOWN; + else if (sig >= SIGRTMIN) + return sig - SIGRTMIN + 128; /* rt sigs mapped to 128 */ + else if (sig <= MAXMAPPED_SIG) + return sig_map[sig]; + return SIGUNKNOWN; +} + +/** + * audit_file_mask - convert mask to permission string + * @buffer: buffer to write string to (NOT NULL) + * @mask: permission mask to convert + */ +static void audit_signal_mask(struct audit_buffer *ab, u32 mask) +{ + if (mask & MAY_READ) + audit_log_string(ab, "receive"); + if (mask & MAY_WRITE) + audit_log_string(ab, "send"); +} + +/** + * audit_cb - call back for signal specific audit fields + * @ab: audit_buffer (NOT NULL) + * @va: audit struct to audit values of (NOT NULL) + */ +static void audit_signal_cb(struct audit_buffer *ab, void *va) +{ + struct common_audit_data *sa = va; + + if (aad(sa)->request & AA_SIGNAL_PERM_MASK) { + audit_log_format(ab, " requested_mask="); + audit_signal_mask(ab, aad(sa)->request); + if (aad(sa)->denied & AA_SIGNAL_PERM_MASK) { + audit_log_format(ab, " denied_mask="); + audit_signal_mask(ab, aad(sa)->denied); + } + } + if (aad(sa)->signal <= MAXMAPPED_SIG) + audit_log_format(ab, " signal=%s", sig_names[aad(sa)->signal]); + else + audit_log_format(ab, " signal=rtmin+%d", + aad(sa)->signal - 128); + audit_log_format(ab, " peer="); + aa_label_xaudit(ab, labels_ns(aad(sa)->label), aad(sa)->peer, + FLAGS_NONE, GFP_ATOMIC); +} + +/* TODO: update to handle compound name&name2, conditionals */ +static void profile_match_signal(struct aa_profile *profile, const char *label, + int signal, struct aa_perms *perms) +{ + unsigned int state; + + /* TODO: secondary cache check */ + state = aa_dfa_next(profile->policy.dfa, + profile->policy.start[AA_CLASS_SIGNAL], + signal); + state = aa_dfa_match(profile->policy.dfa, state, label); + aa_compute_perms(profile->policy.dfa, state, perms); +} + +static int profile_signal_perm(struct aa_profile *profile, + struct aa_profile *peer, u32 request, + struct common_audit_data *sa) +{ + struct aa_perms perms; + + if (profile_unconfined(profile) || + !PROFILE_MEDIATES(profile, AA_CLASS_SIGNAL)) + return 0; + + aad(sa)->peer = &peer->label; + profile_match_signal(profile, peer->base.hname, aad(sa)->signal, + &perms); + aa_apply_modes_to_perms(profile, &perms); + return aa_check_perms(profile, &perms, request, sa, audit_signal_cb); +} + +static int aa_signal_cross_perm(struct aa_profile *sender, + struct aa_profile *target, + struct common_audit_data *sa) +{ + return xcheck(profile_signal_perm(sender, target, MAY_WRITE, sa), + profile_signal_perm(target, sender, MAY_READ, sa)); +} + +int aa_may_signal(struct aa_label *sender, struct aa_label *target, int sig) +{ + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, OP_SIGNAL); + + aad(&sa)->signal = map_signal_num(sig); + return xcheck_labels_profiles(sender, target, aa_signal_cross_perm, + &sa); +} diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 867bcd154c7e..af22f3dfbcce 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -656,6 +656,26 @@ static int apparmor_task_setrlimit(struct task_struct *task, return error; } +static int apparmor_task_kill(struct task_struct *target, struct siginfo *info, + int sig, u32 secid) +{ + struct aa_label *cl, *tl; + int error; + + if (secid) + /* TODO: after secid to label mapping is done. + * Dealing with USB IO specific behavior + */ + return 0; + cl = __begin_current_label_crit_section(); + tl = aa_get_task_label(target); + error = aa_may_signal(cl, tl, sig); + aa_put_label(tl); + __end_current_label_crit_section(cl); + + return error; +} + static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), @@ -697,6 +717,7 @@ static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(bprm_secureexec, apparmor_bprm_secureexec), LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit), + LSM_HOOK_INIT(task_kill, apparmor_task_kill), }; /* -- cgit From 2ea3ffb7782a84da33a8382f13ebd016da50079b Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 18 Jul 2017 23:04:47 -0700 Subject: apparmor: add mount mediation Add basic mount mediation. That allows controlling based on basic mount parameters. It does not include special mount parameters for apparmor, super block labeling, or any triggers for apparmor namespace parameter modifications on pivot root. default userspace policy rules have the form of MOUNT RULE = ( MOUNT | REMOUNT | UMOUNT ) MOUNT = [ QUALIFIERS ] 'mount' [ MOUNT CONDITIONS ] [ SOURCE FILEGLOB ] [ '->' MOUNTPOINT FILEGLOB ] REMOUNT = [ QUALIFIERS ] 'remount' [ MOUNT CONDITIONS ] MOUNTPOINT FILEGLOB UMOUNT = [ QUALIFIERS ] 'umount' [ MOUNT CONDITIONS ] MOUNTPOINT FILEGLOB MOUNT CONDITIONS = [ ( 'fstype' | 'vfstype' ) ( '=' | 'in' ) MOUNT FSTYPE EXPRESSION ] [ 'options' ( '=' | 'in' ) MOUNT FLAGS EXPRESSION ] MOUNT FSTYPE EXPRESSION = ( MOUNT FSTYPE LIST | MOUNT EXPRESSION ) MOUNT FSTYPE LIST = Comma separated list of valid filesystem and virtual filesystem types (eg ext4, debugfs, etc) MOUNT FLAGS EXPRESSION = ( MOUNT FLAGS LIST | MOUNT EXPRESSION ) MOUNT FLAGS LIST = Comma separated list of MOUNT FLAGS. MOUNT FLAGS = ( 'ro' | 'rw' | 'nosuid' | 'suid' | 'nodev' | 'dev' | 'noexec' | 'exec' | 'sync' | 'async' | 'remount' | 'mand' | 'nomand' | 'dirsync' | 'noatime' | 'atime' | 'nodiratime' | 'diratime' | 'bind' | 'rbind' | 'move' | 'verbose' | 'silent' | 'loud' | 'acl' | 'noacl' | 'unbindable' | 'runbindable' | 'private' | 'rprivate' | 'slave' | 'rslave' | 'shared' | 'rshared' | 'relatime' | 'norelatime' | 'iversion' | 'noiversion' | 'strictatime' | 'nouser' | 'user' ) MOUNT EXPRESSION = ( ALPHANUMERIC | AARE ) ... PIVOT ROOT RULE = [ QUALIFIERS ] pivot_root [ oldroot=OLD PUT FILEGLOB ] [ NEW ROOT FILEGLOB ] SOURCE FILEGLOB = FILEGLOB MOUNTPOINT FILEGLOB = FILEGLOB eg. mount, mount /dev/foo, mount options=ro /dev/foo -> /mnt/, mount options in (ro,atime) /dev/foo -> /mnt/, mount options=ro options=atime, Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/Makefile | 2 +- security/apparmor/apparmorfs.c | 8 +- security/apparmor/domain.c | 4 +- security/apparmor/include/apparmor.h | 1 + security/apparmor/include/audit.h | 11 + security/apparmor/include/domain.h | 5 + security/apparmor/include/mount.h | 54 +++ security/apparmor/lsm.c | 64 ++++ security/apparmor/mount.c | 696 +++++++++++++++++++++++++++++++++++ 9 files changed, 841 insertions(+), 4 deletions(-) create mode 100644 security/apparmor/include/mount.h create mode 100644 security/apparmor/mount.c diff --git a/security/apparmor/Makefile b/security/apparmor/Makefile index a16b195274de..81a34426d024 100644 --- a/security/apparmor/Makefile +++ b/security/apparmor/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor.o apparmor-y := apparmorfs.o audit.o capability.o context.o ipc.o lib.o match.o \ path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \ - resource.o secid.o file.o policy_ns.o label.o + resource.o secid.o file.o policy_ns.o label.o mount.o apparmor-$(CONFIG_SECURITY_APPARMOR_HASH) += crypto.o clean-files := capability_names.h rlim_names.h diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index a5f9e1aa51f7..8fa6c898c44b 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -2159,9 +2159,14 @@ static struct aa_sfs_entry aa_sfs_entry_policy[] = { { } }; +static struct aa_sfs_entry aa_sfs_entry_mount[] = { + AA_SFS_FILE_STRING("mask", "mount umount pivot_root"), + { } +}; + static struct aa_sfs_entry aa_sfs_entry_ns[] = { AA_SFS_FILE_BOOLEAN("profile", 1), - AA_SFS_FILE_BOOLEAN("pivot_root", 1), + AA_SFS_FILE_BOOLEAN("pivot_root", 0), { } }; @@ -2180,6 +2185,7 @@ static struct aa_sfs_entry aa_sfs_entry_features[] = { AA_SFS_DIR("policy", aa_sfs_entry_policy), AA_SFS_DIR("domain", aa_sfs_entry_domain), AA_SFS_DIR("file", aa_sfs_entry_file), + AA_SFS_DIR("mount", aa_sfs_entry_mount), AA_SFS_DIR("namespaces", aa_sfs_entry_ns), AA_SFS_FILE_U64("capability", VFS_CAP_FLAGS_MASK), AA_SFS_DIR("rlimit", aa_sfs_entry_rlimit), diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index d0594446ae3f..ffc8c75a6785 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -374,8 +374,8 @@ static const char *next_name(int xtype, const char *name) * * Returns: refcounted label, or NULL on failure (MAYBE NULL) */ -static struct aa_label *x_table_lookup(struct aa_profile *profile, u32 xindex, - const char **name) +struct aa_label *x_table_lookup(struct aa_profile *profile, u32 xindex, + const char **name) { struct aa_label *label = NULL; u32 xtype = xindex & AA_X_TYPE_MASK; diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h index 962a20a75e01..829082c35faa 100644 --- a/security/apparmor/include/apparmor.h +++ b/security/apparmor/include/apparmor.h @@ -27,6 +27,7 @@ #define AA_CLASS_NET 4 #define AA_CLASS_RLIMITS 5 #define AA_CLASS_DOMAIN 6 +#define AA_CLASS_MOUNT 7 #define AA_CLASS_PTRACE 9 #define AA_CLASS_SIGNAL 10 #define AA_CLASS_LABEL 16 diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index d9a156ae11b9..c3fe1c5ef3bc 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -71,6 +71,10 @@ enum audit_type { #define OP_FMPROT "file_mprotect" #define OP_INHERIT "file_inherit" +#define OP_PIVOTROOT "pivotroot" +#define OP_MOUNT "mount" +#define OP_UMOUNT "umount" + #define OP_CREATE "create" #define OP_POST_CREATE "post_create" #define OP_BIND "bind" @@ -132,6 +136,13 @@ struct apparmor_audit_data { int rlim; unsigned long max; } rlim; + struct { + const char *src_name; + const char *type; + const char *trans; + const char *data; + unsigned long flags; + } mnt; }; }; diff --git a/security/apparmor/include/domain.h b/security/apparmor/include/domain.h index bab5810b6e9a..db27403346c5 100644 --- a/security/apparmor/include/domain.h +++ b/security/apparmor/include/domain.h @@ -15,6 +15,8 @@ #include #include +#include "label.h" + #ifndef __AA_DOMAIN_H #define __AA_DOMAIN_H @@ -29,6 +31,9 @@ struct aa_domain { #define AA_CHANGE_ONEXEC 4 #define AA_CHANGE_STACK 8 +struct aa_label *x_table_lookup(struct aa_profile *profile, u32 xindex, + const char **name); + int apparmor_bprm_set_creds(struct linux_binprm *bprm); int apparmor_bprm_secureexec(struct linux_binprm *bprm); diff --git a/security/apparmor/include/mount.h b/security/apparmor/include/mount.h new file mode 100644 index 000000000000..25d6067fa6ef --- /dev/null +++ b/security/apparmor/include/mount.h @@ -0,0 +1,54 @@ +/* + * AppArmor security module + * + * This file contains AppArmor file mediation function definitions. + * + * Copyright 2017 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#ifndef __AA_MOUNT_H +#define __AA_MOUNT_H + +#include +#include + +#include "domain.h" +#include "policy.h" + +/* mount perms */ +#define AA_MAY_PIVOTROOT 0x01 +#define AA_MAY_MOUNT 0x02 +#define AA_MAY_UMOUNT 0x04 +#define AA_AUDIT_DATA 0x40 +#define AA_MNT_CONT_MATCH 0x40 + +#define AA_MS_IGNORE_MASK (MS_KERNMOUNT | MS_NOSEC | MS_ACTIVE | MS_BORN) + +int aa_remount(struct aa_label *label, const struct path *path, + unsigned long flags, void *data); + +int aa_bind_mount(struct aa_label *label, const struct path *path, + const char *old_name, unsigned long flags); + + +int aa_mount_change_type(struct aa_label *label, const struct path *path, + unsigned long flags); + +int aa_move_mount(struct aa_label *label, const struct path *path, + const char *old_name); + +int aa_new_mount(struct aa_label *label, const char *dev_name, + const struct path *path, const char *type, unsigned long flags, + void *data); + +int aa_umount(struct aa_label *label, struct vfsmount *mnt, int flags); + +int aa_pivotroot(struct aa_label *label, const struct path *old_path, + const struct path *new_path); + +#endif /* __AA_MOUNT_H */ diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index af22f3dfbcce..4ad0b3a45142 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -38,6 +38,7 @@ #include "include/policy.h" #include "include/policy_ns.h" #include "include/procattr.h" +#include "include/mount.h" /* Flag indicating whether initialization completed */ int apparmor_initialized; @@ -511,6 +512,65 @@ static int apparmor_file_mprotect(struct vm_area_struct *vma, !(vma->vm_flags & VM_SHARED) ? MAP_PRIVATE : 0); } +static int apparmor_sb_mount(const char *dev_name, const struct path *path, + const char *type, unsigned long flags, void *data) +{ + struct aa_label *label; + int error = 0; + + /* Discard magic */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) + flags &= ~MS_MGC_MSK; + + flags &= ~AA_MS_IGNORE_MASK; + + label = __begin_current_label_crit_section(); + if (!unconfined(label)) { + if (flags & MS_REMOUNT) + error = aa_remount(label, path, flags, data); + else if (flags & MS_BIND) + error = aa_bind_mount(label, path, dev_name, flags); + else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | + MS_UNBINDABLE)) + error = aa_mount_change_type(label, path, flags); + else if (flags & MS_MOVE) + error = aa_move_mount(label, path, dev_name); + else + error = aa_new_mount(label, dev_name, path, type, + flags, data); + } + __end_current_label_crit_section(label); + + return error; +} + +static int apparmor_sb_umount(struct vfsmount *mnt, int flags) +{ + struct aa_label *label; + int error = 0; + + label = __begin_current_label_crit_section(); + if (!unconfined(label)) + error = aa_umount(label, mnt, flags); + __end_current_label_crit_section(label); + + return error; +} + +static int apparmor_sb_pivotroot(const struct path *old_path, + const struct path *new_path) +{ + struct aa_label *label; + int error = 0; + + label = aa_get_current_label(); + if (!unconfined(label)) + error = aa_pivotroot(label, old_path, new_path); + aa_put_label(label); + + return error; +} + static int apparmor_getprocattr(struct task_struct *task, char *name, char **value) { @@ -682,6 +742,10 @@ static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(capget, apparmor_capget), LSM_HOOK_INIT(capable, apparmor_capable), + LSM_HOOK_INIT(sb_mount, apparmor_sb_mount), + LSM_HOOK_INIT(sb_umount, apparmor_sb_umount), + LSM_HOOK_INIT(sb_pivotroot, apparmor_sb_pivotroot), + LSM_HOOK_INIT(path_link, apparmor_path_link), LSM_HOOK_INIT(path_unlink, apparmor_path_unlink), LSM_HOOK_INIT(path_symlink, apparmor_path_symlink), diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c new file mode 100644 index 000000000000..82a64b58041d --- /dev/null +++ b/security/apparmor/mount.c @@ -0,0 +1,696 @@ +/* + * AppArmor security module + * + * This file contains AppArmor mediation of files + * + * Copyright (C) 1998-2008 Novell/SUSE + * Copyright 2009-2017 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include +#include +#include + +#include "include/apparmor.h" +#include "include/audit.h" +#include "include/context.h" +#include "include/domain.h" +#include "include/file.h" +#include "include/match.h" +#include "include/mount.h" +#include "include/path.h" +#include "include/policy.h" + + +static void audit_mnt_flags(struct audit_buffer *ab, unsigned long flags) +{ + if (flags & MS_RDONLY) + audit_log_format(ab, "ro"); + else + audit_log_format(ab, "rw"); + if (flags & MS_NOSUID) + audit_log_format(ab, ", nosuid"); + if (flags & MS_NODEV) + audit_log_format(ab, ", nodev"); + if (flags & MS_NOEXEC) + audit_log_format(ab, ", noexec"); + if (flags & MS_SYNCHRONOUS) + audit_log_format(ab, ", sync"); + if (flags & MS_REMOUNT) + audit_log_format(ab, ", remount"); + if (flags & MS_MANDLOCK) + audit_log_format(ab, ", mand"); + if (flags & MS_DIRSYNC) + audit_log_format(ab, ", dirsync"); + if (flags & MS_NOATIME) + audit_log_format(ab, ", noatime"); + if (flags & MS_NODIRATIME) + audit_log_format(ab, ", nodiratime"); + if (flags & MS_BIND) + audit_log_format(ab, flags & MS_REC ? ", rbind" : ", bind"); + if (flags & MS_MOVE) + audit_log_format(ab, ", move"); + if (flags & MS_SILENT) + audit_log_format(ab, ", silent"); + if (flags & MS_POSIXACL) + audit_log_format(ab, ", acl"); + if (flags & MS_UNBINDABLE) + audit_log_format(ab, flags & MS_REC ? ", runbindable" : + ", unbindable"); + if (flags & MS_PRIVATE) + audit_log_format(ab, flags & MS_REC ? ", rprivate" : + ", private"); + if (flags & MS_SLAVE) + audit_log_format(ab, flags & MS_REC ? ", rslave" : + ", slave"); + if (flags & MS_SHARED) + audit_log_format(ab, flags & MS_REC ? ", rshared" : + ", shared"); + if (flags & MS_RELATIME) + audit_log_format(ab, ", relatime"); + if (flags & MS_I_VERSION) + audit_log_format(ab, ", iversion"); + if (flags & MS_STRICTATIME) + audit_log_format(ab, ", strictatime"); + if (flags & MS_NOUSER) + audit_log_format(ab, ", nouser"); +} + +/** + * audit_cb - call back for mount specific audit fields + * @ab: audit_buffer (NOT NULL) + * @va: audit struct to audit values of (NOT NULL) + */ +static void audit_cb(struct audit_buffer *ab, void *va) +{ + struct common_audit_data *sa = va; + + if (aad(sa)->mnt.type) { + audit_log_format(ab, " fstype="); + audit_log_untrustedstring(ab, aad(sa)->mnt.type); + } + if (aad(sa)->mnt.src_name) { + audit_log_format(ab, " srcname="); + audit_log_untrustedstring(ab, aad(sa)->mnt.src_name); + } + if (aad(sa)->mnt.trans) { + audit_log_format(ab, " trans="); + audit_log_untrustedstring(ab, aad(sa)->mnt.trans); + } + if (aad(sa)->mnt.flags) { + audit_log_format(ab, " flags=\""); + audit_mnt_flags(ab, aad(sa)->mnt.flags); + audit_log_format(ab, "\""); + } + if (aad(sa)->mnt.data) { + audit_log_format(ab, " options="); + audit_log_untrustedstring(ab, aad(sa)->mnt.data); + } +} + +/** + * audit_mount - handle the auditing of mount operations + * @profile: the profile being enforced (NOT NULL) + * @op: operation being mediated (NOT NULL) + * @name: name of object being mediated (MAYBE NULL) + * @src_name: src_name of object being mediated (MAYBE_NULL) + * @type: type of filesystem (MAYBE_NULL) + * @trans: name of trans (MAYBE NULL) + * @flags: filesystem idependent mount flags + * @data: filesystem mount flags + * @request: permissions requested + * @perms: the permissions computed for the request (NOT NULL) + * @info: extra information message (MAYBE NULL) + * @error: 0 if operation allowed else failure error code + * + * Returns: %0 or error on failure + */ +static int audit_mount(struct aa_profile *profile, const char *op, + const char *name, const char *src_name, + const char *type, const char *trans, + unsigned long flags, const void *data, u32 request, + struct aa_perms *perms, const char *info, int error) +{ + int audit_type = AUDIT_APPARMOR_AUTO; + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, op); + + if (likely(!error)) { + u32 mask = perms->audit; + + if (unlikely(AUDIT_MODE(profile) == AUDIT_ALL)) + mask = 0xffff; + + /* mask off perms that are not being force audited */ + request &= mask; + + if (likely(!request)) + return 0; + audit_type = AUDIT_APPARMOR_AUDIT; + } else { + /* only report permissions that were denied */ + request = request & ~perms->allow; + + if (request & perms->kill) + audit_type = AUDIT_APPARMOR_KILL; + + /* quiet known rejects, assumes quiet and kill do not overlap */ + if ((request & perms->quiet) && + AUDIT_MODE(profile) != AUDIT_NOQUIET && + AUDIT_MODE(profile) != AUDIT_ALL) + request &= ~perms->quiet; + + if (!request) + return error; + } + + aad(&sa)->name = name; + aad(&sa)->mnt.src_name = src_name; + aad(&sa)->mnt.type = type; + aad(&sa)->mnt.trans = trans; + aad(&sa)->mnt.flags = flags; + if (data && (perms->audit & AA_AUDIT_DATA)) + aad(&sa)->mnt.data = data; + aad(&sa)->info = info; + aad(&sa)->error = error; + + return aa_audit(audit_type, profile, &sa, audit_cb); +} + +/** + * match_mnt_flags - Do an ordered match on mount flags + * @dfa: dfa to match against + * @state: state to start in + * @flags: mount flags to match against + * + * Mount flags are encoded as an ordered match. This is done instead of + * checking against a simple bitmask, to allow for logical operations + * on the flags. + * + * Returns: next state after flags match + */ +static unsigned int match_mnt_flags(struct aa_dfa *dfa, unsigned int state, + unsigned long flags) +{ + unsigned int i; + + for (i = 0; i <= 31 ; ++i) { + if ((1 << i) & flags) + state = aa_dfa_next(dfa, state, i + 1); + } + + return state; +} + +/** + * compute_mnt_perms - compute mount permission associated with @state + * @dfa: dfa to match against (NOT NULL) + * @state: state match finished in + * + * Returns: mount permissions + */ +static struct aa_perms compute_mnt_perms(struct aa_dfa *dfa, + unsigned int state) +{ + struct aa_perms perms; + + perms.kill = 0; + perms.allow = dfa_user_allow(dfa, state); + perms.audit = dfa_user_audit(dfa, state); + perms.quiet = dfa_user_quiet(dfa, state); + perms.xindex = dfa_user_xindex(dfa, state); + + return perms; +} + +static const char * const mnt_info_table[] = { + "match succeeded", + "failed mntpnt match", + "failed srcname match", + "failed type match", + "failed flags match", + "failed data match" +}; + +/* + * Returns 0 on success else element that match failed in, this is the + * index into the mnt_info_table above + */ +static int do_match_mnt(struct aa_dfa *dfa, unsigned int start, + const char *mntpnt, const char *devname, + const char *type, unsigned long flags, + void *data, bool binary, struct aa_perms *perms) +{ + unsigned int state; + + AA_BUG(!dfa); + AA_BUG(!perms); + + state = aa_dfa_match(dfa, start, mntpnt); + state = aa_dfa_null_transition(dfa, state); + if (!state) + return 1; + + if (devname) + state = aa_dfa_match(dfa, state, devname); + state = aa_dfa_null_transition(dfa, state); + if (!state) + return 2; + + if (type) + state = aa_dfa_match(dfa, state, type); + state = aa_dfa_null_transition(dfa, state); + if (!state) + return 3; + + state = match_mnt_flags(dfa, state, flags); + if (!state) + return 4; + *perms = compute_mnt_perms(dfa, state); + if (perms->allow & AA_MAY_MOUNT) + return 0; + + /* only match data if not binary and the DFA flags data is expected */ + if (data && !binary && (perms->allow & AA_MNT_CONT_MATCH)) { + state = aa_dfa_null_transition(dfa, state); + if (!state) + return 4; + + state = aa_dfa_match(dfa, state, data); + if (!state) + return 5; + *perms = compute_mnt_perms(dfa, state); + if (perms->allow & AA_MAY_MOUNT) + return 0; + } + + /* failed at end of flags match */ + return 4; +} + + +static int path_flags(struct aa_profile *profile, const struct path *path) +{ + AA_BUG(!profile); + AA_BUG(!path); + + return profile->path_flags | + (S_ISDIR(path->dentry->d_inode->i_mode) ? PATH_IS_DIR : 0); +} + +/** + * match_mnt_path_str - handle path matching for mount + * @profile: the confining profile + * @mntpath: for the mntpnt (NOT NULL) + * @buffer: buffer to be used to lookup mntpath + * @devnme: string for the devname/src_name (MAY BE NULL OR ERRPTR) + * @type: string for the dev type (MAYBE NULL) + * @flags: mount flags to match + * @data: fs mount data (MAYBE NULL) + * @binary: whether @data is binary + * @devinfo: error str if (IS_ERR(@devname)) + * + * Returns: 0 on success else error + */ +static int match_mnt_path_str(struct aa_profile *profile, + const struct path *mntpath, char *buffer, + const char *devname, const char *type, + unsigned long flags, void *data, bool binary, + const char *devinfo) +{ + struct aa_perms perms = { }; + const char *mntpnt = NULL, *info = NULL; + int pos, error; + + AA_BUG(!profile); + AA_BUG(!mntpath); + AA_BUG(!buffer); + + error = aa_path_name(mntpath, path_flags(profile, mntpath), buffer, + &mntpnt, &info, profile->disconnected); + if (error) + goto audit; + if (IS_ERR(devname)) { + error = PTR_ERR(devname); + devname = NULL; + info = devinfo; + goto audit; + } + + error = -EACCES; + pos = do_match_mnt(profile->policy.dfa, + profile->policy.start[AA_CLASS_MOUNT], + mntpnt, devname, type, flags, data, binary, &perms); + if (pos) { + info = mnt_info_table[pos]; + goto audit; + } + error = 0; + +audit: + return audit_mount(profile, OP_MOUNT, mntpnt, devname, type, NULL, + flags, data, AA_MAY_MOUNT, &perms, info, error); +} + +/** + * match_mnt - handle path matching for mount + * @profile: the confining profile + * @mntpath: for the mntpnt (NOT NULL) + * @buffer: buffer to be used to lookup mntpath + * @devpath: path devname/src_name (MAYBE NULL) + * @devbuffer: buffer to be used to lookup devname/src_name + * @type: string for the dev type (MAYBE NULL) + * @flags: mount flags to match + * @data: fs mount data (MAYBE NULL) + * @binary: whether @data is binary + * + * Returns: 0 on success else error + */ +static int match_mnt(struct aa_profile *profile, const struct path *path, + char *buffer, struct path *devpath, char *devbuffer, + const char *type, unsigned long flags, void *data, + bool binary) +{ + const char *devname = NULL, *info = NULL; + int error = -EACCES; + + AA_BUG(!profile); + AA_BUG(devpath && !devbuffer); + + if (devpath) { + error = aa_path_name(devpath, path_flags(profile, devpath), + devbuffer, &devname, &info, + profile->disconnected); + if (error) + devname = ERR_PTR(error); + } + + return match_mnt_path_str(profile, path, buffer, devname, type, flags, + data, binary, info); +} + +int aa_remount(struct aa_label *label, const struct path *path, + unsigned long flags, void *data) +{ + struct aa_profile *profile; + char *buffer = NULL; + bool binary; + int error; + + AA_BUG(!label); + AA_BUG(!path); + + binary = path->dentry->d_sb->s_type->fs_flags & FS_BINARY_MOUNTDATA; + + get_buffers(buffer); + error = fn_for_each_confined(label, profile, + match_mnt(profile, path, buffer, NULL, NULL, NULL, + flags, data, binary)); + put_buffers(buffer); + + return error; +} + +int aa_bind_mount(struct aa_label *label, const struct path *path, + const char *dev_name, unsigned long flags) +{ + struct aa_profile *profile; + char *buffer = NULL, *old_buffer = NULL; + struct path old_path; + int error; + + AA_BUG(!label); + AA_BUG(!path); + + if (!dev_name || !*dev_name) + return -EINVAL; + + flags &= MS_REC | MS_BIND; + + error = kern_path(dev_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); + if (error) + return error; + + get_buffers(buffer, old_buffer); + error = fn_for_each_confined(label, profile, + match_mnt(profile, path, buffer, &old_path, old_buffer, + NULL, flags, NULL, false)); + put_buffers(buffer, old_buffer); + path_put(&old_path); + + return error; +} + +int aa_mount_change_type(struct aa_label *label, const struct path *path, + unsigned long flags) +{ + struct aa_profile *profile; + char *buffer = NULL; + int error; + + AA_BUG(!label); + AA_BUG(!path); + + /* These are the flags allowed by do_change_type() */ + flags &= (MS_REC | MS_SILENT | MS_SHARED | MS_PRIVATE | MS_SLAVE | + MS_UNBINDABLE); + + get_buffers(buffer); + error = fn_for_each_confined(label, profile, + match_mnt(profile, path, buffer, NULL, NULL, NULL, + flags, NULL, false)); + put_buffers(buffer); + + return error; +} + +int aa_move_mount(struct aa_label *label, const struct path *path, + const char *orig_name) +{ + struct aa_profile *profile; + char *buffer = NULL, *old_buffer = NULL; + struct path old_path; + int error; + + AA_BUG(!label); + AA_BUG(!path); + + if (!orig_name || !*orig_name) + return -EINVAL; + + error = kern_path(orig_name, LOOKUP_FOLLOW, &old_path); + if (error) + return error; + + get_buffers(buffer, old_buffer); + error = fn_for_each_confined(label, profile, + match_mnt(profile, path, buffer, &old_path, old_buffer, + NULL, MS_MOVE, NULL, false)); + put_buffers(buffer, old_buffer); + path_put(&old_path); + + return error; +} + +int aa_new_mount(struct aa_label *label, const char *dev_name, + const struct path *path, const char *type, unsigned long flags, + void *data) +{ + struct aa_profile *profile; + char *buffer = NULL, *dev_buffer = NULL; + bool binary = true; + int error; + int requires_dev = 0; + struct path tmp_path, *dev_path = NULL; + + AA_BUG(!label); + AA_BUG(!path); + + if (type) { + struct file_system_type *fstype; + + fstype = get_fs_type(type); + if (!fstype) + return -ENODEV; + binary = fstype->fs_flags & FS_BINARY_MOUNTDATA; + requires_dev = fstype->fs_flags & FS_REQUIRES_DEV; + put_filesystem(fstype); + + if (requires_dev) { + if (!dev_name || !*dev_name) + return -ENOENT; + + error = kern_path(dev_name, LOOKUP_FOLLOW, &tmp_path); + if (error) + return error; + dev_path = &tmp_path; + } + } + + get_buffers(buffer, dev_buffer); + if (dev_path) { + error = fn_for_each_confined(label, profile, + match_mnt(profile, path, buffer, dev_path, dev_buffer, + type, flags, data, binary)); + } else { + error = fn_for_each_confined(label, profile, + match_mnt_path_str(profile, path, buffer, dev_name, + type, flags, data, binary, NULL)); + } + put_buffers(buffer, dev_buffer); + if (dev_path) + path_put(dev_path); + + return error; +} + +static int profile_umount(struct aa_profile *profile, struct path *path, + char *buffer) +{ + struct aa_perms perms = { }; + const char *name = NULL, *info = NULL; + unsigned int state; + int error; + + AA_BUG(!profile); + AA_BUG(!path); + + error = aa_path_name(path, path_flags(profile, path), buffer, &name, + &info, profile->disconnected); + if (error) + goto audit; + + state = aa_dfa_match(profile->policy.dfa, + profile->policy.start[AA_CLASS_MOUNT], + name); + perms = compute_mnt_perms(profile->policy.dfa, state); + if (AA_MAY_UMOUNT & ~perms.allow) + error = -EACCES; + +audit: + return audit_mount(profile, OP_UMOUNT, name, NULL, NULL, NULL, 0, NULL, + AA_MAY_UMOUNT, &perms, info, error); +} + +int aa_umount(struct aa_label *label, struct vfsmount *mnt, int flags) +{ + struct aa_profile *profile; + char *buffer = NULL; + int error; + struct path path = { .mnt = mnt, .dentry = mnt->mnt_root }; + + AA_BUG(!label); + AA_BUG(!mnt); + + get_buffers(buffer); + error = fn_for_each_confined(label, profile, + profile_umount(profile, &path, buffer)); + put_buffers(buffer); + + return error; +} + +/* helper fn for transition on pivotroot + * + * Returns: label for transition or ERR_PTR. Does not return NULL + */ +static struct aa_label *build_pivotroot(struct aa_profile *profile, + const struct path *new_path, + char *new_buffer, + const struct path *old_path, + char *old_buffer) +{ + const char *old_name, *new_name = NULL, *info = NULL; + const char *trans_name = NULL; + struct aa_perms perms = { }; + unsigned int state; + int error; + + AA_BUG(!profile); + AA_BUG(!new_path); + AA_BUG(!old_path); + + if (profile_unconfined(profile)) + return aa_get_newest_label(&profile->label); + + error = aa_path_name(old_path, path_flags(profile, old_path), + old_buffer, &old_name, &info, + profile->disconnected); + if (error) + goto audit; + error = aa_path_name(new_path, path_flags(profile, new_path), + new_buffer, &new_name, &info, + profile->disconnected); + if (error) + goto audit; + + error = -EACCES; + state = aa_dfa_match(profile->policy.dfa, + profile->policy.start[AA_CLASS_MOUNT], + new_name); + state = aa_dfa_null_transition(profile->policy.dfa, state); + state = aa_dfa_match(profile->policy.dfa, state, old_name); + perms = compute_mnt_perms(profile->policy.dfa, state); + + if (AA_MAY_PIVOTROOT & perms.allow) + error = 0; + +audit: + error = audit_mount(profile, OP_PIVOTROOT, new_name, old_name, + NULL, trans_name, 0, NULL, AA_MAY_PIVOTROOT, + &perms, info, error); + if (error) + return ERR_PTR(error); + + return aa_get_newest_label(&profile->label); +} + +int aa_pivotroot(struct aa_label *label, const struct path *old_path, + const struct path *new_path) +{ + struct aa_profile *profile; + struct aa_label *target = NULL; + char *old_buffer = NULL, *new_buffer = NULL, *info = NULL; + int error; + + AA_BUG(!label); + AA_BUG(!old_path); + AA_BUG(!new_path); + + get_buffers(old_buffer, new_buffer); + target = fn_label_build(label, profile, GFP_ATOMIC, + build_pivotroot(profile, new_path, new_buffer, + old_path, old_buffer)); + if (!target) { + info = "label build failed"; + error = -ENOMEM; + goto fail; + } else if (!IS_ERR(target)) { + error = aa_replace_current_label(target); + if (error) { + /* TODO: audit target */ + aa_put_label(target); + goto out; + } + } else + /* already audited error */ + error = PTR_ERR(target); +out: + put_buffers(old_buffer, new_buffer); + + return error; + +fail: + /* TODO: add back in auditing of new_name and old_name */ + error = fn_for_each(label, profile, + audit_mount(profile, OP_PIVOTROOT, NULL /*new_name */, + NULL /* old_name */, + NULL, NULL, + 0, NULL, AA_MAY_PIVOTROOT, &nullperms, info, + error)); + goto out; +} -- cgit From f872af75d325cc449b6621a0d30a4f2ba77dd092 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sun, 6 Aug 2017 05:36:40 -0700 Subject: apparmor: cleanup conditional check for label in label_print Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/label.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/security/apparmor/label.c b/security/apparmor/label.c index e324f4df3e34..38be7a89cc31 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -1450,9 +1450,11 @@ bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp) * cached label name is present and visible * @label->hname only exists if label is namespace hierachical */ -static inline bool use_label_hname(struct aa_ns *ns, struct aa_label *label) +static inline bool use_label_hname(struct aa_ns *ns, struct aa_label *label, + int flags) { - if (label->hname && labels_ns(label) == ns) + if (label->hname && (!ns || labels_ns(label) == ns) && + !(flags & ~FLAG_SHOW_MODE)) return true; return false; @@ -1710,10 +1712,8 @@ void aa_label_xaudit(struct audit_buffer *ab, struct aa_ns *ns, AA_BUG(!ab); AA_BUG(!label); - if (!ns) - ns = labels_ns(label); - - if (!use_label_hname(ns, label) || display_mode(ns, label, flags)) { + if (!use_label_hname(ns, label, flags) || + display_mode(ns, label, flags)) { len = aa_label_asxprint(&name, ns, label, flags, gfp); if (len == -1) { AA_DEBUG("label print error"); @@ -1738,10 +1738,7 @@ void aa_label_seq_xprint(struct seq_file *f, struct aa_ns *ns, AA_BUG(!f); AA_BUG(!label); - if (!ns) - ns = labels_ns(label); - - if (!use_label_hname(ns, label)) { + if (!use_label_hname(ns, label, flags)) { char *str; int len; @@ -1764,10 +1761,7 @@ void aa_label_xprintk(struct aa_ns *ns, struct aa_label *label, int flags, { AA_BUG(!label); - if (!ns) - ns = labels_ns(label); - - if (!use_label_hname(ns, label)) { + if (!use_label_hname(ns, label, flags)) { char *str; int len; -- cgit From 26b7899510ae243e392960704ebdba52d05fbb13 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sun, 6 Aug 2017 05:39:08 -0700 Subject: apparmor: add support for absolute root view based labels With apparmor policy virtualization based on policy namespace View's we don't generally want/need absolute root based views, however there are cases like debugging and some secid based conversions where using a root based view is important. Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/include/label.h | 1 + security/apparmor/label.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/security/apparmor/include/label.h b/security/apparmor/include/label.h index 9a283b722755..af22dcbbcb8a 100644 --- a/security/apparmor/include/label.h +++ b/security/apparmor/include/label.h @@ -310,6 +310,7 @@ bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp); #define FLAG_SHOW_MODE 1 #define FLAG_VIEW_SUBNS 2 #define FLAG_HIDDEN_UNCONFINED 4 +#define FLAG_ABS_ROOT 8 int aa_label_snxprint(char *str, size_t size, struct aa_ns *view, struct aa_label *label, int flags); int aa_label_asxprint(char **strp, struct aa_ns *ns, struct aa_label *label, diff --git a/security/apparmor/label.c b/security/apparmor/label.c index 38be7a89cc31..52b4ef14840d 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -1607,8 +1607,13 @@ int aa_label_snxprint(char *str, size_t size, struct aa_ns *ns, AA_BUG(!str && size != 0); AA_BUG(!label); - if (!ns) + if (flags & FLAG_ABS_ROOT) { + ns = root_ns; + len = snprintf(str, size, "="); + update_for_len(total, len, size, str); + } else if (!ns) { ns = labels_ns(label); + } label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS)) { @@ -1868,6 +1873,9 @@ struct aa_label *aa_label_parse(struct aa_label *base, const char *str, if (*str == '&') str++; } + if (*str == '=') + base = &root_ns->unconfined->label; + error = vec_setup(profile, vec, len, gfp); if (error) return ERR_PTR(error); -- cgit From 2410aa96d6b4930ed25fd02c3d173f14b962e0f4 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 18 Jul 2017 23:37:18 -0700 Subject: apparmor: make policy_unpack able to audit different info messages Switch unpack auditing to using the generic name field in the audit struct and make it so we can start adding new info messages about why an unpack failed. Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/include/audit.h | 4 +-- security/apparmor/policy_unpack.c | 52 ++++++++++++++++++++++++++++----------- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index c3fe1c5ef3bc..620e81169659 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -127,9 +127,9 @@ struct apparmor_audit_data { } fs; }; struct { - const char *name; - long pos; + struct aa_profile *profile; const char *ns; + long pos; } iface; int signal; struct { diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index bda0dce3b582..4ede87c30f8b 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -85,9 +85,9 @@ static void audit_cb(struct audit_buffer *ab, void *va) audit_log_format(ab, " ns="); audit_log_untrustedstring(ab, aad(sa)->iface.ns); } - if (aad(sa)->iface.name) { + if (aad(sa)->name) { audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, aad(sa)->iface.name); + audit_log_untrustedstring(ab, aad(sa)->name); } if (aad(sa)->iface.pos) audit_log_format(ab, " offset=%ld", aad(sa)->iface.pos); @@ -114,9 +114,9 @@ static int audit_iface(struct aa_profile *new, const char *ns_name, aad(&sa)->iface.pos = e->pos - e->start; aad(&sa)->iface.ns = ns_name; if (new) - aad(&sa)->iface.name = new->base.hname; + aad(&sa)->name = new->base.hname; else - aad(&sa)->iface.name = name; + aad(&sa)->name = name; aad(&sa)->info = info; aad(&sa)->error = error; @@ -583,6 +583,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) { struct aa_profile *profile = NULL; const char *tmpname, *tmpns = NULL, *name = NULL; + const char *info = "failed to unpack profile"; size_t ns_len; struct rhashtable_params params = { 0 }; char *key = NULL; @@ -604,8 +605,10 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) tmpname = aa_splitn_fqname(name, strlen(name), &tmpns, &ns_len); if (tmpns) { *ns_name = kstrndup(tmpns, ns_len, GFP_KERNEL); - if (!*ns_name) + if (!*ns_name) { + info = "out of memory"; goto fail; + } name = tmpname; } @@ -624,12 +627,15 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) if (IS_ERR(profile->xmatch)) { error = PTR_ERR(profile->xmatch); profile->xmatch = NULL; + info = "bad xmatch"; goto fail; } /* xmatch_len is not optional if xmatch is set */ if (profile->xmatch) { - if (!unpack_u32(e, &tmp, NULL)) + if (!unpack_u32(e, &tmp, NULL)) { + info = "missing xmatch len"; goto fail; + } profile->xmatch_len = tmp; } @@ -637,8 +643,11 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) (void) unpack_str(e, &profile->disconnected, "disconnected"); /* per profile debug flags (complain, audit) */ - if (!unpack_nameX(e, AA_STRUCT, "flags")) + if (!unpack_nameX(e, AA_STRUCT, "flags")) { + info = "profile missing flags"; goto fail; + } + info = "failed to unpack profile flags"; if (!unpack_u32(e, &tmp, NULL)) goto fail; if (tmp & PACKED_FLAG_HAT) @@ -667,6 +676,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) /* set a default value if path_flags field is not present */ profile->path_flags = PATH_MEDIATE_DELETED; + info = "failed to unpack profile capabilities"; if (!unpack_u32(e, &(profile->caps.allow.cap[0]), NULL)) goto fail; if (!unpack_u32(e, &(profile->caps.audit.cap[0]), NULL)) @@ -676,6 +686,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) if (!unpack_u32(e, &tmpcap.cap[0], NULL)) goto fail; + info = "failed to unpack upper profile capabilities"; if (unpack_nameX(e, AA_STRUCT, "caps64")) { /* optional upper half of 64 bit caps */ if (!unpack_u32(e, &(profile->caps.allow.cap[1]), NULL)) @@ -690,6 +701,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) goto fail; } + info = "failed to unpack extended profile capabilities"; if (unpack_nameX(e, AA_STRUCT, "capsx")) { /* optional extended caps mediation mask */ if (!unpack_u32(e, &(profile->caps.extended.cap[0]), NULL)) @@ -700,11 +712,14 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) goto fail; } - if (!unpack_rlimits(e, profile)) + if (!unpack_rlimits(e, profile)) { + info = "failed to unpack profile rlimits"; goto fail; + } if (unpack_nameX(e, AA_STRUCT, "policydb")) { /* generic policy dfa - optional and may be NULL */ + info = "failed to unpack policydb"; profile->policy.dfa = unpack_dfa(e); if (IS_ERR(profile->policy.dfa)) { error = PTR_ERR(profile->policy.dfa); @@ -734,6 +749,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) if (IS_ERR(profile->file.dfa)) { error = PTR_ERR(profile->file.dfa); profile->file.dfa = NULL; + info = "failed to unpack profile file rules"; goto fail; } else if (profile->file.dfa) { if (!unpack_u32(e, &profile->file.start, "dfa_start")) @@ -746,10 +762,13 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } else profile->file.dfa = aa_get_dfa(nulldfa); - if (!unpack_trans_table(e, profile)) + if (!unpack_trans_table(e, profile)) { + info = "failed to unpack profile transition table"; goto fail; + } if (unpack_nameX(e, AA_STRUCT, "data")) { + info = "out of memory"; profile->data = kzalloc(sizeof(*profile->data), GFP_KERNEL); if (!profile->data) goto fail; @@ -761,8 +780,10 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) params.hashfn = strhash; params.obj_cmpfn = datacmp; - if (rhashtable_init(profile->data, ¶ms)) + if (rhashtable_init(profile->data, ¶ms)) { + info = "failed to init key, value hash table"; goto fail; + } while (unpack_strdup(e, &key, NULL)) { data = kzalloc(sizeof(*data), GFP_KERNEL); @@ -784,12 +805,16 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) profile->data->p); } - if (!unpack_nameX(e, AA_STRUCTEND, NULL)) + if (!unpack_nameX(e, AA_STRUCTEND, NULL)) { + info = "failed to unpack end of key, value data table"; goto fail; + } } - if (!unpack_nameX(e, AA_STRUCTEND, NULL)) + if (!unpack_nameX(e, AA_STRUCTEND, NULL)) { + info = "failed to unpack end of profile"; goto fail; + } return profile; @@ -798,8 +823,7 @@ fail: name = NULL; else if (!name) name = "unknown"; - audit_iface(profile, NULL, name, "failed to unpack profile", e, - error); + audit_iface(profile, NULL, name, info, e, error); aa_free_profile(profile); return ERR_PTR(error); -- cgit From cbf2d0e1a9e4876046a628e0e036a7545a3a4c40 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 18 Jul 2017 23:41:13 -0700 Subject: apparmor: add more debug asserts to apparmorfs Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/apparmorfs.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 8fa6c898c44b..7acea14c850b 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1446,6 +1446,10 @@ void __aafs_profile_migrate_dents(struct aa_profile *old, { int i; + AA_BUG(!old); + AA_BUG(!new); + AA_BUG(!mutex_is_locked(&profiles_ns(old)->lock)); + for (i = 0; i < AAFS_PROF_SIZEOF; i++) { new->dents[i] = old->dents[i]; if (new->dents[i]) @@ -1509,6 +1513,9 @@ int __aafs_profile_mkdir(struct aa_profile *profile, struct dentry *parent) struct dentry *dent = NULL, *dir; int error; + AA_BUG(!profile); + AA_BUG(!mutex_is_locked(&profiles_ns(profile)->lock)); + if (!parent) { struct aa_profile *p; p = aa_deref_parent(profile); @@ -1734,6 +1741,7 @@ void __aafs_ns_rmdir(struct aa_ns *ns) if (!ns) return; + AA_BUG(!mutex_is_locked(&ns->lock)); list_for_each_entry(child, &ns->base.profiles, base.list) __aafs_profile_rmdir(child); @@ -1906,6 +1914,10 @@ static struct aa_ns *__next_ns(struct aa_ns *root, struct aa_ns *ns) { struct aa_ns *parent, *next; + AA_BUG(!root); + AA_BUG(!ns); + AA_BUG(ns != root && !mutex_is_locked(&ns->parent->lock)); + /* is next namespace a child */ if (!list_empty(&ns->sub_ns)) { next = list_first_entry(&ns->sub_ns, typeof(*ns), base.list); @@ -1940,6 +1952,9 @@ static struct aa_ns *__next_ns(struct aa_ns *root, struct aa_ns *ns) static struct aa_profile *__first_profile(struct aa_ns *root, struct aa_ns *ns) { + AA_BUG(!root); + AA_BUG(ns && !mutex_is_locked(&ns->lock)); + for (; ns; ns = __next_ns(root, ns)) { if (!list_empty(&ns->base.profiles)) return list_first_entry(&ns->base.profiles, @@ -1962,6 +1977,8 @@ static struct aa_profile *__next_profile(struct aa_profile *p) struct aa_profile *parent; struct aa_ns *ns = p->ns; + AA_BUG(!mutex_is_locked(&profiles_ns(p)->lock)); + /* is next profile a child */ if (!list_empty(&p->base.profiles)) return list_first_entry(&p->base.profiles, typeof(*p), -- cgit From 651e28c5537abb39076d3949fb7618536f1d242e Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 18 Jul 2017 23:18:33 -0700 Subject: apparmor: add base infastructure for socket mediation Provide a basic mediation of sockets. This is not a full net mediation but just whether a spcific family of socket can be used by an application, along with setting up some basic infrastructure for network mediation to follow. the user space rule hav the basic form of NETWORK RULE = [ QUALIFIERS ] 'network' [ DOMAIN ] [ TYPE | PROTOCOL ] DOMAIN = ( 'inet' | 'ax25' | 'ipx' | 'appletalk' | 'netrom' | 'bridge' | 'atmpvc' | 'x25' | 'inet6' | 'rose' | 'netbeui' | 'security' | 'key' | 'packet' | 'ash' | 'econet' | 'atmsvc' | 'sna' | 'irda' | 'pppox' | 'wanpipe' | 'bluetooth' | 'netlink' | 'unix' | 'rds' | 'llc' | 'can' | 'tipc' | 'iucv' | 'rxrpc' | 'isdn' | 'phonet' | 'ieee802154' | 'caif' | 'alg' | 'nfc' | 'vsock' | 'mpls' | 'ib' | 'kcm' ) ',' TYPE = ( 'stream' | 'dgram' | 'seqpacket' | 'rdm' | 'raw' | 'packet' ) PROTOCOL = ( 'tcp' | 'udp' | 'icmp' ) eg. network, network inet, Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/.gitignore | 1 + security/apparmor/Makefile | 43 ++++- security/apparmor/apparmorfs.c | 1 + security/apparmor/file.c | 30 +++ security/apparmor/include/audit.h | 26 ++- security/apparmor/include/net.h | 114 +++++++++++ security/apparmor/include/perms.h | 5 +- security/apparmor/include/policy.h | 13 ++ security/apparmor/lib.c | 5 +- security/apparmor/lsm.c | 387 +++++++++++++++++++++++++++++++++++++ security/apparmor/net.c | 184 ++++++++++++++++++ security/apparmor/policy_unpack.c | 47 ++++- 12 files changed, 840 insertions(+), 16 deletions(-) create mode 100644 security/apparmor/include/net.h create mode 100644 security/apparmor/net.c diff --git a/security/apparmor/.gitignore b/security/apparmor/.gitignore index 9cdec70d72b8..d5b291e94264 100644 --- a/security/apparmor/.gitignore +++ b/security/apparmor/.gitignore @@ -1,5 +1,6 @@ # # Generated include files # +net_names.h capability_names.h rlim_names.h diff --git a/security/apparmor/Makefile b/security/apparmor/Makefile index 81a34426d024..dafdd387d42b 100644 --- a/security/apparmor/Makefile +++ b/security/apparmor/Makefile @@ -4,11 +4,44 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor.o apparmor-y := apparmorfs.o audit.o capability.o context.o ipc.o lib.o match.o \ path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \ - resource.o secid.o file.o policy_ns.o label.o mount.o + resource.o secid.o file.o policy_ns.o label.o mount.o net.o apparmor-$(CONFIG_SECURITY_APPARMOR_HASH) += crypto.o -clean-files := capability_names.h rlim_names.h +clean-files := capability_names.h rlim_names.h net_names.h +# Build a lower case string table of address family names +# Transform lines from +# #define AF_LOCAL 1 /* POSIX name for AF_UNIX */ +# #define AF_INET 2 /* Internet IP Protocol */ +# to +# [1] = "local", +# [2] = "inet", +# +# and build the securityfs entries for the mapping. +# Transforms lines from +# #define AF_INET 2 /* Internet IP Protocol */ +# to +# #define AA_SFS_AF_MASK "local inet" +quiet_cmd_make-af = GEN $@ +cmd_make-af = echo "static const char *address_family_names[] = {" > $@ ;\ + sed $< >>$@ -r -n -e "/AF_MAX/d" -e "/AF_LOCAL/d" -e "/AF_ROUTE/d" -e \ + 's/^\#define[ \t]+AF_([A-Z0-9_]+)[ \t]+([0-9]+)(.*)/[\2] = "\L\1",/p';\ + echo "};" >> $@ ;\ + printf '%s' '\#define AA_SFS_AF_MASK "' >> $@ ;\ + sed -r -n -e "/AF_MAX/d" -e "/AF_LOCAL/d" -e "/AF_ROUTE/d" -e \ + 's/^\#define[ \t]+AF_([A-Z0-9_]+)[ \t]+([0-9]+)(.*)/\L\1/p'\ + $< | tr '\n' ' ' | sed -e 's/ $$/"\n/' >> $@ + +# Build a lower case string table of sock type names +# Transform lines from +# SOCK_STREAM = 1, +# to +# [1] = "stream", +quiet_cmd_make-sock = GEN $@ +cmd_make-sock = echo "static const char *sock_type_names[] = {" >> $@ ;\ + sed $^ >>$@ -r -n \ + -e 's/^\tSOCK_([A-Z0-9_]+)[\t]+=[ \t]+([0-9]+)(.*)/[\2] = "\L\1",/p';\ + echo "};" >> $@ # Build a lower case string table of capability names # Transforms lines from @@ -61,6 +94,7 @@ cmd_make-rlim = echo "static const char *const rlim_names[RLIM_NLIMITS] = {" \ tr '\n' ' ' | sed -e 's/ $$/"\n/' >> $@ $(obj)/capability.o : $(obj)/capability_names.h +$(obj)/net.o : $(obj)/net_names.h $(obj)/resource.o : $(obj)/rlim_names.h $(obj)/capability_names.h : $(srctree)/include/uapi/linux/capability.h \ $(src)/Makefile @@ -68,3 +102,8 @@ $(obj)/capability_names.h : $(srctree)/include/uapi/linux/capability.h \ $(obj)/rlim_names.h : $(srctree)/include/uapi/asm-generic/resource.h \ $(src)/Makefile $(call cmd,make-rlim) +$(obj)/net_names.h : $(srctree)/include/linux/socket.h \ + $(srctree)/include/linux/net.h \ + $(src)/Makefile + $(call cmd,make-af) + $(call cmd,make-sock) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 7acea14c850b..125dad5c3fde 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -2202,6 +2202,7 @@ static struct aa_sfs_entry aa_sfs_entry_features[] = { AA_SFS_DIR("policy", aa_sfs_entry_policy), AA_SFS_DIR("domain", aa_sfs_entry_domain), AA_SFS_DIR("file", aa_sfs_entry_file), + AA_SFS_DIR("network", aa_sfs_entry_network), AA_SFS_DIR("mount", aa_sfs_entry_mount), AA_SFS_DIR("namespaces", aa_sfs_entry_ns), AA_SFS_FILE_U64("capability", VFS_CAP_FLAGS_MASK), diff --git a/security/apparmor/file.c b/security/apparmor/file.c index 3382518b87fa..db80221891c6 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -21,6 +21,7 @@ #include "include/context.h" #include "include/file.h" #include "include/match.h" +#include "include/net.h" #include "include/path.h" #include "include/policy.h" #include "include/label.h" @@ -566,6 +567,32 @@ static int __file_path_perm(const char *op, struct aa_label *label, return error; } +static int __file_sock_perm(const char *op, struct aa_label *label, + struct aa_label *flabel, struct file *file, + u32 request, u32 denied) +{ + struct socket *sock = (struct socket *) file->private_data; + int error; + + AA_BUG(!sock); + + /* revalidation due to label out of date. No revocation at this time */ + if (!denied && aa_label_is_subset(flabel, label)) + return 0; + + /* TODO: improve to skip profiles cached in flabel */ + error = aa_sock_file_perm(label, op, request, sock); + if (denied) { + /* TODO: improve to skip profiles checked above */ + /* check every profile in file label to is cached */ + last_error(error, aa_sock_file_perm(flabel, op, request, sock)); + } + if (!error) + update_file_ctx(file_ctx(file), label, request); + + return error; +} + /** * aa_file_perm - do permission revalidation check & audit for @file * @op: operation being checked @@ -610,6 +637,9 @@ int aa_file_perm(const char *op, struct aa_label *label, struct file *file, error = __file_path_perm(op, label, flabel, file, request, denied); + else if (S_ISSOCK(file_inode(file)->i_mode)) + error = __file_sock_perm(op, label, flabel, file, request, + denied); done: rcu_read_unlock(); diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index 620e81169659..ff4316e1068d 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -121,21 +121,29 @@ struct apparmor_audit_data { /* these entries require a custom callback fn */ struct { struct aa_label *peer; - struct { - const char *target; - kuid_t ouid; - } fs; + union { + struct { + kuid_t ouid; + const char *target; + } fs; + struct { + int type, protocol; + struct sock *peer_sk; + void *addr; + int addrlen; + } net; + int signal; + struct { + int rlim; + unsigned long max; + } rlim; + }; }; struct { struct aa_profile *profile; const char *ns; long pos; } iface; - int signal; - struct { - int rlim; - unsigned long max; - } rlim; struct { const char *src_name; const char *type; diff --git a/security/apparmor/include/net.h b/security/apparmor/include/net.h new file mode 100644 index 000000000000..140c8efcf364 --- /dev/null +++ b/security/apparmor/include/net.h @@ -0,0 +1,114 @@ +/* + * AppArmor security module + * + * This file contains AppArmor network mediation definitions. + * + * Copyright (C) 1998-2008 Novell/SUSE + * Copyright 2009-2017 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#ifndef __AA_NET_H +#define __AA_NET_H + +#include +#include + +#include "apparmorfs.h" +#include "label.h" +#include "perms.h" +#include "policy.h" + +#define AA_MAY_SEND AA_MAY_WRITE +#define AA_MAY_RECEIVE AA_MAY_READ + +#define AA_MAY_SHUTDOWN AA_MAY_DELETE + +#define AA_MAY_CONNECT AA_MAY_OPEN +#define AA_MAY_ACCEPT 0x00100000 + +#define AA_MAY_BIND 0x00200000 +#define AA_MAY_LISTEN 0x00400000 + +#define AA_MAY_SETOPT 0x01000000 +#define AA_MAY_GETOPT 0x02000000 + +#define NET_PERMS_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE | \ + AA_MAY_SHUTDOWN | AA_MAY_BIND | AA_MAY_LISTEN | \ + AA_MAY_CONNECT | AA_MAY_ACCEPT | AA_MAY_SETATTR | \ + AA_MAY_GETATTR | AA_MAY_SETOPT | AA_MAY_GETOPT) + +#define NET_FS_PERMS (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE | \ + AA_MAY_SHUTDOWN | AA_MAY_CONNECT | AA_MAY_RENAME |\ + AA_MAY_SETATTR | AA_MAY_GETATTR | AA_MAY_CHMOD | \ + AA_MAY_CHOWN | AA_MAY_CHGRP | AA_MAY_LOCK | \ + AA_MAY_MPROT) + +#define NET_PEER_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CONNECT | \ + AA_MAY_ACCEPT) +struct aa_sk_ctx { + struct aa_label *label; + struct aa_label *peer; + struct path path; +}; + +#define SK_CTX(X) ((X)->sk_security) +#define SOCK_ctx(X) SOCK_INODE(X)->i_security +#define DEFINE_AUDIT_NET(NAME, OP, SK, F, T, P) \ + struct lsm_network_audit NAME ## _net = { .sk = (SK), \ + .family = (F)}; \ + DEFINE_AUDIT_DATA(NAME, \ + ((SK) && (F) != AF_UNIX) ? LSM_AUDIT_DATA_NET : \ + LSM_AUDIT_DATA_NONE, \ + OP); \ + NAME.u.net = &(NAME ## _net); \ + aad(&NAME)->net.type = (T); \ + aad(&NAME)->net.protocol = (P) + +#define DEFINE_AUDIT_SK(NAME, OP, SK) \ + DEFINE_AUDIT_NET(NAME, OP, SK, (SK)->sk_family, (SK)->sk_type, \ + (SK)->sk_protocol) + +/* struct aa_net - network confinement data + * @allow: basic network families permissions + * @audit: which network permissions to force audit + * @quiet: which network permissions to quiet rejects + */ +struct aa_net { + u16 allow[AF_MAX]; + u16 audit[AF_MAX]; + u16 quiet[AF_MAX]; +}; + + +extern struct aa_sfs_entry aa_sfs_entry_network[]; + +void audit_net_cb(struct audit_buffer *ab, void *va); +int aa_profile_af_perm(struct aa_profile *profile, struct common_audit_data *sa, + u32 request, u16 family, int type); +int aa_af_perm(struct aa_label *label, const char *op, u32 request, u16 family, + int type, int protocol); +static inline int aa_profile_af_sk_perm(struct aa_profile *profile, + struct common_audit_data *sa, + u32 request, + struct sock *sk) +{ + return aa_profile_af_perm(profile, sa, request, sk->sk_family, + sk->sk_type); +} +int aa_sk_perm(const char *op, u32 request, struct sock *sk); + +int aa_sock_file_perm(struct aa_label *label, const char *op, u32 request, + struct socket *sock); + + +static inline void aa_free_net_rules(struct aa_net *new) +{ + /* NOP */ +} + +#endif /* __AA_NET_H */ diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h index 2b27bb79aec4..af04d5a7d73d 100644 --- a/security/apparmor/include/perms.h +++ b/security/apparmor/include/perms.h @@ -135,9 +135,10 @@ extern struct aa_perms allperms; void aa_perm_mask_to_str(char *str, const char *chrs, u32 mask); -void aa_audit_perm_names(struct audit_buffer *ab, const char **names, u32 mask); +void aa_audit_perm_names(struct audit_buffer *ab, const char * const *names, + u32 mask); void aa_audit_perm_mask(struct audit_buffer *ab, u32 mask, const char *chrs, - u32 chrsmask, const char **names, u32 namesmask); + u32 chrsmask, const char * const *names, u32 namesmask); void aa_apply_modes_to_perms(struct aa_profile *profile, struct aa_perms *perms); void aa_compute_perms(struct aa_dfa *dfa, unsigned int state, diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 17fe41a9cac3..4364088a0b9e 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -30,6 +30,7 @@ #include "file.h" #include "lib.h" #include "label.h" +#include "net.h" #include "perms.h" #include "resource.h" @@ -111,6 +112,7 @@ struct aa_data { * @policy: general match rules governing policy * @file: The set of rules governing basic file access and domain transitions * @caps: capabilities for the profile + * @net: network controls for the profile * @rlimits: rlimits for the profile * * @dents: dentries for the profiles file entries in apparmorfs @@ -148,6 +150,7 @@ struct aa_profile { struct aa_policydb policy; struct aa_file_rules file; struct aa_caps caps; + struct aa_net net; struct aa_rlimit rlimits; struct aa_loaddata *rawdata; @@ -220,6 +223,16 @@ static inline unsigned int PROFILE_MEDIATES_SAFE(struct aa_profile *profile, return 0; } +static inline unsigned int PROFILE_MEDIATES_AF(struct aa_profile *profile, + u16 AF) { + unsigned int state = PROFILE_MEDIATES(profile, AA_CLASS_NET); + u16 be_af = cpu_to_be16(AF); + + if (!state) + return 0; + return aa_dfa_match_len(profile->policy.dfa, state, (char *) &be_af, 2); +} + /** * aa_get_profile - increment refcount on profile @p * @p: profile (MAYBE NULL) diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 08ca26bcca77..8818621b5d95 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -211,7 +211,8 @@ void aa_perm_mask_to_str(char *str, const char *chrs, u32 mask) *str = '\0'; } -void aa_audit_perm_names(struct audit_buffer *ab, const char **names, u32 mask) +void aa_audit_perm_names(struct audit_buffer *ab, const char * const *names, + u32 mask) { const char *fmt = "%s"; unsigned int i, perm = 1; @@ -229,7 +230,7 @@ void aa_audit_perm_names(struct audit_buffer *ab, const char **names, u32 mask) } void aa_audit_perm_mask(struct audit_buffer *ab, u32 mask, const char *chrs, - u32 chrsmask, const char **names, u32 namesmask) + u32 chrsmask, const char * const *names, u32 namesmask) { char str[33]; diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 4ad0b3a45142..cc5ab23a2d84 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -33,6 +33,7 @@ #include "include/context.h" #include "include/file.h" #include "include/ipc.h" +#include "include/net.h" #include "include/path.h" #include "include/label.h" #include "include/policy.h" @@ -736,6 +737,368 @@ static int apparmor_task_kill(struct task_struct *target, struct siginfo *info, return error; } +/** + * apparmor_sk_alloc_security - allocate and attach the sk_security field + */ +static int apparmor_sk_alloc_security(struct sock *sk, int family, gfp_t flags) +{ + struct aa_sk_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), flags); + if (!ctx) + return -ENOMEM; + + SK_CTX(sk) = ctx; + + return 0; +} + +/** + * apparmor_sk_free_security - free the sk_security field + */ +static void apparmor_sk_free_security(struct sock *sk) +{ + struct aa_sk_ctx *ctx = SK_CTX(sk); + + SK_CTX(sk) = NULL; + aa_put_label(ctx->label); + aa_put_label(ctx->peer); + path_put(&ctx->path); + kfree(ctx); +} + +/** + * apparmor_clone_security - clone the sk_security field + */ +static void apparmor_sk_clone_security(const struct sock *sk, + struct sock *newsk) +{ + struct aa_sk_ctx *ctx = SK_CTX(sk); + struct aa_sk_ctx *new = SK_CTX(newsk); + + new->label = aa_get_label(ctx->label); + new->peer = aa_get_label(ctx->peer); + new->path = ctx->path; + path_get(&new->path); +} + +static int aa_sock_create_perm(struct aa_label *label, int family, int type, + int protocol) +{ + AA_BUG(!label); + AA_BUG(in_interrupt()); + + return aa_af_perm(label, OP_CREATE, AA_MAY_CREATE, family, type, + protocol); +} + + +/** + * apparmor_socket_create - check perms before creating a new socket + */ +static int apparmor_socket_create(int family, int type, int protocol, int kern) +{ + struct aa_label *label; + int error = 0; + + label = begin_current_label_crit_section(); + if (!(kern || unconfined(label))) + error = aa_sock_create_perm(label, family, type, protocol); + end_current_label_crit_section(label); + + return error; +} + +/** + * apparmor_socket_post_create - setup the per-socket security struct + * + * Note: + * - kernel sockets currently labeled unconfined but we may want to + * move to a special kernel label + * - socket may not have sk here if created with sock_create_lite or + * sock_alloc. These should be accept cases which will be handled in + * sock_graft. + */ +static int apparmor_socket_post_create(struct socket *sock, int family, + int type, int protocol, int kern) +{ + struct aa_label *label; + + if (kern) { + struct aa_ns *ns = aa_get_current_ns(); + + label = aa_get_label(ns_unconfined(ns)); + aa_put_ns(ns); + } else + label = aa_get_current_label(); + + if (sock->sk) { + struct aa_sk_ctx *ctx = SK_CTX(sock->sk); + + aa_put_label(ctx->label); + ctx->label = aa_get_label(label); + } + aa_put_label(label); + + return 0; +} + +/** + * apparmor_socket_bind - check perms before bind addr to socket + */ +static int apparmor_socket_bind(struct socket *sock, + struct sockaddr *address, int addrlen) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(!address); + AA_BUG(in_interrupt()); + + return aa_sk_perm(OP_BIND, AA_MAY_BIND, sock->sk); +} + +/** + * apparmor_socket_connect - check perms before connecting @sock to @address + */ +static int apparmor_socket_connect(struct socket *sock, + struct sockaddr *address, int addrlen) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(!address); + AA_BUG(in_interrupt()); + + return aa_sk_perm(OP_CONNECT, AA_MAY_CONNECT, sock->sk); +} + +/** + * apparmor_socket_list - check perms before allowing listen + */ +static int apparmor_socket_listen(struct socket *sock, int backlog) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(in_interrupt()); + + return aa_sk_perm(OP_LISTEN, AA_MAY_LISTEN, sock->sk); +} + +/** + * apparmor_socket_accept - check perms before accepting a new connection. + * + * Note: while @newsock is created and has some information, the accept + * has not been done. + */ +static int apparmor_socket_accept(struct socket *sock, struct socket *newsock) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(!newsock); + AA_BUG(in_interrupt()); + + return aa_sk_perm(OP_ACCEPT, AA_MAY_ACCEPT, sock->sk); +} + +static int aa_sock_msg_perm(const char *op, u32 request, struct socket *sock, + struct msghdr *msg, int size) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(!msg); + AA_BUG(in_interrupt()); + + return aa_sk_perm(op, request, sock->sk); +} + +/** + * apparmor_socket_sendmsg - check perms before sending msg to another socket + */ +static int apparmor_socket_sendmsg(struct socket *sock, + struct msghdr *msg, int size) +{ + return aa_sock_msg_perm(OP_SENDMSG, AA_MAY_SEND, sock, msg, size); +} + +/** + * apparmor_socket_recvmsg - check perms before receiving a message + */ +static int apparmor_socket_recvmsg(struct socket *sock, + struct msghdr *msg, int size, int flags) +{ + return aa_sock_msg_perm(OP_RECVMSG, AA_MAY_RECEIVE, sock, msg, size); +} + +/* revaliation, get/set attr, shutdown */ +static int aa_sock_perm(const char *op, u32 request, struct socket *sock) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(in_interrupt()); + + return aa_sk_perm(op, request, sock->sk); +} + +/** + * apparmor_socket_getsockname - check perms before getting the local address + */ +static int apparmor_socket_getsockname(struct socket *sock) +{ + return aa_sock_perm(OP_GETSOCKNAME, AA_MAY_GETATTR, sock); +} + +/** + * apparmor_socket_getpeername - check perms before getting remote address + */ +static int apparmor_socket_getpeername(struct socket *sock) +{ + return aa_sock_perm(OP_GETPEERNAME, AA_MAY_GETATTR, sock); +} + +/* revaliation, get/set attr, opt */ +static int aa_sock_opt_perm(const char *op, u32 request, struct socket *sock, + int level, int optname) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(in_interrupt()); + + return aa_sk_perm(op, request, sock->sk); +} + +/** + * apparmor_getsockopt - check perms before getting socket options + */ +static int apparmor_socket_getsockopt(struct socket *sock, int level, + int optname) +{ + return aa_sock_opt_perm(OP_GETSOCKOPT, AA_MAY_GETOPT, sock, + level, optname); +} + +/** + * apparmor_setsockopt - check perms before setting socket options + */ +static int apparmor_socket_setsockopt(struct socket *sock, int level, + int optname) +{ + return aa_sock_opt_perm(OP_SETSOCKOPT, AA_MAY_SETOPT, sock, + level, optname); +} + +/** + * apparmor_socket_shutdown - check perms before shutting down @sock conn + */ +static int apparmor_socket_shutdown(struct socket *sock, int how) +{ + return aa_sock_perm(OP_SHUTDOWN, AA_MAY_SHUTDOWN, sock); +} + +/** + * apparmor_socket_sock_recv_skb - check perms before associating skb to sk + * + * Note: can not sleep may be called with locks held + * + * dont want protocol specific in __skb_recv_datagram() + * to deny an incoming connection socket_sock_rcv_skb() + */ +static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + + +static struct aa_label *sk_peer_label(struct sock *sk) +{ + struct aa_sk_ctx *ctx = SK_CTX(sk); + + if (ctx->peer) + return ctx->peer; + + return ERR_PTR(-ENOPROTOOPT); +} + +/** + * apparmor_socket_getpeersec_stream - get security context of peer + * + * Note: for tcp only valid if using ipsec or cipso on lan + */ +static int apparmor_socket_getpeersec_stream(struct socket *sock, + char __user *optval, + int __user *optlen, + unsigned int len) +{ + char *name; + int slen, error = 0; + struct aa_label *label; + struct aa_label *peer; + + label = begin_current_label_crit_section(); + peer = sk_peer_label(sock->sk); + if (IS_ERR(peer)) { + error = PTR_ERR(peer); + goto done; + } + slen = aa_label_asxprint(&name, labels_ns(label), peer, + FLAG_SHOW_MODE | FLAG_VIEW_SUBNS | + FLAG_HIDDEN_UNCONFINED, GFP_KERNEL); + /* don't include terminating \0 in slen, it breaks some apps */ + if (slen < 0) { + error = -ENOMEM; + } else { + if (slen > len) { + error = -ERANGE; + } else if (copy_to_user(optval, name, slen)) { + error = -EFAULT; + goto out; + } + if (put_user(slen, optlen)) + error = -EFAULT; +out: + kfree(name); + + } + +done: + end_current_label_crit_section(label); + + return error; +} + +/** + * apparmor_socket_getpeersec_dgram - get security label of packet + * @sock: the peer socket + * @skb: packet data + * @secid: pointer to where to put the secid of the packet + * + * Sets the netlabel socket state on sk from parent + */ +static int apparmor_socket_getpeersec_dgram(struct socket *sock, + struct sk_buff *skb, u32 *secid) + +{ + /* TODO: requires secid support */ + return -ENOPROTOOPT; +} + +/** + * apparmor_sock_graft - Initialize newly created socket + * @sk: child sock + * @parent: parent socket + * + * Note: could set off of SOCK_CTX(parent) but need to track inode and we can + * just set sk security information off of current creating process label + * Labeling of sk for accept case - probably should be sock based + * instead of task, because of the case where an implicitly labeled + * socket is shared by different tasks. + */ +static void apparmor_sock_graft(struct sock *sk, struct socket *parent) +{ + struct aa_sk_ctx *ctx = SK_CTX(sk); + + if (!ctx->label) + ctx->label = aa_get_current_label(); +} + static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), @@ -770,6 +1133,30 @@ static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(getprocattr, apparmor_getprocattr), LSM_HOOK_INIT(setprocattr, apparmor_setprocattr), + LSM_HOOK_INIT(sk_alloc_security, apparmor_sk_alloc_security), + LSM_HOOK_INIT(sk_free_security, apparmor_sk_free_security), + LSM_HOOK_INIT(sk_clone_security, apparmor_sk_clone_security), + + LSM_HOOK_INIT(socket_create, apparmor_socket_create), + LSM_HOOK_INIT(socket_post_create, apparmor_socket_post_create), + LSM_HOOK_INIT(socket_bind, apparmor_socket_bind), + LSM_HOOK_INIT(socket_connect, apparmor_socket_connect), + LSM_HOOK_INIT(socket_listen, apparmor_socket_listen), + LSM_HOOK_INIT(socket_accept, apparmor_socket_accept), + LSM_HOOK_INIT(socket_sendmsg, apparmor_socket_sendmsg), + LSM_HOOK_INIT(socket_recvmsg, apparmor_socket_recvmsg), + LSM_HOOK_INIT(socket_getsockname, apparmor_socket_getsockname), + LSM_HOOK_INIT(socket_getpeername, apparmor_socket_getpeername), + LSM_HOOK_INIT(socket_getsockopt, apparmor_socket_getsockopt), + LSM_HOOK_INIT(socket_setsockopt, apparmor_socket_setsockopt), + LSM_HOOK_INIT(socket_shutdown, apparmor_socket_shutdown), + LSM_HOOK_INIT(socket_sock_rcv_skb, apparmor_socket_sock_rcv_skb), + LSM_HOOK_INIT(socket_getpeersec_stream, + apparmor_socket_getpeersec_stream), + LSM_HOOK_INIT(socket_getpeersec_dgram, + apparmor_socket_getpeersec_dgram), + LSM_HOOK_INIT(sock_graft, apparmor_sock_graft), + LSM_HOOK_INIT(cred_alloc_blank, apparmor_cred_alloc_blank), LSM_HOOK_INIT(cred_free, apparmor_cred_free), LSM_HOOK_INIT(cred_prepare, apparmor_cred_prepare), diff --git a/security/apparmor/net.c b/security/apparmor/net.c new file mode 100644 index 000000000000..33d54435f8d6 --- /dev/null +++ b/security/apparmor/net.c @@ -0,0 +1,184 @@ +/* + * AppArmor security module + * + * This file contains AppArmor network mediation + * + * Copyright (C) 1998-2008 Novell/SUSE + * Copyright 2009-2017 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include "include/apparmor.h" +#include "include/audit.h" +#include "include/context.h" +#include "include/label.h" +#include "include/net.h" +#include "include/policy.h" + +#include "net_names.h" + + +struct aa_sfs_entry aa_sfs_entry_network[] = { + AA_SFS_FILE_STRING("af_mask", AA_SFS_AF_MASK), + { } +}; + +static const char * const net_mask_names[] = { + "unknown", + "send", + "receive", + "unknown", + + "create", + "shutdown", + "connect", + "unknown", + + "setattr", + "getattr", + "setcred", + "getcred", + + "chmod", + "chown", + "chgrp", + "lock", + + "mmap", + "mprot", + "unknown", + "unknown", + + "accept", + "bind", + "listen", + "unknown", + + "setopt", + "getopt", + "unknown", + "unknown", + + "unknown", + "unknown", + "unknown", + "unknown", +}; + + +/* audit callback for net specific fields */ +void audit_net_cb(struct audit_buffer *ab, void *va) +{ + struct common_audit_data *sa = va; + + audit_log_format(ab, " family="); + if (address_family_names[sa->u.net->family]) + audit_log_string(ab, address_family_names[sa->u.net->family]); + else + audit_log_format(ab, "\"unknown(%d)\"", sa->u.net->family); + audit_log_format(ab, " sock_type="); + if (sock_type_names[aad(sa)->net.type]) + audit_log_string(ab, sock_type_names[aad(sa)->net.type]); + else + audit_log_format(ab, "\"unknown(%d)\"", aad(sa)->net.type); + audit_log_format(ab, " protocol=%d", aad(sa)->net.protocol); + + if (aad(sa)->request & NET_PERMS_MASK) { + audit_log_format(ab, " requested_mask="); + aa_audit_perm_mask(ab, aad(sa)->request, NULL, 0, + net_mask_names, NET_PERMS_MASK); + + if (aad(sa)->denied & NET_PERMS_MASK) { + audit_log_format(ab, " denied_mask="); + aa_audit_perm_mask(ab, aad(sa)->denied, NULL, 0, + net_mask_names, NET_PERMS_MASK); + } + } + if (aad(sa)->peer) { + audit_log_format(ab, " peer="); + aa_label_xaudit(ab, labels_ns(aad(sa)->label), aad(sa)->peer, + FLAGS_NONE, GFP_ATOMIC); + } +} + + +/* Generic af perm */ +int aa_profile_af_perm(struct aa_profile *profile, struct common_audit_data *sa, + u32 request, u16 family, int type) +{ + struct aa_perms perms = { }; + + AA_BUG(family >= AF_MAX); + AA_BUG(type < 0 || type >= SOCK_MAX); + + if (profile_unconfined(profile)) + return 0; + + perms.allow = (profile->net.allow[family] & (1 << type)) ? + ALL_PERMS_MASK : 0; + perms.audit = (profile->net.audit[family] & (1 << type)) ? + ALL_PERMS_MASK : 0; + perms.quiet = (profile->net.quiet[family] & (1 << type)) ? + ALL_PERMS_MASK : 0; + aa_apply_modes_to_perms(profile, &perms); + + return aa_check_perms(profile, &perms, request, sa, audit_net_cb); +} + +int aa_af_perm(struct aa_label *label, const char *op, u32 request, u16 family, + int type, int protocol) +{ + struct aa_profile *profile; + DEFINE_AUDIT_NET(sa, op, NULL, family, type, protocol); + + return fn_for_each_confined(label, profile, + aa_profile_af_perm(profile, &sa, request, family, + type)); +} + +static int aa_label_sk_perm(struct aa_label *label, const char *op, u32 request, + struct sock *sk) +{ + struct aa_profile *profile; + DEFINE_AUDIT_SK(sa, op, sk); + + AA_BUG(!label); + AA_BUG(!sk); + + if (unconfined(label)) + return 0; + + return fn_for_each_confined(label, profile, + aa_profile_af_sk_perm(profile, &sa, request, sk)); +} + +int aa_sk_perm(const char *op, u32 request, struct sock *sk) +{ + struct aa_label *label; + int error; + + AA_BUG(!sk); + AA_BUG(in_interrupt()); + + /* TODO: switch to begin_current_label ???? */ + label = begin_current_label_crit_section(); + error = aa_label_sk_perm(label, op, request, sk); + end_current_label_crit_section(label); + + return error; +} + + +int aa_sock_file_perm(struct aa_label *label, const char *op, u32 request, + struct socket *sock) +{ + AA_BUG(!label); + AA_BUG(!sock); + AA_BUG(!sock->sk); + + return aa_label_sk_perm(label, op, request, sock->sk); +} diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 4ede87c30f8b..5a2aec358322 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -275,6 +275,19 @@ fail: return 0; } +static bool unpack_u16(struct aa_ext *e, u16 *data, const char *name) +{ + if (unpack_nameX(e, AA_U16, name)) { + if (!inbounds(e, sizeof(u16))) + return 0; + if (data) + *data = le16_to_cpu(get_unaligned((__le16 *) e->pos)); + e->pos += sizeof(u16); + return 1; + } + return 0; +} + static bool unpack_u32(struct aa_ext *e, u32 *data, const char *name) { if (unpack_nameX(e, AA_U32, name)) { @@ -584,7 +597,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) struct aa_profile *profile = NULL; const char *tmpname, *tmpns = NULL, *name = NULL; const char *info = "failed to unpack profile"; - size_t ns_len; + size_t size = 0, ns_len; struct rhashtable_params params = { 0 }; char *key = NULL; struct aa_data *data; @@ -717,6 +730,38 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) goto fail; } + size = unpack_array(e, "net_allowed_af"); + if (size) { + + for (i = 0; i < size; i++) { + /* discard extraneous rules that this kernel will + * never request + */ + if (i >= AF_MAX) { + u16 tmp; + + if (!unpack_u16(e, &tmp, NULL) || + !unpack_u16(e, &tmp, NULL) || + !unpack_u16(e, &tmp, NULL)) + goto fail; + continue; + } + if (!unpack_u16(e, &profile->net.allow[i], NULL)) + goto fail; + if (!unpack_u16(e, &profile->net.audit[i], NULL)) + goto fail; + if (!unpack_u16(e, &profile->net.quiet[i], NULL)) + goto fail; + } + if (!unpack_nameX(e, AA_ARRAYEND, NULL)) + goto fail; + } + if (VERSION_LT(e->version, v7)) { + /* pre v7 policy always allowed these */ + profile->net.allow[AF_UNIX] = 0xffff; + profile->net.allow[AF_NETLINK] = 0xffff; + } + if (unpack_nameX(e, AA_STRUCT, "policydb")) { /* generic policy dfa - optional and may be NULL */ info = "failed to unpack policydb"; -- cgit From d07881d2edb0ab783846730629ac2faeaafdf4f1 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 16 Aug 2017 08:59:57 -0700 Subject: apparmor: move new_null_profile to after profile lookup fns() new_null_profile will need to use some of the profile lookup fns() so move instead of doing forward fn declarations. Signed-off-by: John Johansen --- security/apparmor/policy.c | 158 ++++++++++++++++++++++----------------------- 1 file changed, 79 insertions(+), 79 deletions(-) diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 244ea4a4a8f0..a81a384a63b1 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -289,85 +289,6 @@ fail: return NULL; } -/** - * aa_new_null_profile - create or find a null-X learning profile - * @parent: profile that caused this profile to be created (NOT NULL) - * @hat: true if the null- learning profile is a hat - * @base: name to base the null profile off of - * @gfp: type of allocation - * - * Find/Create a null- complain mode profile used in learning mode. The - * name of the profile is unique and follows the format of parent//null-XXX. - * where XXX is based on the @name or if that fails or is not supplied - * a unique number - * - * null profiles are added to the profile list but the list does not - * hold a count on them so that they are automatically released when - * not in use. - * - * Returns: new refcounted profile else NULL on failure - */ -struct aa_profile *aa_new_null_profile(struct aa_profile *parent, bool hat, - const char *base, gfp_t gfp) -{ - struct aa_profile *profile; - char *name; - - AA_BUG(!parent); - - if (base) { - name = kmalloc(strlen(parent->base.hname) + 8 + strlen(base), - gfp); - if (name) { - sprintf(name, "%s//null-%s", parent->base.hname, base); - goto name; - } - /* fall through to try shorter uniq */ - } - - name = kmalloc(strlen(parent->base.hname) + 2 + 7 + 8, gfp); - if (!name) - return NULL; - sprintf(name, "%s//null-%x", parent->base.hname, - atomic_inc_return(&parent->ns->uniq_null)); - -name: - /* lookup to see if this is a dup creation */ - profile = aa_find_child(parent, basename(name)); - if (profile) - goto out; - - profile = aa_alloc_profile(name, NULL, gfp); - if (!profile) - goto fail; - - profile->mode = APPARMOR_COMPLAIN; - profile->label.flags |= FLAG_NULL; - if (hat) - profile->label.flags |= FLAG_HAT; - profile->path_flags = parent->path_flags; - - /* released on free_profile */ - rcu_assign_pointer(profile->parent, aa_get_profile(parent)); - profile->ns = aa_get_ns(parent->ns); - profile->file.dfa = aa_get_dfa(nulldfa); - profile->policy.dfa = aa_get_dfa(nulldfa); - - mutex_lock(&profile->ns->lock); - __add_profile(&parent->base.profiles, profile); - mutex_unlock(&profile->ns->lock); - - /* refcount released by caller */ -out: - kfree(name); - - return profile; - -fail: - aa_free_profile(profile); - return NULL; -} - /* TODO: profile accounting - setup in remove */ /** @@ -558,6 +479,85 @@ struct aa_profile *aa_fqlookupn_profile(struct aa_label *base, return profile; } +/** + * aa_new_null_profile - create or find a null-X learning profile + * @parent: profile that caused this profile to be created (NOT NULL) + * @hat: true if the null- learning profile is a hat + * @base: name to base the null profile off of + * @gfp: type of allocation + * + * Find/Create a null- complain mode profile used in learning mode. The + * name of the profile is unique and follows the format of parent//null-XXX. + * where XXX is based on the @name or if that fails or is not supplied + * a unique number + * + * null profiles are added to the profile list but the list does not + * hold a count on them so that they are automatically released when + * not in use. + * + * Returns: new refcounted profile else NULL on failure + */ +struct aa_profile *aa_new_null_profile(struct aa_profile *parent, bool hat, + const char *base, gfp_t gfp) +{ + struct aa_profile *profile; + char *name; + + AA_BUG(!parent); + + if (base) { + name = kmalloc(strlen(parent->base.hname) + 8 + strlen(base), + gfp); + if (name) { + sprintf(name, "%s//null-%s", parent->base.hname, base); + goto name; + } + /* fall through to try shorter uniq */ + } + + name = kmalloc(strlen(parent->base.hname) + 2 + 7 + 8, gfp); + if (!name) + return NULL; + sprintf(name, "%s//null-%x", parent->base.hname, + atomic_inc_return(&parent->ns->uniq_null)); + +name: + /* lookup to see if this is a dup creation */ + profile = aa_find_child(parent, basename(name)); + if (profile) + goto out; + + profile = aa_alloc_profile(name, NULL, gfp); + if (!profile) + goto fail; + + profile->mode = APPARMOR_COMPLAIN; + profile->label.flags |= FLAG_NULL; + if (hat) + profile->label.flags |= FLAG_HAT; + profile->path_flags = parent->path_flags; + + /* released on free_profile */ + rcu_assign_pointer(profile->parent, aa_get_profile(parent)); + profile->ns = aa_get_ns(parent->ns); + profile->file.dfa = aa_get_dfa(nulldfa); + profile->policy.dfa = aa_get_dfa(nulldfa); + + mutex_lock(&profile->ns->lock); + __add_profile(&parent->base.profiles, profile); + mutex_unlock(&profile->ns->lock); + + /* refcount released by caller */ +out: + kfree(name); + + return profile; + +fail: + aa_free_profile(profile); + return NULL; +} + /** * replacement_allowed - test to see if replacement is allowed * @profile: profile to test if it can be replaced (MAYBE NULL) -- cgit From 290638a52a808d658bd04b746b3ca46886c157e0 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 16 Aug 2017 05:40:49 -0700 Subject: apparmor: fix race condition in null profile creation There is a race when null- profile is being created between the initial lookup/creation of the profile and lock/addition of the profile. This could result in multiple version of a profile being added to the list which need to be removed/replaced. Since these are learning profile their is no affect on mediation. Signed-off-by: John Johansen --- security/apparmor/policy.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index a81a384a63b1..4243b0c3f0e4 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -500,7 +500,8 @@ struct aa_profile *aa_fqlookupn_profile(struct aa_label *base, struct aa_profile *aa_new_null_profile(struct aa_profile *parent, bool hat, const char *base, gfp_t gfp) { - struct aa_profile *profile; + struct aa_profile *p, *profile; + const char *bname; char *name; AA_BUG(!parent); @@ -523,7 +524,8 @@ struct aa_profile *aa_new_null_profile(struct aa_profile *parent, bool hat, name: /* lookup to see if this is a dup creation */ - profile = aa_find_child(parent, basename(name)); + bname = basename(name); + profile = aa_find_child(parent, bname); if (profile) goto out; @@ -544,7 +546,13 @@ name: profile->policy.dfa = aa_get_dfa(nulldfa); mutex_lock(&profile->ns->lock); - __add_profile(&parent->base.profiles, profile); + p = __find_child(&parent->base.profiles, bname); + if (p) { + aa_free_profile(profile); + profile = aa_get_profile(p); + } else { + __add_profile(&parent->base.profiles, profile); + } mutex_unlock(&profile->ns->lock); /* refcount released by caller */ -- cgit From 15372b97aa7593c6f5bc1afe69f42fd403c40685 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 16 Aug 2017 05:48:06 -0700 Subject: apparmor: ensure unconfined profiles have dfas initialized Generally unconfined has early bailout tests and does not need the dfas initialized, however if an early bailout test is ever missed it will result in an oops. Be defensive and initialize the unconfined profile to have null dfas (no permission) so if an early bailout test is missed we fail closed (no perms granted) instead of oopsing. Signed-off-by: John Johansen --- security/apparmor/policy_ns.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/security/apparmor/policy_ns.c b/security/apparmor/policy_ns.c index 351d3bab3a3d..62a3589c62ab 100644 --- a/security/apparmor/policy_ns.c +++ b/security/apparmor/policy_ns.c @@ -112,6 +112,8 @@ static struct aa_ns *alloc_ns(const char *prefix, const char *name) ns->unconfined->label.flags |= FLAG_IX_ON_NAME_ERROR | FLAG_IMMUTIBLE | FLAG_NS_COUNT | FLAG_UNCONFINED; ns->unconfined->mode = APPARMOR_UNCONFINED; + ns->unconfined->file.dfa = aa_get_dfa(nulldfa); + ns->unconfined->policy.dfa = aa_get_dfa(nulldfa); /* ns and ns->unconfined share ns->unconfined refcount */ ns->unconfined->ns = ns; -- cgit From bc4d82fb946e7b471eab2a80e384227c4eb15652 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 16 Aug 2017 09:33:48 -0700 Subject: apparmor: fix incorrect type assignment when freeing proxies sparse reports poisoning the proxy->label before freeing the struct is resulting in a sparse build warning. ../security/apparmor/label.c:52:30: warning: incorrect type in assignment (different address spaces) ../security/apparmor/label.c:52:30: expected struct aa_label [noderef] *label ../security/apparmor/label.c:52:30: got struct aa_label * fix with RCU_INIT_POINTER as this is one of those cases where rcu_assign_pointer() is not needed. Signed-off-by: John Johansen --- security/apparmor/label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/label.c b/security/apparmor/label.c index 52b4ef14840d..c5b99b954580 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -49,7 +49,7 @@ static void free_proxy(struct aa_proxy *proxy) /* p->label will not updated any more as p is dead */ aa_put_label(rcu_dereference_protected(proxy->label, true)); memset(proxy, 0, sizeof(*proxy)); - proxy->label = (struct aa_label *) PROXY_POISON; + RCU_INIT_POINTER(proxy->label, (struct aa_label *)PROXY_POISON); kfree(proxy); } } -- cgit From b1545dba092ba543eab1f7b5ed757a4988e267c8 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 23 Aug 2017 12:10:39 -0700 Subject: apparmor: fix build failure on sparc caused by undeclared signals In file included from security/apparmor/ipc.c:23:0: security/apparmor/include/sig_names.h:26:3: error: 'SIGSTKFLT' undeclared here (not in a function) [SIGSTKFLT] = 16, /* -, 16, - */ ^ security/apparmor/include/sig_names.h:26:3: error: array index in initializer not of integer type security/apparmor/include/sig_names.h:26:3: note: (near initialization for 'sig_map') security/apparmor/include/sig_names.h:51:3: error: 'SIGUNUSED' undeclared here (not in a function) [SIGUNUSED] = 34, /* -, 31, - */ ^ security/apparmor/include/sig_names.h:51:3: error: array index in initializer not of integer type security/apparmor/include/sig_names.h:51:3: note: (near initialization for 'sig_map') Reported-by: Stephen Rothwell Fixes: c6bf1adaecaa ("apparmor: add the ability to mediate signals") Signed-off-by: John Johansen --- security/apparmor/include/sig_names.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/security/apparmor/include/sig_names.h b/security/apparmor/include/sig_names.h index 0d4395f231ca..92e62fe95292 100644 --- a/security/apparmor/include/sig_names.h +++ b/security/apparmor/include/sig_names.h @@ -23,7 +23,9 @@ static const int sig_map[MAXMAPPED_SIG] = { [SIGPIPE] = 13, [SIGALRM] = 14, [SIGTERM] = 15, +#ifdef SIGSTKFLT [SIGSTKFLT] = 16, /* -, 16, - */ +#endif [SIGCHLD] = 17, /* 20, 17, 18. SIGCHLD -, -, 18 */ [SIGCONT] = 18, /* 19, 18, 25 */ [SIGSTOP] = 19, /* 17, 19, 23 */ @@ -47,7 +49,8 @@ static const int sig_map[MAXMAPPED_SIG] = { #if defined(SIGLOST) && SIGPWR != SIGLOST /* sparc */ [SIGLOST] = 33, /* unused on Linux */ #endif -#if defined(SIGLOST) && defined(SIGSYS) && SIGLOST != SIGSYS +#if defined(SIGUNUSED) && \ + defined(SIGLOST) && defined(SIGSYS) && SIGLOST != SIGSYS [SIGUNUSED] = 34, /* -, 31, - */ #endif }; -- cgit From bf81100f63db7ea243d17b9d5008ba3af2fdf6b2 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Thu, 31 Aug 2017 09:54:43 -0700 Subject: apparmor: fix apparmorfs DAC access permissions The DAC access permissions for several apparmorfs files are wrong. .access - needs to be writable by all tasks to perform queries the others in the set only provide a read fn so should be read only. With policy namespace virtualization all apparmor needs to control the permission and visibility checks directly which means DAC access has to be allowed for all user, group, and other. BugLink: http://bugs.launchpad.net/bugs/1713103 Fixes: c97204baf840b ("apparmor: rename apparmor file fns and data to indicate use") Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 125dad5c3fde..518d5928661b 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -2215,12 +2215,12 @@ static struct aa_sfs_entry aa_sfs_entry_features[] = { }; static struct aa_sfs_entry aa_sfs_entry_apparmor[] = { - AA_SFS_FILE_FOPS(".access", 0640, &aa_sfs_access), + AA_SFS_FILE_FOPS(".access", 0666, &aa_sfs_access), AA_SFS_FILE_FOPS(".stacked", 0444, &seq_ns_stacked_fops), AA_SFS_FILE_FOPS(".ns_stacked", 0444, &seq_ns_nsstacked_fops), - AA_SFS_FILE_FOPS(".ns_level", 0666, &seq_ns_level_fops), - AA_SFS_FILE_FOPS(".ns_name", 0640, &seq_ns_name_fops), - AA_SFS_FILE_FOPS("profiles", 0440, &aa_sfs_profiles_fops), + AA_SFS_FILE_FOPS(".ns_level", 0444, &seq_ns_level_fops), + AA_SFS_FILE_FOPS(".ns_name", 0444, &seq_ns_name_fops), + AA_SFS_FILE_FOPS("profiles", 0444, &aa_sfs_profiles_fops), AA_SFS_DIR("features", aa_sfs_entry_features), { } }; -- cgit From af21b01d1166248f282fc02d0f459c94de06615e Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 22 Sep 2017 22:24:02 +0200 Subject: parisc: Reintroduce option to gzip-compress the kernel By adding the feature to build the kernel as self-extracting executeable, the possibility to simply compress the kernel with gzip was lost. This patch now reintroduces this possibilty again and leaves it up to the user to decide how the kernel should be built. The palo bootloader is able to natively load both formats. Signed-off-by: Helge Deller --- arch/parisc/Kconfig | 12 ++++++++++++ arch/parisc/Makefile | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index ba7b7ddc3844..a57dedbfc7b7 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -257,6 +257,18 @@ config PARISC_PAGE_SIZE_64KB endchoice +config PARISC_SELF_EXTRACT + bool "Build kernel as self-extracting executable" + default y + help + Say Y if you want to build the parisc kernel as a kind of + self-extracting executable. + + If you say N here, the kernel will be compressed with gzip + which can be loaded by the palo bootloader directly too. + + If you don't know what to do here, say Y. + config SMP bool "Symmetric multi-processing support" ---help--- diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index 58fae5d2449d..01946ebaff72 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -129,8 +129,13 @@ Image: vmlinux bzImage: vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ +ifdef CONFIG_PARISC_SELF_EXTRACT vmlinuz: bzImage $(OBJCOPY) $(boot)/bzImage $@ +else +vmlinuz: vmlinux + @gzip -cf -9 $< > $@ +endif install: $(CONFIG_SHELL) $(src)/arch/parisc/install.sh \ -- cgit From 8c031ba63f8f2a9efc471cb45b2ff18271556544 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 22 Sep 2017 21:57:11 +0200 Subject: parisc: Unbreak bootloader due to gcc-7 optimizations gcc-7 optimizes the byte-wise accesses of get_unaligned_le32() into word-wise accesses if the 32-bit integer output_len is declared as external. This panics then the bootloader since we don't have the unaligned access fault trap handler installed during boot time. Avoid this optimization by declaring output_len as byte-aligned and thus unbreak the bootloader code. Additionally, compile the boot code optimized for size. Signed-off-by: Helge Deller --- arch/parisc/boot/compressed/Makefile | 2 +- arch/parisc/boot/compressed/misc.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/parisc/boot/compressed/Makefile b/arch/parisc/boot/compressed/Makefile index 5450a11c9d10..7d7e594bda36 100644 --- a/arch/parisc/boot/compressed/Makefile +++ b/arch/parisc/boot/compressed/Makefile @@ -15,7 +15,7 @@ targets += misc.o piggy.o sizes.h head.o real2.o firmware.o KBUILD_CFLAGS := -D__KERNEL__ -O2 -DBOOTLOADER KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING KBUILD_CFLAGS += $(cflags-y) -fno-delete-null-pointer-checks -KBUILD_CFLAGS += -fno-PIE -mno-space-regs -mdisable-fpregs +KBUILD_CFLAGS += -fno-PIE -mno-space-regs -mdisable-fpregs -Os ifndef CONFIG_64BIT KBUILD_CFLAGS += -mfast-indirect-calls endif diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c index 13a4bf9ac4da..9345b44b86f0 100644 --- a/arch/parisc/boot/compressed/misc.c +++ b/arch/parisc/boot/compressed/misc.c @@ -24,7 +24,8 @@ /* Symbols defined by linker scripts */ extern char input_data[]; extern int input_len; -extern __le32 output_len; /* at unaligned address, little-endian */ +/* output_len is inserted by the linker possibly at an unaligned address */ +extern __le32 output_len __aligned(1); extern char _text, _end; extern char _bss, _ebss; extern char _startcode_end; -- cgit From c17c02040bf0d186cebd3e66ff349f955575bf38 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 22 Sep 2017 09:42:42 +0200 Subject: arch: remove unused *_segments() macros/functions Some architectures define the no-op macros/functions copy_segments, release_segments and forget_segments. These are used nowhere in the tree, so removed them. Signed-off-by: Tobias Klauser Acked-by: Vineet Gupta [for arch/arc] Signed-off-by: Linus Torvalds --- arch/arc/include/asm/processor.h | 3 --- arch/c6x/include/asm/processor.h | 3 --- arch/frv/include/asm/processor.h | 4 ---- arch/m32r/include/asm/processor.h | 8 -------- arch/metag/include/asm/processor.h | 3 --- arch/mn10300/kernel/process.c | 12 ------------ arch/sh/include/asm/processor_32.h | 4 ---- arch/sh/include/asm/processor_64.h | 4 ---- arch/um/include/asm/processor-generic.h | 5 ----- arch/xtensa/include/asm/processor.h | 5 ----- 10 files changed, 51 deletions(-) diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index d400a2161935..8ee41e988169 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -78,9 +78,6 @@ struct task_struct; #endif -#define copy_segments(tsk, mm) do { } while (0) -#define release_segments(mm) do { } while (0) - #define KSTK_EIP(tsk) (task_pt_regs(tsk)->ret) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->sp) diff --git a/arch/c6x/include/asm/processor.h b/arch/c6x/include/asm/processor.h index 7c87b5be53b5..8f7cce829f8e 100644 --- a/arch/c6x/include/asm/processor.h +++ b/arch/c6x/include/asm/processor.h @@ -92,9 +92,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -#define copy_segments(tsk, mm) do { } while (0) -#define release_segments(mm) do { } while (0) - /* * saved kernel SP and DP of a blocked thread. */ diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h index e4d08d74ed9f..021cce78b401 100644 --- a/arch/frv/include/asm/processor.h +++ b/arch/frv/include/asm/processor.h @@ -92,10 +92,6 @@ static inline void release_thread(struct task_struct *dead_task) extern asmlinkage void save_user_regs(struct user_context *target); extern asmlinkage void *restore_user_regs(const struct user_context *target, ...); -#define copy_segments(tsk, mm) do { } while (0) -#define release_segments(mm) do { } while (0) -#define forget_segments() do { } while (0) - unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.frame0->pc) diff --git a/arch/m32r/include/asm/processor.h b/arch/m32r/include/asm/processor.h index 657874eeeccc..c70fa9ac7169 100644 --- a/arch/m32r/include/asm/processor.h +++ b/arch/m32r/include/asm/processor.h @@ -118,14 +118,6 @@ struct mm_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* Copy and release all segment info associated with a VM */ -extern void copy_segments(struct task_struct *p, struct mm_struct * mm); -extern void release_segments(struct mm_struct * mm); - -/* Copy and release all segment info associated with a VM */ -#define copy_segments(p, mm) do { } while (0) -#define release_segments(mm) do { } while (0) - unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.lr) #define KSTK_ESP(tsk) ((tsk)->thread.sp) diff --git a/arch/metag/include/asm/processor.h b/arch/metag/include/asm/processor.h index ec6a49076980..8ae92d6abfd2 100644 --- a/arch/metag/include/asm/processor.h +++ b/arch/metag/include/asm/processor.h @@ -131,9 +131,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -#define copy_segments(tsk, mm) do { } while (0) -#define release_segments(mm) do { } while (0) - /* * Return saved PC of a blocked thread. */ diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index 89e8027e07fb..7c475fd99c46 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -59,10 +59,6 @@ void arch_cpu_idle(void) } #endif -void release_segments(struct mm_struct *mm) -{ -} - void machine_restart(char *cmd) { #ifdef CONFIG_KERNEL_DEBUGGER @@ -112,14 +108,6 @@ void release_thread(struct task_struct *dead_task) { } -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -void copy_segments(struct task_struct *p, struct mm_struct *new_mm) -{ -} - /* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h index 18e0377f72bb..88ce1e22237b 100644 --- a/arch/sh/include/asm/processor_32.h +++ b/arch/sh/include/asm/processor_32.h @@ -136,10 +136,6 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_pc, unsigned lo /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* Copy and release all segment info associated with a VM */ -#define copy_segments(p, mm) do { } while(0) -#define release_segments(mm) do { } while(0) - /* * FPU lazy state save handling. */ diff --git a/arch/sh/include/asm/processor_64.h b/arch/sh/include/asm/processor_64.h index eedd4f625d07..777a16318aff 100644 --- a/arch/sh/include/asm/processor_64.h +++ b/arch/sh/include/asm/processor_64.h @@ -170,10 +170,6 @@ struct mm_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* Copy and release all segment info associated with a VM */ -#define copy_segments(p, mm) do { } while (0) -#define release_segments(mm) do { } while (0) -#define forget_segments() do { } while (0) /* * FPU lazy state save handling. */ diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index f6d1a3f747a9..86942a492454 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -58,11 +58,6 @@ static inline void release_thread(struct task_struct *task) { } -static inline void mm_copy_segments(struct mm_struct *from_mm, - struct mm_struct *new_mm) -{ -} - #define init_stack (init_thread_union.stack) /* diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 30ee8c608853..5b0027d4ecc0 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -208,11 +208,6 @@ struct mm_struct; /* Free all resources held by a thread. */ #define release_thread(thread) do { } while(0) -/* Copy and release all segment info associated with a VM */ -#define copy_segments(p, mm) do { } while(0) -#define release_segments(mm) do { } while(0) -#define forget_segments() do { } while (0) - extern unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) -- cgit From 6e70e26dc52be62c1f39f81b5f71fa5e643677aa Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 21 Sep 2017 21:32:29 -0500 Subject: SMB3: handle new statx fields We weren't returning the creation time or the two easily supported attributes (ENCRYPTED or COMPRESSED) for the getattr call to allow statx to return these fields. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg \ Acked-by: Jeff Layton CC: Stable Reviewed-by: Pavel Shilovsky --- fs/cifs/inode.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a8693632235f..7c732cb44164 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -234,6 +234,8 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); fattr->cf_mtime = cifs_NTtimeToUnix(info->LastModificationTime); fattr->cf_ctime = cifs_NTtimeToUnix(info->LastStatusChange); + /* old POSIX extensions don't get create time */ + fattr->cf_mode = le64_to_cpu(info->Permissions); /* @@ -2024,6 +2026,19 @@ int cifs_getattr(const struct path *path, struct kstat *stat, stat->blksize = CIFS_MAX_MSGSIZE; stat->ino = CIFS_I(inode)->uniqueid; + /* old CIFS Unix Extensions doesn't return create time */ + if (CIFS_I(inode)->createtime) { + stat->result_mask |= STATX_BTIME; + stat->btime = + cifs_NTtimeToUnix(cpu_to_le64(CIFS_I(inode)->createtime)); + } + + stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED); + if (CIFS_I(inode)->cifsAttrs & FILE_ATTRIBUTE_COMPRESSED) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (CIFS_I(inode)->cifsAttrs & FILE_ATTRIBUTE_ENCRYPTED) + stat->attributes |= STATX_ATTR_ENCRYPTED; + /* * If on a multiuser mount without unix extensions or cifsacl being * enabled, and the admin hasn't overridden them, set the ownership -- cgit From 1013e760d10e614dc10b5624ce9fc41563ba2e65 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 22 Sep 2017 01:40:27 -0500 Subject: SMB3: Don't ignore O_SYNC/O_DSYNC and O_DIRECT flags Signed-off-by: Steve French CC: Stable Reviewed-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky --- fs/cifs/file.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 822311919163..92fdf9c35de2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -224,6 +224,13 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; + /* O_SYNC also has bit for O_DSYNC so following check picks up either */ + if (f_flags & O_SYNC) + create_options |= CREATE_WRITE_THROUGH; + + if (f_flags & O_DIRECT) + create_options |= CREATE_NO_BUFFER; + oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = desired_access; -- cgit From b9b95da92de9d498ece7e6a82e2d6dcfc76fd9d8 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Fri, 22 Sep 2017 14:28:46 +0200 Subject: MAINTAINERS: update git tree locations for ieee802154 subsystem Patches for ieee802154 will go through my new trees towards netdev from now on. The 6LoWPAN subsystem will stay as is (shared between ieee802154 and bluetooth) and go through the bluetooth tree as usual. Signed-off-by: Stefan Schmidt Signed-off-by: David S. Miller --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 955f034fd523..61ee134cf6a8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6642,8 +6642,8 @@ M: Alexander Aring M: Stefan Schmidt L: linux-wpan@vger.kernel.org W: http://wpan.cakelab.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan-next.git S: Maintained F: net/ieee802154/ F: net/mac802154/ -- cgit From 581fe0ea61584d88072527ae9fb9dcb9d1f2783e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Sep 2017 19:42:37 -0400 Subject: net: orphan frags on stand-alone ptype in dev_queue_xmit_nit Zerocopy skbs frags are copied when the skb is looped to a local sock. Commit 1080e512d44d ("net: orphan frags on receive") introduced calls to skb_orphan_frags to deliver_skb and __netif_receive_skb for this. With msg_zerocopy, these skbs can also exist in the tx path and thus loop from dev_queue_xmit_nit. This already calls deliver_skb in its loop. But it does not orphan before a separate pt_prev->func(). Add the missing skb_orphan_frags_rx. Changes v1->v2: handle skb_orphan_frags_rx failure Fixes: 1f8b977ab32d ("sock: enable MSG_ZEROCOPY") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 9a2254f9802f..588b473194a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1948,8 +1948,12 @@ again: goto again; } out_unlock: - if (pt_prev) - pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + if (pt_prev) { + if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + else + kfree_skb(skb2); + } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); -- cgit From cbb2fb5c72f48d3029c144be0f0e61da1c7bccf7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 22 Sep 2017 20:20:06 -0400 Subject: net: set tb->fast_sk_family We need to set the tb->fast_sk_family properly so we can use the proper comparison function for all subsequent reuseport bind requests. Fixes: 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk") Reported-and-tested-by: Cole Robinson Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b9c64b40a83a..f87f4805e244 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -328,6 +328,7 @@ success: tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif @@ -354,6 +355,7 @@ success: tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif -- cgit From 7a56673b58f2414679e926bba80309a037a4fd35 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 22 Sep 2017 20:20:07 -0400 Subject: net: use inet6_rcv_saddr to compare sockets In ipv6_rcv_saddr_equal() we need to use inet6_rcv_saddr(sk) for the ipv6 compare with the fast socket information to make sure we're doing the proper comparisons. Fixes: 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk") Reported-and-tested-by: Cole Robinson Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f87f4805e244..a1bf30438bc5 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -266,7 +266,7 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, #if IS_ENABLED(CONFIG_IPV6) if (tb->fast_sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, - &sk->sk_v6_rcv_saddr, + inet6_rcv_saddr(sk), tb->fast_rcv_saddr, sk->sk_rcv_saddr, tb->fast_ipv6_only, -- cgit From fbed24bcc69d3e48c5402c371f19f5c7688871e5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 22 Sep 2017 20:20:08 -0400 Subject: inet: fix improper empty comparison When doing my reuseport rework I screwed up and changed a if (hlist_empty(&tb->owners)) to if (!hlist_empty(&tb->owners)) This is obviously bad as all of the reuseport/reuse logic was reversed, which caused weird problems like allowing an ipv4 bind conflict if we opened an ipv4 only socket on a port followed by an ipv6 only socket on the same port. Fixes: b9470c27607b ("inet: kill smallest_size and smallest_port") Reported-by: Cole Robinson Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a1bf30438bc5..c039c937ba90 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -321,7 +321,7 @@ tb_found: goto fail_unlock; } success: - if (!hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->owners)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; -- cgit From 0d0970eef3b03ef08b19da5bc3044410731cf38f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Sep 2017 16:24:32 -0500 Subject: objtool: Handle another GCC stack pointer adjustment bug The kbuild bot reported the following warning with GCC 4.4 and a randconfig: net/socket.o: warning: objtool: compat_sock_ioctl()+0x1083: stack state mismatch: cfa1=7+160 cfa2=-1+0 This is caused by another GCC non-optimization, where it backs up and restores the stack pointer for no apparent reason: 2f91: 48 89 e0 mov %rsp,%rax 2f94: 4c 89 e7 mov %r12,%rdi 2f97: 4c 89 f6 mov %r14,%rsi 2f9a: ba 20 00 00 00 mov $0x20,%edx 2f9f: 48 89 c4 mov %rax,%rsp This issue would have been happily ignored before the following commit: dd88a0a0c861 ("objtool: Handle GCC stack pointer adjustment bug") But now that objtool is paying attention to such stack pointer writes to/from a register, it needs to understand them properly. In this case that means recognizing that the "mov %rsp, %rax" instruction is potentially a backup of the stack pointer. Reported-by: kbuild test robot Signed-off-by: Josh Poimboeuf Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dmitriy Vyukov Cc: Linus Torvalds Cc: Matthias Kaehlcke Cc: Miguel Bernal Marin Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: dd88a0a0c861 ("objtool: Handle GCC stack pointer adjustment bug") Link: http://lkml.kernel.org/r/8c7aa8e9a36fbbb6655d9d8e7cea58958c912da8.1505942196.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- tools/objtool/arch/x86/decode.c | 6 +++--- tools/objtool/check.c | 43 +++++++++++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 0e8c8ec4fd4e..0f22768c0d4d 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -208,14 +208,14 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, break; case 0x89: - if (rex == 0x48 && modrm == 0xe5) { + if (rex_w && !rex_r && modrm_mod == 3 && modrm_reg == 4) { - /* mov %rsp, %rbp */ + /* mov %rsp, reg */ *type = INSN_STACK; op->src.type = OP_SRC_REG; op->src.reg = CFI_SP; op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_BP; + op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b]; break; } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index f744617c9946..a0c518ecf085 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1203,24 +1203,39 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) switch (op->src.type) { case OP_SRC_REG: - if (op->src.reg == CFI_SP && op->dest.reg == CFI_BP) { + if (op->src.reg == CFI_SP && op->dest.reg == CFI_BP && + cfa->base == CFI_SP && + regs[CFI_BP].base == CFI_CFA && + regs[CFI_BP].offset == -cfa->offset) { + + /* mov %rsp, %rbp */ + cfa->base = op->dest.reg; + state->bp_scratch = false; + } - if (cfa->base == CFI_SP && - regs[CFI_BP].base == CFI_CFA && - regs[CFI_BP].offset == -cfa->offset) { + else if (op->src.reg == CFI_SP && + op->dest.reg == CFI_BP && state->drap) { - /* mov %rsp, %rbp */ - cfa->base = op->dest.reg; - state->bp_scratch = false; - } + /* drap: mov %rsp, %rbp */ + regs[CFI_BP].base = CFI_BP; + regs[CFI_BP].offset = -state->stack_size; + state->bp_scratch = false; + } - else if (state->drap) { + else if (op->src.reg == CFI_SP && cfa->base == CFI_SP) { - /* drap: mov %rsp, %rbp */ - regs[CFI_BP].base = CFI_BP; - regs[CFI_BP].offset = -state->stack_size; - state->bp_scratch = false; - } + /* + * mov %rsp, %reg + * + * This is needed for the rare case where GCC + * does: + * + * mov %rsp, %rax + * ... + * mov %rax, %rsp + */ + state->vals[op->dest.reg].base = CFI_CFA; + state->vals[op->dest.reg].offset = -state->stack_size; } else if (op->dest.reg == cfa->base) { -- cgit From f5caf621ee357279e759c0911daf6d55c7d36f03 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Sep 2017 16:24:33 -0500 Subject: x86/asm: Fix inline asm call constraints for Clang For inline asm statements which have a CALL instruction, we list the stack pointer as a constraint to convince GCC to ensure the frame pointer is set up first: static inline void foo() { register void *__sp asm(_ASM_SP); asm("call bar" : "+r" (__sp)) } Unfortunately, that pattern causes Clang to corrupt the stack pointer. The fix is easy: convert the stack pointer register variable to a global variable. It should be noted that the end result is different based on the GCC version. With GCC 6.4, this patch has exactly the same result as before: defconfig defconfig-nofp distro distro-nofp before 9820389 9491555 8816046 8516940 after 9820389 9491555 8816046 8516940 With GCC 7.2, however, GCC's behavior has changed. It now changes its behavior based on the conversion of the register variable to a global. That somehow convinces it to *always* set up the frame pointer before inserting *any* inline asm. (Therefore, listing the variable as an output constraint is a no-op and is no longer necessary.) It's a bit overkill, but the performance impact should be negligible. And in fact, there's a nice improvement with frame pointers disabled: defconfig defconfig-nofp distro distro-nofp before 9796316 9468236 9076191 8790305 after 9796957 9464267 9076381 8785949 So in summary, while listing the stack pointer as an output constraint is no longer necessary for newer versions of GCC, it's still needed for older versions. Suggested-by: Andrey Ryabinin Reported-by: Matthias Kaehlcke Signed-off-by: Josh Poimboeuf Cc: Alexander Potapenko Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dmitriy Vyukov Cc: Linus Torvalds Cc: Miguel Bernal Marin Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3db862e970c432ae823cf515c52b54fec8270e0e.1505942196.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 3 +-- arch/x86/include/asm/asm.h | 11 +++++++++++ arch/x86/include/asm/mshyperv.h | 10 ++++------ arch/x86/include/asm/paravirt_types.h | 14 +++++++------- arch/x86/include/asm/preempt.h | 15 +++++---------- arch/x86/include/asm/processor.h | 6 ++---- arch/x86/include/asm/rwsem.h | 4 ++-- arch/x86/include/asm/uaccess.h | 4 ++-- arch/x86/include/asm/xen/hypercall.h | 5 ++--- arch/x86/kvm/emulate.c | 3 +-- arch/x86/kvm/vmx.c | 3 +-- arch/x86/mm/fault.c | 3 +-- tools/objtool/Documentation/stack-validation.txt | 6 +++--- 13 files changed, 42 insertions(+), 45 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 1b020381ab38..c096624137ae 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -218,10 +218,9 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ output, input...) \ { \ - register void *__sp asm(_ASM_SP); \ asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ "call %P[new2]", feature2) \ - : output, "+r" (__sp) \ + : output, ASM_CALL_CONSTRAINT \ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ [new2] "i" (newfunc2), ## input); \ } diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 676ee5807d86..c1eadbaf1115 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -132,4 +132,15 @@ /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif +#ifndef __ASSEMBLY__ +/* + * This output constraint should be used for any inline asm which has a "call" + * instruction. Otherwise the asm may be inserted before the frame pointer + * gets set up by the containing function. If you forget to do this, objtool + * may print a "call without frame pointer save/setup" warning. + */ +register unsigned int __asm_call_sp asm("esp"); +#define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) +#endif + #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 63cc96f064dc..738503e1f80c 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -179,7 +179,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) u64 input_address = input ? virt_to_phys(input) : 0; u64 output_address = output ? virt_to_phys(output) : 0; u64 hv_status; - register void *__sp asm(_ASM_SP); #ifdef CONFIG_X86_64 if (!hv_hypercall_pg) @@ -187,7 +186,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) __asm__ __volatile__("mov %4, %%r8\n" "call *%5" - : "=a" (hv_status), "+r" (__sp), + : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) : "r" (output_address), "m" (hv_hypercall_pg) : "cc", "memory", "r8", "r9", "r10", "r11"); @@ -202,7 +201,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) __asm__ __volatile__("call *%7" : "=A" (hv_status), - "+c" (input_address_lo), "+r" (__sp) + "+c" (input_address_lo), ASM_CALL_CONSTRAINT : "A" (control), "b" (input_address_hi), "D"(output_address_hi), "S"(output_address_lo), @@ -224,12 +223,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) { u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; - register void *__sp asm(_ASM_SP); #ifdef CONFIG_X86_64 { __asm__ __volatile__("call *%4" - : "=a" (hv_status), "+r" (__sp), + : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) : "m" (hv_hypercall_pg) : "cc", "r8", "r9", "r10", "r11"); @@ -242,7 +240,7 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) __asm__ __volatile__ ("call *%5" : "=A"(hv_status), "+c"(input1_lo), - "+r"(__sp) + ASM_CALL_CONSTRAINT : "A" (control), "b" (input1_hi), "m" (hv_hypercall_pg) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 42873edd9f9d..280d94c36dad 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -459,8 +459,8 @@ int paravirt_disable_iospace(void); */ #ifdef CONFIG_X86_32 #define PVOP_VCALL_ARGS \ - unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; \ - register void *__sp asm("esp") + unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) @@ -480,8 +480,8 @@ int paravirt_disable_iospace(void); /* [re]ax isn't an arg, but the return val */ #define PVOP_VCALL_ARGS \ unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx, __eax = __eax; \ - register void *__sp asm("rsp") + __edx = __edx, __ecx = __ecx, __eax = __eax; + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) @@ -532,7 +532,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -542,7 +542,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -569,7 +569,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index ec1f3c651150..4f44505dbf87 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -100,19 +100,14 @@ static __always_inline bool should_resched(int preempt_offset) #ifdef CONFIG_PREEMPT extern asmlinkage void ___preempt_schedule(void); -# define __preempt_schedule() \ -({ \ - register void *__sp asm(_ASM_SP); \ - asm volatile ("call ___preempt_schedule" : "+r"(__sp)); \ -}) +# define __preempt_schedule() \ + asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT) extern asmlinkage void preempt_schedule(void); extern asmlinkage void ___preempt_schedule_notrace(void); -# define __preempt_schedule_notrace() \ -({ \ - register void *__sp asm(_ASM_SP); \ - asm volatile ("call ___preempt_schedule_notrace" : "+r"(__sp)); \ -}) +# define __preempt_schedule_notrace() \ + asm volatile ("call ___preempt_schedule_notrace" : ASM_CALL_CONSTRAINT) + extern asmlinkage void preempt_schedule_notrace(void); #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3fa26a61eabc..b390ff76e58f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -677,8 +677,6 @@ static inline void sync_core(void) * Like all of Linux's memory ordering operations, this is a * compiler barrier as well. */ - register void *__sp asm(_ASM_SP); - #ifdef CONFIG_X86_32 asm volatile ( "pushfl\n\t" @@ -686,7 +684,7 @@ static inline void sync_core(void) "pushl $1f\n\t" "iret\n\t" "1:" - : "+r" (__sp) : : "memory"); + : ASM_CALL_CONSTRAINT : : "memory"); #else unsigned int tmp; @@ -703,7 +701,7 @@ static inline void sync_core(void) "iretq\n\t" UNWIND_HINT_RESTORE "1:" - : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); + : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); #endif } diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index a34e0d4b957d..7116b7931c7b 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -103,7 +103,6 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) ({ \ long tmp; \ struct rw_semaphore* ret; \ - register void *__sp asm(_ASM_SP); \ \ asm volatile("# beginning down_write\n\t" \ LOCK_PREFIX " xadd %1,(%4)\n\t" \ @@ -114,7 +113,8 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) " call " slow_path "\n" \ "1:\n" \ "# ending down_write" \ - : "+m" (sem->count), "=d" (tmp), "=a" (ret), "+r" (__sp) \ + : "+m" (sem->count), "=d" (tmp), \ + "=a" (ret), ASM_CALL_CONSTRAINT \ : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \ : "memory", "cc"); \ ret; \ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 184eb9894dae..78e8fcc87d4c 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -166,11 +166,11 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) ({ \ int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ - register void *__sp asm(_ASM_SP); \ __chk_user_ptr(ptr); \ might_fault(); \ asm volatile("call __get_user_%P4" \ - : "=a" (__ret_gu), "=r" (__val_gu), "+r" (__sp) \ + : "=a" (__ret_gu), "=r" (__val_gu), \ + ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 9606688caa4b..128a1a0b1450 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -113,10 +113,9 @@ extern struct { char _entry[32]; } hypercall_page[]; register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \ register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \ register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \ - register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; \ - register void *__sp asm(_ASM_SP); + register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; -#define __HYPERCALL_0PARAM "=r" (__res), "+r" (__sp) +#define __HYPERCALL_0PARAM "=r" (__res), ASM_CALL_CONSTRAINT #define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1) #define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2) #define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 16bf6655aa85..f23f13403f33 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5296,7 +5296,6 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) { - register void *__sp asm(_ASM_SP); ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; if (!(ctxt->d & ByteOp)) @@ -5304,7 +5303,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), - [fastop]"+S"(fop), "+r"(__sp) + [fastop]"+S"(fop), ASM_CALL_CONSTRAINT : "c"(ctxt->src2.val)); ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06c0c6d0541e..6ee237f509dc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9036,7 +9036,6 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) { u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - register void *__sp asm(_ASM_SP); if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { @@ -9065,7 +9064,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 [sp]"=&r"(tmp), #endif - "+r"(__sp) + ASM_CALL_CONSTRAINT : [entry]"r"(entry), [ss]"i"(__KERNEL_DS), diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b836a7274e12..39567b5c33da 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -806,7 +806,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - register void *__sp asm("rsp"); unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); /* * We're likely to be running with very little stack space @@ -821,7 +820,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, asm volatile ("movq %[stack], %%rsp\n\t" "call handle_stack_overflow\n\t" "1: jmp 1b" - : "+r" (__sp) + : ASM_CALL_CONSTRAINT : "D" ("kernel stack overflow (page fault)"), "S" (regs), "d" (address), [stack] "rm" (stack)); diff --git a/tools/objtool/Documentation/stack-validation.txt b/tools/objtool/Documentation/stack-validation.txt index 6a1af43862df..3995735a878f 100644 --- a/tools/objtool/Documentation/stack-validation.txt +++ b/tools/objtool/Documentation/stack-validation.txt @@ -194,10 +194,10 @@ they mean, and suggestions for how to fix them. If it's a GCC-compiled .c file, the error may be because the function uses an inline asm() statement which has a "call" instruction. An asm() statement with a call instruction must declare the use of the - stack pointer in its output operand. For example, on x86_64: + stack pointer in its output operand. On x86_64, this means adding + the ASM_CALL_CONSTRAINT as an output constraint: - register void *__sp asm("rsp"); - asm volatile("call func" : "+r" (__sp)); + asm volatile("call func" : ASM_CALL_CONSTRAINT); Otherwise the stack frame may not get created before the call. -- cgit From af2e658fc08a397b10352265e50b83f27e25d73e Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 22 Sep 2017 12:32:35 +0300 Subject: as3645a: Use ams,input-max-microamp as documented in DT bindings DT bindings document the property "ams,input-max-microamp" that limits the chip's maximum input current. The driver and the DTS however used "peak-current-limit" property. Fix this by using the property documented in DT binding documentation. Signed-off-by: Sakari Ailus Acked-by: Pavel Machek Signed-off-by: Jacek Anaszewski --- arch/arm/boot/dts/omap3-n950-n9.dtsi | 2 +- drivers/leds/leds-as3645a.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/omap3-n950-n9.dtsi b/arch/arm/boot/dts/omap3-n950-n9.dtsi index cb47ae79a5f9..b86fc83a5a65 100644 --- a/arch/arm/boot/dts/omap3-n950-n9.dtsi +++ b/arch/arm/boot/dts/omap3-n950-n9.dtsi @@ -273,7 +273,7 @@ flash-timeout-us = <150000>; flash-max-microamp = <320000>; led-max-microamp = <60000>; - peak-current-limit = <1750000>; + ams,input-max-microamp = <1750000>; }; indicator { led-max-microamp = <10000>; diff --git a/drivers/leds/leds-as3645a.c b/drivers/leds/leds-as3645a.c index bbbbe0898233..e3f89c6130d2 100644 --- a/drivers/leds/leds-as3645a.c +++ b/drivers/leds/leds-as3645a.c @@ -534,7 +534,7 @@ static int as3645a_parse_node(struct as3645a *flash, of_property_read_u32(flash->flash_node, "voltage-reference", &cfg->voltage_reference); - of_property_read_u32(flash->flash_node, "peak-current-limit", + of_property_read_u32(flash->flash_node, "ams,input-max-microamp", &cfg->peak); cfg->peak = AS_PEAK_mA_TO_REG(cfg->peak); -- cgit From 75f9f7279e874ff95d1abe4613abc0826c9a8dcc Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 22 Sep 2017 12:32:36 +0300 Subject: dt: bindings: as3645a: Use LED number to refer to LEDs Use integers (reg property) to tell the number of the LED to the driver instead of the node name. While both of these approaches are currently used by the LED bindings, using integers will require less driver changes for ACPI support. Additionally, it will make possible LED naming using chip and LED node names, effectively making the label property most useful for human-readable names only. Signed-off-by: Sakari Ailus Acked-by: Rob Herring Signed-off-by: Jacek Anaszewski --- .../devicetree/bindings/leds/ams,as3645a.txt | 28 ++++++++++++++-------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/Documentation/devicetree/bindings/leds/ams,as3645a.txt b/Documentation/devicetree/bindings/leds/ams,as3645a.txt index 12c5ef26ec73..fdc40e354a64 100644 --- a/Documentation/devicetree/bindings/leds/ams,as3645a.txt +++ b/Documentation/devicetree/bindings/leds/ams,as3645a.txt @@ -15,11 +15,14 @@ Required properties compatible : Must be "ams,as3645a". reg : The I2C address of the device. Typically 0x30. +#address-cells : 1 +#size-cells : 0 -Required properties of the "flash" child node -============================================= +Required properties of the flash child node (0) +=============================================== +reg: 0 flash-timeout-us: Flash timeout in microseconds. The value must be in the range [100000, 850000] and divisible by 50000. flash-max-microamp: Maximum flash current in microamperes. Has to be @@ -33,20 +36,21 @@ ams,input-max-microamp: Maximum flash controller input current. The and divisible by 50000. -Optional properties of the "flash" child node -============================================= +Optional properties of the flash child node +=========================================== label : The label of the flash LED. -Required properties of the "indicator" child node -================================================= +Required properties of the indicator child node (1) +=================================================== +reg: 1 led-max-microamp: Maximum indicator current. The allowed values are 2500, 5000, 7500 and 10000. -Optional properties of the "indicator" child node -================================================= +Optional properties of the indicator child node +=============================================== label : The label of the indicator LED. @@ -55,16 +59,20 @@ Example ======= as3645a@30 { + #address-cells: 1 + #size-cells: 0 reg = <0x30>; compatible = "ams,as3645a"; - flash { + flash@0 { + reg = <0x0>; flash-timeout-us = <150000>; flash-max-microamp = <320000>; led-max-microamp = <60000>; ams,input-max-microamp = <1750000>; label = "as3645a:flash"; }; - indicator { + indicator@1 { + reg = <0x1>; led-max-microamp = <10000>; label = "as3645a:indicator"; }; -- cgit From e626c325277531db15314b80610d1f5a1c2637b2 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 22 Sep 2017 12:32:37 +0300 Subject: as3645a: Use integer numbers for parsing LEDs Use integer numbers for LEDs, 0 is the flash and 1 is the indicator. Signed-off-by: Sakari Ailus Acked-by: Pavel Machek Signed-off-by: Jacek Anaszewski --- arch/arm/boot/dts/omap3-n950-n9.dtsi | 8 ++++++-- drivers/leds/leds-as3645a.c | 26 ++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/arch/arm/boot/dts/omap3-n950-n9.dtsi b/arch/arm/boot/dts/omap3-n950-n9.dtsi index b86fc83a5a65..1b0bd72945f2 100644 --- a/arch/arm/boot/dts/omap3-n950-n9.dtsi +++ b/arch/arm/boot/dts/omap3-n950-n9.dtsi @@ -267,15 +267,19 @@ clock-frequency = <400000>; as3645a@30 { + #address-cells = <1>; + #size-cells = <0>; reg = <0x30>; compatible = "ams,as3645a"; - flash { + flash@0 { + reg = <0x0>; flash-timeout-us = <150000>; flash-max-microamp = <320000>; led-max-microamp = <60000>; ams,input-max-microamp = <1750000>; }; - indicator { + indicator@1 { + reg = <0x1>; led-max-microamp = <10000>; }; }; diff --git a/drivers/leds/leds-as3645a.c b/drivers/leds/leds-as3645a.c index e3f89c6130d2..605e0c64e974 100644 --- a/drivers/leds/leds-as3645a.c +++ b/drivers/leds/leds-as3645a.c @@ -112,6 +112,10 @@ #define AS_PEAK_mA_TO_REG(a) \ ((min_t(u32, AS_PEAK_mA_MAX, a) - 1250) / 250) +/* LED numbers for Devicetree */ +#define AS_LED_FLASH 0 +#define AS_LED_INDICATOR 1 + enum as_mode { AS_MODE_EXT_TORCH = 0 << AS_CONTROL_MODE_SETTING_SHIFT, AS_MODE_INDICATOR = 1 << AS_CONTROL_MODE_SETTING_SHIFT, @@ -491,10 +495,29 @@ static int as3645a_parse_node(struct as3645a *flash, struct device_node *node) { struct as3645a_config *cfg = &flash->cfg; + struct device_node *child; const char *name; int rval; - flash->flash_node = of_get_child_by_name(node, "flash"); + for_each_child_of_node(node, child) { + u32 id = 0; + + of_property_read_u32(child, "reg", &id); + + switch (id) { + case AS_LED_FLASH: + flash->flash_node = of_node_get(child); + break; + case AS_LED_INDICATOR: + flash->indicator_node = of_node_get(child); + break; + default: + dev_warn(&flash->client->dev, + "unknown LED %u encountered, ignoring\n", id); + break; + } + } + if (!flash->flash_node) { dev_err(&flash->client->dev, "can't find flash node\n"); return -ENODEV; @@ -538,7 +561,6 @@ static int as3645a_parse_node(struct as3645a *flash, &cfg->peak); cfg->peak = AS_PEAK_mA_TO_REG(cfg->peak); - flash->indicator_node = of_get_child_by_name(node, "indicator"); if (!flash->indicator_node) { dev_warn(&flash->client->dev, "can't find indicator node\n"); -- cgit From 12c4b878e71fa8b65bc479b2460765c7d1d81a26 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 22 Sep 2017 12:32:38 +0300 Subject: as3645a: Unregister indicator LED on device unbind The indicator LED was registered in probe but was not removed in driver remove callback. Fix this. Signed-off-by: Sakari Ailus Signed-off-by: Jacek Anaszewski --- drivers/leds/leds-as3645a.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/leds/leds-as3645a.c b/drivers/leds/leds-as3645a.c index 605e0c64e974..9a257f969300 100644 --- a/drivers/leds/leds-as3645a.c +++ b/drivers/leds/leds-as3645a.c @@ -743,6 +743,7 @@ static int as3645a_remove(struct i2c_client *client) as3645a_set_control(flash, AS_MODE_EXT_TORCH, false); v4l2_flash_release(flash->vf); + v4l2_flash_release(flash->vfind); led_classdev_flash_unregister(&flash->fled); led_classdev_unregister(&flash->iled_cdev); -- cgit From 28585a832602747cbfa88ad8934013177a3aae38 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Sep 2017 14:10:22 -0700 Subject: rcu: Allow for page faults in NMI handlers A number of architecture invoke rcu_irq_enter() on exception entry in order to allow RCU read-side critical sections in the exception handler when the exception is from an idle or nohz_full CPU. This works, at least unless the exception happens in an NMI handler. In that case, rcu_nmi_enter() would already have exited the extended quiescent state, which would mean that rcu_irq_enter() would (incorrectly) cause RCU to think that it is again in an extended quiescent state. This will in turn result in lockdep splats in response to later RCU read-side critical sections. This commit therefore causes rcu_irq_enter() and rcu_irq_exit() to take no action if there is an rcu_nmi_enter() in effect, thus avoiding the unscheduled return to RCU quiescent state. This in turn should make the kernel safe for on-demand RCU voyeurism. Link: http://lkml.kernel.org/r/20170922211022.GA18084@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/rcu/tree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..63bee8e1b193 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -888,6 +888,11 @@ void rcu_irq_exit(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && rdtp->dynticks_nesting < 1); if (rdtp->dynticks_nesting <= 1) { @@ -1020,6 +1025,11 @@ void rcu_irq_enter(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && -- cgit From 9aadde91b3c035413c806619beb3e3ef6e697953 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 22 Sep 2017 17:22:19 -0400 Subject: extable: Consolidate *kernel_text_address() functions The functionality between kernel_text_address() and _kernel_text_address() is the same except that _kernel_text_address() does a little more (that function needs a rename, but that can be done another time). Instead of having duplicate code in both, simply have _kernel_text_address() calls kernel_text_address() instead. This is marked for stable because there's an RCU bug that can happen if one of these functions gets called while RCU is not watching. That fix depends on this fix to keep from having to write the fix twice. Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/extable.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/extable.c b/kernel/extable.c index 38c2412401a1..a7024a494faf 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -102,15 +102,7 @@ int core_kernel_data(unsigned long addr) int __kernel_text_address(unsigned long addr) { - if (core_kernel_text(addr)) - return 1; - if (is_module_text_address(addr)) - return 1; - if (is_ftrace_trampoline(addr)) - return 1; - if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) - return 1; - if (is_bpf_text_address(addr)) + if (kernel_text_address(addr)) return 1; /* * There might be init symbols in saved stacktraces. -- cgit From e8cac8b1d10589be45671a5ade0926a639b543b7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 22 Sep 2017 17:36:32 -0400 Subject: extable: Enable RCU if it is not watching in kernel_text_address() If kernel_text_address() is called when RCU is not watching, it can cause an RCU bug because is_module_text_address(), the is_kprobe_*insn_slot() and is_bpf_text_address() functions require the use of RCU. Only enable RCU if it is not currently watching before it calls is_module_text_address(). The use of rcu_nmi_enter() is used to enable RCU because kernel_text_address() can happen pretty much anywhere (like an NMI), and even from within an NMI. It is called via save_stack_trace() that can be called by any WARN() or tracing function, which can happen while RCU is not watching (for example, going to or coming from idle, or during CPU take down or bring up). Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/extable.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/kernel/extable.c b/kernel/extable.c index a7024a494faf..9aa1cc41ecf7 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -119,17 +119,42 @@ int __kernel_text_address(unsigned long addr) int kernel_text_address(unsigned long addr) { + bool no_rcu; + int ret = 1; + if (core_kernel_text(addr)) return 1; + + /* + * If a stack dump happens while RCU is not watching, then + * RCU needs to be notified that it requires to start + * watching again. This can happen either by tracing that + * triggers a stack trace, or a WARN() that happens during + * coming back from idle, or cpu on or offlining. + * + * is_module_text_address() as well as the kprobe slots + * and is_bpf_text_address() require RCU to be watching. + */ + no_rcu = !rcu_is_watching(); + + /* Treat this like an NMI as it can happen anywhere */ + if (no_rcu) + rcu_nmi_enter(); + if (is_module_text_address(addr)) - return 1; + goto out; if (is_ftrace_trampoline(addr)) - return 1; + goto out; if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) - return 1; + goto out; if (is_bpf_text_address(addr)) - return 1; - return 0; + goto out; + ret = 0; +out: + if (no_rcu) + rcu_nmi_exit(); + + return ret; } /* -- cgit From 15516c89acce948debc4c598e03c3fee53045797 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 21 Sep 2017 13:00:21 -0400 Subject: tracing: Remove RCU work arounds from stack tracer Currently the stack tracer calls rcu_irq_enter() to make sure RCU is watching when it records a stack trace. But if the stack tracer is triggered while tracing inside of a rcu_irq_enter(), calling rcu_irq_enter() unconditionally can be problematic. The reason for having rcu_irq_enter() in the first place has been fixed from within the saving of the stack trace code, and there's no reason for doing it in the stack tracer itself. Just remove it. Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Acked-by: Paul E. McKenney Suggested-by: "Paul E. McKenney" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a4df67cbc711..49cb41412eec 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -96,23 +96,9 @@ check_stack(unsigned long ip, unsigned long *stack) if (in_nmi()) return; - /* - * There's a slight chance that we are tracing inside the - * RCU infrastructure, and rcu_irq_enter() will not work - * as expected. - */ - if (unlikely(rcu_irq_enter_disabled())) - return; - local_irq_save(flags); arch_spin_lock(&stack_trace_max_lock); - /* - * RCU may not be watching, make it see us. - * The stack trace code uses rcu_sched. - */ - rcu_irq_enter(); - /* In case another CPU set the tracer_frame on us */ if (unlikely(!frame_size)) this_size -= tracer_frame; @@ -205,7 +191,6 @@ check_stack(unsigned long ip, unsigned long *stack) } out: - rcu_irq_exit(); arch_spin_unlock(&stack_trace_max_lock); local_irq_restore(flags); } -- cgit From ab5348c9c23cd253f5902980d2d8fe067dc24c82 Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Wed, 26 Jul 2017 22:27:05 -0400 Subject: security: fix description of values returned by cap_inode_need_killpriv cap_inode_need_killpriv returns 1 if security.capability exists and has a value and inode_killpriv() is required, 0 otherwise. Fix the description of the return value to reflect this. Signed-off-by: Stefan Berger Reviewed-by: Serge Hallyn Signed-off-by: James Morris --- security/commoncap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index 7abebd782d5e..123426900d25 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -300,10 +300,10 @@ static inline void bprm_clear_caps(struct linux_binprm *bprm) * * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV * affects the security markings on that inode, and if it is, should - * inode_killpriv() be invoked or the change rejected? + * inode_killpriv() be invoked or the change rejected. * - * Returns 0 if granted; +ve if granted, but inode_killpriv() is required; and - * -ve to deny the change. + * Returns 1 if security.capability has a value, meaning inode_killpriv() + * is required, 0 otherwise, meaning inode_killpriv() is not required. */ int cap_inode_need_killpriv(struct dentry *dentry) { -- cgit From c2a9c4bf034ab7415ec272c4c86f1c0b796f80e6 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 17 Aug 2017 23:04:21 +0530 Subject: tpm: vtpm: constify vio_device_id vio_device_id are not supposed to change at runtime. All functions working with vio_device_id provided by work with const vio_device_id. So mark the non-const structs as const. Signed-off-by: Arvind Yadav Reviewed-by: Jason Gunthorpe Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: James Morris --- drivers/char/tpm/tpm_ibmvtpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c index f01d083eced2..d2ce46bd2471 100644 --- a/drivers/char/tpm/tpm_ibmvtpm.c +++ b/drivers/char/tpm/tpm_ibmvtpm.c @@ -32,7 +32,7 @@ static const char tpm_ibmvtpm_driver_name[] = "tpm_ibmvtpm"; -static struct vio_device_id tpm_ibmvtpm_device_table[] = { +static const struct vio_device_id tpm_ibmvtpm_device_table[] = { { "IBM,vtpm", "IBM,vtpm"}, { "", "" } }; -- cgit From e1ec650f9ae7d1e66f77bf14c86e4e9dc629a4d5 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 6 Jul 2017 23:18:39 +0530 Subject: tpm: tpm_crb: constify acpi_device_id. acpi_device_id are not supposed to change at runtime. All functions working with acpi_device_id provided by work with const acpi_device_id. So mark the non-const structs as const. File size before: text data bss dec hex filename 4198 608 0 4806 12c6 drivers/char/tpm/tpm_crb.o File size After adding 'const': text data bss dec hex filename 4262 520 0 4782 12ae drivers/char/tpm/tpm_crb.o Signed-off-by: Arvind Yadav Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: James Morris --- drivers/char/tpm/tpm_crb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c index a4ac63a21d8a..8f0a98dea327 100644 --- a/drivers/char/tpm/tpm_crb.c +++ b/drivers/char/tpm/tpm_crb.c @@ -665,7 +665,7 @@ static const struct dev_pm_ops crb_pm = { SET_RUNTIME_PM_OPS(crb_pm_runtime_suspend, crb_pm_runtime_resume, NULL) }; -static struct acpi_device_id crb_device_ids[] = { +static const struct acpi_device_id crb_device_ids[] = { {"MSFT0101", 0}, {"", 0}, }; -- cgit From 5d0e4d78149b9745ea34848e950e734f7db3b95f Mon Sep 17 00:00:00 2001 From: Enric Balletbo i Serra Date: Tue, 27 Jun 2017 12:27:23 +0200 Subject: Documentation: tpm: add powered-while-suspended binding documentation Add a new powered-while-suspended property to control the behavior of the TPM suspend/resume. Signed-off-by: Enric Balletbo i Serra Signed-off-by: Sonny Rao Reviewed-by: Jason Gunthorpe Reviewed-by: Jarkko Sakkinen Acked-by: Rob Herring Signed-off-by: Jarkko Sakkinen Signed-off-by: James Morris --- Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt b/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt index 8cb638b7e89c..85c8216fc335 100644 --- a/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt +++ b/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt @@ -8,6 +8,12 @@ Required properties: the firmware event log - linux,sml-size : size of the memory allocated for the firmware event log +Optional properties: + +- powered-while-suspended: present when the TPM is left powered on between + suspend and resume (makes the suspend/resume + callbacks do nothing). + Example (for OpenPower Systems with Nuvoton TPM 2.0 on I2C) ---------------------------------------------------------- -- cgit From 9f3fc7bcddcb51234e23494531f93ab60475e1c3 Mon Sep 17 00:00:00 2001 From: Hamza Attak Date: Mon, 14 Aug 2017 19:09:16 +0100 Subject: tpm: replace msleep() with usleep_range() in TPM 1.2/2.0 generic drivers The patch simply replaces all msleep function calls with usleep_range calls in the generic drivers. Tested with an Infineon TPM 1.2, using the generic tpm-tis module, for a thousand PCR extends, we see results going from 1m57s unpatched to 40s with the new patch. We obtain similar results when using the original and patched tpm_infineon driver, which is also part of the patch. Similarly with a STM TPM 2.0, using the CRB driver, it takes about 20ms per extend unpatched and around 7ms with the new patch. Note that the PCR consistency is untouched with this patch, each TPM has been tested with 10 million extends and the aggregated PCR value is continuously verified to be correct. As an extension of this work, this could potentially and easily be applied to other vendor's drivers. Still, these changes are not included in the proposed patch as they are untested. Signed-off-by: Hamza Attak Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: James Morris --- drivers/char/tpm/tpm-interface.c | 10 +++++----- drivers/char/tpm/tpm.h | 9 ++++++++- drivers/char/tpm/tpm2-cmd.c | 2 +- drivers/char/tpm/tpm_infineon.c | 6 +++--- drivers/char/tpm/tpm_tis_core.c | 8 ++++---- 5 files changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c index fe597e6c55c4..1d6729be4cd6 100644 --- a/drivers/char/tpm/tpm-interface.c +++ b/drivers/char/tpm/tpm-interface.c @@ -455,7 +455,7 @@ ssize_t tpm_transmit(struct tpm_chip *chip, struct tpm_space *space, goto out; } - msleep(TPM_TIMEOUT); /* CHECK */ + tpm_msleep(TPM_TIMEOUT); rmb(); } while (time_before(jiffies, stop)); @@ -970,7 +970,7 @@ int tpm_do_selftest(struct tpm_chip *chip) dev_info( &chip->dev, HW_ERR "TPM command timed out during continue self test"); - msleep(delay_msec); + tpm_msleep(delay_msec); continue; } @@ -985,7 +985,7 @@ int tpm_do_selftest(struct tpm_chip *chip) } if (rc != TPM_WARN_DOING_SELFTEST) return rc; - msleep(delay_msec); + tpm_msleep(delay_msec); } while (--loops > 0); return rc; @@ -1085,7 +1085,7 @@ again: } } else { do { - msleep(TPM_TIMEOUT); + tpm_msleep(TPM_TIMEOUT); status = chip->ops->status(chip); if ((status & mask) == mask) return 0; @@ -1150,7 +1150,7 @@ int tpm_pm_suspend(struct device *dev) */ if (rc != TPM_WARN_RETRY) break; - msleep(TPM_TIMEOUT_RETRY); + tpm_msleep(TPM_TIMEOUT_RETRY); } if (rc) diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index 04fbff2edbf3..2d5466a72e40 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -50,7 +50,8 @@ enum tpm_const { enum tpm_timeout { TPM_TIMEOUT = 5, /* msecs */ - TPM_TIMEOUT_RETRY = 100 /* msecs */ + TPM_TIMEOUT_RETRY = 100, /* msecs */ + TPM_TIMEOUT_RANGE_US = 300 /* usecs */ }; /* TPM addresses */ @@ -527,6 +528,12 @@ int tpm_pm_resume(struct device *dev); int wait_for_tpm_stat(struct tpm_chip *chip, u8 mask, unsigned long timeout, wait_queue_head_t *queue, bool check_cancel); +static inline void tpm_msleep(unsigned int delay_msec) +{ + usleep_range(delay_msec * 1000, + (delay_msec * 1000) + TPM_TIMEOUT_RANGE_US); +}; + struct tpm_chip *tpm_chip_find_get(int chip_num); __must_check int tpm_try_get_ops(struct tpm_chip *chip); void tpm_put_ops(struct tpm_chip *chip); diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index f7f34b2aa981..e1a41b788f08 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -899,7 +899,7 @@ static int tpm2_do_selftest(struct tpm_chip *chip) if (rc != TPM2_RC_TESTING) break; - msleep(delay_msec); + tpm_msleep(delay_msec); } return rc; diff --git a/drivers/char/tpm/tpm_infineon.c b/drivers/char/tpm/tpm_infineon.c index 3b1b9f9322d5..d8f10047fbba 100644 --- a/drivers/char/tpm/tpm_infineon.c +++ b/drivers/char/tpm/tpm_infineon.c @@ -191,7 +191,7 @@ static int wait(struct tpm_chip *chip, int wait_for_bit) /* check the status-register if wait_for_bit is set */ if (status & 1 << wait_for_bit) break; - msleep(TPM_MSLEEP_TIME); + tpm_msleep(TPM_MSLEEP_TIME); } if (i == TPM_MAX_TRIES) { /* timeout occurs */ if (wait_for_bit == STAT_XFE) @@ -226,7 +226,7 @@ static void tpm_wtx(struct tpm_chip *chip) wait_and_send(chip, TPM_CTRL_WTX); wait_and_send(chip, 0x00); wait_and_send(chip, 0x00); - msleep(TPM_WTX_MSLEEP_TIME); + tpm_msleep(TPM_WTX_MSLEEP_TIME); } static void tpm_wtx_abort(struct tpm_chip *chip) @@ -237,7 +237,7 @@ static void tpm_wtx_abort(struct tpm_chip *chip) wait_and_send(chip, 0x00); wait_and_send(chip, 0x00); number_of_wtx = 0; - msleep(TPM_WTX_MSLEEP_TIME); + tpm_msleep(TPM_WTX_MSLEEP_TIME); } static int tpm_inf_recv(struct tpm_chip *chip, u8 * buf, size_t count) diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index b617b2eeb080..63bc6c3b949e 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -51,7 +51,7 @@ static int wait_startup(struct tpm_chip *chip, int l) if (access & TPM_ACCESS_VALID) return 0; - msleep(TPM_TIMEOUT); + tpm_msleep(TPM_TIMEOUT); } while (time_before(jiffies, stop)); return -1; } @@ -117,7 +117,7 @@ again: do { if (check_locality(chip, l)) return l; - msleep(TPM_TIMEOUT); + tpm_msleep(TPM_TIMEOUT); } while (time_before(jiffies, stop)); } return -1; @@ -164,7 +164,7 @@ static int get_burstcount(struct tpm_chip *chip) burstcnt = (value >> 8) & 0xFFFF; if (burstcnt) return burstcnt; - msleep(TPM_TIMEOUT); + tpm_msleep(TPM_TIMEOUT); } while (time_before(jiffies, stop)); return -EBUSY; } @@ -396,7 +396,7 @@ static int tpm_tis_send(struct tpm_chip *chip, u8 *buf, size_t len) priv->irq = irq; chip->flags |= TPM_CHIP_FLAG_IRQ; if (!priv->irq_tested) - msleep(1); + tpm_msleep(1); if (!priv->irq_tested) disable_interrupts(chip); priv->irq_tested = true; -- cgit From fb154e0e0a95249459df054241a9e8f4cca56062 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 24 Feb 2017 20:35:16 +0100 Subject: tpm: ibmvtpm: simplify crq initialization and document crq format The crq is passed in registers and is the same on BE and LE hosts. However, current implementation allocates a structure on-stack to represent the crq, initializes the members swapping them to BE, and loads the structure swapping it from BE. This is pointless and causes GCC warnings about ununitialized members. Get rid of the structure and the warnings. Signed-off-by: Michal Suchanek Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: James Morris --- drivers/char/tpm/tpm_ibmvtpm.c | 96 ++++++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 36 deletions(-) diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c index d2ce46bd2471..25f6e2665385 100644 --- a/drivers/char/tpm/tpm_ibmvtpm.c +++ b/drivers/char/tpm/tpm_ibmvtpm.c @@ -39,19 +39,63 @@ static const struct vio_device_id tpm_ibmvtpm_device_table[] = { MODULE_DEVICE_TABLE(vio, tpm_ibmvtpm_device_table); /** + * + * ibmvtpm_send_crq_word - Send a CRQ request + * @vdev: vio device struct + * @w1: pre-constructed first word of tpm crq (second word is reserved) + * + * Return: + * 0 - Success + * Non-zero - Failure + */ +static int ibmvtpm_send_crq_word(struct vio_dev *vdev, u64 w1) +{ + return plpar_hcall_norets(H_SEND_CRQ, vdev->unit_address, w1, 0); +} + +/** + * * ibmvtpm_send_crq - Send a CRQ request * * @vdev: vio device struct - * @w1: first word - * @w2: second word + * @valid: Valid field + * @msg: Type field + * @len: Length field + * @data: Data field + * + * The ibmvtpm crq is defined as follows: + * + * Byte | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 + * ----------------------------------------------------------------------- + * Word0 | Valid | Type | Length | Data + * ----------------------------------------------------------------------- + * Word1 | Reserved + * ----------------------------------------------------------------------- + * + * Which matches the following structure (on bigendian host): + * + * struct ibmvtpm_crq { + * u8 valid; + * u8 msg; + * __be16 len; + * __be32 data; + * __be64 reserved; + * } __attribute__((packed, aligned(8))); + * + * However, the value is passed in a register so just compute the numeric value + * to load into the register avoiding byteswap altogether. Endian only affects + * memory loads and stores - registers are internally represented the same. * * Return: - * 0 -Sucess + * 0 (H_SUCCESS) - Success * Non-zero - Failure */ -static int ibmvtpm_send_crq(struct vio_dev *vdev, u64 w1, u64 w2) +static int ibmvtpm_send_crq(struct vio_dev *vdev, + u8 valid, u8 msg, u16 len, u32 data) { - return plpar_hcall_norets(H_SEND_CRQ, vdev->unit_address, w1, w2); + u64 w1 = ((u64)valid << 56) | ((u64)msg << 48) | ((u64)len << 32) | + (u64)data; + return ibmvtpm_send_crq_word(vdev, w1); } /** @@ -109,8 +153,6 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count) static int tpm_ibmvtpm_send(struct tpm_chip *chip, u8 *buf, size_t count) { struct ibmvtpm_dev *ibmvtpm = dev_get_drvdata(&chip->dev); - struct ibmvtpm_crq crq; - __be64 *word = (__be64 *)&crq; int rc, sig; if (!ibmvtpm->rtce_buf) { @@ -137,10 +179,6 @@ static int tpm_ibmvtpm_send(struct tpm_chip *chip, u8 *buf, size_t count) spin_lock(&ibmvtpm->rtce_lock); ibmvtpm->res_len = 0; memcpy((void *)ibmvtpm->rtce_buf, (void *)buf, count); - crq.valid = (u8)IBMVTPM_VALID_CMD; - crq.msg = (u8)VTPM_TPM_COMMAND; - crq.len = cpu_to_be16(count); - crq.data = cpu_to_be32(ibmvtpm->rtce_dma_handle); /* * set the processing flag before the Hcall, since we may get the @@ -148,8 +186,9 @@ static int tpm_ibmvtpm_send(struct tpm_chip *chip, u8 *buf, size_t count) */ ibmvtpm->tpm_processing_cmd = true; - rc = ibmvtpm_send_crq(ibmvtpm->vdev, be64_to_cpu(word[0]), - be64_to_cpu(word[1])); + rc = ibmvtpm_send_crq(ibmvtpm->vdev, + IBMVTPM_VALID_CMD, VTPM_TPM_COMMAND, + count, ibmvtpm->rtce_dma_handle); if (rc != H_SUCCESS) { dev_err(ibmvtpm->dev, "tpm_ibmvtpm_send failed rc=%d\n", rc); rc = 0; @@ -182,15 +221,10 @@ static u8 tpm_ibmvtpm_status(struct tpm_chip *chip) */ static int ibmvtpm_crq_get_rtce_size(struct ibmvtpm_dev *ibmvtpm) { - struct ibmvtpm_crq crq; - u64 *buf = (u64 *) &crq; int rc; - crq.valid = (u8)IBMVTPM_VALID_CMD; - crq.msg = (u8)VTPM_GET_RTCE_BUFFER_SIZE; - - rc = ibmvtpm_send_crq(ibmvtpm->vdev, cpu_to_be64(buf[0]), - cpu_to_be64(buf[1])); + rc = ibmvtpm_send_crq(ibmvtpm->vdev, + IBMVTPM_VALID_CMD, VTPM_GET_RTCE_BUFFER_SIZE, 0, 0); if (rc != H_SUCCESS) dev_err(ibmvtpm->dev, "ibmvtpm_crq_get_rtce_size failed rc=%d\n", rc); @@ -210,15 +244,10 @@ static int ibmvtpm_crq_get_rtce_size(struct ibmvtpm_dev *ibmvtpm) */ static int ibmvtpm_crq_get_version(struct ibmvtpm_dev *ibmvtpm) { - struct ibmvtpm_crq crq; - u64 *buf = (u64 *) &crq; int rc; - crq.valid = (u8)IBMVTPM_VALID_CMD; - crq.msg = (u8)VTPM_GET_VERSION; - - rc = ibmvtpm_send_crq(ibmvtpm->vdev, cpu_to_be64(buf[0]), - cpu_to_be64(buf[1])); + rc = ibmvtpm_send_crq(ibmvtpm->vdev, + IBMVTPM_VALID_CMD, VTPM_GET_VERSION, 0, 0); if (rc != H_SUCCESS) dev_err(ibmvtpm->dev, "ibmvtpm_crq_get_version failed rc=%d\n", rc); @@ -238,7 +267,7 @@ static int ibmvtpm_crq_send_init_complete(struct ibmvtpm_dev *ibmvtpm) { int rc; - rc = ibmvtpm_send_crq(ibmvtpm->vdev, INIT_CRQ_COMP_CMD, 0); + rc = ibmvtpm_send_crq_word(ibmvtpm->vdev, INIT_CRQ_COMP_CMD); if (rc != H_SUCCESS) dev_err(ibmvtpm->dev, "ibmvtpm_crq_send_init_complete failed rc=%d\n", rc); @@ -258,7 +287,7 @@ static int ibmvtpm_crq_send_init(struct ibmvtpm_dev *ibmvtpm) { int rc; - rc = ibmvtpm_send_crq(ibmvtpm->vdev, INIT_CRQ_CMD, 0); + rc = ibmvtpm_send_crq_word(ibmvtpm->vdev, INIT_CRQ_CMD); if (rc != H_SUCCESS) dev_err(ibmvtpm->dev, "ibmvtpm_crq_send_init failed rc=%d\n", rc); @@ -340,15 +369,10 @@ static int tpm_ibmvtpm_suspend(struct device *dev) { struct tpm_chip *chip = dev_get_drvdata(dev); struct ibmvtpm_dev *ibmvtpm = dev_get_drvdata(&chip->dev); - struct ibmvtpm_crq crq; - u64 *buf = (u64 *) &crq; int rc = 0; - crq.valid = (u8)IBMVTPM_VALID_CMD; - crq.msg = (u8)VTPM_PREPARE_TO_SUSPEND; - - rc = ibmvtpm_send_crq(ibmvtpm->vdev, cpu_to_be64(buf[0]), - cpu_to_be64(buf[1])); + rc = ibmvtpm_send_crq(ibmvtpm->vdev, + IBMVTPM_VALID_CMD, VTPM_PREPARE_TO_SUSPEND, 0, 0); if (rc != H_SUCCESS) dev_err(ibmvtpm->dev, "tpm_ibmvtpm_suspend failed rc=%d\n", rc); -- cgit From 656f083116a4799d8c0194976b8a2d66bf306538 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:44 +0200 Subject: x86/fpu: Rename copyin_to_xsaves()/copyout_from_xsaves() to copy_user_to_xstate()/copy_xstate_to_user() The 'copyin/copyout' nomenclature needlessly departs from what the modern FPU code uses, which is: copy_fpregs_to_fpstate() copy_fpstate_to_sigframe() copy_fregs_to_user() copy_fxregs_to_kernel() copy_fxregs_to_user() copy_kernel_to_fpregs() copy_kernel_to_fregs() copy_kernel_to_fxregs() copy_kernel_to_xregs() copy_user_to_fregs() copy_user_to_fxregs() copy_user_to_xregs() copy_xregs_to_kernel() copy_xregs_to_user() I.e. according to this pattern, the following rename should be done: copyin_to_xsaves() -> copy_user_to_xstate() copyout_from_xsaves() -> copy_xstate_to_user() or, if we want to be pedantic, denote that that the user-space format is ptrace: copyin_to_xsaves() -> copy_user_ptrace_to_xstate() copyout_from_xsaves() -> copy_xstate_to_user_ptrace() But I'd suggest the shorter, non-pedantic name. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-2-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++-- arch/x86/kernel/fpu/regset.c | 4 ++-- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/fpu/xstate.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 1b2799e0699a..a1baa17e9748 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, +int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); -int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, +int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index b188b16841e3..165d0545c924 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -92,7 +92,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_read(fpu); if (using_compacted_format()) { - ret = copyout_from_xsaves(pos, count, kbuf, ubuf, xsave); + ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); } else { fpstate_sanitize_xstate(fpu); /* @@ -132,7 +132,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_write(fpu); if (boot_cpu_has(X86_FEATURE_XSAVES)) - ret = copyin_to_xsaves(kbuf, ubuf, xsave); + ret = copy_user_to_xstate(kbuf, ubuf, xsave); else ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 83c23c230b4c..b1fe9a1fc4e0 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -324,7 +324,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpu__drop(fpu); if (using_compacted_format()) { - err = copyin_to_xsaves(NULL, buf_fx, + err = copy_user_to_xstate(NULL, buf_fx, &fpu->state.xsave); } else { err = __copy_from_user(&fpu->state.xsave, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c24ac1efb12d..e7bb41723eaa 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -951,7 +951,7 @@ static inline int xstate_copyout(unsigned int pos, unsigned int count, * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, +int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave) { unsigned int offset, size; @@ -1023,7 +1023,7 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, +int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave) { unsigned int offset, size; -- cgit From f0d4f30a7fd299587840a028655285a87f334904 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:45 +0200 Subject: x86/fpu: Split copy_xstate_to_user() into copy_xstate_to_kernel() & copy_xstate_to_user() copy_xstate_to_user() is a weird API - in part due to a bad API inherited from the regset APIs. But don't propagate that bad API choice into the FPU code - so as a first step split the API into kernel and user buffer handling routines. (Also split the xstate_copyout() internal helper.) The split API is a dumb duplication that should be obviously correct, the real splitting will be done in the next patch. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-3-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 +- arch/x86/kernel/fpu/regset.c | 5 +- arch/x86/kernel/fpu/xstate.c | 110 +++++++++++++++++++++++++++++++++++--- 3 files changed, 109 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index a1baa17e9748..92dc8ca14124 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, - void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 165d0545c924..b6d12d66d04b 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -92,7 +92,10 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_read(fpu); if (using_compacted_format()) { - ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); + if (kbuf) + ret = copy_xstate_to_kernel(pos, count, kbuf, ubuf, xsave); + else + ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); } else { fpstate_sanitize_xstate(fpu); /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index e7bb41723eaa..38561539cb99 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -924,10 +924,106 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * This is similar to user_regset_copyout(), but will not add offset to * the source data pointer or increment pos, count, kbuf, and ubuf. */ -static inline int xstate_copyout(unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, - const void *data, const int start_pos, - const int end_pos) +static inline int +__copy_xstate_to_kernel(unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf, + const void *data, const int start_pos, + const int end_pos) +{ + if ((count == 0) || (pos < start_pos)) + return 0; + + if (end_pos < 0 || pos < end_pos) { + unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + + if (kbuf) { + memcpy(kbuf + pos, data, copy); + } else { + if (__copy_to_user(ubuf + pos, data, copy)) + return -EFAULT; + } + } + return 0; +} + +/* + * Convert from kernel XSAVES compacted format to standard format and copy + * to a kernel-space ptrace buffer. + * + * It supports partial copy but pos always starts from zero. This is called + * from xstateregs_get() and there we check the CPU has XSAVES. + */ +int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, + void __user *ubuf, struct xregs_state *xsave) +{ + unsigned int offset, size; + int ret, i; + struct xstate_header header; + + /* + * Currently copy_regset_to_user() starts from pos 0: + */ + if (unlikely(pos != 0)) + return -EFAULT; + + /* + * The destination is a ptrace buffer; we put in only user xstates: + */ + memset(&header, 0, sizeof(header)); + header.xfeatures = xsave->header.xfeatures; + header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + + /* + * Copy xregs_state->header: + */ + offset = offsetof(struct xregs_state, header); + size = sizeof(header); + + ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, &header, 0, count); + + if (ret) + return ret; + + for (i = 0; i < XFEATURE_MAX; i++) { + /* + * Copy only in-use xstates: + */ + if ((header.xfeatures >> i) & 1) { + void *src = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, src, 0, count); + + if (ret) + return ret; + + if (offset + size >= count) + break; + } + + } + + /* + * Fill xsave->i387.sw_reserved value for ptrace frame: + */ + offset = offsetof(struct fxregs_state, sw_reserved); + size = sizeof(xstate_fx_sw_bytes); + + ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); + + if (ret) + return ret; + + return 0; +} + +static inline int +__copy_xstate_to_user(unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf, + const void *data, const int start_pos, + const int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; @@ -977,7 +1073,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count); + ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, &header, 0, count); if (ret) return ret; @@ -992,7 +1088,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count); + ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, src, 0, count); if (ret) return ret; @@ -1009,7 +1105,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); + ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); if (ret) return ret; -- cgit From 4d981cf2d96f29cdfa7d4972c8b377fe7baa9c4c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:46 +0200 Subject: x86/fpu: Remove 'ubuf' parameter from the copy_xstate_to_kernel() APIs The 'ubuf' parameter is unused in the _kernel() side of the API, remove it. This simplifies the code and makes it easier to think about. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-4-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/xstate.c | 21 ++++++--------------- 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 92dc8ca14124..c762574a245f 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,7 +48,7 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave); int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index b6d12d66d04b..34e74adf9d5d 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -93,7 +93,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (using_compacted_format()) { if (kbuf) - ret = copy_xstate_to_kernel(pos, count, kbuf, ubuf, xsave); + ret = copy_xstate_to_kernel(pos, count, kbuf, xsave); else ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); } else { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 38561539cb99..71d3bda2b898 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -926,7 +926,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ static inline int __copy_xstate_to_kernel(unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, + void *kbuf, const void *data, const int start_pos, const int end_pos) { @@ -936,12 +936,7 @@ __copy_xstate_to_kernel(unsigned int pos, unsigned int count, if (end_pos < 0 || pos < end_pos) { unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); - if (kbuf) { - memcpy(kbuf + pos, data, copy); - } else { - if (__copy_to_user(ubuf + pos, data, copy)) - return -EFAULT; - } + memcpy(kbuf + pos, data, copy); } return 0; } @@ -953,8 +948,7 @@ __copy_xstate_to_kernel(unsigned int pos, unsigned int count, * It supports partial copy but pos always starts from zero. This is called * from xstateregs_get() and there we check the CPU has XSAVES. */ -int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, - void __user *ubuf, struct xregs_state *xsave) +int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave) { unsigned int offset, size; int ret, i; @@ -979,8 +973,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, &header, 0, count); - + ret = __copy_xstate_to_kernel(offset, size, kbuf, &header, 0, count); if (ret) return ret; @@ -994,8 +987,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, src, 0, count); - + ret = __copy_xstate_to_kernel(offset, size, kbuf, src, 0, count); if (ret) return ret; @@ -1011,8 +1003,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); - + ret = __copy_xstate_to_kernel(offset, size, kbuf, xstate_fx_sw_bytes, 0, count); if (ret) return ret; -- cgit From a69c158fb3e7a91220f55029bf222a4e678d16e9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:47 +0200 Subject: x86/fpu: Remove 'kbuf' parameter from the copy_xstate_to_user() APIs The 'kbuf' parameter is unused in the _user() side of the API, remove it. This simplifies the code and makes it easier to think about. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-5-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/xstate.c | 25 +++++++------------------ 3 files changed, 9 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index c762574a245f..65bd68c30cd0 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -49,7 +49,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave); -int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 34e74adf9d5d..fd6dbdd8fde6 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -95,7 +95,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (kbuf) ret = copy_xstate_to_kernel(pos, count, kbuf, xsave); else - ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); + ret = copy_xstate_to_user(pos, count, ubuf, xsave); } else { fpstate_sanitize_xstate(fpu); /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 71d3bda2b898..2d8f3344875d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1011,10 +1011,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru } static inline int -__copy_xstate_to_user(unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, - const void *data, const int start_pos, - const int end_pos) +__copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, const void *data, const int start_pos, const int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; @@ -1022,12 +1019,8 @@ __copy_xstate_to_user(unsigned int pos, unsigned int count, if (end_pos < 0 || pos < end_pos) { unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); - if (kbuf) { - memcpy(kbuf + pos, data, copy); - } else { - if (__copy_to_user(ubuf + pos, data, copy)) - return -EFAULT; - } + if (__copy_to_user(ubuf + pos, data, copy)) + return -EFAULT; } return 0; } @@ -1038,8 +1031,7 @@ __copy_xstate_to_user(unsigned int pos, unsigned int count, * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, - void __user *ubuf, struct xregs_state *xsave) +int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave) { unsigned int offset, size; int ret, i; @@ -1064,8 +1056,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, &header, 0, count); - + ret = __copy_xstate_to_user(offset, size, ubuf, &header, 0, count); if (ret) return ret; @@ -1079,8 +1070,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, src, 0, count); - + ret = __copy_xstate_to_user(offset, size, ubuf, src, 0, count); if (ret) return ret; @@ -1096,8 +1086,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); - + ret = __copy_xstate_to_user(offset, size, ubuf, xstate_fx_sw_bytes, 0, count); if (ret) return ret; -- cgit From d7eda6c99cc75f1c41d67abf988f37a10045a370 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:48 +0200 Subject: x86/fpu: Clean up parameter order in the copy_xstate_to_*() APIs Parameter ordering is weird: int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave); int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave); 'pos' and 'count', which are attributes of the destination buffer, are listed before the destination buffer itself ... List them after the primary arguments instead. This makes the code more similar to regular memcpy() variant APIs. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-6-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++-- arch/x86/kernel/fpu/regset.c | 4 ++-- arch/x86/kernel/fpu/xstate.c | 25 ++++++++++++------------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 65bd68c30cd0..e4430b84939d 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave); -int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count); +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index fd6dbdd8fde6..ec1404194b65 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -93,9 +93,9 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (using_compacted_format()) { if (kbuf) - ret = copy_xstate_to_kernel(pos, count, kbuf, xsave); + ret = copy_xstate_to_kernel(kbuf, xsave, pos, count); else - ret = copy_xstate_to_user(pos, count, ubuf, xsave); + ret = copy_xstate_to_user(ubuf, xsave, pos, count); } else { fpstate_sanitize_xstate(fpu); /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 2d8f3344875d..0a299468510f 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -925,10 +925,9 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * the source data pointer or increment pos, count, kbuf, and ubuf. */ static inline int -__copy_xstate_to_kernel(unsigned int pos, unsigned int count, - void *kbuf, - const void *data, const int start_pos, - const int end_pos) +__copy_xstate_to_kernel(void *kbuf, + const void *data, + unsigned int pos, unsigned int count, const int start_pos, const int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; @@ -948,7 +947,7 @@ __copy_xstate_to_kernel(unsigned int pos, unsigned int count, * It supports partial copy but pos always starts from zero. This is called * from xstateregs_get() and there we check the CPU has XSAVES. */ -int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave) +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count) { unsigned int offset, size; int ret, i; @@ -973,7 +972,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(offset, size, kbuf, &header, 0, count); + ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, 0, count); if (ret) return ret; @@ -987,7 +986,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_kernel(offset, size, kbuf, src, 0, count); + ret = __copy_xstate_to_kernel(kbuf, src, offset, size, 0, count); if (ret) return ret; @@ -1003,7 +1002,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(offset, size, kbuf, xstate_fx_sw_bytes, 0, count); + ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, 0, count); if (ret) return ret; @@ -1011,7 +1010,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru } static inline int -__copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, const void *data, const int start_pos, const int end_pos) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, const int start_pos, const int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; @@ -1031,7 +1030,7 @@ __copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, c * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave) +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count) { unsigned int offset, size; int ret, i; @@ -1056,7 +1055,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_user(offset, size, ubuf, &header, 0, count); + ret = __copy_xstate_to_user(ubuf, &header, offset, size, 0, count); if (ret) return ret; @@ -1070,7 +1069,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_user(offset, size, ubuf, src, 0, count); + ret = __copy_xstate_to_user(ubuf, src, offset, size, 0, count); if (ret) return ret; @@ -1086,7 +1085,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_user(offset, size, ubuf, xstate_fx_sw_bytes, 0, count); + ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, 0, count); if (ret) return ret; -- cgit From becb2bb72ff906cc0d2bac3ee9574f694364823b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:49 +0200 Subject: x86/fpu: Clean up the parameter definitions of copy_xstate_to_*() Remove pointless 'const' of non-pointer input parameter. Remove unnecessary parenthesis that shows uncertainty about arithmetic operator precedence. Clarify copy_xstate_to_user() description. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-7-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0a299468510f..9647e7256179 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -927,13 +927,13 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline int __copy_xstate_to_kernel(void *kbuf, const void *data, - unsigned int pos, unsigned int count, const int start_pos, const int end_pos) + unsigned int pos, unsigned int count, int start_pos, int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; if (end_pos < 0 || pos < end_pos) { - unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos); memcpy(kbuf + pos, data, copy); } @@ -1010,13 +1010,13 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po } static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, const int start_pos, const int end_pos) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int start_pos, int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; if (end_pos < 0 || pos < end_pos) { - unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos); if (__copy_to_user(ubuf + pos, data, copy)) return -EFAULT; @@ -1026,7 +1026,7 @@ __copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, uns /* * Convert from kernel XSAVES compacted format to standard format and copy - * to a ptrace buffer. It supports partial copy but pos always starts from + * to a user-space buffer. It supports partial copy but pos always starts from * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -- cgit From 8a5b731889cbf004b406d988dc591c8a7aac773e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:50 +0200 Subject: x86/fpu: Remove the 'start_pos' parameter from the __copy_xstate_to_*() functions 'start_pos' is always 0, so remove it and remove the pointless check of 'pos < 0' which can not ever be true as 'pos' is unsigned ... No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-8-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9647e7256179..1f50fdaf4c5a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -927,9 +927,9 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline int __copy_xstate_to_kernel(void *kbuf, const void *data, - unsigned int pos, unsigned int count, int start_pos, int end_pos) + unsigned int pos, unsigned int count, int end_pos) { - if ((count == 0) || (pos < start_pos)) + if (!count) return 0; if (end_pos < 0 || pos < end_pos) { @@ -972,7 +972,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, 0, count); + ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, count); if (ret) return ret; @@ -986,7 +986,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_kernel(kbuf, src, offset, size, 0, count); + ret = __copy_xstate_to_kernel(kbuf, src, offset, size, count); if (ret) return ret; @@ -1002,7 +1002,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, 0, count); + ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, count); if (ret) return ret; @@ -1010,9 +1010,9 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po } static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int start_pos, int end_pos) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int end_pos) { - if ((count == 0) || (pos < start_pos)) + if (!count) return 0; if (end_pos < 0 || pos < end_pos) { @@ -1055,7 +1055,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_user(ubuf, &header, offset, size, 0, count); + ret = __copy_xstate_to_user(ubuf, &header, offset, size, count); if (ret) return ret; @@ -1069,7 +1069,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_user(ubuf, src, offset, size, 0, count); + ret = __copy_xstate_to_user(ubuf, src, offset, size, count); if (ret) return ret; @@ -1085,7 +1085,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, 0, count); + ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, count); if (ret) return ret; -- cgit From 56583c9a1400fe1935edd55b24b4fbbc779b59cb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:51 +0200 Subject: x86/fpu: Clarify parameter names in the copy_xstate_to_*() methods Right now there's a confusing mixture of 'offset' and 'size' parameters: - __copy_xstate_to_*() input parameter 'end_pos' not not really an offset, but the full size of the copy to be performed. - input parameter 'count' to copy_xstate_to_*() shadows that of __copy_xstate_to_*()'s 'count' parameter name - but the roles are different: the first one is the total number of bytes to be copied, while the second one is a partial copy size. To unconfuse all this, use a consistent set of parameter names: - 'size' is the partial copy size within a single xstate component - 'size_total' is the total copy requested - 'offset_start' is the requested starting offset. - 'offset' is the offset within an xstate component. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-9-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++-- arch/x86/kernel/fpu/xstate.c | 44 +++++++++++++++++++-------------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index e4430b84939d..fed6617a1079 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count); -int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count); +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1f50fdaf4c5a..c13083579655 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -927,15 +927,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline int __copy_xstate_to_kernel(void *kbuf, const void *data, - unsigned int pos, unsigned int count, int end_pos) + unsigned int offset, unsigned int size, int size_total) { - if (!count) + if (!size) return 0; - if (end_pos < 0 || pos < end_pos) { - unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos); + if (size_total < 0 || offset < size_total) { + unsigned int copy = size_total < 0 ? size : min(size, size_total - offset); - memcpy(kbuf + pos, data, copy); + memcpy(kbuf + offset, data, copy); } return 0; } @@ -947,7 +947,7 @@ __copy_xstate_to_kernel(void *kbuf, * It supports partial copy but pos always starts from zero. This is called * from xstateregs_get() and there we check the CPU has XSAVES. */ -int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count) +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { unsigned int offset, size; int ret, i; @@ -956,7 +956,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po /* * Currently copy_regset_to_user() starts from pos 0: */ - if (unlikely(pos != 0)) + if (unlikely(offset_start != 0)) return -EFAULT; /* @@ -972,7 +972,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, count); + ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); if (ret) return ret; @@ -986,11 +986,11 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_kernel(kbuf, src, offset, size, count); + ret = __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); if (ret) return ret; - if (offset + size >= count) + if (offset + size >= size_total) break; } @@ -1002,7 +1002,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, count); + ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); if (ret) return ret; @@ -1010,15 +1010,15 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po } static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int end_pos) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, int size_total) { - if (!count) + if (!size) return 0; - if (end_pos < 0 || pos < end_pos) { - unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos); + if (size_total < 0 || offset < size_total) { + unsigned int copy = size_total < 0 ? size : min(size, size_total - offset); - if (__copy_to_user(ubuf + pos, data, copy)) + if (__copy_to_user(ubuf + offset, data, copy)) return -EFAULT; } return 0; @@ -1030,7 +1030,7 @@ __copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, uns * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count) +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { unsigned int offset, size; int ret, i; @@ -1039,7 +1039,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i /* * Currently copy_regset_to_user() starts from pos 0: */ - if (unlikely(pos != 0)) + if (unlikely(offset_start != 0)) return -EFAULT; /* @@ -1055,7 +1055,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_user(ubuf, &header, offset, size, count); + ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total); if (ret) return ret; @@ -1069,11 +1069,11 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_user(ubuf, src, offset, size, count); + ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total); if (ret) return ret; - if (offset + size >= count) + if (offset + size >= size_total) break; } @@ -1085,7 +1085,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, count); + ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total); if (ret) return ret; -- cgit From 6ff15f8db7eaf29ef5ead6afbec9b25485fe8703 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:52 +0200 Subject: x86/fpu: Change 'size_total' parameter to unsigned and standardize the size checks in copy_xstate_to_*() 'size_total' is derived from an unsigned input parameter - and then converted to 'int' and checked for negative ranges: if (size_total < 0 || offset < size_total) { This conversion and the checks are unnecessary obfuscation, reject overly large requested copy sizes outright and simplify the underlying code. Reported-by: Rik van Riel Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-10-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c13083579655..b18c5457065a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -925,15 +925,11 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * the source data pointer or increment pos, count, kbuf, and ubuf. */ static inline int -__copy_xstate_to_kernel(void *kbuf, - const void *data, - unsigned int offset, unsigned int size, int size_total) +__copy_xstate_to_kernel(void *kbuf, const void *data, + unsigned int offset, unsigned int size, unsigned int size_total) { - if (!size) - return 0; - - if (size_total < 0 || offset < size_total) { - unsigned int copy = size_total < 0 ? size : min(size, size_total - offset); + if (offset < size_total) { + unsigned int copy = min(size, size_total - offset); memcpy(kbuf + offset, data, copy); } @@ -986,12 +982,13 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of offset = xstate_offsets[i]; size = xstate_sizes[i]; + /* The next component has to fit fully into the output buffer: */ + if (offset + size > size_total) + break; + ret = __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); if (ret) return ret; - - if (offset + size >= size_total) - break; } } @@ -1010,13 +1007,13 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of } static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, int size_total) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total) { if (!size) return 0; - if (size_total < 0 || offset < size_total) { - unsigned int copy = size_total < 0 ? size : min(size, size_total - offset); + if (offset < size_total) { + unsigned int copy = min(size, size_total - offset); if (__copy_to_user(ubuf + offset, data, copy)) return -EFAULT; @@ -1069,12 +1066,13 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = xstate_offsets[i]; size = xstate_sizes[i]; + /* The next component has to fit fully into the output buffer: */ + if (offset + size > size_total) + break; + ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total); if (ret) return ret; - - if (offset + size >= size_total) - break; } } -- cgit From 8c0817f4a3188ac5485ce14f96f12a175800b881 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:53 +0200 Subject: x86/fpu: Simplify __copy_xstate_to_kernel() return values __copy_xstate_to_kernel() can only return 0 (because kernel copies cannot fail), simplify the code throughout. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-11-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b18c5457065a..00c3b41c3cf1 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -924,7 +924,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * This is similar to user_regset_copyout(), but will not add offset to * the source data pointer or increment pos, count, kbuf, and ubuf. */ -static inline int +static inline void __copy_xstate_to_kernel(void *kbuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total) { @@ -933,7 +933,6 @@ __copy_xstate_to_kernel(void *kbuf, const void *data, memcpy(kbuf + offset, data, copy); } - return 0; } /* @@ -946,8 +945,8 @@ __copy_xstate_to_kernel(void *kbuf, const void *data, int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { unsigned int offset, size; - int ret, i; struct xstate_header header; + int i; /* * Currently copy_regset_to_user() starts from pos 0: @@ -968,9 +967,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); - if (ret) - return ret; + __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); for (i = 0; i < XFEATURE_MAX; i++) { /* @@ -986,9 +983,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of if (offset + size > size_total) break; - ret = __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); - if (ret) - return ret; + __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); } } @@ -999,9 +994,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); - if (ret) - return ret; + __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); return 0; } -- cgit From 79fecc2b7506f29fb91becc65e8788e5ae7eba9f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:54 +0200 Subject: x86/fpu: Split copy_user_to_xstate() into copy_kernel_to_xstate() & copy_user_to_xstate() Similar to: x86/fpu: Split copy_xstate_to_user() into copy_xstate_to_kernel() & copy_xstate_to_user() No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-12-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 +-- arch/x86/kernel/fpu/regset.c | 10 ++++-- arch/x86/kernel/fpu/xstate.c | 66 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 74 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index fed6617a1079..79af79dbcab6 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -50,6 +50,6 @@ const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); -int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, - struct xregs_state *xsave); +int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); +int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index ec1404194b65..cb45dd81d617 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -134,10 +134,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_write(fpu); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - ret = copy_user_to_xstate(kbuf, ubuf, xsave); - else + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + if (kbuf) + ret = copy_kernel_to_xstate(kbuf, ubuf, xsave); + else + ret = copy_user_to_xstate(kbuf, ubuf, xsave); + } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + } /* * In case of failure, mark all states as init: diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 00c3b41c3cf1..1ad25d1b8056 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1084,7 +1084,71 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i } /* - * Convert from a ptrace standard-format buffer to kernel XSAVES format + * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format + * and copy to the target thread. This is called from xstateregs_set() and + * there we check the CPU has XSAVES and a whole standard-sized buffer + * exists. + */ +int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, + struct xregs_state *xsave) +{ + unsigned int offset, size; + int i; + u64 xfeatures; + u64 allowed_features; + + offset = offsetof(struct xregs_state, header); + size = sizeof(xfeatures); + + if (kbuf) { + memcpy(&xfeatures, kbuf + offset, size); + } else { + if (__copy_from_user(&xfeatures, ubuf + offset, size)) + return -EFAULT; + } + + /* + * Reject if the user sets any disabled or supervisor features: + */ + allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; + + if (xfeatures & ~allowed_features) + return -EINVAL; + + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = ((u64)1 << i); + + if (xfeatures & mask) { + void *dst = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + if (kbuf) { + memcpy(dst, kbuf + offset, size); + } else { + if (__copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; + } + } + } + + /* + * The state that came in from userspace was user-state only. + * Mask all the user states out of 'xfeatures': + */ + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + + /* + * Add back in the features that came in from userspace: + */ + xsave->header.xfeatures |= xfeatures; + + return 0; +} + +/* + * Convert from a ptrace standard-format user-space buffer to kernel XSAVES format * and copy to the target thread. This is called from xstateregs_set() and * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. -- cgit From 59dffa4edba1f15b2bfdbe608aca1efe664c674c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:55 +0200 Subject: x86/fpu: Remove 'ubuf' parameter from the copy_kernel_to_xstate() API No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-13-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/xstate.c | 17 +++-------------- 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 79af79dbcab6..f10889bc0c88 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -50,6 +50,6 @@ const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); -int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); +int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index cb45dd81d617..785302c75f38 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -136,7 +136,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (boot_cpu_has(X86_FEATURE_XSAVES)) { if (kbuf) - ret = copy_kernel_to_xstate(kbuf, ubuf, xsave); + ret = copy_kernel_to_xstate(kbuf, xsave); else ret = copy_user_to_xstate(kbuf, ubuf, xsave); } else { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1ad25d1b8056..71cc8d367fdd 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1089,8 +1089,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, - struct xregs_state *xsave) +int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave) { unsigned int offset, size; int i; @@ -1100,12 +1099,7 @@ int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, offset = offsetof(struct xregs_state, header); size = sizeof(xfeatures); - if (kbuf) { - memcpy(&xfeatures, kbuf + offset, size); - } else { - if (__copy_from_user(&xfeatures, ubuf + offset, size)) - return -EFAULT; - } + memcpy(&xfeatures, kbuf + offset, size); /* * Reject if the user sets any disabled or supervisor features: @@ -1124,12 +1118,7 @@ int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - if (kbuf) { - memcpy(dst, kbuf + offset, size); - } else { - if (__copy_from_user(dst, ubuf + offset, size)) - return -EFAULT; - } + memcpy(dst, kbuf + offset, size); } } -- cgit From 7b9094c688f807c110a2dab6f6edc5876bfa7b0b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:56 +0200 Subject: x86/fpu: Remove 'kbuf' parameter from the copy_user_to_xstate() API No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-14-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/signal.c | 11 ++++------- arch/x86/kernel/fpu/xstate.c | 19 +++++-------------- 4 files changed, 11 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index f10889bc0c88..4ceb90740d80 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -51,5 +51,5 @@ int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave); -int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); +int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 785302c75f38..caf723f31737 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -138,7 +138,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (kbuf) ret = copy_kernel_to_xstate(kbuf, xsave); else - ret = copy_user_to_xstate(kbuf, ubuf, xsave); + ret = copy_user_to_xstate(ubuf, xsave); } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index b1fe9a1fc4e0..2c685b492fd6 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -323,13 +323,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ fpu__drop(fpu); - if (using_compacted_format()) { - err = copy_user_to_xstate(NULL, buf_fx, - &fpu->state.xsave); - } else { - err = __copy_from_user(&fpu->state.xsave, - buf_fx, state_size); - } + if (using_compacted_format()) + err = copy_user_to_xstate(buf_fx, &fpu->state.xsave); + else + err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); if (err || __copy_from_user(&env, buf, sizeof(env))) { fpstate_init(&fpu->state); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 71cc8d367fdd..b1f3e4dae2e3 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1142,8 +1142,7 @@ int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave) * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, - struct xregs_state *xsave) +int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave) { unsigned int offset, size; int i; @@ -1153,12 +1152,8 @@ int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, offset = offsetof(struct xregs_state, header); size = sizeof(xfeatures); - if (kbuf) { - memcpy(&xfeatures, kbuf + offset, size); - } else { - if (__copy_from_user(&xfeatures, ubuf + offset, size)) - return -EFAULT; - } + if (__copy_from_user(&xfeatures, ubuf + offset, size)) + return -EFAULT; /* * Reject if the user sets any disabled or supervisor features: @@ -1177,12 +1172,8 @@ int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - if (kbuf) { - memcpy(dst, kbuf + offset, size); - } else { - if (__copy_from_user(dst, ubuf + offset, size)) - return -EFAULT; - } + if (__copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; } } -- cgit From 6d7f7da5533a3f841eeb1d9657257c9367924274 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:57 +0200 Subject: x86/fpu: Flip the parameter order in copy_*_to_xstate() Make it more consistent with regular memcpy() semantics, where the destination argument comes first. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-15-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++-- arch/x86/kernel/fpu/regset.c | 4 ++-- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/fpu/xstate.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 4ceb90740d80..579ac2358e63 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -50,6 +50,6 @@ const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); -int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave); -int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave); +int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); +int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index caf723f31737..19a7385a912c 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -136,9 +136,9 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (boot_cpu_has(X86_FEATURE_XSAVES)) { if (kbuf) - ret = copy_kernel_to_xstate(kbuf, xsave); + ret = copy_kernel_to_xstate(xsave, kbuf); else - ret = copy_user_to_xstate(ubuf, xsave); + ret = copy_user_to_xstate(xsave, ubuf); } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 2c685b492fd6..2d682dac35d4 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -324,7 +324,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpu__drop(fpu); if (using_compacted_format()) - err = copy_user_to_xstate(buf_fx, &fpu->state.xsave); + err = copy_user_to_xstate(&fpu->state.xsave, buf_fx); else err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b1f3e4dae2e3..0ef35040d0ad 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1089,7 +1089,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave) +int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) { unsigned int offset, size; int i; @@ -1142,7 +1142,7 @@ int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave) * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave) +int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) { unsigned int offset, size; int i; -- cgit From b3a163081c28d1a4d1ad76259a9d93b34a82f1da Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:58 +0200 Subject: x86/fpu: Simplify fpu->fpregs_active use The fpregs_active() inline function is pretty pointless - in almost all the callsites it can be replaced with a direct fpu->fpregs_active access. Do so and eliminate the extra layer of obfuscation. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-16-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 17 +---------------- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/signal.c | 9 +++++---- arch/x86/mm/pkeys.c | 2 +- 4 files changed, 8 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 554cdb205d17..b223c57dd5dc 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -542,21 +542,6 @@ static inline void fpregs_activate(struct fpu *fpu) trace_x86_fpu_regs_activated(fpu); } -/* - * The question "does this thread have fpu access?" - * is slightly racy, since preemption could come in - * and revoke it immediately after the test. - * - * However, even in that very unlikely scenario, - * we can just assume we have FPU access - typically - * to save the FP state - we'll just take a #NM - * fault and get the FPU access back. - */ -static inline int fpregs_active(void) -{ - return current->thread.fpu.fpregs_active; -} - /* * FPU state switching for scheduling. * @@ -617,7 +602,7 @@ static inline void user_fpu_begin(void) struct fpu *fpu = ¤t->thread.fpu; preempt_disable(); - if (!fpregs_active()) + if (!fpu->fpregs_active) fpregs_activate(fpu); preempt_enable(); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e1114f070c2d..bad57248e5a0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -367,7 +367,7 @@ void fpu__current_fpstate_write_end(void) * registers may still be out of date. Update them with * an XRSTOR if they are active. */ - if (fpregs_active()) + if (fpu->fpregs_active) copy_kernel_to_fpregs(&fpu->state); /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 2d682dac35d4..684025654d0c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -155,7 +155,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) */ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { - struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; + struct fpu *fpu = ¤t->thread.fpu; + struct xregs_state *xsave = &fpu->state.xsave; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); @@ -170,13 +171,13 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpregs_active() || using_compacted_format()) { + if (fpu->fpregs_active || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; /* Update the thread's fxstate to save the fsave header. */ if (ia32_fxstate) - copy_fxregs_to_kernel(&tsk->thread.fpu); + copy_fxregs_to_kernel(fpu); } else { /* * It is a *bug* if kernel uses compacted-format for xsave @@ -189,7 +190,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) return -1; } - fpstate_sanitize_xstate(&tsk->thread.fpu); + fpstate_sanitize_xstate(fpu); if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) return -1; } diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 2dab69a706ec..e2c23472233e 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -45,7 +45,7 @@ int __execute_only_pkey(struct mm_struct *mm) */ preempt_disable(); if (!need_to_set_mm_pkey && - fpregs_active() && + current->thread.fpu.fpregs_active && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); return execute_only_pkey; -- cgit From a10b6a16cdad88170f546d008c77453cddf918e6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:59 +0200 Subject: x86/fpu: Make the fpu state change in fpu__clear() scheduler-atomic Do this temporarily only, to make it easier to change the FPU state machine, in particular this change couples the fpu->fpregs_active and fpu->fpstate_active states: they are only set/cleared together (as far as the scheduler sees them). This will be removed by later patches. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-17-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index bad57248e5a0..b7dc3833d41a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -462,9 +462,11 @@ void fpu__clear(struct fpu *fpu) * Make sure fpstate is cleared and initialized. */ if (static_cpu_has(X86_FEATURE_FPU)) { + preempt_disable(); fpu__activate_curr(fpu); user_fpu_begin(); copy_init_fpstate_to_fpregs(); + preempt_enable(); } } -- cgit From b6aa85558d7e7b18fc3470d2bc1731d2205dd275 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:00 +0200 Subject: x86/fpu: Split the state handling in fpu__drop() Prepare fpu__drop() to use fpu->fpregs_active. There are two distinct usecases for fpu__drop() in this context: exit_thread() when called for 'current' in exit(), and when called for another task in fork(). This patch does not change behavior, it only adds a couple of debug checks and structures the code to make the ->fpregs_active change more obviously correct. All the complications will be removed later on. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-18-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b7dc3833d41a..815dfba7781a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -414,12 +414,19 @@ void fpu__drop(struct fpu *fpu) { preempt_disable(); - if (fpu->fpregs_active) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - fpregs_deactivate(fpu); + if (fpu == ¤t->thread.fpu) { + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); + + if (fpu->fpregs_active) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + if (fpu->fpregs_active) + fpregs_deactivate(fpu); + } + } else { + WARN_ON_FPU(fpu->fpregs_active); } fpu->fpstate_active = 0; -- cgit From f1c8cd0176078c7bcafdc89cac447cab672a0b5e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:01 +0200 Subject: x86/fpu: Change fpu->fpregs_active users to fpu->fpstate_active We want to simplify the FPU state machine by eliminating fpu->fpregs_active, and we can do that because the two state flags (::fpregs_active and ::fpstate_active) are set essentially together. The old lazy FPU switching code used to make a distinction - but there's no lazy switching code anymore, we always switch in an 'eager' fashion. Do this by first changing all substantial uses of fpu->fpregs_active to fpu->fpstate_active and adding a few debug checks to double check our assumption is correct. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-19-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 4 +++- arch/x86/kernel/fpu/core.c | 16 ++++++++++------ arch/x86/kernel/fpu/signal.c | 4 +++- arch/x86/mm/pkeys.c | 3 +-- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index b223c57dd5dc..7fa676f93ac1 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -556,7 +556,9 @@ static inline void fpregs_activate(struct fpu *fpu) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (old_fpu->fpregs_active) { + WARN_ON_FPU(old_fpu->fpregs_active != old_fpu->fpstate_active); + + if (old_fpu->fpstate_active) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 815dfba7781a..eab244622402 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -100,7 +100,7 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); - if (fpu->fpregs_active) { + if (fpu->fpstate_active) { /* * Ignore return value -- we don't care if reg state * is clobbered. @@ -116,7 +116,7 @@ void __kernel_fpu_end(void) { struct fpu *fpu = ¤t->thread.fpu; - if (fpu->fpregs_active) + if (fpu->fpstate_active) copy_kernel_to_fpregs(&fpu->state); kernel_fpu_enable(); @@ -147,8 +147,10 @@ void fpu__save(struct fpu *fpu) WARN_ON_FPU(fpu != ¤t->thread.fpu); preempt_disable(); + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); + trace_x86_fpu_before_save(fpu); - if (fpu->fpregs_active) { + if (fpu->fpstate_active) { if (!copy_fpregs_to_fpstate(fpu)) { copy_kernel_to_fpregs(&fpu->state); } @@ -262,11 +264,12 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr); */ void fpu__activate_fpstate_read(struct fpu *fpu) { + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); /* * If fpregs are active (in the current CPU), then * copy them to the fpstate: */ - if (fpu->fpregs_active) { + if (fpu->fpstate_active) { fpu__save(fpu); } else { if (!fpu->fpstate_active) { @@ -362,12 +365,13 @@ void fpu__current_fpstate_write_end(void) { struct fpu *fpu = ¤t->thread.fpu; + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); /* * 'fpu' now has an updated copy of the state, but the * registers may still be out of date. Update them with * an XRSTOR if they are active. */ - if (fpu->fpregs_active) + if (fpu->fpstate_active) copy_kernel_to_fpregs(&fpu->state); /* @@ -417,7 +421,7 @@ void fpu__drop(struct fpu *fpu) if (fpu == ¤t->thread.fpu) { WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); - if (fpu->fpregs_active) { + if (fpu->fpstate_active) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 684025654d0c..a88083ba7f8b 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -171,7 +171,9 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpu->fpregs_active || using_compacted_format()) { + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); + + if (fpu->fpstate_active || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index e2c23472233e..4d24269c071f 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -18,7 +18,6 @@ #include /* boot_cpu_has, ... */ #include /* vma_pkey() */ -#include /* fpregs_active() */ int __execute_only_pkey(struct mm_struct *mm) { @@ -45,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm) */ preempt_disable(); if (!need_to_set_mm_pkey && - current->thread.fpu.fpregs_active && + current->thread.fpu.fpstate_active && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); return execute_only_pkey; -- cgit From 6cf4edbe0526db311a28734609da888fdfcb3604 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:02 +0200 Subject: x86/fpu: Decouple fpregs_activate()/fpregs_deactivate() from fpu->fpregs_active The fpregs_activate()/fpregs_deactivate() are currently called in such a pattern: if (!fpu->fpregs_active) fpregs_activate(fpu); ... if (fpu->fpregs_active) fpregs_deactivate(fpu); But note that it's actually safe to call them without checking the flag first. This further decouples the fpu->fpregs_active flag from actual FPU logic. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-20-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 7 +------ arch/x86/kernel/fpu/core.c | 3 +-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7fa676f93ac1..42a601673c09 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -526,8 +526,6 @@ static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) */ static inline void fpregs_deactivate(struct fpu *fpu) { - WARN_ON_FPU(!fpu->fpregs_active); - fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); @@ -535,8 +533,6 @@ static inline void fpregs_deactivate(struct fpu *fpu) static inline void fpregs_activate(struct fpu *fpu) { - WARN_ON_FPU(fpu->fpregs_active); - fpu->fpregs_active = 1; this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); @@ -604,8 +600,7 @@ static inline void user_fpu_begin(void) struct fpu *fpu = ¤t->thread.fpu; preempt_disable(); - if (!fpu->fpregs_active) - fpregs_activate(fpu); + fpregs_activate(fpu); preempt_enable(); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index eab244622402..01a47e9edfb4 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -426,8 +426,7 @@ void fpu__drop(struct fpu *fpu) asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); - if (fpu->fpregs_active) - fpregs_deactivate(fpu); + fpregs_deactivate(fpu); } } else { WARN_ON_FPU(fpu->fpregs_active); -- cgit From 99dc26bda233ee722bbd370bddf20beece3ffb93 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:03 +0200 Subject: x86/fpu: Remove struct fpu::fpregs_active The previous changes paved the way for the removal of the fpu::fpregs_active state flag - we now only have the fpu::fpstate_active and fpu::last_cpu fields left. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-21-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 5 ----- arch/x86/include/asm/fpu/types.h | 23 ----------------------- arch/x86/include/asm/trace/fpu.h | 5 +---- arch/x86/kernel/fpu/core.c | 9 --------- arch/x86/kernel/fpu/signal.c | 2 -- 5 files changed, 1 insertion(+), 43 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 42a601673c09..629e7abcd6c9 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -526,14 +526,12 @@ static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) */ static inline void fpregs_deactivate(struct fpu *fpu) { - fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { - fpu->fpregs_active = 1; this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } @@ -552,8 +550,6 @@ static inline void fpregs_activate(struct fpu *fpu) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - WARN_ON_FPU(old_fpu->fpregs_active != old_fpu->fpstate_active); - if (old_fpu->fpstate_active) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; @@ -561,7 +557,6 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) old_fpu->last_cpu = cpu; /* But leave fpu_fpregs_owner_ctx! */ - old_fpu->fpregs_active = 0; trace_x86_fpu_regs_deactivated(old_fpu); } else old_fpu->last_cpu = -1; diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 3c80f5b9c09d..0c314a397cf5 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -298,29 +298,6 @@ struct fpu { */ unsigned char fpstate_active; - /* - * @fpregs_active: - * - * This flag determines whether a given context is actively - * loaded into the FPU's registers and that those registers - * represent the task's current FPU state. - * - * Note the interaction with fpstate_active: - * - * # task does not use the FPU: - * fpstate_active == 0 - * - * # task uses the FPU and regs are active: - * fpstate_active == 1 && fpregs_active == 1 - * - * # the regs are inactive but still match fpstate: - * fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu - * - * The third state is what we use for the lazy restore optimization - * on lazy-switching CPUs. - */ - unsigned char fpregs_active; - /* * @state: * diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 342e59789fcd..da565aae9fd2 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -12,7 +12,6 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_STRUCT__entry( __field(struct fpu *, fpu) - __field(bool, fpregs_active) __field(bool, fpstate_active) __field(u64, xfeatures) __field(u64, xcomp_bv) @@ -20,16 +19,14 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_fast_assign( __entry->fpu = fpu; - __entry->fpregs_active = fpu->fpregs_active; __entry->fpstate_active = fpu->fpstate_active; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, - __entry->fpregs_active, __entry->fpstate_active, __entry->xfeatures, __entry->xcomp_bv diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 01a47e9edfb4..93103a909c47 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -147,8 +147,6 @@ void fpu__save(struct fpu *fpu) WARN_ON_FPU(fpu != ¤t->thread.fpu); preempt_disable(); - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); - trace_x86_fpu_before_save(fpu); if (fpu->fpstate_active) { if (!copy_fpregs_to_fpstate(fpu)) { @@ -191,7 +189,6 @@ EXPORT_SYMBOL_GPL(fpstate_init); int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU)) @@ -264,7 +261,6 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr); */ void fpu__activate_fpstate_read(struct fpu *fpu) { - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); /* * If fpregs are active (in the current CPU), then * copy them to the fpstate: @@ -365,7 +361,6 @@ void fpu__current_fpstate_write_end(void) { struct fpu *fpu = ¤t->thread.fpu; - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); /* * 'fpu' now has an updated copy of the state, but the * registers may still be out of date. Update them with @@ -419,8 +414,6 @@ void fpu__drop(struct fpu *fpu) preempt_disable(); if (fpu == ¤t->thread.fpu) { - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); - if (fpu->fpstate_active) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" @@ -428,8 +421,6 @@ void fpu__drop(struct fpu *fpu) _ASM_EXTABLE(1b, 2b)); fpregs_deactivate(fpu); } - } else { - WARN_ON_FPU(fpu->fpregs_active); } fpu->fpstate_active = 0; diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a88083ba7f8b..629106e51a29 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -171,8 +171,6 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); - if (fpu->fpstate_active || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) -- cgit From 0852b374173bb57f870d78e6c6839c77b339be5f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 23 Sep 2017 15:00:04 +0200 Subject: x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on Intel Skylake CPUs On Skylake CPUs I noticed that XRSTOR is unable to deal with states created by copyout_from_xsaves() if the xstate has only SSE/YMM state, and no FP state. That is, xfeatures had XFEATURE_MASK_SSE set, but not XFEATURE_MASK_FP. The reason is that part of the SSE/YMM state lives in the MXCSR and MXCSR_FLAGS fields of the FP state. Ensure that whenever we copy SSE or YMM state around, the MXCSR and MXCSR_FLAGS fields are also copied around. Signed-off-by: Rik van Riel Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170210085445.0f1cc708@annuminas.surriel.com Link: http://lkml.kernel.org/r/20170923130016.21448-22-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 3 +++ arch/x86/kernel/fpu/xstate.c | 42 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 0c314a397cf5..71db45ca8870 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -68,6 +68,9 @@ struct fxregs_state { /* Default value for fxregs_state.mxcsr: */ #define MXCSR_DEFAULT 0x1f80 +/* Copy both mxcsr & mxcsr_flags with a single u64 memcpy: */ +#define MXCSR_AND_FLAGS_SIZE sizeof(u64) + /* * Software based FPU emulation state. This is arbitrary really, * it matches the x87 format to make it easier to understand: diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0ef35040d0ad..41c52256bdce 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -920,6 +920,23 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, } #endif /* ! CONFIG_ARCH_HAS_PKEYS */ +/* + * Weird legacy quirk: SSE and YMM states store information in the + * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP + * area is marked as unused in the xfeatures header, we need to copy + * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use. + */ +static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) +{ + if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM))) + return 0; + + if (xfeatures & XFEATURE_MASK_FP) + return 0; + + return 1; +} + /* * This is similar to user_regset_copyout(), but will not add offset to * the source data pointer or increment pos, count, kbuf, and ubuf. @@ -988,6 +1005,12 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of } + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total); + } + /* * Fill xsave->i387.sw_reserved value for ptrace frame: */ @@ -1070,6 +1093,12 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i } + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total); + } + /* * Fill xsave->i387.sw_reserved value for ptrace frame: */ @@ -1122,6 +1151,12 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) } } + if (xfeatures_mxcsr_quirk(xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + memcpy(&xsave->i387.mxcsr, kbuf + offset, size); + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': @@ -1177,6 +1212,13 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) } } + if (xfeatures_mxcsr_quirk(xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) + return -EFAULT; + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': -- cgit From 4f8cef59bad29344aca0e2e6b0ad18dadd078fd0 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Sat, 23 Sep 2017 15:00:05 +0200 Subject: x86/fpu: Fix boolreturn.cocci warnings arch/x86/kernel/fpu/xstate.c:931:9-10: WARNING: return of 0/1 in function 'xfeatures_mxcsr_quirk' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: scripts/coccinelle/misc/boolreturn.cocci Signed-off-by: Fengguang Wu Signed-off-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Yu-cheng Yu Cc: kbuild-all@01.org Cc: tipbuild@zytor.com Link: http://lkml.kernel.org/r/20170306004553.GA25764@lkp-wsm-ep1 Link: http://lkml.kernel.org/r/20170923130016.21448-23-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 41c52256bdce..fda1109cc355 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -929,12 +929,12 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) { if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM))) - return 0; + return false; if (xfeatures & XFEATURE_MASK_FP) - return 0; + return false; - return 1; + return true; } /* -- cgit From 03eaec81ac09814817e9f0307d572ffe8365f980 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 23 Sep 2017 15:00:06 +0200 Subject: x86/fpu: Turn WARN_ON() in context switch into WARN_ON_FPU() copy_xregs_to_kernel checks if the alternatives have been already patched. This WARN_ON() is always executed in every context switch. All the other checks in fpu internal.h are WARN_ON_FPU(), but this one is plain WARN_ON(). I assume it was forgotten to switch it. So switch it to WARN_ON_FPU() too to avoid some unnecessary code in the context switch, and a potentially expensive cache line miss for the global variable. Signed-off-by: Andi Kleen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170329062605.4970-1-andi@firstfloor.org Link: http://lkml.kernel.org/r/20170923130016.21448-24-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 629e7abcd6c9..2dca7c65319c 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -350,7 +350,7 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate) u32 hmask = mask >> 32; int err; - WARN_ON(!alternatives_patched); + WARN_ON_FPU(!alternatives_patched); XSTATE_XSAVE(xstate, lmask, hmask, err); -- cgit From e19b205be43d11bff638cad4487008c48d21c103 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 24 Sep 2017 16:38:56 -0700 Subject: Linux 4.14-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 64cbc66cebca..d1119941261c 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Fearless Coyote # *DOCUMENTATION* -- cgit From 5f3d862a736398e7068fa67142133f1713fdee8c Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Mon, 18 Sep 2017 09:41:45 +0200 Subject: qxl: fix framebuffer unpinning qxl_plane_cleanup_fb() unpins the just activated framebuffer instead of the old one. Oops. Fix it. Cc: Gabriel Krisman Bertazi Fixes: 1277eed5fecb8830c8cc414ad70c1ef640464bc0 Signed-off-by: Gerd Hoffmann Reviewed-by: Gabriel Krisman Bertazi Link: http://patchwork.freedesktop.org/patch/msgid/20170918074145.2257-1-kraxel@redhat.com --- drivers/gpu/drm/qxl/qxl_display.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_display.c b/drivers/gpu/drm/qxl/qxl_display.c index e1dd05423e86..afbf50d0c08f 100644 --- a/drivers/gpu/drm/qxl/qxl_display.c +++ b/drivers/gpu/drm/qxl/qxl_display.c @@ -702,14 +702,15 @@ static void qxl_plane_cleanup_fb(struct drm_plane *plane, struct drm_gem_object *obj; struct qxl_bo *user_bo; - if (!plane->state->fb) { - /* we never executed prepare_fb, so there's nothing to + if (!old_state->fb) { + /* + * we never executed prepare_fb, so there's nothing to * unpin. */ return; } - obj = to_qxl_framebuffer(plane->state->fb)->obj; + obj = to_qxl_framebuffer(old_state->fb)->obj; user_bo = gem_to_qxl_bo(obj); qxl_bo_unpin(user_bo); } -- cgit From 814fb7bb7db5433757d76f4c4502c96fc53b0b5e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 23 Sep 2017 15:00:07 +0200 Subject: x86/fpu: Don't let userspace set bogus xcomp_bv On x86, userspace can use the ptrace() or rt_sigreturn() system calls to set a task's extended state (xstate) or "FPU" registers. ptrace() can set them for another task using the PTRACE_SETREGSET request with NT_X86_XSTATE, while rt_sigreturn() can set them for the current task. In either case, registers can be set to any value, but the kernel assumes that the XSAVE area itself remains valid in the sense that the CPU can restore it. However, in the case where the kernel is using the uncompacted xstate format (which it does whenever the XSAVES instruction is unavailable), it was possible for userspace to set the xcomp_bv field in the xstate_header to an arbitrary value. However, all bits in that field are reserved in the uncompacted case, so when switching to a task with nonzero xcomp_bv, the XRSTOR instruction failed with a #GP fault. This caused the WARN_ON_FPU(err) in copy_kernel_to_xregs() to be hit. In addition, since the error is otherwise ignored, the FPU registers from the task previously executing on the CPU were leaked. Fix the bug by checking that the user-supplied value of xcomp_bv is 0 in the uncompacted case, and returning an error otherwise. The reason for validating xcomp_bv rather than simply overwriting it with 0 is that we want userspace to see an error if it (incorrectly) provides an XSAVE area in compacted format rather than in uncompacted format. Note that as before, in case of error we clear the task's FPU state. This is perhaps non-ideal, especially for PTRACE_SETREGSET; it might be better to return an error before changing anything. But it seems the "clear on error" behavior is fine for now, and it's a little tricky to do otherwise because it would mean we couldn't simply copy the full userspace state into kernel memory in one __copy_from_user(). This bug was found by syzkaller, which hit the above-mentioned WARN_ON_FPU(): WARNING: CPU: 1 PID: 0 at ./arch/x86/include/asm/fpu/internal.h:373 __switch_to+0x5b5/0x5d0 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.13.0 #453 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff9ba2bc8e42c0 task.stack: ffffa78cc036c000 RIP: 0010:__switch_to+0x5b5/0x5d0 RSP: 0000:ffffa78cc08bbb88 EFLAGS: 00010082 RAX: 00000000fffffffe RBX: ffff9ba2b8bf2180 RCX: 00000000c0000100 RDX: 00000000ffffffff RSI: 000000005cb10700 RDI: ffff9ba2b8bf36c0 RBP: ffffa78cc08bbbd0 R08: 00000000929fdf46 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9ba2bc8e42c0 R13: 0000000000000000 R14: ffff9ba2b8bf3680 R15: ffff9ba2bf5d7b40 FS: 00007f7e5cb10700(0000) GS:ffff9ba2bf400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004005cc CR3: 0000000079fd5000 CR4: 00000000001406e0 Call Trace: Code: 84 00 00 00 00 00 e9 11 fd ff ff 0f ff 66 0f 1f 84 00 00 00 00 00 e9 e7 fa ff ff 0f ff 66 0f 1f 84 00 00 00 00 00 e9 c2 fa ff ff <0f> ff 66 0f 1f 84 00 00 00 00 00 e9 d4 fc ff ff 66 66 2e 0f 1f Here is a C reproducer. The expected behavior is that the program spin forever with no output. However, on a buggy kernel running on a processor with the "xsave" feature but without the "xsaves" feature (e.g. Sandy Bridge through Broadwell for Intel), within a second or two the program reports that the xmm registers were corrupted, i.e. were not restored correctly. With CONFIG_X86_DEBUG_FPU=y it also hits the above kernel warning. #define _GNU_SOURCE #include #include #include #include #include #include #include #include int main(void) { int pid = fork(); uint64_t xstate[512]; struct iovec iov = { .iov_base = xstate, .iov_len = sizeof(xstate) }; if (pid == 0) { bool tracee = true; for (int i = 0; i < sysconf(_SC_NPROCESSORS_ONLN) && tracee; i++) tracee = (fork() != 0); uint32_t xmm0[4] = { [0 ... 3] = tracee ? 0x00000000 : 0xDEADBEEF }; asm volatile(" movdqu %0, %%xmm0\n" " mov %0, %%rbx\n" "1: movdqu %%xmm0, %0\n" " mov %0, %%rax\n" " cmp %%rax, %%rbx\n" " je 1b\n" : "+m" (xmm0) : : "rax", "rbx", "xmm0"); printf("BUG: xmm registers corrupted! tracee=%d, xmm0=%08X%08X%08X%08X\n", tracee, xmm0[0], xmm0[1], xmm0[2], xmm0[3]); } else { usleep(100000); ptrace(PTRACE_ATTACH, pid, 0, 0); wait(NULL); ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); xstate[65] = -1; ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov); ptrace(PTRACE_CONT, pid, 0, 0); wait(NULL); } return 1; } Note: the program only tests for the bug using the ptrace() system call. The bug can also be reproduced using the rt_sigreturn() system call, but only when called from a 32-bit program, since for 64-bit programs the kernel restores the FPU state from the signal frame by doing XRSTOR directly from userspace memory (with proper error checking). Reported-by: Dmitry Vyukov Signed-off-by: Eric Biggers Reviewed-by: Kees Cook Reviewed-by: Rik van Riel Acked-by: Dave Hansen Cc: [v3.17+] Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Fixes: 0b29643a5843 ("x86/xsaves: Change compacted format xsave area header") Link: http://lkml.kernel.org/r/20170922174156.16780-2-ebiggers3@gmail.com Link: http://lkml.kernel.org/r/20170923130016.21448-25-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/regset.c | 4 ++++ arch/x86/kernel/fpu/signal.c | 9 +++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 19a7385a912c..c764f7405322 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -141,6 +141,10 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, ret = copy_user_to_xstate(xsave, ubuf); } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + + /* xcomp_bv must be 0 when using uncompacted format */ + if (!ret && xsave->header.xcomp_bv) + ret = -EINVAL; } /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 629106e51a29..da68ea1c3a44 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -324,11 +324,16 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ fpu__drop(fpu); - if (using_compacted_format()) + if (using_compacted_format()) { err = copy_user_to_xstate(&fpu->state.xsave, buf_fx); - else + } else { err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); + /* xcomp_bv must be 0 when using uncompacted format */ + if (!err && state_size > offsetof(struct xregs_state, header) && fpu->state.xsave.header.xcomp_bv) + err = -EINVAL; + } + if (err || __copy_from_user(&env, buf, sizeof(env))) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); -- cgit From d5c8028b4788f62b31fb79a331b3ad3e041fa366 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 23 Sep 2017 15:00:09 +0200 Subject: x86/fpu: Reinitialize FPU registers if restoring FPU state fails Userspace can change the FPU state of a task using the ptrace() or rt_sigreturn() system calls. Because reserved bits in the FPU state can cause the XRSTOR instruction to fail, the kernel has to carefully validate that no reserved bits or other invalid values are being set. Unfortunately, there have been bugs in this validation code. For example, we were not checking that the 'xcomp_bv' field in the xstate_header was 0. As-is, such bugs are exploitable to read the FPU registers of other processes on the system. To do so, an attacker can create a task, assign to it an invalid FPU state, then spin in a loop and monitor the values of the FPU registers. Because the task's FPU registers are not being restored, sometimes the FPU registers will have the values from another process. This is likely to continue to be a problem in the future because the validation done by the CPU instructions like XRSTOR is not immediately visible to kernel developers. Nor will invalid FPU states ever be encountered during ordinary use --- they will only be seen during fuzzing or exploits. There can even be reserved bits outside the xstate_header which are easy to forget about. For example, the MXCSR register contains reserved bits, which were not validated by the KVM_SET_XSAVE ioctl until commit a575813bfe4b ("KVM: x86: Fix load damaged SSEx MXCSR register"). Therefore, mitigate this class of vulnerability by restoring the FPU registers from init_fpstate if restoring from the task's state fails. We actually used to do this, but it was (perhaps unwisely) removed by commit 9ccc27a5d297 ("x86/fpu: Remove error return values from copy_kernel_to_*regs() functions"). This new patch is also a bit different. First, it only clears the registers, not also the bad in-memory state; this is simpler and makes it easier to make the mitigation cover all callers of __copy_kernel_to_fpregs(). Second, it does the register clearing in an exception handler so that no extra instructions are added to context switches. In fact, we *remove* instructions, since previously we were always zeroing the register containing 'err' even if CONFIG_X86_DEBUG_FPU was disabled. Signed-off-by: Eric Biggers Reviewed-by: Rik van Riel Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170922174156.16780-4-ebiggers3@gmail.com Link: http://lkml.kernel.org/r/20170923130016.21448-27-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 51 +++++++++++-------------------------- arch/x86/mm/extable.c | 24 +++++++++++++++++ 2 files changed, 39 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 2dca7c65319c..cf290d424e48 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -120,20 +120,11 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu); err; \ }) -#define check_insn(insn, output, input...) \ -({ \ - int err; \ +#define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \ + : output : input) static inline int copy_fregs_to_user(struct fregs_state __user *fx) { @@ -153,20 +144,16 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) { - int err; - if (IS_ENABLED(CONFIG_X86_32)) { - err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) { - err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { /* See comment in copy_fxregs_to_kernel() below. */ - err = check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); + kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); } } - /* Copying from a kernel buffer to FPU registers should never fail: */ - WARN_ON_FPU(err); } static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) @@ -183,9 +170,7 @@ static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) static inline void copy_kernel_to_fregs(struct fregs_state *fx) { - int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - - WARN_ON_FPU(err); + kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int copy_user_to_fregs(struct fregs_state __user *fx) @@ -281,18 +266,13 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact * XSAVE area format. */ -#define XSTATE_XRESTORE(st, lmask, hmask, err) \ +#define XSTATE_XRESTORE(st, lmask, hmask) \ asm volatile(ALTERNATIVE(XRSTOR, \ XRSTORS, X86_FEATURE_XSAVES) \ "\n" \ - "xor %[err], %[err]\n" \ "3:\n" \ - ".pushsection .fixup,\"ax\"\n" \ - "4: movl $-2, %[err]\n" \ - "jmp 3b\n" \ - ".popsection\n" \ - _ASM_EXTABLE(661b, 4b) \ - : [err] "=r" (err) \ + _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\ + : \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") @@ -336,7 +316,10 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - /* We should never fault when copying from a kernel buffer: */ + /* + * We should never fault when copying from a kernel buffer, and the FPU + * state we set at boot time should be valid. + */ WARN_ON_FPU(err); } @@ -365,12 +348,8 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; - int err; - - XSTATE_XRESTORE(xstate, lmask, hmask, err); - /* We should never fault when copying from a kernel buffer: */ - WARN_ON_FPU(err); + XSTATE_XRESTORE(xstate, lmask, hmask); } /* diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index c076f710de4c..c3521e2be396 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -78,6 +79,29 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_refcount); +/* + * Handler for when we fail to restore a task's FPU state. We should never get + * here because the FPU state of a task using the FPU (task->thread.fpu.state) + * should always be valid. However, past bugs have allowed userspace to set + * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). + * These caused XRSTOR to fail when switching to the task, leaking the FPU + * registers of the task previously executing on the CPU. Mitigate this class + * of vulnerability by restoring from the initial state (essentially, zeroing + * out all the FPU registers) if we can't restore from the task's FPU state. + */ +bool ex_handler_fprestore(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ip = ex_fixup_addr(fixup); + + WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", + (void *)instruction_pointer(regs)); + + __copy_kernel_to_fpregs(&init_fpstate, -1); + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_fprestore); + bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { -- cgit From 10430364ebb562311ba6a6efa74e0c2298007912 Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Tue, 29 Aug 2017 23:47:11 +0530 Subject: x86/numachip: Add const and __initconst to numachip2_clockevent Make this const as it is only used during a copy operation and add __initconst as this usage is during the initialization phase. Signed-off-by: Bhumika Goyal Signed-off-by: Thomas Gleixner Cc: julia.lawall@lip6.fr Cc: daniel.lezcano@linaro.org Link: http://lkml.kernel.org/r/1504030631-10812-1-git-send-email-bhumirks@gmail.com --- drivers/clocksource/numachip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/numachip.c b/drivers/clocksource/numachip.c index 6a20dc8b253f..9a7d7f0f23fe 100644 --- a/drivers/clocksource/numachip.c +++ b/drivers/clocksource/numachip.c @@ -43,7 +43,7 @@ static int numachip2_set_next_event(unsigned long delta, struct clock_event_devi return 0; } -static struct clock_event_device numachip2_clockevent = { +static const struct clock_event_device numachip2_clockevent __initconst = { .name = "numachip2", .rating = 400, .set_next_event = numachip2_set_next_event, -- cgit From a3c4fb7c9c2ebfd50b8c60f6c069932bb319bc37 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Mon, 4 Sep 2017 10:32:15 +0200 Subject: x86/mm: Fix fault error path using unsafe vma pointer commit 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal generation code") passes down a vma pointer to the error path, but that is done once the mmap_sem is released when calling mm_fault_error() from __do_page_fault(). This is dangerous as the vma structure is no more safe to be used once the mmap_sem has been released. As only the protection key value is required in the error processing, we could just pass down this value. Fix it by passing a pointer to a protection key value down to the fault signal generation code. The use of a pointer allows to keep the check generating a warning message in fill_sig_info_pkey() when the vma was not known. If the pointer is valid, the protection value can be accessed by deferencing the pointer. [ tglx: Made *pkey u32 as that's the type which is passed in siginfo ] Fixes: 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal generation code") Signed-off-by: Laurent Dufour Signed-off-by: Thomas Gleixner Cc: linux-mm@kvack.org Cc: Dave Hansen Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1504513935-12742-1-git-send-email-ldufour@linux.vnet.ibm.com --- arch/x86/mm/fault.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 39567b5c33da..e2baeaa053a5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -192,8 +192,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ -static void fill_sig_info_pkey(int si_code, siginfo_t *info, - struct vm_area_struct *vma) +static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) { /* This is effectively an #ifdef */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) @@ -209,7 +208,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * valid VMA, so we should never reach this without a * valid VMA. */ - if (!vma) { + if (!pkey) { WARN_ONCE(1, "PKU fault with no VMA passed in"); info->si_pkey = 0; return; @@ -219,13 +218,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * absolutely guranteed to be 100% accurate because of * the race explained above. */ - info->si_pkey = vma_pkey(vma); + info->si_pkey = *pkey; } static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk, struct vm_area_struct *vma, - int fault) + struct task_struct *tsk, u32 *pkey, int fault) { unsigned lsb = 0; siginfo_t info; @@ -240,7 +238,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, lsb = PAGE_SHIFT; info.si_addr_lsb = lsb; - fill_sig_info_pkey(si_code, &info, vma); + fill_sig_info_pkey(si_code, &info, pkey); force_sig_info(si_signo, &info, tsk); } @@ -762,8 +760,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; unsigned long flags; int sig; - /* No context means no VMA to pass down */ - struct vm_area_struct *vma = NULL; /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF)) { @@ -788,7 +784,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* XXX: hwpoison faults will set the wrong code. */ force_sig_info_fault(signal, si_code, address, - tsk, vma, 0); + tsk, NULL, 0); } /* @@ -896,8 +892,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - int si_code) + unsigned long address, u32 *pkey, int si_code) { struct task_struct *tsk = current; @@ -945,7 +940,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_PF; - force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); return; } @@ -958,9 +953,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma) + unsigned long address, u32 *pkey) { - __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR); + __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); } static void @@ -968,6 +963,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct vm_area_struct *vma, int si_code) { struct mm_struct *mm = current->mm; + u32 pkey; + + if (vma) + pkey = vma_pkey(vma); /* * Something tried to access memory that isn't in our memory map.. @@ -975,7 +974,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, */ up_read(&mm->mmap_sem); - __bad_area_nosemaphore(regs, error_code, address, vma, si_code); + __bad_area_nosemaphore(regs, error_code, address, + (vma) ? &pkey : NULL, si_code); } static noinline void @@ -1018,7 +1018,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, - struct vm_area_struct *vma, unsigned int fault) + u32 *pkey, unsigned int fault) { struct task_struct *tsk = current; int code = BUS_ADRERR; @@ -1045,13 +1045,12 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault); + force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); } static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - unsigned int fault) + unsigned long address, u32 *pkey, unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -1075,9 +1074,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, vma, fault); + do_sigbus(regs, error_code, address, pkey, fault); else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address, vma); + bad_area_nosemaphore(regs, error_code, address, pkey); else BUG(); } @@ -1267,6 +1266,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm; int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + u32 pkey; tsk = current; mm = tsk->mm; @@ -1467,9 +1467,10 @@ good_area: return; } + pkey = vma_pkey(vma); up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, vma, fault); + mm_fault_error(regs, error_code, address, &pkey, fault); return; } -- cgit From 7d7099433d9eaaa5a989a55f1fa354c16a3ad297 Mon Sep 17 00:00:00 2001 From: Sean Fu Date: Mon, 11 Sep 2017 08:33:21 +0800 Subject: x86/sysfs: Fix off-by-one error in loop termination An off-by-one error in loop terminantion conditions in create_setup_data_nodes() will lead to memory leak when create_setup_data_node() failed. Signed-off-by: Sean Fu Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1505090001-1157-1-git-send-email-fxinrong@gmail.com --- arch/x86/kernel/ksysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 4b0592ca9e47..8c1cc08f514f 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -299,7 +299,7 @@ static int __init create_setup_data_nodes(struct kobject *parent) return 0; out_clean_nodes: - for (j = i - 1; j > 0; j--) + for (j = i - 1; j >= 0; j--) cleanup_setup_data_node(*(kobjp + j)); kfree(kobjp); out_setup_data_kobj: -- cgit From 5ac751d9e6b187c4a0000879d6598eb2292db949 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Tue, 12 Sep 2017 19:40:00 +0300 Subject: x86: Don't cast away the __user in __get_user_asm_u64() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't cast away the __user in __get_user_asm_u64() on x86-32. Prevents sparse getting upset. Signed-off-by: Ville Syrjälä Signed-off-by: Thomas Gleixner Cc: Benjamin LaHaise Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20170912164000.13745-1-ville.syrjala@linux.intel.com --- arch/x86/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 78e8fcc87d4c..4b892917edeb 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -337,7 +337,7 @@ do { \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ : "=r" (retval), "=&A"(x) \ - : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \ + : "m" (__m(__ptr)), "m" __m(((u32 __user *)(__ptr)) + 1), \ "i" (errret), "0" (retval)); \ }) -- cgit From b09c146f8f63c0e03adba74df76bf9c2be466fec Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 8 Sep 2017 17:34:47 -0400 Subject: perf/x86/intel/cstate: Add missing CPU IDs Skylake server uses the same C-state residency events as Sandy Bridge. Denverton and Gemini lake use the same C-state residency events as Apollo Lake. Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: piotr.luc@intel.com Cc: harry.pan@intel.com Cc: srinivas.pandruvada@linux.intel.com Link: http://lkml.kernel.org/r/20170908213449.6224-1-kan.liang@intel.com --- arch/x86/events/intel/cstate.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4cf100ff2a37..72db0664a53d 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -552,6 +552,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates), @@ -560,6 +561,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_DENVERTON, glm_cstates), + + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GEMINI_LAKE, glm_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); -- cgit From 1aaccc40a1864053da26605b0297be16dd52641e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 8 Sep 2017 17:34:48 -0400 Subject: perf/x86/msr: Add missing CPU IDs Goldmont, Glodmont plus and Xeon Phi have MSR_SMI_COUNT as well. Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: piotr.luc@intel.com Cc: harry.pan@intel.com Cc: srinivas.pandruvada@linux.intel.com Link: http://lkml.kernel.org/r/20170908213449.6224-2-kan.liang@intel.com --- arch/x86/events/msr.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 4bb3ec69e8ea..06723671ae4e 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -63,6 +63,14 @@ static bool test_intel(int idx) case INTEL_FAM6_ATOM_SILVERMONT1: case INTEL_FAM6_ATOM_SILVERMONT2: case INTEL_FAM6_ATOM_AIRMONT: + + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_DENVERTON: + + case INTEL_FAM6_ATOM_GEMINI_LAKE: + + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: if (idx == PERF_MSR_SMI) return true; break; -- cgit From 450a97893559354b927c935f39ee11126f01f520 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 8 Sep 2017 17:34:49 -0400 Subject: perf/x86/intel/rapl: Add missing CPU IDs DENVERTON and GEMINI_LAKE support same RAPL counters as Apollo Lake. Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: piotr.luc@intel.com Cc: harry.pan@intel.com Cc: srinivas.pandruvada@linux.intel.com Link: http://lkml.kernel.org/r/20170908213449.6224-3-kan.liang@intel.com --- arch/x86/events/intel/rapl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 8e2457cb6b4a..005908ee9333 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -775,6 +775,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_DENVERTON, hsw_rapl_init), + + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GEMINI_LAKE, hsw_rapl_init), {}, }; -- cgit From 29b46dfb136cdbeece542b3f01115237e43f2855 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 11 Sep 2017 10:10:15 -0700 Subject: perf/x86/intel/uncore: Correct num_boxes for IIO and IRP There are 6 IIO/IRP boxes for CBDMA, PCIe0-2, MCP 0 and MCP 1 separately. Correct the num_boxes. Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: acme@kernel.org Link: http://lkml.kernel.org/r/1505149816-12580-1-git-send-email-kan.liang@intel.com --- arch/x86/events/intel/uncore_snbep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index db1fe377e6dd..a7196818416a 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3462,7 +3462,7 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { static struct intel_uncore_type skx_uncore_iio = { .name = "iio", .num_counters = 4, - .num_boxes = 5, + .num_boxes = 6, .perf_ctr_bits = 48, .event_ctl = SKX_IIO0_MSR_PMON_CTL0, .perf_ctr = SKX_IIO0_MSR_PMON_CTR0, @@ -3492,7 +3492,7 @@ static const struct attribute_group skx_uncore_format_group = { static struct intel_uncore_type skx_uncore_irp = { .name = "irp", .num_counters = 2, - .num_boxes = 5, + .num_boxes = 6, .perf_ctr_bits = 48, .event_ctl = SKX_IRP0_MSR_PMON_CTL0, .perf_ctr = SKX_IRP0_MSR_PMON_CTR0, -- cgit From 0add53713b1c07a1c71e27a20e21eb7c180b4e7b Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Tue, 19 Sep 2017 16:40:21 +0200 Subject: microblaze: Add missing kvm_para.h to Kbuild Running make allmodconfig;make is throwing compilation error: CC kernel/watchdog.o In file included from ./include/linux/kvm_para.h:4:0, from kernel/watchdog.c:29: ./include/uapi/linux/kvm_para.h:32:26: fatal error: asm/kvm_para.h: No such file or directory #include ^ compilation terminated. make[1]: *** [kernel/watchdog.o] Error 1 make: *** [kernel/watchdog.o] Error 2 Reported-by: Michal Hocko Suggested-by: Geert Uytterhoeven Signed-off-by: Michal Simek Fixes: 83f0124ad81e87b ("microblaze: remove asm-generic wrapper headers") Reviewed-by: Tobias Klauser Tested-by: Michal Hocko --- arch/microblaze/include/uapi/asm/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild index e77a596f3f1e..06609ca36115 100644 --- a/arch/microblaze/include/uapi/asm/Kbuild +++ b/arch/microblaze/include/uapi/asm/Kbuild @@ -7,6 +7,7 @@ generic-y += fcntl.h generic-y += ioctl.h generic-y += ioctls.h generic-y += ipcbuf.h +generic-y += kvm_para.h generic-y += mman.h generic-y += msgbuf.h generic-y += param.h -- cgit From 64c99853baca40e2f06038c4a926009edd14c7c3 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Thu, 21 Sep 2017 00:29:36 +0200 Subject: microblaze: Cocci spatch "vma_pages" Use vma_pages function on vma object instead of explicit computation. Found by coccinelle spatch "api/vma_pages.cocci" Signed-off-by: Thomas Meyer Signed-off-by: Michal Simek --- arch/microblaze/kernel/dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c index e45ada8fb006..94700c5270a9 100644 --- a/arch/microblaze/kernel/dma.c +++ b/arch/microblaze/kernel/dma.c @@ -165,7 +165,7 @@ int dma_direct_mmap_coherent(struct device *dev, struct vm_area_struct *vma, unsigned long attrs) { #ifdef CONFIG_MMU - unsigned long user_count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + unsigned long user_count = vma_pages(vma); unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long off = vma->vm_pgoff; unsigned long pfn; -- cgit From 428dbf156cc5a8f9994d1f1a5c79373d15476f3c Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 18 Sep 2017 10:53:29 -0600 Subject: arch: change default endian for microblaze Fix the default for microblaze. Michal Simek mentioned default for microblaze should be CPU_LITTLE_ENDIAN. Fixes : commit 206d3642d8ee ("arch/microblaze: add choice for endianness and update Makefile") Signed-off-by: Babu Moger Cc: Michal Simek Signed-off-by: Michal Simek --- arch/microblaze/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 9d26abdf0dc1..4f798aa671dd 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -39,7 +39,7 @@ config MICROBLAZE # Endianness selection choice prompt "Endianness selection" - default CPU_BIG_ENDIAN + default CPU_LITTLE_ENDIAN help microblaze architectures can be configured for either little or big endian formats. Be sure to select the appropriate mode. -- cgit From 89975bd335f37b96ffd3cc24b9effb1fa25e7788 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 20 Sep 2017 16:41:34 -0300 Subject: perf tools: Get all of tools/{arch,include}/ in the MANIFEST Now that I'm switching the container builds from using a local volume pointing to the kernel repository with the perf sources, instead getting a detached tarball to be able to use a container cluster, some places broke because I forgot to put some of the required files in tools/perf/MANIFEST, namely some bitsperlong.h files. So, to fix it do the same as for tools/build/ and pack the whole tools/arch/ directory. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-wmenpjfjsobwdnfde30qqncj@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/MANIFEST | 87 ++--------------------------------------------------- 1 file changed, 2 insertions(+), 85 deletions(-) diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 62072822dc85..627b7cada144 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -1,34 +1,8 @@ tools/perf -tools/arch/alpha/include/asm/barrier.h -tools/arch/arm/include/asm/barrier.h -tools/arch/arm64/include/asm/barrier.h -tools/arch/ia64/include/asm/barrier.h -tools/arch/mips/include/asm/barrier.h -tools/arch/powerpc/include/asm/barrier.h -tools/arch/s390/include/asm/barrier.h -tools/arch/sh/include/asm/barrier.h -tools/arch/sparc/include/asm/barrier.h -tools/arch/sparc/include/asm/barrier_32.h -tools/arch/sparc/include/asm/barrier_64.h -tools/arch/tile/include/asm/barrier.h -tools/arch/x86/include/asm/barrier.h -tools/arch/x86/include/asm/cmpxchg.h -tools/arch/x86/include/asm/cpufeatures.h -tools/arch/x86/include/asm/disabled-features.h -tools/arch/x86/include/asm/required-features.h -tools/arch/x86/include/uapi/asm/svm.h -tools/arch/x86/include/uapi/asm/vmx.h -tools/arch/x86/include/uapi/asm/kvm.h -tools/arch/x86/include/uapi/asm/kvm_perf.h -tools/arch/x86/lib/memcpy_64.S -tools/arch/x86/lib/memset_64.S -tools/arch/s390/include/uapi/asm/kvm_perf.h -tools/arch/s390/include/uapi/asm/sie.h -tools/arch/xtensa/include/asm/barrier.h +tools/arch tools/scripts tools/build -tools/arch/x86/include/asm/atomic.h -tools/arch/x86/include/asm/rmwcc.h +tools/include tools/lib/traceevent tools/lib/api tools/lib/bpf @@ -42,60 +16,3 @@ tools/lib/find_bit.c tools/lib/bitmap.c tools/lib/str_error_r.c tools/lib/vsprintf.c -tools/include/asm/alternative-asm.h -tools/include/asm/atomic.h -tools/include/asm/barrier.h -tools/include/asm/bug.h -tools/include/asm-generic/atomic-gcc.h -tools/include/asm-generic/barrier.h -tools/include/asm-generic/bitops/arch_hweight.h -tools/include/asm-generic/bitops/atomic.h -tools/include/asm-generic/bitops/const_hweight.h -tools/include/asm-generic/bitops/__ffs.h -tools/include/asm-generic/bitops/__ffz.h -tools/include/asm-generic/bitops/__fls.h -tools/include/asm-generic/bitops/find.h -tools/include/asm-generic/bitops/fls64.h -tools/include/asm-generic/bitops/fls.h -tools/include/asm-generic/bitops/hweight.h -tools/include/asm-generic/bitops.h -tools/include/linux/atomic.h -tools/include/linux/bitops.h -tools/include/linux/compiler.h -tools/include/linux/compiler-gcc.h -tools/include/linux/coresight-pmu.h -tools/include/linux/bug.h -tools/include/linux/filter.h -tools/include/linux/hash.h -tools/include/linux/kernel.h -tools/include/linux/list.h -tools/include/linux/log2.h -tools/include/uapi/asm-generic/fcntl.h -tools/include/uapi/asm-generic/ioctls.h -tools/include/uapi/asm-generic/mman-common.h -tools/include/uapi/asm-generic/mman.h -tools/include/uapi/drm/drm.h -tools/include/uapi/drm/i915_drm.h -tools/include/uapi/linux/bpf.h -tools/include/uapi/linux/bpf_common.h -tools/include/uapi/linux/fcntl.h -tools/include/uapi/linux/hw_breakpoint.h -tools/include/uapi/linux/kvm.h -tools/include/uapi/linux/mman.h -tools/include/uapi/linux/perf_event.h -tools/include/uapi/linux/sched.h -tools/include/uapi/linux/stat.h -tools/include/uapi/linux/vhost.h -tools/include/uapi/sound/asound.h -tools/include/linux/poison.h -tools/include/linux/rbtree.h -tools/include/linux/rbtree_augmented.h -tools/include/linux/refcount.h -tools/include/linux/string.h -tools/include/linux/stringify.h -tools/include/linux/types.h -tools/include/linux/err.h -tools/include/linux/bitmap.h -tools/include/linux/time64.h -tools/arch/*/include/uapi/asm/mman.h -tools/arch/*/include/uapi/asm/perf_regs.h -- cgit From 549a3976523c69a0245c0a310210c824a0b26e35 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 Sep 2017 09:38:23 +0200 Subject: tools include: Sync kernel ABI headers with tooling headers Time for a sync with ABI/uapi headers with the upcoming v4.14 kernel. None of the ABI changes require any source code level changes to our existing in-kernel tooling code: - tools/arch/s390/include/uapi/asm/kvm.h: New KVM_S390_VM_TOD_EXT ABI, not used by in-kernel tooling. - tools/arch/x86/include/asm/cpufeatures.h: tools/arch/x86/include/asm/disabled-features.h: New PCID, SME and VGIF x86 CPU feature bits defined. - tools/include/asm-generic/hugetlb_encode.h: tools/include/uapi/asm-generic/mman-common.h: tools/include/uapi/linux/mman.h: Two new madvise() flags, plus a hugetlb system call mmap flags restructuring/extension changes. - tools/include/uapi/drm/drm.h: tools/include/uapi/drm/i915_drm.h: New drm_syncobj_create flags definitions, new drm_syncobj_wait and drm_syncobj_array ABIs. DRM_I915_PERF_* calls and a new I915_PARAM_HAS_EXEC_FENCE_ARRAY ABI for the Intel driver. - tools/include/uapi/linux/bpf.h: New bpf_sock fields (::mark and ::priority), new XDP_REDIRECT action, new kvm_ppc_smmu_info fields (::data_keys, instr_keys) Signed-off-by: Ingo Molnar Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Taeung Song Cc: Wang Nan Cc: Yao Jin Link: http://lkml.kernel.org/r/20170913073823.lxmi4c7ejqlfabjx@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/s390/include/uapi/asm/kvm.h | 6 +++ tools/arch/x86/include/asm/cpufeatures.h | 2 + tools/arch/x86/include/asm/disabled-features.h | 4 +- tools/include/asm-generic/hugetlb_encode.h | 34 +++++++++++++++++ tools/include/uapi/asm-generic/mman-common.h | 14 ++----- tools/include/uapi/drm/drm.h | 22 +++++++++++ tools/include/uapi/drm/i915_drm.h | 51 +++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 32 ++++++++++------ tools/include/uapi/linux/kvm.h | 3 +- tools/include/uapi/linux/mman.h | 24 +++++++++++- 10 files changed, 164 insertions(+), 28 deletions(-) create mode 100644 tools/include/asm-generic/hugetlb_encode.h diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index 69d09c39bbcd..cd7359e23d86 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -88,6 +88,12 @@ struct kvm_s390_io_adapter_req { /* kvm attributes for KVM_S390_VM_TOD */ #define KVM_S390_VM_TOD_LOW 0 #define KVM_S390_VM_TOD_HIGH 1 +#define KVM_S390_VM_TOD_EXT 2 + +struct kvm_s390_vm_tod_clock { + __u8 epoch_idx; + __u64 tod; +}; /* kvm attributes for KVM_S390_VM_CPU_MODEL */ /* processor related attributes are r/w */ diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 8ea315a11fe0..2519c6c801c9 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -196,6 +196,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ @@ -287,6 +288,7 @@ #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 5dff775af7cd..c10c9128f54e 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -21,11 +21,13 @@ # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) +# define DISABLE_PCID 0 #else # define DISABLE_VME 0 # define DISABLE_K6_MTRR 0 # define DISABLE_CYRIX_ARR 0 # define DISABLE_CENTAUR_MCR 0 +# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS @@ -49,7 +51,7 @@ #define DISABLED_MASK1 0 #define DISABLED_MASK2 0 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 0 +#define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 diff --git a/tools/include/asm-generic/hugetlb_encode.h b/tools/include/asm-generic/hugetlb_encode.h new file mode 100644 index 000000000000..e4732d3c2998 --- /dev/null +++ b/tools/include/asm-generic/hugetlb_encode.h @@ -0,0 +1,34 @@ +#ifndef _ASM_GENERIC_HUGETLB_ENCODE_H_ +#define _ASM_GENERIC_HUGETLB_ENCODE_H_ + +/* + * Several system calls take a flag to request "hugetlb" huge pages. + * Without further specification, these system calls will use the + * system's default huge page size. If a system supports multiple + * huge page sizes, the desired huge page size can be specified in + * bits [26:31] of the flag arguments. The value in these 6 bits + * will encode the log2 of the huge page size. + * + * The following definitions are associated with this huge page size + * encoding in flag arguments. System call specific header files + * that use this encoding should include this file. They can then + * provide definitions based on these with their own specific prefix. + * for example: + * #define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT + */ + +#define HUGETLB_FLAG_ENCODE_SHIFT 26 +#define HUGETLB_FLAG_ENCODE_MASK 0x3f + +#define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) + +#endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */ diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h index 8c27db0c5c08..203268f9231e 100644 --- a/tools/include/uapi/asm-generic/mman-common.h +++ b/tools/include/uapi/asm-generic/mman-common.h @@ -58,20 +58,12 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_DONTDUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 -/* - * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. - * This gives us 6 bits, which is enough until someone invents 128 bit address - * spaces. - * - * Assume these are all power of twos. - * When 0 use the default page size. - */ -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f - #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h index 101593ab10ac..97677cd6964d 100644 --- a/tools/include/uapi/drm/drm.h +++ b/tools/include/uapi/drm/drm.h @@ -700,6 +700,7 @@ struct drm_prime_handle { struct drm_syncobj_create { __u32 handle; +#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0) __u32 flags; }; @@ -718,6 +719,24 @@ struct drm_syncobj_handle { __u32 pad; }; +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1) +struct drm_syncobj_wait { + __u64 handles; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; +}; + +struct drm_syncobj_array { + __u64 handles; + __u32 count_handles; + __u32 pad; +}; + #if defined(__cplusplus) } #endif @@ -840,6 +859,9 @@ extern "C" { #define DRM_IOCTL_SYNCOBJ_DESTROY DRM_IOWR(0xC0, struct drm_syncobj_destroy) #define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD DRM_IOWR(0xC1, struct drm_syncobj_handle) #define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE DRM_IOWR(0xC2, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_WAIT DRM_IOWR(0xC3, struct drm_syncobj_wait) +#define DRM_IOCTL_SYNCOBJ_RESET DRM_IOWR(0xC4, struct drm_syncobj_array) +#define DRM_IOCTL_SYNCOBJ_SIGNAL DRM_IOWR(0xC5, struct drm_syncobj_array) /** * Device specific ioctls should only be in their respective headers diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index 7ccbd6a2bbe0..6598fb76d2c2 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -260,6 +260,8 @@ typedef struct _drm_i915_sarea { #define DRM_I915_GEM_CONTEXT_GETPARAM 0x34 #define DRM_I915_GEM_CONTEXT_SETPARAM 0x35 #define DRM_I915_PERF_OPEN 0x36 +#define DRM_I915_PERF_ADD_CONFIG 0x37 +#define DRM_I915_PERF_REMOVE_CONFIG 0x38 #define DRM_IOCTL_I915_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t) #define DRM_IOCTL_I915_FLUSH DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH) @@ -315,6 +317,8 @@ typedef struct _drm_i915_sarea { #define DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_GETPARAM, struct drm_i915_gem_context_param) #define DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_SETPARAM, struct drm_i915_gem_context_param) #define DRM_IOCTL_I915_PERF_OPEN DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_OPEN, struct drm_i915_perf_open_param) +#define DRM_IOCTL_I915_PERF_ADD_CONFIG DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_ADD_CONFIG, struct drm_i915_perf_oa_config) +#define DRM_IOCTL_I915_PERF_REMOVE_CONFIG DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_REMOVE_CONFIG, __u64) /* Allow drivers to submit batchbuffers directly to hardware, relying * on the security mechanisms provided by hardware. @@ -431,6 +435,11 @@ typedef struct drm_i915_irq_wait { */ #define I915_PARAM_HAS_EXEC_BATCH_FIRST 48 +/* Query whether DRM_I915_GEM_EXECBUFFER2 supports supplying an array of + * drm_i915_gem_exec_fence structures. See I915_EXEC_FENCE_ARRAY. + */ +#define I915_PARAM_HAS_EXEC_FENCE_ARRAY 49 + typedef struct drm_i915_getparam { __s32 param; /* @@ -812,6 +821,17 @@ struct drm_i915_gem_exec_object2 { __u64 rsvd2; }; +struct drm_i915_gem_exec_fence { + /** + * User's handle for a drm_syncobj to wait on or signal. + */ + __u32 handle; + +#define I915_EXEC_FENCE_WAIT (1<<0) +#define I915_EXEC_FENCE_SIGNAL (1<<1) + __u32 flags; +}; + struct drm_i915_gem_execbuffer2 { /** * List of gem_exec_object2 structs @@ -826,7 +846,11 @@ struct drm_i915_gem_execbuffer2 { __u32 DR1; __u32 DR4; __u32 num_cliprects; - /** This is a struct drm_clip_rect *cliprects */ + /** + * This is a struct drm_clip_rect *cliprects if I915_EXEC_FENCE_ARRAY + * is not set. If I915_EXEC_FENCE_ARRAY is set, then this is a + * struct drm_i915_gem_exec_fence *fences. + */ __u64 cliprects_ptr; #define I915_EXEC_RING_MASK (7<<0) #define I915_EXEC_DEFAULT (0<<0) @@ -927,7 +951,14 @@ struct drm_i915_gem_execbuffer2 { * element). */ #define I915_EXEC_BATCH_FIRST (1<<18) -#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_BATCH_FIRST<<1)) + +/* Setting I915_FENCE_ARRAY implies that num_cliprects and cliprects_ptr + * define an array of i915_gem_exec_fence structures which specify a set of + * dma fences to wait upon or signal. + */ +#define I915_EXEC_FENCE_ARRAY (1<<19) + +#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_ARRAY<<1)) #define I915_EXEC_CONTEXT_ID_MASK (0xffffffff) #define i915_execbuffer2_set_context_id(eb2, context) \ @@ -1467,6 +1498,22 @@ enum drm_i915_perf_record_type { DRM_I915_PERF_RECORD_MAX /* non-ABI */ }; +/** + * Structure to upload perf dynamic configuration into the kernel. + */ +struct drm_i915_perf_oa_config { + /** String formatted like "%08x-%04x-%04x-%04x-%012x" */ + char uuid[36]; + + __u32 n_mux_regs; + __u32 n_boolean_regs; + __u32 n_flex_regs; + + __u64 __user mux_regs_ptr; + __u64 __user boolean_regs_ptr; + __u64 __user flex_regs_ptr; +}; + #if defined(__cplusplus) } #endif diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 461811e57140..43ab5c402f98 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -143,12 +143,6 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE -enum bpf_sockmap_flags { - BPF_SOCKMAP_UNSPEC, - BPF_SOCKMAP_STRPARSER, - __MAX_BPF_SOCKMAP_FLAG -}; - /* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command * to the given target_fd cgroup the descendent cgroup will be able to * override effective bpf program that was inherited from this cgroup @@ -368,9 +362,20 @@ union bpf_attr { * int bpf_redirect(ifindex, flags) * redirect to another netdev * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: TC_ACT_REDIRECT + * @flags: + * cls_bpf: + * bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * xdp_bpf: + * all bits - reserved + * Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error + * xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error + * int bpf_redirect_map(map, key, flags) + * redirect to endpoint in map + * @map: pointer to dev map + * @key: index in map to lookup + * @flags: -- + * Return: XDP_REDIRECT on success or XDP_ABORT on error * * u32 bpf_get_route_realm(skb) * retrieve a dst's tclassid @@ -632,7 +637,7 @@ union bpf_attr { FN(skb_adjust_room), \ FN(redirect_map), \ FN(sk_redirect_map), \ - FN(sock_map_update), + FN(sock_map_update), \ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -753,20 +758,23 @@ struct bpf_sock { __u32 family; __u32 type; __u32 protocol; + __u32 mark; + __u32 priority; }; #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other - * return codes are reserved for future use. Unknown return codes will result - * in packet drop. + * return codes are reserved for future use. Unknown return codes will + * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). */ enum xdp_action { XDP_ABORTED = 0, XDP_DROP, XDP_PASS, XDP_TX, + XDP_REDIRECT, }; /* user accessible metadata for XDP packet hook diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 6cd63c18708a..838887587411 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -711,7 +711,8 @@ struct kvm_ppc_one_seg_page_size { struct kvm_ppc_smmu_info { __u64 flags; __u32 slb_size; - __u32 pad; + __u16 data_keys; /* # storage keys supported for data */ + __u16 instr_keys; /* # storage keys supported for instructions */ struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; }; diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h index 81d8edf11789..a937480d7cd3 100644 --- a/tools/include/uapi/linux/mman.h +++ b/tools/include/uapi/linux/mman.h @@ -1,7 +1,8 @@ #ifndef _UAPI_LINUX_MMAN_H #define _UAPI_LINUX_MMAN_H -#include +#include +#include #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 @@ -10,4 +11,25 @@ #define OVERCOMMIT_ALWAYS 1 #define OVERCOMMIT_NEVER 2 +/* + * Huge page size encoding when MAP_HUGETLB is specified, and a huge page + * size other than the default is desired. See hugetlb_encode.h. + * All known huge page size encodings are provided here. It is the + * responsibility of the application to know which sizes are supported on + * the running system. See mmap(2) man page for details. + */ +#define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT +#define MAP_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK + +#define MAP_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB +#define MAP_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB +#define MAP_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB +#define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB +#define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB +#define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB +#define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB +#define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB + #endif /* _UAPI_LINUX_MMAN_H */ -- cgit From f1e52f14a69386ac460a8d700df0647a631cf595 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 22 Sep 2017 15:41:44 -0300 Subject: perf evsel: Fix attr.exclude_kernel setting for default cycles:p Yet another fix for probing the max attr.precise_ip setting: it is not enough settting attr.exclude_kernel for !root users, as they _can_ profile the kernel if the kernel.perf_event_paranoid sysctl is set to -1, so check that as well. Testing it: As non root: $ sysctl kernel.perf_event_paranoid kernel.perf_event_paranoid = 2 $ perf record sleep 1 $ perf evlist -v cycles:uppp: ..., exclude_kernel: 1, ... precise_ip: 3, ... Now as non-root, but with kernel.perf_event_paranoid set set to the most permissive value, -1: $ sysctl kernel.perf_event_paranoid kernel.perf_event_paranoid = -1 $ perf record sleep 1 $ perf evlist -v cycles:ppp: ..., exclude_kernel: 0, ... precise_ip: 3, ... $ I.e. non-root, default kernel.perf_event_paranoid: :uppp modifier = not allowed to sample the kernel, non-root, most permissible kernel.perf_event_paranoid: :ppp = allowed to sample the kernel. In both cases, use the highest available precision: attr.precise_ip = 3. Reported-and-Tested-by: Ingo Molnar Cc: Adrian Hunter Cc: Andy Lutomirski Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Fixes: d37a36979077 ("perf evsel: Fix attr.exclude_kernel setting for default cycles:p") Link: http://lkml.kernel.org/n/tip-nj2qkf75xsd6pw6hhjzfqqdx@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 4bb89373eb52..0dccdb89572c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -271,12 +271,17 @@ struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx) return evsel; } +static bool perf_event_can_profile_kernel(void) +{ + return geteuid() == 0 || perf_event_paranoid() == -1; +} + struct perf_evsel *perf_evsel__new_cycles(bool precise) { struct perf_event_attr attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, - .exclude_kernel = geteuid() != 0, + .exclude_kernel = !perf_event_can_profile_kernel(), }; struct perf_evsel *evsel; -- cgit From 44d8143340a99b167c74365e844516b73523c087 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 21 Sep 2017 13:57:40 -0700 Subject: KEYS: fix cred refcount leak in request_key_auth_new() In request_key_auth_new(), if key_alloc() or key_instantiate_and_link() were to fail, we would leak a reference to the 'struct cred'. Currently this can only happen if key_alloc() fails to allocate memory. But it still should be fixed, as it is a more severe bug waiting to happen. Fix it by cleaning things up to use a helper function which frees a 'struct request_key_auth' correctly. Fixes: d84f4f992cbd ("CRED: Inaugurate COW credentials") Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/request_key_auth.c | 68 ++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index afe9d22ab361..69d6b3b35470 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -120,6 +120,18 @@ static void request_key_auth_revoke(struct key *key) } } +static void free_request_key_auth(struct request_key_auth *rka) +{ + if (!rka) + return; + key_put(rka->target_key); + key_put(rka->dest_keyring); + if (rka->cred) + put_cred(rka->cred); + kfree(rka->callout_info); + kfree(rka); +} + /* * Destroy an instantiation authorisation token key. */ @@ -129,15 +141,7 @@ static void request_key_auth_destroy(struct key *key) kenter("{%d}", key->serial); - if (rka->cred) { - put_cred(rka->cred); - rka->cred = NULL; - } - - key_put(rka->target_key); - key_put(rka->dest_keyring); - kfree(rka->callout_info); - kfree(rka); + free_request_key_auth(rka); } /* @@ -151,22 +155,17 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, const struct cred *cred = current->cred; struct key *authkey = NULL; char desc[20]; - int ret; + int ret = -ENOMEM; kenter("%d,", target->serial); /* allocate a auth record */ - rka = kmalloc(sizeof(*rka), GFP_KERNEL); - if (!rka) { - kleave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } + rka = kzalloc(sizeof(*rka), GFP_KERNEL); + if (!rka) + goto error; rka->callout_info = kmalloc(callout_len, GFP_KERNEL); - if (!rka->callout_info) { - kleave(" = -ENOMEM"); - kfree(rka); - return ERR_PTR(-ENOMEM); - } + if (!rka->callout_info) + goto error_free_rka; /* see if the calling process is already servicing the key request of * another process */ @@ -176,8 +175,12 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, /* if the auth key has been revoked, then the key we're * servicing is already instantiated */ - if (test_bit(KEY_FLAG_REVOKED, &cred->request_key_auth->flags)) - goto auth_key_revoked; + if (test_bit(KEY_FLAG_REVOKED, + &cred->request_key_auth->flags)) { + up_read(&cred->request_key_auth->sem); + ret = -EKEYREVOKED; + goto error_free_rka; + } irka = cred->request_key_auth->payload.data[0]; rka->cred = get_cred(irka->cred); @@ -205,32 +208,23 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(authkey)) { ret = PTR_ERR(authkey); - goto error_alloc; + goto error_free_rka; } /* construct the auth key */ ret = key_instantiate_and_link(authkey, rka, 0, NULL, NULL); if (ret < 0) - goto error_inst; + goto error_put_authkey; kleave(" = {%d,%d}", authkey->serial, refcount_read(&authkey->usage)); return authkey; -auth_key_revoked: - up_read(&cred->request_key_auth->sem); - kfree(rka->callout_info); - kfree(rka); - kleave("= -EKEYREVOKED"); - return ERR_PTR(-EKEYREVOKED); - -error_inst: +error_put_authkey: key_revoke(authkey); key_put(authkey); -error_alloc: - key_put(rka->target_key); - key_put(rka->dest_keyring); - kfree(rka->callout_info); - kfree(rka); +error_free_rka: + free_request_key_auth(rka); +error: kleave("= %d", ret); return ERR_PTR(ret); } -- cgit From f7b48cf08fa63a68b59c2894806ee478216d7f91 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 21 Sep 2017 13:57:41 -0700 Subject: KEYS: don't revoke uninstantiated key in request_key_auth_new() If key_instantiate_and_link() were to fail (which fortunately isn't possible currently), the call to key_revoke(authkey) would crash with a NULL pointer dereference in request_key_auth_revoke() because the key has not yet been instantiated. Fix this by removing the call to key_revoke(). key_put() is sufficient, as it's not possible for an uninstantiated authkey to have been used for anything yet. Fixes: b5f545c880a2 ("[PATCH] keys: Permit running process to instantiate keys") Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/request_key_auth.c | 1 - 1 file changed, 1 deletion(-) diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index 69d6b3b35470..e356075ed2f8 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -220,7 +220,6 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, return authkey; error_put_authkey: - key_revoke(authkey); key_put(authkey); error_free_rka: free_request_key_auth(rka); -- cgit From 884bee0215fcc239b30c062c37ca29077005e064 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:36:12 -0700 Subject: KEYS: fix key refcount leak in keyctl_assume_authority() In keyctl_assume_authority(), if keyctl_change_reqkey_auth() were to fail, we would leak the reference to the 'authkey'. Currently this can only happen if prepare_creds() fails to allocate memory. But it still should be fixed, as it is a more severe bug waiting to happen. This patch also moves the read of 'authkey->serial' to before the reference to the authkey is dropped. Doing the read after dropping the reference is very fragile because it assumes we still hold another reference to the key. (Which we do, in current->cred->request_key_auth, but there's no reason not to write it in the "obviously correct" way.) Fixes: d84f4f992cbd ("CRED: Inaugurate COW credentials") Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/keyctl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index ab0b337c84b4..562f7fe287a0 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1406,11 +1406,9 @@ long keyctl_assume_authority(key_serial_t id) } ret = keyctl_change_reqkey_auth(authkey); - if (ret < 0) - goto error; + if (ret == 0) + ret = authkey->serial; key_put(authkey); - - ret = authkey->serial; error: return ret; } -- cgit From 7fc0786d956d9e59b68d282be9b156179846ea3d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:36:31 -0700 Subject: KEYS: fix key refcount leak in keyctl_read_key() In keyctl_read_key(), if key_permission() were to return an error code other than EACCES, we would leak a the reference to the key. This can't actually happen currently because key_permission() can only return an error code other than EACCES if security_key_permission() does, only SELinux and Smack implement that hook, and neither can return an error code other than EACCES. But it should still be fixed, as it is a bug waiting to happen. Fixes: 29db91906340 ("[PATCH] Keys: Add LSM hooks for key management [try #3]") Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/keyctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 562f7fe287a0..aa1d11a29136 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -771,7 +771,7 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) if (ret == 0) goto can_read_key; if (ret != -EACCES) - goto error; + goto error2; /* we can't; see if it's searchable from this process's keyrings * - we automatically take account of the fact that it may be -- cgit From e645016abc803dafc75e4b8f6e4118f088900ffb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:36:45 -0700 Subject: KEYS: fix writing past end of user-supplied buffer in keyring_read() Userspace can call keyctl_read() on a keyring to get the list of IDs of keys in the keyring. But if the user-supplied buffer is too small, the kernel would write the full list anyway --- which will corrupt whatever userspace memory happened to be past the end of the buffer. Fix it by only filling the space that is available. Fixes: b2a4df200d57 ("KEYS: Expand the capacity of a keyring") Cc: [v3.13+] Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/keyring.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/security/keys/keyring.c b/security/keys/keyring.c index de81793f9920..94f038967c17 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -423,7 +423,7 @@ static void keyring_describe(const struct key *keyring, struct seq_file *m) } struct keyring_read_iterator_context { - size_t qty; + size_t buflen; size_t count; key_serial_t __user *buffer; }; @@ -435,9 +435,9 @@ static int keyring_read_iterator(const void *object, void *data) int ret; kenter("{%s,%d},,{%zu/%zu}", - key->type->name, key->serial, ctx->count, ctx->qty); + key->type->name, key->serial, ctx->count, ctx->buflen); - if (ctx->count >= ctx->qty) + if (ctx->count >= ctx->buflen) return 1; ret = put_user(key->serial, ctx->buffer); @@ -472,16 +472,12 @@ static long keyring_read(const struct key *keyring, return 0; /* Calculate how much data we could return */ - ctx.qty = nr_keys * sizeof(key_serial_t); - if (!buffer || !buflen) - return ctx.qty; - - if (buflen > ctx.qty) - ctx.qty = buflen; + return nr_keys * sizeof(key_serial_t); /* Copy the IDs of the subscribed keys into the buffer */ ctx.buffer = (key_serial_t __user *)buffer; + ctx.buflen = buflen; ctx.count = 0; ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); if (ret < 0) { -- cgit From 237bbd29f7a049d310d907f4b2716a7feef9abf3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:03 -0700 Subject: KEYS: prevent creating a different user's keyrings It was possible for an unprivileged user to create the user and user session keyrings for another user. For example: sudo -u '#3000' sh -c 'keyctl add keyring _uid.4000 "" @u keyctl add keyring _uid_ses.4000 "" @u sleep 15' & sleep 1 sudo -u '#4000' keyctl describe @u sudo -u '#4000' keyctl describe @us This is problematic because these "fake" keyrings won't have the right permissions. In particular, the user who created them first will own them and will have full access to them via the possessor permissions, which can be used to compromise the security of a user's keys: -4: alswrv-----v------------ 3000 0 keyring: _uid.4000 -5: alswrv-----v------------ 3000 0 keyring: _uid_ses.4000 Fix it by marking user and user session keyrings with a flag KEY_FLAG_UID_KEYRING. Then, when searching for a user or user session keyring by name, skip all keyrings that don't have the flag set. Fixes: 69664cf16af4 ("keys: don't generate user and user session keyrings unless they're accessed") Cc: [v2.6.26+] Signed-off-by: Eric Biggers Signed-off-by: David Howells --- include/linux/key.h | 2 ++ security/keys/internal.h | 2 +- security/keys/key.c | 2 ++ security/keys/keyring.c | 23 ++++++++++++++--------- security/keys/process_keys.c | 6 ++++-- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/include/linux/key.h b/include/linux/key.h index 044114185120..e315e16b6ff8 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -187,6 +187,7 @@ struct key { #define KEY_FLAG_BUILTIN 8 /* set if key is built in to the kernel */ #define KEY_FLAG_ROOT_CAN_INVAL 9 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 10 /* set if key should not be removed */ +#define KEY_FLAG_UID_KEYRING 11 /* set if key is a user or user session keyring */ /* the key type and key description string * - the desc is used to match a key against search criteria @@ -243,6 +244,7 @@ extern struct key *key_alloc(struct key_type *type, #define KEY_ALLOC_NOT_IN_QUOTA 0x0002 /* not in quota */ #define KEY_ALLOC_BUILT_IN 0x0004 /* Key is built into kernel */ #define KEY_ALLOC_BYPASS_RESTRICTION 0x0008 /* Override the check on restricted keyrings */ +#define KEY_ALLOC_UID_KEYRING 0x0010 /* allocating a user or user session keyring */ extern void key_revoke(struct key *key); extern void key_invalidate(struct key *key); diff --git a/security/keys/internal.h b/security/keys/internal.h index 1c02c6547038..503adbae7b0d 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -141,7 +141,7 @@ extern key_ref_t keyring_search_aux(key_ref_t keyring_ref, extern key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx); extern key_ref_t search_process_keyrings(struct keyring_search_context *ctx); -extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check); +extern struct key *find_keyring_by_name(const char *name, bool uid_keyring); extern int install_user_keyrings(void); extern int install_thread_keyring_to_cred(struct cred *); diff --git a/security/keys/key.c b/security/keys/key.c index 83da68d98b40..e5c0896c3a8f 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -302,6 +302,8 @@ struct key *key_alloc(struct key_type *type, const char *desc, key->flags |= 1 << KEY_FLAG_IN_QUOTA; if (flags & KEY_ALLOC_BUILT_IN) key->flags |= 1 << KEY_FLAG_BUILTIN; + if (flags & KEY_ALLOC_UID_KEYRING) + key->flags |= 1 << KEY_FLAG_UID_KEYRING; #ifdef KEY_DEBUGGING key->magic = KEY_DEBUG_MAGIC; diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 94f038967c17..4fa82a8a9c0e 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -1097,15 +1097,15 @@ found: /* * Find a keyring with the specified name. * - * All named keyrings in the current user namespace are searched, provided they - * grant Search permission directly to the caller (unless this check is - * skipped). Keyrings whose usage points have reached zero or who have been - * revoked are skipped. + * Only keyrings that have nonzero refcount, are not revoked, and are owned by a + * user in the current user namespace are considered. If @uid_keyring is %true, + * the keyring additionally must have been allocated as a user or user session + * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. */ -struct key *find_keyring_by_name(const char *name, bool skip_perm_check) +struct key *find_keyring_by_name(const char *name, bool uid_keyring) { struct key *keyring; int bucket; @@ -1133,10 +1133,15 @@ struct key *find_keyring_by_name(const char *name, bool skip_perm_check) if (strcmp(keyring->description, name) != 0) continue; - if (!skip_perm_check && - key_permission(make_key_ref(keyring, 0), - KEY_NEED_SEARCH) < 0) - continue; + if (uid_keyring) { + if (!test_bit(KEY_FLAG_UID_KEYRING, + &keyring->flags)) + continue; + } else { + if (key_permission(make_key_ref(keyring, 0), + KEY_NEED_SEARCH) < 0) + continue; + } /* we've got a match but we might end up racing with * key_cleanup() if the keyring is currently 'dead' diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 86bced9fdbdf..293d3598153b 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -77,7 +77,8 @@ int install_user_keyrings(void) if (IS_ERR(uid_keyring)) { uid_keyring = keyring_alloc(buf, user->uid, INVALID_GID, cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, + KEY_ALLOC_UID_KEYRING | + KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); @@ -94,7 +95,8 @@ int install_user_keyrings(void) session_keyring = keyring_alloc(buf, user->uid, INVALID_GID, cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, + KEY_ALLOC_UID_KEYRING | + KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); -- cgit From 37863c43b2c6464f252862bf2e9768264e961678 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:23 -0700 Subject: KEYS: prevent KEYCTL_READ on negative key Because keyctl_read_key() looks up the key with no permissions requested, it may find a negatively instantiated key. If the key is also possessed, we went ahead and called ->read() on the key. But the key payload will actually contain the ->reject_error rather than the normal payload. Thus, the kernel oopses trying to read the user_key_payload from memory address (int)-ENOKEY = 0x00000000ffffff82. Fortunately the payload data is stored inline, so it shouldn't be possible to abuse this as an arbitrary memory read primitive... Reproducer: keyctl new_session keyctl request2 user desc '' @s keyctl read $(keyctl show | awk '/user: desc/ {print $1}') It causes a crash like the following: BUG: unable to handle kernel paging request at 00000000ffffff92 IP: user_read+0x33/0xa0 PGD 36a54067 P4D 36a54067 PUD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 211 Comm: keyctl Not tainted 4.14.0-rc1 #337 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 task: ffff90aa3b74c3c0 task.stack: ffff9878c0478000 RIP: 0010:user_read+0x33/0xa0 RSP: 0018:ffff9878c047bee8 EFLAGS: 00010246 RAX: 0000000000000001 RBX: ffff90aa3d7da340 RCX: 0000000000000017 RDX: 0000000000000000 RSI: 00000000ffffff82 RDI: ffff90aa3d7da340 RBP: ffff9878c047bf00 R08: 00000024f95da94f R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f58ece69740(0000) GS:ffff90aa3e200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000ffffff92 CR3: 0000000036adc001 CR4: 00000000003606f0 Call Trace: keyctl_read_key+0xac/0xe0 SyS_keyctl+0x99/0x120 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x7f58ec787bb9 RSP: 002b:00007ffc8d401678 EFLAGS: 00000206 ORIG_RAX: 00000000000000fa RAX: ffffffffffffffda RBX: 00007ffc8d402800 RCX: 00007f58ec787bb9 RDX: 0000000000000000 RSI: 00000000174a63ac RDI: 000000000000000b RBP: 0000000000000004 R08: 00007ffc8d402809 R09: 0000000000000020 R10: 0000000000000000 R11: 0000000000000206 R12: 00007ffc8d402800 R13: 00007ffc8d4016e0 R14: 0000000000000000 R15: 0000000000000000 Code: e5 41 55 49 89 f5 41 54 49 89 d4 53 48 89 fb e8 a4 b4 ad ff 85 c0 74 09 80 3d b9 4c 96 00 00 74 43 48 8b b3 20 01 00 00 4d 85 ed <0f> b7 5e 10 74 29 4d 85 e4 74 24 4c 39 e3 4c 89 e2 4c 89 ef 48 RIP: user_read+0x33/0xa0 RSP: ffff9878c047bee8 CR2: 00000000ffffff92 Fixes: 61ea0c0ba904 ("KEYS: Skip key state checks when checking for possession") Cc: [v3.13+] Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/keyctl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index aa1d11a29136..365ff85d7e27 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -766,6 +766,11 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) key = key_ref_to_ptr(key_ref); + if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { + ret = -ENOKEY; + goto error2; + } + /* see if we can read it directly */ ret = key_permission(key_ref, KEY_NEED_READ); if (ret == 0) -- cgit From 8f674565d405a8c0b36ee531849df87f43e72ed5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:39 -0700 Subject: KEYS: reset parent each time before searching key_user_tree In key_user_lookup(), if there is no key_user for the given uid, we drop key_user_lock, allocate a new key_user, and search the tree again. But we failed to set 'parent' to NULL at the beginning of the second search. If the tree were to be empty for the second search, the insertion would be done with an invalid 'parent', scribbling over freed memory. Fortunately this can't actually happen currently because the tree always contains at least the root_key_user. But it still should be fixed to make the code more robust. Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/key.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/keys/key.c b/security/keys/key.c index e5c0896c3a8f..eb914a838840 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -54,10 +54,10 @@ void __key_check(const struct key *key) struct key_user *key_user_lookup(kuid_t uid) { struct key_user *candidate = NULL, *user; - struct rb_node *parent = NULL; - struct rb_node **p; + struct rb_node *parent, **p; try_again: + parent = NULL; p = &key_user_tree.rb_node; spin_lock(&key_user_lock); -- cgit From 4aa68e07d845562561f5e73c04aa521376e95252 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:38:29 -0700 Subject: KEYS: restrict /proc/keys by credentials at open time When checking for permission to view keys whilst reading from /proc/keys, we should use the credentials with which the /proc/keys file was opened. This is because, in a classic type of exploit, it can be possible to bypass checks for the *current* credentials by passing the file descriptor to a suid program. Following commit 34dbbcdbf633 ("Make file credentials available to the seqfile interfaces") we can finally fix it. So let's do it. Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/proc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/security/keys/proc.c b/security/keys/proc.c index bf08d02b6646..de834309d100 100644 --- a/security/keys/proc.c +++ b/security/keys/proc.c @@ -187,7 +187,7 @@ static int proc_keys_show(struct seq_file *m, void *v) struct keyring_search_context ctx = { .index_key.type = key->type, .index_key.description = key->description, - .cred = current_cred(), + .cred = m->file->f_cred, .match_data.cmp = lookup_user_key_possessed, .match_data.raw_data = key, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, @@ -207,11 +207,7 @@ static int proc_keys_show(struct seq_file *m, void *v) } } - /* check whether the current task is allowed to view the key (assuming - * non-possession) - * - the caller holds a spinlock, and thus the RCU read lock, making our - * access to __current_cred() safe - */ + /* check whether the current task is allowed to view the key */ rc = key_task_permission(key_ref, ctx.cred, KEY_NEED_VIEW); if (rc < 0) return 0; -- cgit From e007ce9c59bddd1e67b94bc29036d920f5c5428a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 21 Sep 2017 13:57:42 -0700 Subject: KEYS: use kmemdup() in request_key_auth_new() kmemdup() is preferred to kmalloc() followed by memcpy(). Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/request_key_auth.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index e356075ed2f8..6ebf1af8fce9 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -163,9 +163,10 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, rka = kzalloc(sizeof(*rka), GFP_KERNEL); if (!rka) goto error; - rka->callout_info = kmalloc(callout_len, GFP_KERNEL); + rka->callout_info = kmemdup(callout_info, callout_len, GFP_KERNEL); if (!rka->callout_info) goto error_free_rka; + rka->callout_len = callout_len; /* see if the calling process is already servicing the key request of * another process */ @@ -196,8 +197,6 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, rka->target_key = key_get(target); rka->dest_keyring = key_get(dest_keyring); - memcpy(rka->callout_info, callout_info, callout_len); - rka->callout_len = callout_len; /* allocate the auth key */ sprintf(desc, "%x", target->serial); -- cgit From c74aef2d06a9f59cece89093eecc552933cba72a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 17:48:06 +0200 Subject: futex: Fix pi_state->owner serialization There was a reported suspicion about a race between exit_pi_state_list() and put_pi_state(). The same report mentioned the comment with put_pi_state() said it should be called with hb->lock held, and it no longer is in all places. As it turns out, the pi_state->owner serialization is indeed broken. As per the new rules: 734009e96d19 ("futex: Change locking rules") pi_state->owner should be serialized by pi_state->pi_mutex.wait_lock. For the sites setting pi_state->owner we already hold wait_lock (where required) but exit_pi_state_list() and put_pi_state() were not and raced on clearing it. Fixes: 734009e96d19 ("futex: Change locking rules") Reported-by: Gratian Crisan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: dvhart@infradead.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20170922154806.jd3ffltfk24m4o4y@hirez.programming.kicks-ass.net --- kernel/futex.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 3d38eaf05492..0518a0bfc746 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state) /* * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. - * - * Must be called with the hb lock held. */ static void put_pi_state(struct futex_pi_state *pi_state) { @@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + struct task_struct *owner; - rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + owner = pi_state->owner; + if (owner) { + raw_spin_lock(&owner->pi_lock); + list_del_init(&pi_state->list); + raw_spin_unlock(&owner->pi_lock); + } + rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); } - if (current->pi_state_cache) + if (current->pi_state_cache) { kfree(pi_state); - else { + } else { /* * pi_state->list is already empty. * clear pi_state->owner. @@ -907,13 +911,14 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); - - raw_spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); continue; } @@ -922,9 +927,10 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - raw_spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock(&curr->pi_lock); get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); rt_mutex_futex_unlock(&pi_state->pi_mutex); @@ -1208,6 +1214,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); + /* + * Assignment without holding pi_state->pi_mutex.wait_lock is safe + * because there is no concurrency as the object is not published yet. + */ pi_state->owner = p; raw_spin_unlock_irq(&p->pi_lock); @@ -2878,6 +2888,7 @@ retry: raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); + /* drops pi_state->pi_mutex.wait_lock */ ret = wake_futex_pi(uaddr, uval, pi_state); put_pi_state(pi_state); -- cgit From 2827a418ca1b23e432e62c9b3d0e7cf3255dfe88 Mon Sep 17 00:00:00 2001 From: Alexandru Moise <00moses.alexander00@gmail.com> Date: Tue, 19 Sep 2017 22:04:12 +0200 Subject: genirq: Check __free_irq() return value for NULL __free_irq() can return a NULL irqaction for example when trying to free already-free IRQ, but the callsite unconditionally dereferences the returned pointer. Fix this by adding a check and return NULL. Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com> Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20170919200412.GA29985@gmail.com --- kernel/irq/manage.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 573dc52b0806..d00132b5c325 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1643,6 +1643,10 @@ const void *free_irq(unsigned int irq, void *dev_id) #endif action = __free_irq(irq, dev_id); + + if (!action) + return NULL; + devname = action->name; kfree(action); return devname; -- cgit From 02a4843618fb35f847cf8c31cd3893873aa0edde Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 13 Sep 2017 09:17:57 -0400 Subject: brd: fix overflow in __brd_direct_access The code in __brd_direct_access multiplies the pgoff variable by page size and divides it by 512. It can cause overflow on 32-bit architectures. The overflow happens if we create ramdisk larger than 4G and use it as a sparse device. This patch replaces multiplication and division with multiplication by the number of sectors per page. Reviewed-by: Dan Williams Signed-off-by: Mikulas Patocka Fixes: 1647b9b959c7 ("brd: add dax_operations support") Cc: stable@vger.kernel.org # 4.12+ Signed-off-by: Jens Axboe --- drivers/block/brd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index bbd0d186cfc0..2d7178f7754e 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -342,7 +342,7 @@ static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff, if (!brd) return -ENODEV; - page = brd_insert_page(brd, PFN_PHYS(pgoff) / 512); + page = brd_insert_page(brd, (sector_t)pgoff << PAGE_SECTORS_SHIFT); if (!page) return -ENOSPC; *kaddr = page_address(page); -- cgit From f507b54dccfd8000c517d740bc45f20c74532d18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Sep 2017 13:54:35 +0200 Subject: bsg-lib: don't free job in bsg_prepare_job The job structure is allocated as part of the request, so we should not free it in the error path of bsg_prepare_job. Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bsg-lib.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/bsg-lib.c b/block/bsg-lib.c index c82408c7cc3c..dbddff8174e5 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -154,7 +154,6 @@ static int bsg_prepare_job(struct device *dev, struct request *req) failjob_rls_rqst_payload: kfree(job->request_payload.sg_list); failjob_rls_job: - kfree(job); return -ENOMEM; } -- cgit From 1dae69bedeeca0b57e441eae491fbd38049c0b47 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 5 May 2017 22:25:18 -0400 Subject: nbd: ignore non-nbd ioctl's In testing we noticed that nbd would spew if you ran a fio job against the raw device itself. This is because fio calls a block device specific ioctl, however the block layer will first pass this back to the driver ioctl handler in case the driver wants to do something special. Since the device was setup using netlink this caused us to spew every time fio called this ioctl. Since we don't have special handling, just error out for any non-nbd specific ioctl's that come in. This fixes the spew. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 2aa87cbdede0..3684e21d543f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1194,6 +1194,12 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* The block layer will pass back some non-nbd ioctls in case we have + * special handling for them, but we don't so just return an error. + */ + if (_IOC_TYPE(cmd) != 0xab) + return -EINVAL; + mutex_lock(&nbd->config_lock); /* Don't allow ioctl operations on a nbd device that was created with -- cgit From 5acb3cc2c2e9d3020a4fee43763c6463767f1572 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 20 Sep 2017 13:12:20 -0600 Subject: blktrace: Fix potential deadlock between delete & sysfs ops The lockdep code had reported the following unsafe locking scenario: CPU0 CPU1 ---- ---- lock(s_active#228); lock(&bdev->bd_mutex/1); lock(s_active#228); lock(&bdev->bd_mutex); *** DEADLOCK *** The deadlock may happen when one task (CPU1) is trying to delete a partition in a block device and another task (CPU0) is accessing tracing sysfs file (e.g. /sys/block/dm-1/trace/act_mask) in that partition. The s_active isn't an actual lock. It is a reference count (kn->count) on the sysfs (kernfs) file. Removal of a sysfs file, however, require a wait until all the references are gone. The reference count is treated like a rwsem using lockdep instrumentation code. The fact that a thread is in the sysfs callback method or in the ioctl call means there is a reference to the opended sysfs or device file. That should prevent the underlying block structure from being removed. Instead of using bd_mutex in the block_device structure, a new blk_trace_mutex is now added to the request_queue structure to protect access to the blk_trace structure. Suggested-by: Christoph Hellwig Signed-off-by: Waiman Long Acked-by: Steven Rostedt (VMware) Fix typo in patch subject line, and prune a comment detailing how the code used to work. Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +++ include/linux/blkdev.h | 1 + kernel/trace/blktrace.c | 18 ++++++++++++------ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index aebe676225e6..048be4aa6024 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -854,6 +854,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) kobject_init(&q->kobj, &blk_queue_ktype); +#ifdef CONFIG_BLK_DEV_IO_TRACE + mutex_init(&q->blk_trace_mutex); +#endif mutex_init(&q->sysfs_lock); spin_lock_init(&q->__queue_lock); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 460294bb0fa5..02fa42d24b52 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -551,6 +551,7 @@ struct request_queue { int node; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace *blk_trace; + struct mutex blk_trace_mutex; #endif /* * for flush operations diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2a685b45b73b..45a3928544ce 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -648,6 +648,12 @@ int blk_trace_startstop(struct request_queue *q, int start) } EXPORT_SYMBOL_GPL(blk_trace_startstop); +/* + * When reading or writing the blktrace sysfs files, the references to the + * opened sysfs or device files should prevent the underlying block device + * from being removed. So no further delete protection is really needed. + */ + /** * blk_trace_ioctl: - handle the ioctls associated with tracing * @bdev: the block device @@ -665,7 +671,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); switch (cmd) { case BLKTRACESETUP: @@ -691,7 +697,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) break; } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); return ret; } @@ -1727,7 +1733,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!q->blk_trace); @@ -1746,7 +1752,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: @@ -1788,7 +1794,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { if (value) @@ -1814,7 +1820,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: -- cgit From e5313c141b49c1b1af43d1ca81398185d66ad1a6 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 20 Sep 2017 14:24:34 -0700 Subject: loop: remove union of use_aio and ref in struct loop_cmd When the request is completed, lo_complete_rq() checks cmd->use_aio. However, if this is in fact an aio request, cmd->use_aio will have already been reused as cmd->ref by lo_rw_aio*. Fix it by not using a union. On x86_64, there's a hole after the union anyways, so this doesn't make struct loop_cmd any bigger. Fixes: 92d773324b7e ("block/loop: fix use after free") Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.h b/drivers/block/loop.h index f68c1d50802f..1f3956702993 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -67,10 +67,8 @@ struct loop_device { struct loop_cmd { struct kthread_work work; struct request *rq; - union { - bool use_aio; /* use AIO interface to handle I/O */ - atomic_t ref; /* only for aio */ - }; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ long ret; struct kiocb iocb; struct bio_vec *bvec; -- cgit From 56b7103a06083b8ce1160f8289460ba2f584e182 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:26 -0700 Subject: nvme-fc: remove use of FC-specific error codes The FC-NVME transport used the FC-specific error codes in cases where it had to fabricate an error to go back up stack. Instead of using the FC-specific values, now use a generic value (NVME_SC_INTERNAL). Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index d2e882c0f496..9100779b58c9 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1376,7 +1376,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) if (atomic_read(&op->state) == FCPOP_STATE_ABORTED) status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); else if (freq->status) - status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1); + status = cpu_to_le16(NVME_SC_INTERNAL << 1); /* * For the linux implementation, if we have an unsuccesful @@ -1404,7 +1404,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) */ if (freq->transferred_length != be32_to_cpu(op->cmd_iu.data_len)) { - status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1); + status = cpu_to_le16(NVME_SC_INTERNAL << 1); goto done; } result.u64 = 0; @@ -1421,7 +1421,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) freq->transferred_length || op->rsp_iu.status_code || sqe->common.command_id != cqe->command_id)) { - status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1); + status = cpu_to_le16(NVME_SC_INTERNAL << 1); goto done; } result = cqe->result; @@ -1429,7 +1429,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) break; default: - status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1); + status = cpu_to_le16(NVME_SC_INTERNAL << 1); goto done; } -- cgit From 29b3d26ecc8046838de88205b7c4b182ac27ff65 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:27 -0700 Subject: nvmet-fc: remove use of FC-specific error codes The FC-NVME target transport used the FC-specific error codes in return codes when the transport or lldd failed. Instead of using the FC-specific values, now use a generic value (NVME_SC_INTERNAL). Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 421e43bf1dd7..088f07250d76 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1910,8 +1910,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport, spin_lock_irqsave(&fod->flock, flags); fod->writedataactive = false; spin_unlock_irqrestore(&fod->flock, flags); - nvmet_req_complete(&fod->req, - NVME_SC_FC_TRANSPORT_ERROR); + nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); } else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ { fcpreq->fcp_error = ret; fcpreq->transferred_length = 0; @@ -1929,8 +1928,7 @@ __nvmet_fc_fod_op_abort(struct nvmet_fc_fcp_iod *fod, bool abort) /* if in the middle of an io and we need to tear down */ if (abort) { if (fcpreq->op == NVMET_FCOP_WRITEDATA) { - nvmet_req_complete(&fod->req, - NVME_SC_FC_TRANSPORT_ERROR); + nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); return true; } @@ -1968,8 +1966,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) fod->abort = true; spin_unlock(&fod->flock); - nvmet_req_complete(&fod->req, - NVME_SC_FC_TRANSPORT_ERROR); + nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); return; } -- cgit From fc9608e8b4dc3c2545fa0bc5d3eef158ca56ccf8 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:28 -0700 Subject: nvmet-fcloop: remove use of FC-specific error codes The FC-NVME transport loopback test module used the FC-specific error codes in cases where it emulated a transport abort case. Instead of using the FC-specific values, now use a generic value (NVME_SC_INTERNAL). Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fcloop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1cb9847ec261..1fd1afbb8b2a 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -576,7 +576,7 @@ fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport, tfcp_req->aborted = true; spin_unlock(&tfcp_req->reqlock); - tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED; + tfcp_req->status = NVME_SC_INTERNAL; /* * nothing more to do. If io wasn't active, the transport should -- cgit From 8e009ce84683fa124b23ff5cb7fd87c48b331b88 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:29 -0700 Subject: lpfc: remove use of FC-specific error codes The lpfc driver uses the FC-specific error when it needed to return an error to the FC-NVME transport. Convert to use a generic value instead. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/scsi/lpfc/lpfc_nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index 79ba3ce063a4..23bdb1ca106e 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -884,7 +884,7 @@ out_err: wcqe->total_data_placed); nCmd->transferred_length = 0; nCmd->rcv_rsplen = 0; - nCmd->status = NVME_SC_FC_TRANSPORT_ERROR; + nCmd->status = NVME_SC_INTERNAL; } } -- cgit From 39a550d2d9eaee8b618084e6011441eac6a2a3b7 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 14 Sep 2017 11:30:15 -0700 Subject: qla2xxx: remove use of FC-specific error codes The qla2xxx driver uses the FC-specific error when it needed to return an error to the FC-NVME transport. Convert to use a generic value instead. Signed-off-by: James Smart Acked-by: Himanshu Madhani Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/scsi/qla2xxx/qla_nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index 1f59e7a74c7b..6b33a1f24f56 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -180,7 +180,7 @@ static void qla_nvme_sp_done(void *ptr, int res) goto rel; if (unlikely(res == QLA_FUNCTION_FAILED)) - fd->status = NVME_SC_FC_TRANSPORT_ERROR; + fd->status = NVME_SC_INTERNAL; else fd->status = 0; -- cgit From c98cb3bd882119e7e1a7c8df2f1eacfcc701450b Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:25 -0700 Subject: nvme.h: remove FC transport-specific error values The NVM express group recinded the reserved range for the transport. Remove the FC-centric values that had been defined. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 87723c86f136..2440be32be1d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1127,19 +1127,6 @@ enum { NVME_SC_UNWRITTEN_BLOCK = 0x287, NVME_SC_DNR = 0x4000, - - - /* - * FC Transport-specific error status values for NVME commands - * - * Transport-specific status code values must be in the range 0xB0..0xBF - */ - - /* Generic FC failure - catchall */ - NVME_SC_FC_TRANSPORT_ERROR = 0x00B0, - - /* I/O failure due to FC ABTS'd */ - NVME_SC_FC_TRANSPORT_ABORTED = 0x00B1, }; struct nvme_completion { -- cgit From d85cf207499e6740ab9c490ff4f360af5c432d23 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 13:20:23 -0700 Subject: nvme: add transport SGL definitions Add transport SGL defintions from NVMe TP 4008, required for the final NVMe-FC standard. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2440be32be1d..9310ce77d8e1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -471,12 +471,14 @@ enum nvme_opcode { * * @NVME_SGL_FMT_ADDRESS: absolute address of the data block * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block + * @NVME_SGL_FMT_TRANSPORT_A: transport defined format, value 0xA * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation * request subtype */ enum { NVME_SGL_FMT_ADDRESS = 0x00, NVME_SGL_FMT_OFFSET = 0x01, + NVME_SGL_FMT_TRANSPORT_A = 0x0A, NVME_SGL_FMT_INVALIDATE = 0x0f, }; @@ -490,12 +492,16 @@ enum { * * For struct nvme_keyed_sgl_desc: * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor + * + * Transport-specific SGL types: + * @NVME_TRANSPORT_SGL_DATA_DESC: Transport SGL data dlock descriptor */ enum { NVME_SGL_FMT_DATA_DESC = 0x00, NVME_SGL_FMT_SEG_DESC = 0x02, NVME_SGL_FMT_LAST_SEG_DESC = 0x03, NVME_KEY_SGL_FMT_DATA_DESC = 0x04, + NVME_TRANSPORT_SGL_DATA_DESC = 0x05, }; struct nvme_sgl_desc { -- cgit From d9d34c0b2327e85da0ad1476575264fe957fc6ef Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 13:20:24 -0700 Subject: nvme-fc: use transport-specific sgl format Sync with NVM Express spec change and FC-NVME 1.18. FC transport sets SGL type to Transport SGL Data Block Descriptor and subtype to transport-specific value 0x0A. Removed the warn-on's on the PRP fields. They are unneeded. They were to check for values from the upper layer that weren't set right, and for the most part were fine. But, with Async events, which reuse the same structure and 2nd time issued the SGL overlay converted them to the Transport SGL values - the warn-on's were errantly firing. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 9100779b58c9..af075e998944 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1989,16 +1989,17 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, * as well as those by FC-NVME spec. */ WARN_ON_ONCE(sqe->common.metadata); - WARN_ON_ONCE(sqe->common.dptr.prp1); - WARN_ON_ONCE(sqe->common.dptr.prp2); sqe->common.flags |= NVME_CMD_SGL_METABUF; /* - * format SQE DPTR field per FC-NVME rules - * type=data block descr; subtype=offset; - * offset is currently 0. + * format SQE DPTR field per FC-NVME rules: + * type=0x5 Transport SGL Data Block Descriptor + * subtype=0xA Transport-specific value + * address=0 + * length=length of the data series */ - sqe->rw.dptr.sgl.type = NVME_SGL_FMT_OFFSET; + sqe->rw.dptr.sgl.type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | + NVME_SGL_FMT_TRANSPORT_A; sqe->rw.dptr.sgl.length = cpu_to_le32(data_len); sqe->rw.dptr.sgl.addr = 0; -- cgit From deb61742e060d4447712598bc11bb50f8b2e51dd Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 11 Sep 2017 16:16:53 -0700 Subject: nvmet-fc: fix failing max io queue connections fc transport is treating NVMET_NR_QUEUES as maximum queue count, e.g. admin queue plus NVMET_NR_QUEUES-1 io queues. But NVMET_NR_QUEUES is the number of io queues, so maximum queue count is really NVMET_NR_QUEUES+1. Fix the handling in the target fc transport Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 088f07250d76..c48c83d97e30 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -148,7 +148,7 @@ struct nvmet_fc_tgt_assoc { u32 a_id; struct nvmet_fc_tgtport *tgtport; struct list_head a_list; - struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES]; + struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1]; struct kref ref; }; @@ -608,7 +608,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, unsigned long flags; int ret; - if (qid >= NVMET_NR_QUEUES) + if (qid > NVMET_NR_QUEUES) return NULL; queue = kzalloc((sizeof(*queue) + @@ -888,7 +888,7 @@ nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc) int i; spin_lock_irqsave(&tgtport->lock, flags); - for (i = NVMET_NR_QUEUES - 1; i >= 0; i--) { + for (i = NVMET_NR_QUEUES; i >= 0; i--) { queue = assoc->queues[i]; if (queue) { if (!nvmet_fc_tgt_q_get(queue)) -- cgit From 161b8be2bd6abad250d4b3f674bdd5480f15beeb Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 14 Sep 2017 13:54:39 -0400 Subject: nvme-pci: initialize queue memory before interrupts A spurious interrupt before the nvme driver has initialized the completion queue may inadvertently cause the driver to believe it has a completion to process. This may result in a NULL dereference since the nvmeq's tags are not set at this point. The patch initializes the host's CQ memory so that a spurious interrupt isn't mistaken for a real completion. Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4a2121335f48..004018c5dccc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1313,11 +1313,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) if (result < 0) goto release_cq; + nvme_init_queue(nvmeq, qid); result = queue_request_irq(nvmeq); if (result < 0) goto release_sq; - nvme_init_queue(nvmeq, qid); return result; release_sq: @@ -1464,6 +1464,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) return result; nvmeq->cq_vector = 0; + nvme_init_queue(nvmeq, 0); result = queue_request_irq(nvmeq); if (result) { nvmeq->cq_vector = -1; @@ -2156,7 +2157,6 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - nvme_init_queue(dev->queues[0], 0); result = nvme_alloc_admin_tags(dev); if (result) goto out; -- cgit From d08774738446e77734777adcf5d1045237b4475a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 15 Sep 2017 13:05:38 -0400 Subject: nvme-pci: Print invalid SGL only once The WARN_ONCE macro returns true if the condition is true, not if the warn was raised, so we're printing the scatter list every time it's invalid. This is excessive and makes debugging harder, so this patch prints it just once. Signed-off-by: Keith Busch Reviewed-by: Martin K. Petersen Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 004018c5dccc..cb73bc8cad3b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -540,6 +541,20 @@ static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) } #endif +static void nvme_print_sgl(struct scatterlist *sgl, int nents) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) { + dma_addr_t phys = sg_phys(sg); + pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " + "dma_address:%pad dma_length:%d\n", + i, &phys, sg->offset, sg->length, &sg_dma_address(sg), + sg_dma_len(sg)); + } +} + static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -622,19 +637,10 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) return BLK_STS_OK; bad_sgl: - if (WARN_ONCE(1, "Invalid SGL for payload:%d nents:%d\n", - blk_rq_payload_bytes(req), iod->nents)) { - for_each_sg(iod->sg, sg, iod->nents, i) { - dma_addr_t phys = sg_phys(sg); - pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " - "dma_address:%pad dma_length:%d\n", i, &phys, - sg->offset, sg->length, - &sg_dma_address(sg), - sg_dma_len(sg)); - } - } + WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents), + "Invalid SGL for payload:%d nents:%d\n", + blk_rq_payload_bytes(req), iod->nents); return BLK_STS_IOERR; - } static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, -- cgit From cd48282cc736377d5abf7c04de8c6ba864ba3794 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 14 Sep 2017 11:03:09 -0700 Subject: nvme: stop aer posting if controller state not live If an nvme async_event command completes, in most cases, a new async event is posted. However, if the controller enters a resetting or reconnecting state, there is nothing to block the scheduled work element from posting the async event again. Nor are there calls from the transport to stop async events when an association dies. In the case of FC, where the association is torn down, the aer must be aborted on the FC link and completes through the normal job completion path. Thus the terminated async event ends up being rescheduled even though the controller isn't in a valid state for the aer, and the reposting gets the transport into a partially torn down data structure. It's possible to hit the scenario on rdma, although much less likely due to an aer completing right as the association is terminated and as the association teardown reclaims the blk requests via nvme_cancel_request() so its immediate, not a link-related action like on FC. Fix by putting controller state checks in both the async event completion routine where it schedules the async event and in the async event work routine before it calls into the transport. It's effectively a "stop_async_events()" behavior. The transport, when it creates a new association with the subsystem will transition the state back to live and is already restarting the async event posting. Signed-off-by: James Smart [hch: remove taking a lock over reading the controller state] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index acc816b67582..d470f031e27f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2590,7 +2590,7 @@ static void nvme_async_event_work(struct work_struct *work) container_of(work, struct nvme_ctrl, async_event_work); spin_lock_irq(&ctrl->lock); - while (ctrl->event_limit > 0) { + while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) { int aer_idx = --ctrl->event_limit; spin_unlock_irq(&ctrl->lock); @@ -2677,7 +2677,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, /*FALLTHRU*/ case NVME_SC_ABORT_REQ: ++ctrl->event_limit; - queue_work(nvme_wq, &ctrl->async_event_work); + if (ctrl->state == NVME_CTRL_LIVE) + schedule_work(&ctrl->async_event_work); break; default: break; -- cgit From 0951338d9677f546e230685d68631dfd3f81cca5 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 13:18:04 -0700 Subject: nvme: allow timed-out ios to retry Currently the nvme_req_needs_retry() applies several checks to see if a retry is allowed. On of those is whether the current time has exceeded the start time of the io plus the timeout length. This check, if an io times out, means there is never a retry allowed for the io. Which means applications see the io failure. Remove this check and allow the io to timeout, like it does on other protocols, and retries to be made. On the FC transport, a frame can be lost for an individual io, and there may be no other errors that escalate for the connection/association. The io will timeout, which causes the transport to escalate into creating a new association, but the io that timed out, due to this retry logic, has already failed back to the application and things are hosed. Signed-off-by: James Smart Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d470f031e27f..5589f67d2cd8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -134,8 +134,6 @@ static inline bool nvme_req_needs_retry(struct request *req) return false; if (nvme_req(req)->status & NVME_SC_DNR) return false; - if (jiffies - req->start_time >= req->timeout) - return false; if (nvme_req(req)->retries >= nvme_max_retries) return false; return true; -- cgit From 8edd11c9ad3a6205eea6de9d02eaf64c681a0658 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 14 Sep 2017 13:59:28 -0300 Subject: nvme-fabrics: Allow 0 as KATO value Currently, driver code allows user to set 0 as KATO (Keep Alive TimeOut), but this is not being respected. This patch enforces the expected behavior. Signed-off-by: Guilherme G. Piccoli Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fabrics.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 47307752dc65..555c976cc2ee 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -565,6 +565,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->queue_size = NVMF_DEF_QUEUE_SIZE; opts->nr_io_queues = num_online_cpus(); opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; + opts->kato = NVME_DEFAULT_KATO; options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -655,21 +656,22 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, goto out; } - if (opts->discovery_nqn) { - pr_err("Discovery controllers cannot accept keep_alive_tmo != 0\n"); - ret = -EINVAL; - goto out; - } - if (token < 0) { pr_err("Invalid keep_alive_tmo %d\n", token); ret = -EINVAL; goto out; - } else if (token == 0) { + } else if (token == 0 && !opts->discovery_nqn) { /* Allowed for debug */ pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n"); } opts->kato = token; + + if (opts->discovery_nqn && opts->kato) { + pr_err("Discovery controllers cannot accept KATO != 0\n"); + ret = -EINVAL; + goto out; + } + break; case NVMF_OPT_CTRL_LOSS_TMO: if (match_int(args, &token)) { @@ -762,8 +764,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, uuid_copy(&opts->host->id, &hostid); out: - if (!opts->discovery_nqn && !opts->kato) - opts->kato = NVME_DEFAULT_KATO; kfree(options); return ret; } -- cgit From bb1cc74790eb51f52d23c6e5fd9a3bb16030c3d8 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 18 Sep 2017 09:08:29 -0700 Subject: nvmet: implement valid sqhd values in completions To support sqhd, for initiators that are following the spec and paying attention to sqhd vs their sqtail values: - add sqhd to struct nvmet_sq - initialize sqhd to 0 in nvmet_sq_setup - rather than propagate the 0's-based qsize value from the connect message which requires a +1 in every sqhd update, and as nothing else references it, convert to 1's-based value in nvmt_sq/cq_setup() calls. - validate connect message sqsize being non-zero per spec. - updated assign sqhd for every completion that goes back. Also remove handling the NULL sq case in __nvmet_req_complete, as it can't happen with the current code. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/core.c | 8 ++++---- drivers/nvme/target/fabrics-cmd.c | 9 +++++++-- drivers/nvme/target/nvmet.h | 1 + 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 7c23eaf8e563..c2a768a94235 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -390,10 +390,9 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status) if (status) nvmet_set_status(req, status); - /* XXX: need to fill in something useful for sq_head */ - req->rsp->sq_head = 0; - if (likely(req->sq)) /* may happen during early failure */ - req->rsp->sq_id = cpu_to_le16(req->sq->qid); + req->sq->sqhd = (req->sq->sqhd + 1) % req->sq->size; + req->rsp->sq_head = cpu_to_le16(req->sq->sqhd); + req->rsp->sq_id = cpu_to_le16(req->sq->qid); req->rsp->command_id = req->cmd->common.command_id; if (req->ns) @@ -420,6 +419,7 @@ void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, u16 size) { + sq->sqhd = 0; sq->qid = qid; sq->size = size; diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 859a66725291..db3bf6b8bf9e 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -109,9 +109,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) pr_warn("queue already connected!\n"); return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; } + if (!sqsize) { + pr_warn("queue size zero!\n"); + return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + } - nvmet_cq_setup(ctrl, req->cq, qid, sqsize); - nvmet_sq_setup(ctrl, req->sq, qid, sqsize); + /* note: convert queue size from 0's-based value to 1's-based value */ + nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1); + nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1); return 0; } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 7d261ab894f4..7b8e20adf760 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -74,6 +74,7 @@ struct nvmet_sq { struct percpu_ref ref; u16 qid; u16 size; + u16 sqhd; struct completion free_done; struct completion confirm_done; }; -- cgit From 332391a9935da939319e473b4680e173df75afcf Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 21 Sep 2017 08:16:29 -0600 Subject: fs: Fix page cache inconsistency when mixing buffered and AIO DIO Currently when mixing buffered reads and asynchronous direct writes it is possible to end up with the situation where we have stale data in the page cache while the new data is already written to disk. This is permanent until the affected pages are flushed away. Despite the fact that mixing buffered and direct IO is ill-advised it does pose a thread for a data integrity, is unexpected and should be fixed. Fix this by deferring completion of asynchronous direct writes to a process context in the case that there are mapped pages to be found in the inode. Later before the completion in dio_complete() invalidate the pages in question. This ensures that after the completion the pages in the written area are either unmapped, or populated with up-to-date data. Also do the same for the iomap case which uses iomap_dio_complete() instead. This has a side effect of deferring the completion to a process context for every AIO DIO that happens on inode that has pages mapped. However since the consensus is that this is ill-advised practice the performance implication should not be a problem. This was based on proposal from Jeff Moyer, thanks! Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Reviewed-by: Jeff Moyer Signed-off-by: Lukas Czerner Signed-off-by: Jens Axboe --- fs/direct-io.c | 49 +++++++++++++++++++++++++++++++++++++++++++------ fs/iomap.c | 29 ++++++++++++++++------------- mm/filemap.c | 10 ++++++++-- 3 files changed, 67 insertions(+), 21 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 5fa2211e49ae..62cf812ed0e5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) { loff_t offset = dio->iocb->ki_pos; ssize_t transferred = 0; + int err; /* * AIO submission can race with bio completion to get here while @@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) if (ret == 0) ret = transferred; + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + */ + if (ret > 0 && dio->op == REQ_OP_WRITE && + dio->inode->i_mapping->nrpages) { + err = invalidate_inode_pages2_range(dio->inode->i_mapping, + offset >> PAGE_SHIFT, + (offset + ret - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(err); + } + if (dio->end_io) { - int err; // XXX: ki_pos?? err = dio->end_io(dio->iocb, offset, ret, dio->private); @@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio) struct dio *dio = bio->bi_private; unsigned long remaining; unsigned long flags; + bool defer_completion = false; /* cleanup the bio */ dio_bio_complete(dio, bio); @@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - if (dio->result && dio->defer_completion) { + /* + * Defer completion when defer_completion is set or + * when the inode has pages mapped and this is AIO write. + * We need to invalidate those pages because there is a + * chance they contain stale data in the case buffered IO + * went in between AIO submission and completion into the + * same region. + */ + if (dio->result) + defer_completion = dio->defer_completion || + (dio->op == REQ_OP_WRITE && + dio->inode->i_mapping->nrpages); + if (defer_completion) { INIT_WORK(&dio->complete_work, dio_aio_complete_work); queue_work(dio->inode->i_sb->s_dio_done_wq, &dio->complete_work); @@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, * For AIO O_(D)SYNC writes we need to defer completions to a workqueue * so that we can call ->fsync. */ - if (dio->is_async && iov_iter_rw(iter) == WRITE && - ((iocb->ki_filp->f_flags & O_DSYNC) || - IS_SYNC(iocb->ki_filp->f_mapping->host))) { - retval = dio_set_defer_completion(dio); + if (dio->is_async && iov_iter_rw(iter) == WRITE) { + retval = 0; + if ((iocb->ki_filp->f_flags & O_DSYNC) || + IS_SYNC(iocb->ki_filp->f_mapping->host)) + retval = dio_set_defer_completion(dio); + else if (!dio->inode->i_sb->s_dio_done_wq) { + /* + * In case of AIO write racing with buffered read we + * need to defer completion. We can't decide this now, + * however the workqueue needs to be initialized here. + */ + retval = sb_init_dio_done_wq(dio->inode->i_sb); + } if (retval) { /* * We grab i_mutex only for reads so we don't have diff --git a/fs/iomap.c b/fs/iomap.c index 269b24a01f32..8194d30bdca0 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -713,8 +713,24 @@ struct iomap_dio { static ssize_t iomap_dio_complete(struct iomap_dio *dio) { struct kiocb *iocb = dio->iocb; + struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + */ + if (!dio->error && + (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { + ret = invalidate_inode_pages2_range(inode->i_mapping, + iocb->ki_pos >> PAGE_SHIFT, + (iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + } + if (dio->end_io) { ret = dio->end_io(iocb, dio->error ? dio->error : dio->size, @@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, ret = iomap_dio_complete(dio); - /* - * Try again to invalidate clean pages which might have been cached by - * non-direct readahead, or faulted in by get_user_pages() if the source - * of the write was an mmap'ed region of the file we're writing. Either - * one is a pretty crazy thing to do, so we don't support it 100%. If - * this invalidation fails, tough, the write still worked... - */ - if (iov_iter_rw(iter) == WRITE) { - int err = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(err); - } - return ret; out_free_dio: diff --git a/mm/filemap.c b/mm/filemap.c index 870971e20967..db250d0e0565 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2926,9 +2926,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * we're writing. Either one is a pretty crazy thing to do, * so we don't support it 100%. If this invalidation * fails, tough, the write still worked... + * + * Most of the time we do not need this since dio_complete() will do + * the invalidation for us. However there are some file systems that + * do not end up with dio_complete() being called, so let's not break + * them by removing it completely */ - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); + if (mapping->nrpages) + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); if (written > 0) { pos += written; -- cgit From f5c156c4c29a3d87176dd6e5c099388e187ec29b Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 12:17:16 -0700 Subject: block: fix a crash caused by wrong API part_stat_show takes a part device not a disk, so we should use part_to_disk. Fixes: d62e26b3ffd2("block: pass in queue to inflight accounting") Cc: Bart Van Assche Cc: Omar Sandoval Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/partition-generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index 86e8fe1adcdb..88c555db4e5d 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -112,7 +112,7 @@ ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = dev_to_disk(dev)->queue; + struct request_queue *q = part_to_disk(p)->queue; unsigned int inflight[2]; int cpu; -- cgit From 9789e7e93f2b892098d7684ac8131092aa617814 Mon Sep 17 00:00:00 2001 From: Mengting Zhang Date: Sat, 23 Sep 2017 16:18:14 +0800 Subject: perf report: Fix debug messages with --call-graph option With --call-graph option, perf report can display call chains using type, min percent threshold, optional print limit and order. And the default call-graph parameter is 'graph,0.5,caller,function,percent'. Before this patch, 'perf report --call-graph' shows incorrect debug messages as below: # perf report --call-graph Invalid callchain mode: 0.5 Invalid callchain order: 0.5 Invalid callchain sort key: 0.5 Invalid callchain config key: 0.5 Invalid callchain mode: caller Invalid callchain mode: function Invalid callchain order: function Invalid callchain mode: percent Invalid callchain order: percent Invalid callchain sort key: percent That is because in function __parse_callchain_report_opt(),each field of the call-graph parameter is passed to parse_callchain_{mode,order, sort_key,value} in turn until it meets the matching value. For example, the order field "caller" is passed to parse_callchain_mode() firstly and obviously it doesn't match any mode field. Therefore parse_callchain_mode() will shows the debug message "Invalid callchain mode: caller", which could confuse users. The patch fixes this issue by moving the warning out of the function parse_callchain_{mode,order,sort_key,value}. Signed-off-by: Mengting Zhang Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: Krister Johansen Cc: Li Bin Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Cc: Yao Jin Link: http://lkml.kernel.org/r/1506154694-39691-1-git-send-email-zhangmengting@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 510b513e0f01..be09d77cade0 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -65,8 +65,6 @@ static int parse_callchain_mode(const char *value) callchain_param.mode = CHAIN_FOLDED; return 0; } - - pr_err("Invalid callchain mode: %s\n", value); return -1; } @@ -82,8 +80,6 @@ static int parse_callchain_order(const char *value) callchain_param.order_set = true; return 0; } - - pr_err("Invalid callchain order: %s\n", value); return -1; } @@ -105,8 +101,6 @@ static int parse_callchain_sort_key(const char *value) callchain_param.branch_callstack = 1; return 0; } - - pr_err("Invalid callchain sort key: %s\n", value); return -1; } @@ -124,8 +118,6 @@ static int parse_callchain_value(const char *value) callchain_param.value = CCVAL_COUNT; return 0; } - - pr_err("Invalid callchain config key: %s\n", value); return -1; } @@ -319,12 +311,27 @@ int perf_callchain_config(const char *var, const char *value) return ret; } - if (!strcmp(var, "print-type")) - return parse_callchain_mode(value); - if (!strcmp(var, "order")) - return parse_callchain_order(value); - if (!strcmp(var, "sort-key")) - return parse_callchain_sort_key(value); + if (!strcmp(var, "print-type")){ + int ret; + ret = parse_callchain_mode(value); + if (ret == -1) + pr_err("Invalid callchain mode: %s\n", value); + return ret; + } + if (!strcmp(var, "order")){ + int ret; + ret = parse_callchain_order(value); + if (ret == -1) + pr_err("Invalid callchain order: %s\n", value); + return ret; + } + if (!strcmp(var, "sort-key")){ + int ret; + ret = parse_callchain_sort_key(value); + if (ret == -1) + pr_err("Invalid callchain sort key: %s\n", value); + return ret; + } if (!strcmp(var, "threshold")) { callchain_param.min_percent = strtod(value, &endptr); if (value == endptr) { -- cgit From 090657c9fb7094e4c1b05c1713d6c2a12ef43dea Mon Sep 17 00:00:00 2001 From: Akemi Yagi Date: Fri, 22 Sep 2017 22:11:53 +0000 Subject: perf tools: Fix syscalltbl build failure The build of kernel v4.14-rc1 for i686 fails on RHEL 6 with the error in tools/perf: util/syscalltbl.c:157: error: expected ';', ',' or ')' before '__maybe_unused' mv: cannot stat `util/.syscalltbl.o.tmp': No such file or directory Fix it by placing/moving: #include outside of #ifdef HAVE_SYSCALL_TABLE block. Signed-off-by: Akemi Yagi Cc: Alan Bartlett Link: http://lkml.kernel.org/r/oq41r8$1v9$1@blaine.gmane.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/syscalltbl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c index 19e5db90394c..6eea7cff3d4e 100644 --- a/tools/perf/util/syscalltbl.c +++ b/tools/perf/util/syscalltbl.c @@ -15,9 +15,9 @@ #include "syscalltbl.h" #include +#include #ifdef HAVE_SYSCALL_TABLE -#include #include #include "string2.h" #include "util.h" -- cgit From 78b1beb0998437107ed144b341fbe1252188916b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 24 Sep 2017 21:46:29 +0300 Subject: IB/core: Fix typo in the name of the tag-matching cap struct The tag matching functionality is implemented by mlx5 driver by extending XRQ, however this internal kernel information was exposed to user space applications with *xrq* name instead of *tm*. This patch renames *xrq* to *tm* to handle that. Fixes: 8d50505ada72 ("IB/uverbs: Expose XRQ capabilities") Signed-off-by: Leon Romanovsky Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 14 +++++++------- drivers/infiniband/hw/mlx5/main.c | 10 +++++----- include/rdma/ib_verbs.h | 4 ++-- include/uapi/rdma/ib_user_verbs.h | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4ab30d832ac5..52a2cf2d83aa 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3869,15 +3869,15 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.raw_packet_caps = attr.raw_packet_caps; resp.response_length += sizeof(resp.raw_packet_caps); - if (ucore->outlen < resp.response_length + sizeof(resp.xrq_caps)) + if (ucore->outlen < resp.response_length + sizeof(resp.tm_caps)) goto end; - resp.xrq_caps.max_rndv_hdr_size = attr.xrq_caps.max_rndv_hdr_size; - resp.xrq_caps.max_num_tags = attr.xrq_caps.max_num_tags; - resp.xrq_caps.max_ops = attr.xrq_caps.max_ops; - resp.xrq_caps.max_sge = attr.xrq_caps.max_sge; - resp.xrq_caps.flags = attr.xrq_caps.flags; - resp.response_length += sizeof(resp.xrq_caps); + resp.tm_caps.max_rndv_hdr_size = attr.tm_caps.max_rndv_hdr_size; + resp.tm_caps.max_num_tags = attr.tm_caps.max_num_tags; + resp.tm_caps.max_ops = attr.tm_caps.max_ops; + resp.tm_caps.max_sge = attr.tm_caps.max_sge; + resp.tm_caps.flags = attr.tm_caps.flags; + resp.response_length += sizeof(resp.tm_caps); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 05fb4bdff6a0..d6fbad8f34aa 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -778,13 +778,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (MLX5_CAP_GEN(mdev, tag_matching)) { - props->xrq_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; - props->xrq_caps.max_num_tags = + props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; + props->tm_caps.max_num_tags = (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1; - props->xrq_caps.flags = IB_TM_CAP_RC; - props->xrq_caps.max_ops = + props->tm_caps.flags = IB_TM_CAP_RC; + props->tm_caps.max_ops = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); - props->xrq_caps.max_sge = MLX5_TM_MAX_SGE; + props->tm_caps.max_sge = MLX5_TM_MAX_SGE; } if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bdb1279a415b..bbb5f54db882 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -285,7 +285,7 @@ enum ib_tm_cap_flags { IB_TM_CAP_RC = 1 << 0, }; -struct ib_xrq_caps { +struct ib_tm_caps { /* Max size of RNDV header */ u32 max_rndv_hdr_size; /* Max number of entries in tag matching list */ @@ -358,7 +358,7 @@ struct ib_device_attr { struct ib_rss_caps rss_caps; u32 max_wq_type_rq; u32 raw_packet_caps; /* Use ib_raw_packet_caps enum */ - struct ib_xrq_caps xrq_caps; + struct ib_tm_caps tm_caps; }; enum ib_mtu { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 9a0b6479fe0c..d4e0b53bfc75 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -261,7 +261,7 @@ struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_rss_caps rss_caps; __u32 max_wq_type_rq; __u32 raw_packet_caps; - struct ib_uverbs_tm_caps xrq_caps; + struct ib_uverbs_tm_caps tm_caps; }; struct ib_uverbs_query_port { -- cgit From 73827a605bbd7cebef4cfd1261e497246a82a0e7 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 24 Sep 2017 21:46:30 +0300 Subject: IB/core: Fix qp_sec use after free access When security_ib_alloc_security fails, qp->qp_sec memory is freed. However ib_destroy_qp still tries to access this memory which result in kernel crash. So its initialized to NULL to avoid such access. Fixes: d291f1a65232 ("IB/core: Enforce PKey security on QPs") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/security.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 70ad19c4c73e..88bdafb297f5 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -432,8 +432,10 @@ int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev) atomic_set(&qp->qp_sec->error_list_count, 0); init_completion(&qp->qp_sec->error_complete); ret = security_ib_alloc_security(&qp->qp_sec->security); - if (ret) + if (ret) { kfree(qp->qp_sec); + qp->qp_sec = NULL; + } return ret; } -- cgit From edd31551148c09608feee6b8756ad148d550ee3b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 24 Sep 2017 21:46:31 +0300 Subject: IB: Correct MR length field to be 64-bit The ib_mr->length represents the length of the MR in bytes as per the IBTA spec 1.3 section 11.2.10.3 (REGISTER PHYSICAL MEMORY REGION). Currently ib_mr->length field is defined as only 32-bits field. This might result into truncation and failed WRs of consumers who registers more than 4GB bytes memory regions and whose WRs accessing such MRs. This patch makes the length 64-bit to avoid such truncation. Cc: Sagi Grimberg Cc: Chuck Lever Cc: Faisal Latif Fixes: 4c67e2bfc8b7 ("IB/core: Introduce new fast registration API") Signed-off-by: Ilya Lesokhin Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/nes/nes_verbs.c | 4 ++-- drivers/infiniband/ulp/iser/iser_memory.c | 2 +- include/rdma/ib_verbs.h | 2 +- net/sunrpc/xprtrdma/frwr_ops.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index f0dc5f4aa177..442b9bdc0f03 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3232,7 +3232,7 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, mr->ibmr.iova); set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX, - mr->ibmr.length); + lower_32_bits(mr->ibmr.length)); set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX, 0); set_wqe_32bit_value(wqe->wqe_words, @@ -3274,7 +3274,7 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, mr->npages * 8); nes_debug(NES_DBG_IW_TX, "SQ_REG_MR: iova_start: %llx, " - "length: %d, rkey: %0x, pgl_paddr: %llx, " + "length: %lld, rkey: %0x, pgl_paddr: %llx, " "page_list_len: %u, wqe_misc: %x\n", (unsigned long long) mr->ibmr.iova, mr->ibmr.length, diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 9c3e9ab53a41..322209d5ff58 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -154,7 +154,7 @@ static void iser_dump_page_vec(struct iser_page_vec *page_vec) { int i; - iser_err("page vec npages %d data length %d\n", + iser_err("page vec npages %d data length %lld\n", page_vec->npages, page_vec->fake_mr.length); for (i = 0; i < page_vec->npages; i++) iser_err("vec[%d]: %llx\n", i, page_vec->pages[i]); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bbb5f54db882..e8608b2dc844 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1739,7 +1739,7 @@ struct ib_mr { u32 lkey; u32 rkey; u64 iova; - u32 length; + u64 length; unsigned int page_size; bool need_inval; union { diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 5a936a6a31a3..df062e086bdb 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -401,7 +401,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (unlikely(n != mw->mw_nents)) goto out_mapmr_err; - dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", + dprintk("RPC: %s: Using frmr %p to map %u segments (%llu bytes)\n", __func__, frmr, mw->mw_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); -- cgit From 9c6f42e9254150d2772242d9f8bd8d0b7b7431ff Mon Sep 17 00:00:00 2001 From: Shalom Lagziel Date: Sun, 24 Sep 2017 21:46:32 +0300 Subject: IB/ipoib: Fix sysfs Pkey create<->remove possible deadlock A possible ABBA lock can happen with RTNL and vlan_rwsem. For example: Flow A: Device Flush __ipoib_ib_dev_flush down_read(vlan_rwsem) // Lock A ipoib_flush_ah flush_workqueue(priv->wq) // Wait for completion A work on shared WQ (Mcast carrier) ipoib_mcast_carrier_on_task while (!rtnl_trylock()) // Wait for lock B Flow B: Sysfs PKEY delete ipoib_vlan_delete lock(RTNL) // Lock B down_write(vlan_rwsem) // Wait for lock A This can happen with PKEY creates as well. The solution is to release the RTNL lock in sysfs functions in case it is not possible to lock VLAN RW semaphore and reset the SYS call. Fixes: 69956d83267e ("IB/ipoib: Sync between remove_one to sysfs calls that use rtnl_lock") Signed-off-by: Shalom Lagziel Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 9927cd6b7082..e01c58edca15 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -141,14 +141,17 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) return restart_syscall(); } - priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); - if (!priv) { + if (!down_write_trylock(&ppriv->vlan_rwsem)) { rtnl_unlock(); mutex_unlock(&ppriv->sysfs_mutex); - return -ENOMEM; + return restart_syscall(); } - down_write(&ppriv->vlan_rwsem); + priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); + if (!priv) { + result = -ENOMEM; + goto out; + } /* * First ensure this isn't a duplicate. We check the parent device and @@ -175,7 +178,7 @@ out: rtnl_unlock(); mutex_unlock(&ppriv->sysfs_mutex); - if (result) { + if (result && priv) { free_netdev(priv->dev); kfree(priv); } @@ -204,7 +207,12 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) return restart_syscall(); } - down_write(&ppriv->vlan_rwsem); + if (!down_write_trylock(&ppriv->vlan_rwsem)) { + rtnl_unlock(); + mutex_unlock(&ppriv->sysfs_mutex); + return restart_syscall(); + } + list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { -- cgit From 7c9d9662103ae1c11acc7bfc47d988466cff23cf Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Sun, 24 Sep 2017 21:46:33 +0300 Subject: IB/ipoib: Fix inconsistency with free_netdev and free_rdma_netdev Call free_rdma_netdev instead of free_netdev each time we want to release a netdevice. This call is also relevant for future freeing of offloaded child interfaces. This patch also adds a missing call for free netdevice when releasing a parent interface that has child interfaces using ipoib_remove_one. Fixes: cd565b4b51e5 ('IB/IPoIB: Support acceleration options callbacks') Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 15 +++++++++++---- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 10 ++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index bac95b509a9b..dcc77014018d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2180,6 +2180,7 @@ static struct net_device *ipoib_add_port(const char *format, { struct ipoib_dev_priv *priv; struct ib_port_attr attr; + struct rdma_netdev *rn; int result = -ENOMEM; priv = ipoib_intf_alloc(hca, port, format); @@ -2279,7 +2280,8 @@ register_failed: ipoib_dev_cleanup(priv->dev); device_init_failed: - free_netdev(priv->dev); + rn = netdev_priv(priv->dev); + rn->free_rdma_netdev(priv->dev); kfree(priv); alloc_mem_failed: @@ -2328,7 +2330,7 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) return; list_for_each_entry_safe(priv, tmp, dev_list, list) { - struct rdma_netdev *rn = netdev_priv(priv->dev); + struct rdma_netdev *parent_rn = netdev_priv(priv->dev); ib_unregister_event_handler(&priv->event_handler); flush_workqueue(ipoib_workqueue); @@ -2350,10 +2352,15 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) unregister_netdev(priv->dev); mutex_unlock(&priv->sysfs_mutex); - rn->free_rdma_netdev(priv->dev); + parent_rn->free_rdma_netdev(priv->dev); + + list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { + struct rdma_netdev *child_rn; - list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) + child_rn = netdev_priv(cpriv->dev); + child_rn->free_rdma_netdev(cpriv->dev); kfree(cpriv); + } kfree(priv); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index e01c58edca15..55a9b71ed05a 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -179,7 +179,10 @@ out: mutex_unlock(&ppriv->sysfs_mutex); if (result && priv) { - free_netdev(priv->dev); + struct rdma_netdev *rn; + + rn = netdev_priv(priv->dev); + rn->free_rdma_netdev(priv->dev); kfree(priv); } @@ -232,7 +235,10 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) mutex_unlock(&ppriv->sysfs_mutex); if (dev) { - free_netdev(dev); + struct rdma_netdev *rn; + + rn = netdev_priv(dev); + rn->free_rdma_netdev(priv->dev); kfree(priv); return 0; } -- cgit From d67bc5d4e3e100d762c0f57ea67f28bc219698a6 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Sun, 24 Sep 2017 21:46:34 +0300 Subject: IB/mlx5: Simplify mlx5_ib_cont_pages The patch simplifies mlx5_ib_cont_pages and fixes the following issues in the original implementation: First issues is related to alignment of the PFNs. After the check base + p != PFN, the alignment of the PFN wasn't checked. So the PFN sequence 0, 1, 1, 2 would result in a page_shift of 13 even though the 3rd PFN is not 8KB aligned. This wasn't actually a bug because it was supported by all the existing mlx5 compatible device, but we don't want to require this support in all future devices. Another issue is because the inner loop didn't advance PFN so the test "if (base + p != pfn)" always failed for SGE with len > (1< Reviewed-by: Eli Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mem.c | 47 +++++++++++++++------------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 914f212e7ef6..f3dbd75a0a96 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -50,13 +50,9 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, { unsigned long tmp; unsigned long m; - int i, k; - u64 base = 0; - int p = 0; - int skip; - int mask; - u64 len; - u64 pfn; + u64 base = ~0, p = 0; + u64 len, pfn; + int i = 0; struct scatterlist *sg; int entry; unsigned long page_shift = umem->page_shift; @@ -76,33 +72,24 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, m = find_first_bit(&tmp, BITS_PER_LONG); if (max_page_shift) m = min_t(unsigned long, max_page_shift - page_shift, m); - skip = 1 << m; - mask = skip - 1; - i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { len = sg_dma_len(sg) >> page_shift; pfn = sg_dma_address(sg) >> page_shift; - for (k = 0; k < len; k++) { - if (!(i & mask)) { - tmp = (unsigned long)pfn; - m = min_t(unsigned long, m, find_first_bit(&tmp, BITS_PER_LONG)); - skip = 1 << m; - mask = skip - 1; - base = pfn; - p = 0; - } else { - if (base + p != pfn) { - tmp = (unsigned long)p; - m = find_first_bit(&tmp, BITS_PER_LONG); - skip = 1 << m; - mask = skip - 1; - base = pfn; - p = 0; - } - } - p++; - i++; + if (base + p != pfn) { + /* If either the offset or the new + * base are unaligned update m + */ + tmp = (unsigned long)(pfn | p); + if (!IS_ALIGNED(tmp, 1 << m)) + m = find_first_bit(&tmp, BITS_PER_LONG); + + base = pfn; + p = 0; } + + p += len; + i += len; } if (i) { -- cgit From fbcd49838d9094ca45772356e7b33afe4b7c93e7 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Sun, 24 Sep 2017 21:46:35 +0300 Subject: IB/mlx5: Fix NULL deference on mlx5_ib_update_xlt failure mlx5_ib_reg_user_mr called mlx5_ib_dereg_mr in case of MR population failure. This resulted in a NULL dereference as ibmr->device wasn't initialized yet. We address this by adding an internal dereg_mr function that can handle partially initialized MRs, and fixing clean_mr to work on partially initialized MRs. Fixes: ff740aefecb9 ("IB/mlx5: Decouple MR allocation and population flows") Signed-off-by: Ilya Lesokhin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mr.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 0e2789d9bb4d..37bbc543847a 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -47,7 +47,8 @@ enum { #define MLX5_UMR_ALIGN 2048 -static int clean_mr(struct mlx5_ib_mr *mr); +static int clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); +static int dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int mr_cache_max_order(struct mlx5_ib_dev *dev); static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); @@ -1270,8 +1271,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, err = mlx5_ib_update_xlt(mr, 0, ncont, page_shift, update_xlt_flags); + if (err) { - mlx5_ib_dereg_mr(&mr->ibmr); + dereg_mr(dev, mr); return ERR_PTR(err); } } @@ -1356,7 +1358,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, err = mr_umem_get(pd, addr, len, access_flags, &mr->umem, &npages, &page_shift, &ncont, &order); if (err < 0) { - clean_mr(mr); + clean_mr(dev, mr); return err; } } @@ -1410,7 +1412,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, if (err) { mlx5_ib_warn(dev, "Failed to rereg UMR\n"); ib_umem_release(mr->umem); - clean_mr(mr); + clean_mr(dev, mr); return err; } } @@ -1469,9 +1471,8 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr) } } -static int clean_mr(struct mlx5_ib_mr *mr) +static int clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); int allocated_from_cache = mr->allocated_from_cache; int err; @@ -1507,10 +1508,8 @@ static int clean_mr(struct mlx5_ib_mr *mr) return 0; } -int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +static int dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - struct mlx5_ib_dev *dev = to_mdev(ibmr->device); - struct mlx5_ib_mr *mr = to_mmr(ibmr); int npages = mr->npages; struct ib_umem *umem = mr->umem; @@ -1539,7 +1538,7 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) } #endif - clean_mr(mr); + clean_mr(dev, mr); if (umem) { ib_umem_release(umem); @@ -1549,6 +1548,14 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) return 0; } +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + + return dereg_mr(dev, mr); +} + struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg) -- cgit From fe59493240169a2cc3f445ae5f2a2308fda06b63 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Sep 2017 14:29:15 +0200 Subject: PCI: Add dummy pci_acs_enabled() for CONFIG_PCI=n build If CONFIG_PCI=n and gcc (e.g. 4.1.2) decides not to inline get_pci_function_alias_group(), the build fails with: drivers/iommu/iommu.o: In function `get_pci_function_alias_group': iommu.c:(.text+0xfdc): undefined reference to `pci_acs_enabled' Due to the various dummies for PCI calls in the CONFIG_PCI=n case, pci_acs_enabled() never called, but not all versions of gcc are smart enough to realize that. While explicitly marking get_pci_function_alias_group() inline would fix the build, this would inflate the code for the CONFIG_PCI=y case, as get_pci_function_alias_group() is a not-so-small function called from two places. Hence fix the issue by introducing a dummy for pci_acs_enabled() instead. Fixes: 0ae349a0f33f ("iommu/qcom: Add qcom_iommu") Signed-off-by: Geert Uytterhoeven Signed-off-by: Bjorn Helgaas Reviewed-by: Alex Williamson --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/pci.h b/include/linux/pci.h index f68c58a93dd0..f4f8ee5a7362 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1685,6 +1685,8 @@ static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } #define dev_is_pci(d) (false) #define dev_is_pf(d) (false) +static inline bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags) +{ return false; } #endif /* CONFIG_PCI */ /* Include architecture-dependent settings and functions */ -- cgit From 9c3340ea7f5dabf88ca096a917cb0ab1f208ef2a Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 19:11:07 -0600 Subject: selftests: futex: copy sub-dir test scripts for make O=dir run For make O=dir run_tests to work, test scripts from sub-directories need to be copied over to the object directory. Running tests from the object directory is necessary to avoid making the source tree dirty. Signed-off-by: Shuah Khan Reviewed-by: Darren Hart (VMware) Signed-off-by: Shuah Khan --- tools/testing/selftests/futex/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/futex/Makefile b/tools/testing/selftests/futex/Makefile index 7c647f619d63..9358cb210fd5 100644 --- a/tools/testing/selftests/futex/Makefile +++ b/tools/testing/selftests/futex/Makefile @@ -11,10 +11,13 @@ all: BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ + if [ -e $$DIR/$(TEST_PROGS) ]; then + rsync -a $$DIR/$(TEST_PROGS) $$BUILD_TARGET/; + fi done override define RUN_TESTS - $(OUTPUT)/run.sh + cd $(OUTPUT); ./run.sh endef override define INSTALL_RULE -- cgit From 8230b905a6780c60372bf5df7bf5f23041ca3196 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Tue, 12 Sep 2017 08:52:13 -0600 Subject: selftests: mqueue: Use full path to run tests from Makefile Use full path including $(OUTPUT) to run tests from Makefile for normal case when objects reside in the source tree as well as when objects are relocated with make O=dir. In both cases $(OUTPUT) will be set correctly by lib.mk. Signed-off-by: Shuah Khan --- tools/testing/selftests/mqueue/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mqueue/Makefile b/tools/testing/selftests/mqueue/Makefile index 79a664aeb8d7..0f5e347b068d 100644 --- a/tools/testing/selftests/mqueue/Makefile +++ b/tools/testing/selftests/mqueue/Makefile @@ -5,8 +5,8 @@ TEST_GEN_PROGS := mq_open_tests mq_perf_tests include ../lib.mk override define RUN_TESTS - @./mq_open_tests /test1 || echo "selftests: mq_open_tests [FAIL]" - @./mq_perf_tests || echo "selftests: mq_perf_tests [FAIL]" + $(OUTPUT)/mq_open_tests /test1 || echo "selftests: mq_open_tests [FAIL]" + $(OUTPUT)//mq_perf_tests || echo "selftests: mq_perf_tests [FAIL]" endef override define EMIT_TESTS -- cgit From 1ede053632f6380d7e1dba4d781e5eb78621aa3a Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 18 Sep 2017 17:30:50 -0600 Subject: selftests: Makefile: fix for loops in targets to run silently Fix for loops in targets to run silently to avoid cluttering the test results. Suppresses the following from targets: e.g run from breakpoints for TARGET in breakpoints; do \ BUILD_TARGET=$BUILD/$TARGET; \ mkdir $BUILD_TARGET -p; \ make OUTPUT=$BUILD_TARGET -C $TARGET;\ done; Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index f4368db011ea..ff805643b5f7 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -66,32 +66,32 @@ endif export BUILD all: - for TARGET in $(TARGETS); do \ + @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ mkdir $$BUILD_TARGET -p; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET;\ done; run_tests: all - for TARGET in $(TARGETS); do \ + @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests;\ done; hotplug: - for TARGET in $(TARGETS_HOTPLUG); do \ + @for TARGET in $(TARGETS_HOTPLUG); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET;\ done; run_hotplug: hotplug - for TARGET in $(TARGETS_HOTPLUG); do \ + @for TARGET in $(TARGETS_HOTPLUG); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET run_full_test;\ done; clean_hotplug: - for TARGET in $(TARGETS_HOTPLUG); do \ + @for TARGET in $(TARGETS_HOTPLUG); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ done; @@ -107,7 +107,7 @@ install: ifdef INSTALL_PATH @# Ask all targets to install their files mkdir -p $(INSTALL_PATH) - for TARGET in $(TARGETS); do \ + @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install; \ done; @@ -132,7 +132,7 @@ else endif clean: - for TARGET in $(TARGETS); do \ + @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ done; -- cgit From 659dbfd8c47adeb03f401d1a1f17091bb63cc5a2 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 18 Sep 2017 18:46:23 -0600 Subject: selftests: futex: Makefile: fix for loops in targets to run silently Fix for loops in targets to run silently to avoid cluttering the test results. Suppresses the following from targets: for DIR in functional; do \ BUILD_TARGET=./tools/testing/selftests/futex/$DIR; \ mkdir $BUILD_TARGET -p; \ make OUTPUT=$BUILD_TARGET -C $DIR all;\ done ./tools/testing/selftests/futex/run.sh Signed-off-by: Shuah Khan Reviewed-by: Darren Hart (VMware) Signed-off-by: Shuah Khan --- tools/testing/selftests/futex/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/futex/Makefile b/tools/testing/selftests/futex/Makefile index 9358cb210fd5..f0c0369ccb79 100644 --- a/tools/testing/selftests/futex/Makefile +++ b/tools/testing/selftests/futex/Makefile @@ -7,7 +7,7 @@ TEST_PROGS := run.sh include ../lib.mk all: - for DIR in $(SUBDIRS); do \ + @for DIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ @@ -17,7 +17,7 @@ all: done override define RUN_TESTS - cd $(OUTPUT); ./run.sh + @cd $(OUTPUT); ./run.sh endef override define INSTALL_RULE @@ -36,7 +36,7 @@ override define EMIT_TESTS endef override define CLEAN - for DIR in $(SUBDIRS); do \ + @for DIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ -- cgit From 10859f3855db4c6f10dc7974ff4b3a292f3de8e0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 7 Sep 2017 16:32:46 -0700 Subject: selftests/seccomp: Support glibc 2.26 siginfo_t.h The 2.26 release of glibc changed how siginfo_t is defined, and the earlier work-around to using the kernel definition are no longer needed. The old way needs to stay around for a while, though. Reported-by: Seth Forshee Cc: Andy Lutomirski Cc: Will Drewry Cc: Shuah Khan Cc: linux-kselftest@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Tested-by: Seth Forshee Signed-off-by: Shuah Khan --- tools/testing/selftests/seccomp/seccomp_bpf.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 4d6f92a9df6b..19cd272c234d 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -6,10 +6,18 @@ */ #include -#include -#define __have_siginfo_t 1 -#define __have_sigval_t 1 -#define __have_sigevent_t 1 + +/* + * glibc 2.26 and later have SIGSYS in siginfo_t. Before that, + * we need to use the kernel's siginfo.h file and trick glibc + * into accepting it. + */ +#if !__GLIBC_PREREQ(2, 26) +# include +# define __have_siginfo_t 1 +# define __have_sigval_t 1 +# define __have_sigevent_t 1 +#endif #include #include @@ -676,7 +684,7 @@ TEST_F_SIGNAL(TRAP, ign, SIGSYS) syscall(__NR_getpid); } -static struct siginfo TRAP_info; +static siginfo_t TRAP_info; static volatile int TRAP_nr; static void TRAP_action(int nr, siginfo_t *info, void *void_context) { -- cgit From 21aadfa2426d5d199ceb474d0159d079c7f17bfa Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Thu, 21 Sep 2017 17:13:27 +0800 Subject: selftests/memfd: correct run_tests.sh permission to fix the following issue: ------------------ TAP version 13 selftests: run_tests.sh ======================================== selftests: Warning: file run_tests.sh is not executable, correct this. not ok 1..1 selftests: run_tests.sh [FAIL] ------------------ Signed-off-by: Li Zhijian Signed-off-by: Shuah Khan --- tools/testing/selftests/memfd/run_tests.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tools/testing/selftests/memfd/run_tests.sh diff --git a/tools/testing/selftests/memfd/run_tests.sh b/tools/testing/selftests/memfd/run_tests.sh old mode 100644 new mode 100755 -- cgit From 01db7fbf5487505b887fbd6a03c51f2adc952196 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 21 Sep 2017 13:46:01 -0600 Subject: selftests: timers: set-timer-lat: fix hang when std out/err are redirected do_timer_oneshot() uses select() as a timer with FD_SETSIZE and readfs is cleared with FD_ZERO without FD_SET. When stdout and stderr are redirected, the test hangs in select forever. Fix the problem calling select() with readfds empty and nfds zero. This is sufficient for using select() for timer. With this fix "./set-timer-lat > /dev/null 2>&1" no longer hangs. Signed-off-by: Shuah Khan Acked-by: Greg Hackmann Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/set-timer-lat.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c index 9c92b7bd5641..ea1af5dbc7b6 100644 --- a/tools/testing/selftests/timers/set-timer-lat.c +++ b/tools/testing/selftests/timers/set-timer-lat.c @@ -228,7 +228,6 @@ int do_timer_oneshot(int clock_id, int flags) timer_t tm1; const int interval = 0; struct timeval timeout; - fd_set fds; int err; err = setup_timer(clock_id, flags, interval, &tm1); @@ -237,9 +236,8 @@ int do_timer_oneshot(int clock_id, int flags) memset(&timeout, 0, sizeof(timeout)); timeout.tv_sec = 5; - FD_ZERO(&fds); do { - err = select(FD_SETSIZE, &fds, NULL, NULL, &timeout); + err = select(0, NULL, NULL, NULL, &timeout); } while (err == -1 && errno == EINTR); timer_delete(tm1); -- cgit From eefd95e1f3d47b90dc768e9ebc77d390c4f34809 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 21 Sep 2017 13:05:18 -0600 Subject: selftests: timers: set-timer-lat: Fix hang when testing unsupported alarms When timer_create() fails on a bootime or realtime clock, setup_timer() returns 0 as if timer has been set. Callers wait forever for the timer to expire. This hang is seen on a system that doesn't have support for: CLOCK_REALTIME_ALARM ABSTIME missing CAP_WAKE_ALARM? : [UNSUPPORTED] Test hangs waiting for a timer that hasn't been set to expire. Fix setup_timer() to return 1, add handling in callers to detect the unsupported case and return 0 without waiting to not fail the test. Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/set-timer-lat.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c index ea1af5dbc7b6..50da45437daa 100644 --- a/tools/testing/selftests/timers/set-timer-lat.c +++ b/tools/testing/selftests/timers/set-timer-lat.c @@ -143,7 +143,8 @@ int setup_timer(int clock_id, int flags, int interval, timer_t *tm1) printf("%-22s %s missing CAP_WAKE_ALARM? : [UNSUPPORTED]\n", clockstring(clock_id), flags ? "ABSTIME":"RELTIME"); - return 0; + /* Indicate timer isn't set, so caller doesn't wait */ + return 1; } printf("%s - timer_create() failed\n", clockstring(clock_id)); return -1; @@ -213,8 +214,9 @@ int do_timer(int clock_id, int flags) int err; err = setup_timer(clock_id, flags, interval, &tm1); + /* Unsupported case - return 0 to not fail the test */ if (err) - return err; + return err == 1 ? 0 : err; while (alarmcount < 5) sleep(1); @@ -231,8 +233,9 @@ int do_timer_oneshot(int clock_id, int flags) int err; err = setup_timer(clock_id, flags, interval, &tm1); + /* Unsupported case - return 0 to not fail the test */ if (err) - return err; + return err == 1 ? 0 : err; memset(&timeout, 0, sizeof(timeout)); timeout.tv_sec = 5; -- cgit From 10201655b085df8e000822e496e5d4016a167a36 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 19 Sep 2017 07:15:35 -0500 Subject: gfs2: Fix debugfs glocks dump The switch to rhashtables (commit 88ffbf3e03) broke the debugfs glock dump (/sys/kernel/debug/gfs2//glocks) for dumps bigger than a single buffer: the right function for restarting an rhashtable iteration from the beginning of the hash table is rhashtable_walk_enter; rhashtable_walk_stop + rhashtable_walk_start will just resume from the current position. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Cc: stable@vger.kernel.org # v4.3+ --- fs/gfs2/glock.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 98e845b7841b..11066d8647d2 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1945,13 +1945,9 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; - int ret; - - if (gi->last_pos <= *pos) - n = (*pos - gi->last_pos); - ret = rhashtable_walk_start(&gi->hti); - if (ret) + rhashtable_walk_enter(&gl_hash_table, &gi->hti); + if (rhashtable_walk_start(&gi->hti) != 0) return NULL; do { @@ -1959,6 +1955,7 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) } while (gi->gl && n--); gi->last_pos = *pos; + return gi->gl; } @@ -1970,6 +1967,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, (*pos)++; gi->last_pos = *pos; gfs2_glock_iter_next(gi); + return gi->gl; } @@ -1980,6 +1978,7 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) gi->gl = NULL; rhashtable_walk_stop(&gi->hti); + rhashtable_walk_exit(&gi->hti); } static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) @@ -2042,12 +2041,10 @@ static int __gfs2_glocks_open(struct inode *inode, struct file *file, struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; - gi->last_pos = 0; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - rhashtable_walk_enter(&gl_hash_table, &gi->hti); } return ret; } @@ -2063,7 +2060,6 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file) struct gfs2_glock_iter *gi = seq->private; gi->gl = NULL; - rhashtable_walk_exit(&gi->hti); return seq_release_private(inode, file); } -- cgit From 8cbd96a6285e8eb65232b5afd3e8d9418453a61c Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 21 Sep 2017 08:13:49 -0700 Subject: nvme: fix sqhd reference when admin queue connect fails Fix bug in sqhd patch. It wasn't the sq that was at risk. In the case where the admin queue connect command fails, the sq->size field is not set. Therefore, this becomes a divide by zero error. Add a quick check to bypass under this failure condition. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index c2a768a94235..1b208beeef50 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -390,7 +390,8 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status) if (status) nvmet_set_status(req, status); - req->sq->sqhd = (req->sq->sqhd + 1) % req->sq->size; + if (req->sq->size) + req->sq->sqhd = (req->sq->sqhd + 1) % req->sq->size; req->rsp->sq_head = cpu_to_le16(req->sq->sqhd); req->rsp->sq_id = cpu_to_le16(req->sq->qid); req->rsp->command_id = req->cmd->common.command_id; -- cgit From 1a40d97288c6ffea9b355139e88fa62f0e5439f7 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Sep 2017 17:01:36 +0300 Subject: nvme-core: Use nvme_wq to queue async events and fw activation async_event_work might race as it is executed from two different workqueues at the moment. Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5589f67d2cd8..bb2aad078637 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2676,7 +2676,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, case NVME_SC_ABORT_REQ: ++ctrl->event_limit; if (ctrl->state == NVME_CTRL_LIVE) - schedule_work(&ctrl->async_event_work); + queue_work(nvme_wq, &ctrl->async_event_work); break; default: break; @@ -2691,7 +2691,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, nvme_queue_scan(ctrl); break; case NVME_AER_NOTICE_FW_ACT_STARTING: - schedule_work(&ctrl->fw_act_work); + queue_work(nvme_wq, &ctrl->fw_act_work); break; default: dev_warn(ctrl->device, "async event result %08x\n", result); -- cgit From 0a960afd60d02808c7f7f36d4aa8a2e07045e1e9 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Sep 2017 17:01:37 +0300 Subject: nvme-rdma: give up reconnect if state change fails If we failed to transition to state LIVE after a successful reconnect, then controller deletion already started. In this case there is no point moving forward with reconnect. Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/rdma.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 58983000964b..8441f6b3f617 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -942,7 +942,12 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); + if (!changed) { + /* state change failure is ok if we're in DELETING state */ + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + return; + } + ctrl->ctrl.nr_reconnects = 0; nvme_start_ctrl(&ctrl->ctrl); -- cgit From e4d753d7e51c0648b9ee33efeed55d45f362fc3d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Sep 2017 17:01:38 +0300 Subject: nvme-rdma: don't fully stop the controller in error recovery By calling nvme_stop_ctrl on a already failed controller will wait for the scan work to complete (only by identify timeout expiration which is 60 seconds). This is unnecessary when we already know that the controller has failed. Reported-by: Yi Zhang Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 8441f6b3f617..92a03ff5fb4d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -967,7 +967,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, err_work); - nvme_stop_ctrl(&ctrl->ctrl); + nvme_stop_keep_alive(&ctrl->ctrl); if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); -- cgit From 3688feb582a1bc4e58ad50f5eccfdb90615de27b Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 19 Sep 2017 15:13:11 -0700 Subject: nvmet-fc: on port remove call put outside lock Avoid calling the put routine, as it may traverse to free routines while holding the target lock. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index c48c83d97e30..6850672ad2a2 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2530,13 +2530,17 @@ nvmet_fc_remove_port(struct nvmet_port *port) { struct nvmet_fc_tgtport *tgtport = port->priv; unsigned long flags; + bool matched = false; spin_lock_irqsave(&nvmet_fc_tgtlock, flags); if (tgtport->port == port) { - nvmet_fc_tgtport_put(tgtport); + matched = true; tgtport->port = NULL; } spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); + + if (matched) + nvmet_fc_tgtport_put(tgtport); } static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = { -- cgit From 0c319d3a144d4b8f1ea2047fd614d2149b68f889 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 19 Sep 2017 16:33:56 -0700 Subject: nvmet-fc: ensure target queue id within range. When searching for queue id's ensure they are within the expected range. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 6850672ad2a2..58e010bdda3e 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -783,6 +783,9 @@ nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport, u16 qid = nvmet_fc_getqueueid(connection_id); unsigned long flags; + if (qid > NVMET_NR_QUEUES) + return NULL; + spin_lock_irqsave(&tgtport->lock, flags); list_for_each_entry(assoc, &tgtport->assoc_list, a_list) { if (association_id == assoc->association_id) { -- cgit From 6b71f9e1e849f82abb4a8d54ce7f4b1c71f19ac4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 20 Sep 2017 11:07:26 -0700 Subject: nvmet-fc: sync header templates with comments Comments were incorrect: - defer_rcv was in host port template. moved to target port template - Added Mandatory statements for target port template items Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme-fc-driver.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 9c5cb4480806..a726f96010d5 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -346,11 +346,6 @@ struct nvme_fc_remote_port { * indicating an FC transport Aborted status. * Entrypoint is Mandatory. * - * @defer_rcv: Called by the transport to signal the LLLD that it has - * begun processing of a previously received NVME CMD IU. The LLDD - * is now free to re-use the rcv buffer associated with the - * nvmefc_tgt_fcp_req. - * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -806,11 +801,19 @@ struct nvmet_fc_target_port { * outstanding operation (if there was one) to complete, then will * call the fcp_req_release() callback to return the command's * exchange context back to the LLDD. + * Entrypoint is Mandatory. * * @fcp_req_release: Called by the transport to return a nvmefc_tgt_fcp_req * to the LLDD after all operations on the fcp operation are complete. * This may be due to the command completing or upon completion of * abort cleanup. + * Entrypoint is Mandatory. + * + * @defer_rcv: Called by the transport to signal the LLLD that it has + * begun processing of a previously received NVME CMD IU. The LLDD + * is now free to re-use the rcv buffer associated with the + * nvmefc_tgt_fcp_req. + * Entrypoint is Optional. * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. -- cgit From fddc9923c6d41de9fe7b1f323a3cece53e046c88 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 19 Sep 2017 14:01:50 -0700 Subject: nvme-fcloop: fix port deletes and callbacks Now that there are potentially long delays between when a remoteport or targetport delete calls is made and when the callback occurs (dev_loss_tmo timeout), no longer block in the delete routines and move the final nport puts to the callbacks. Moved the fcloop_nport_get/put/free routines to avoid forward declarations. Ensure port_info structs used in registrations are nulled in case fields are not set (ex: devloss_tmo values). Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fcloop.c | 102 ++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 64 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1fd1afbb8b2a..7b75d9de55ab 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -224,8 +224,6 @@ struct fcloop_nport { struct fcloop_lport *lport; struct list_head nport_list; struct kref ref; - struct completion rport_unreg_done; - struct completion tport_unreg_done; u64 node_name; u64 port_name; u32 port_role; @@ -630,6 +628,32 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, schedule_work(&inireq->iniwork); } +static void +fcloop_nport_free(struct kref *ref) +{ + struct fcloop_nport *nport = + container_of(ref, struct fcloop_nport, ref); + unsigned long flags; + + spin_lock_irqsave(&fcloop_lock, flags); + list_del(&nport->nport_list); + spin_unlock_irqrestore(&fcloop_lock, flags); + + kfree(nport); +} + +static void +fcloop_nport_put(struct fcloop_nport *nport) +{ + kref_put(&nport->ref, fcloop_nport_free); +} + +static int +fcloop_nport_get(struct fcloop_nport *nport) +{ + return kref_get_unless_zero(&nport->ref); +} + static void fcloop_localport_delete(struct nvme_fc_local_port *localport) { @@ -644,8 +668,7 @@ fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport) { struct fcloop_rport *rport = remoteport->private; - /* release any threads waiting for the unreg to complete */ - complete(&rport->nport->rport_unreg_done); + fcloop_nport_put(rport->nport); } static void @@ -653,8 +676,7 @@ fcloop_targetport_delete(struct nvmet_fc_target_port *targetport) { struct fcloop_tport *tport = targetport->private; - /* release any threads waiting for the unreg to complete */ - complete(&tport->nport->tport_unreg_done); + fcloop_nport_put(tport->nport); } #define FCLOOP_HW_QUEUES 4 @@ -722,6 +744,7 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr, goto out_free_opts; } + memset(&pinfo, 0, sizeof(pinfo)); pinfo.node_name = opts->wwnn; pinfo.port_name = opts->wwpn; pinfo.port_role = opts->roles; @@ -804,32 +827,6 @@ fcloop_delete_local_port(struct device *dev, struct device_attribute *attr, return ret ? ret : count; } -static void -fcloop_nport_free(struct kref *ref) -{ - struct fcloop_nport *nport = - container_of(ref, struct fcloop_nport, ref); - unsigned long flags; - - spin_lock_irqsave(&fcloop_lock, flags); - list_del(&nport->nport_list); - spin_unlock_irqrestore(&fcloop_lock, flags); - - kfree(nport); -} - -static void -fcloop_nport_put(struct fcloop_nport *nport) -{ - kref_put(&nport->ref, fcloop_nport_free); -} - -static int -fcloop_nport_get(struct fcloop_nport *nport) -{ - return kref_get_unless_zero(&nport->ref); -} - static struct fcloop_nport * fcloop_alloc_nport(const char *buf, size_t count, bool remoteport) { @@ -938,6 +935,7 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr, if (!nport) return -EIO; + memset(&pinfo, 0, sizeof(pinfo)); pinfo.node_name = nport->node_name; pinfo.port_name = nport->port_name; pinfo.port_role = nport->port_role; @@ -979,24 +977,12 @@ __unlink_remote_port(struct fcloop_nport *nport) } static int -__wait_remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport) +__remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport) { - int ret; - if (!rport) return -EALREADY; - init_completion(&nport->rport_unreg_done); - - ret = nvme_fc_unregister_remoteport(rport->remoteport); - if (ret) - return ret; - - wait_for_completion(&nport->rport_unreg_done); - - fcloop_nport_put(nport); - - return ret; + return nvme_fc_unregister_remoteport(rport->remoteport); } static ssize_t @@ -1029,7 +1015,7 @@ fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr, if (!nport) return -ENOENT; - ret = __wait_remoteport_unreg(nport, rport); + ret = __remoteport_unreg(nport, rport); return ret ? ret : count; } @@ -1086,24 +1072,12 @@ __unlink_target_port(struct fcloop_nport *nport) } static int -__wait_targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport) +__targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport) { - int ret; - if (!tport) return -EALREADY; - init_completion(&nport->tport_unreg_done); - - ret = nvmet_fc_unregister_targetport(tport->targetport); - if (ret) - return ret; - - wait_for_completion(&nport->tport_unreg_done); - - fcloop_nport_put(nport); - - return ret; + return nvmet_fc_unregister_targetport(tport->targetport); } static ssize_t @@ -1136,7 +1110,7 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr, if (!nport) return -ENOENT; - ret = __wait_targetport_unreg(nport, tport); + ret = __targetport_unreg(nport, tport); return ret ? ret : count; } @@ -1223,11 +1197,11 @@ static void __exit fcloop_exit(void) spin_unlock_irqrestore(&fcloop_lock, flags); - ret = __wait_targetport_unreg(nport, tport); + ret = __targetport_unreg(nport, tport); if (ret) pr_warn("%s: Failed deleting target port\n", __func__); - ret = __wait_remoteport_unreg(nport, rport); + ret = __remoteport_unreg(nport, rport); if (ret) pr_warn("%s: Failed deleting remote port\n", __func__); -- cgit From a08588ea486a5590b50c36f437dc86350271b250 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 21 Sep 2017 23:24:39 -0700 Subject: irqchip/mips-gic: Fix shifts to extract register fields The MIPS GIC driver is incorrectly using __fls to shift registers, intending to shift to the least significant bit of a value based upon its mask but instead shifting off all but the value's top bit. It should actually be using __ffs to shift to the first, not last, bit of the value. Apparently the system I used when testing commit 3680746abd87 ("irqchip: mips-gic: Convert remaining shared reg access to new accessors") and commit b2b2e584ceab ("irqchip: mips-gic: Clean up mti, reserved-cpu-vectors handling") managed to work correctly despite this issue, but not all systems do... Fixes: 3680746abd87 ("irqchip: mips-gic: Convert remaining shared reg access to new accessors") Fixes: b2b2e584ceab ("irqchip: mips-gic: Clean up mti, reserved-cpu-vectors handling") Signed-off-by: Paul Burton Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Jason Cooper Link: https://lkml.kernel.org/r/20170922062440.23701-2-paul.burton@imgtec.com --- drivers/irqchip/irq-mips-gic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 40159ac12ac8..0022b31ad2c5 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -645,7 +645,7 @@ static int __init gic_of_init(struct device_node *node, /* Find the first available CPU vector. */ i = 0; - reserved = (C_SW0 | C_SW1) >> __fls(C_SW0); + reserved = (C_SW0 | C_SW1) >> __ffs(C_SW0); while (!of_property_read_u32_index(node, "mti,reserved-cpu-vectors", i++, &cpu_vec)) reserved |= BIT(cpu_vec); @@ -684,11 +684,11 @@ static int __init gic_of_init(struct device_node *node, gicconfig = read_gic_config(); gic_shared_intrs = gicconfig & GIC_CONFIG_NUMINTERRUPTS; - gic_shared_intrs >>= __fls(GIC_CONFIG_NUMINTERRUPTS); + gic_shared_intrs >>= __ffs(GIC_CONFIG_NUMINTERRUPTS); gic_shared_intrs = (gic_shared_intrs + 1) * 8; gic_vpes = gicconfig & GIC_CONFIG_PVPS; - gic_vpes >>= __fls(GIC_CONFIG_PVPS); + gic_vpes >>= __ffs(GIC_CONFIG_PVPS); gic_vpes = gic_vpes + 1; if (cpu_has_veic) { -- cgit From d9f82930a5b41f28fadb1e4838b877ae528456d3 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 21 Sep 2017 23:24:40 -0700 Subject: irqchip/mips-gic: Use effective affinity to unmask Commit 7778c4b27cbe ("irqchip: mips-gic: Use pcpu_masks to avoid reading GIC_SH_MASK*") adjusted the way we handle masking interrupts to set & clear the interrupt's bit in each pcpu_mask. This allows us to avoid needing to read the GIC mask registers and perform a bitwise and of their values with the pending & pcpu_masks. Unfortunately this didn't quite work for IPIs, which were mapped to a particular CPU/VP during initialisation but never set the affinity or effective_affinity fields of their struct irq_desc. This led to them losing their affinity when gic_unmask_irq() was called for them, and they'd all become affine to cpu0. Fix this by: 1) Setting the effective affinity of interrupts in gic_shared_irq_domain_map(), which is where we actually map an interrupt to a CPU/VP. This ensures that the effective affinity mask is always valid, not just after explicitly setting affinity. 2) Using an interrupt's effective affinity when unmasking it, which prevents gic_unmask_irq() from unintentionally changing which pcpu_mask includes an interrupt. Fixes: 7778c4b27cbe ("irqchip: mips-gic: Use pcpu_masks to avoid reading GIC_SH_MASK*") Signed-off-by: Paul Burton Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Jason Cooper Link: https://lkml.kernel.org/r/20170922062440.23701-3-paul.burton@imgtec.com --- drivers/irqchip/irq-mips-gic.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 0022b31ad2c5..c90976d7e53c 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -175,14 +175,13 @@ static void gic_mask_irq(struct irq_data *d) static void gic_unmask_irq(struct irq_data *d) { - struct cpumask *affinity = irq_data_get_affinity_mask(d); unsigned int intr = GIC_HWIRQ_TO_SHARED(d->hwirq); unsigned int cpu; write_gic_smask(intr); gic_clear_pcpu_masks(intr); - cpu = cpumask_first_and(affinity, cpu_online_mask); + cpu = cpumask_first(irq_data_get_effective_affinity_mask(d)); set_bit(intr, per_cpu_ptr(pcpu_masks, cpu)); } @@ -420,13 +419,17 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw, unsigned int cpu) { int intr = GIC_HWIRQ_TO_SHARED(hw); + struct irq_data *data; unsigned long flags; + data = irq_get_irq_data(virq); + spin_lock_irqsave(&gic_lock, flags); write_gic_map_pin(intr, GIC_MAP_PIN_MAP_TO_PIN | gic_cpu_pin); write_gic_map_vp(intr, BIT(mips_cm_vp_id(cpu))); gic_clear_pcpu_masks(intr); set_bit(intr, per_cpu_ptr(pcpu_masks, cpu)); + irq_data_update_effective_affinity(data, cpumask_of(cpu)); spin_unlock_irqrestore(&gic_lock, flags); return 0; -- cgit From 7755d83e48397e822aac751b1545f8bcf71d133e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 Sep 2017 21:20:41 +0900 Subject: irqdomain: Add __rcu annotations to radix tree accessors Fix various address spaces warning of sparse. kernel/irq/irqdomain.c:1463:14: warning: incorrect type in assignment (different address spaces) kernel/irq/irqdomain.c:1463:14: expected void **slot kernel/irq/irqdomain.c:1463:14: got void [noderef] ** kernel/irq/irqdomain.c:1465:66: warning: incorrect type in argument 2 (different address spaces) kernel/irq/irqdomain.c:1465:66: expected void [noderef] **slot kernel/irq/irqdomain.c:1465:66: got void **slot Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Jason Cooper Link: https://lkml.kernel.org/r/1506082841-11530-1-git-send-email-yamada.masahiro@socionext.com --- kernel/irq/irqdomain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e84b7056bb08..ac4644e92b49 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private) struct irq_desc *desc; struct irq_domain *domain; struct radix_tree_iter iter; - void **slot; + void __rcu **slot; int i; seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", @@ -1453,7 +1453,7 @@ out_free_desc: /* The irq_data was moved, fix the revmap to refer to the new location */ static void irq_domain_fix_revmap(struct irq_data *d) { - void **slot; + void __rcu **slot; if (d->hwirq < d->domain->revmap_size) return; /* Not using radix tree. */ -- cgit From c88f0e6b06f4092995688211a631bb436125d77b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 27 Aug 2017 20:25:26 +0800 Subject: scsi: scsi_transport_iscsi: fix the issue that iscsi_if_rx doesn't parse nlmsg properly ChunYu found a kernel crash by syzkaller: [ 651.617875] kasan: CONFIG_KASAN_INLINE enabled [ 651.618217] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 651.618731] general protection fault: 0000 [#1] SMP KASAN [ 651.621543] CPU: 1 PID: 9539 Comm: scsi Not tainted 4.11.0.cov #32 [ 651.621938] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 651.622309] task: ffff880117780000 task.stack: ffff8800a3188000 [ 651.622762] RIP: 0010:skb_release_data+0x26c/0x590 [...] [ 651.627260] Call Trace: [ 651.629156] skb_release_all+0x4f/0x60 [ 651.629450] consume_skb+0x1a5/0x600 [ 651.630705] netlink_unicast+0x505/0x720 [ 651.632345] netlink_sendmsg+0xab2/0xe70 [ 651.633704] sock_sendmsg+0xcf/0x110 [ 651.633942] ___sys_sendmsg+0x833/0x980 [ 651.637117] __sys_sendmsg+0xf3/0x240 [ 651.638820] SyS_sendmsg+0x32/0x50 [ 651.639048] entry_SYSCALL_64_fastpath+0x1f/0xc2 It's caused by skb_shared_info at the end of sk_buff was overwritten by ISCSI_KEVENT_IF_ERROR when parsing nlmsg info from skb in iscsi_if_rx. During the loop if skb->len == nlh->nlmsg_len and both are sizeof(*nlh), ev = nlmsg_data(nlh) will acutally get skb_shinfo(SKB) instead and set a new value to skb_shinfo(SKB)->nr_frags by ev->type. This patch is to fix it by checking nlh->nlmsg_len properly there to avoid over accessing sk_buff. Reported-by: ChunYu Wang Signed-off-by: Xin Long Acked-by: Chris Leech Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_iscsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 8934f19bce8e..0190aeff5f7f 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -3689,7 +3689,7 @@ iscsi_if_rx(struct sk_buff *skb) uint32_t group; nlh = nlmsg_hdr(skb); - if (nlh->nlmsg_len < sizeof(*nlh) || + if (nlh->nlmsg_len < sizeof(*nlh) + sizeof(*ev) || skb->len < nlh->nlmsg_len) { break; } -- cgit From f3a0c7b3fa7cdf6783827c245c62772687f8b3ac Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Sat, 9 Sep 2017 20:37:03 +0800 Subject: MAINTAINERS: Add entry for MediaTek PMIC LED driver Add myself as a maintainer to support existing SoCs and push forward following MediaTek PMICs with LEDs to reuse the driver. Signed-off-by: Sean Wang Signed-off-by: Jacek Anaszewski --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..22cb6cc91081 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8586,6 +8586,12 @@ M: Sean Wang S: Maintained F: drivers/media/rc/mtk-cir.c +MEDIATEK PMIC LED DRIVER +M: Sean Wang +S: Maintained +F: drivers/leds/leds-mt6323.c +F: Documentation/devicetree/bindings/leds/leds-mt6323.txt + MEDIATEK ETHERNET DRIVER M: Felix Fietkau M: John Crispin -- cgit From fac1c2040203363eab6c6e86ce883cb71390418f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:15 +0200 Subject: smp/hotplug: Add state diagram Add a state diagram to clarify when which states are ran where. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.661598270@infradead.org --- include/linux/cpuhotplug.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f24bfb2b9a2d..477b2e6f60f7 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -3,6 +3,24 @@ #include +/* + * CPU-up CPU-down + * + * BP AP BP AP + * + * OFFLINE OFFLINE + * | ^ + * v | + * BRINGUP_CPU->AP_OFFLINE BRINGUP_CPU <- AP_IDLE_DEAD (idle thread/play_dead) + * | AP_OFFLINE + * v (IRQ-off) ,---------------^ + * AP_ONLNE | (stop_machine) + * | TEARDOWN_CPU <- AP_ONLINE_IDLE + * | ^ + * v | + * AP_ACTIVE AP_ACTIVE + */ + enum cpuhp_state { CPUHP_OFFLINE, CPUHP_CREATE_THREADS, -- cgit From 96abb968549cdefd0964d1f7af0a79f4e6e7f897 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:16 +0200 Subject: smp/hotplug: Allow external multi-instance rollback Currently the rollback of multi-instance states is handled inside cpuhp_invoke_callback(). The problem is that when we want to allow an explicit state change for rollback, we need to return from the function without doing the rollback. Change cpuhp_invoke_callback() to optionally return the multi-instance state, such that rollback can be done from a subsequent call. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.720361181@infradead.org --- kernel/cpu.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index acf5308fad51..323b71050b54 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -123,13 +123,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) /** * cpuhp_invoke_callback _ Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked - * @step: The step in the state machine + * @state: The state to do callbacks for * @bringup: True if the bringup callback should be invoked + * @node: For multi-instance, do a single entry callback for install/remove + * @lastp: For multi-instance rollback, remember how far we got * * Called from cpu hotplug and from the state register machinery. */ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, - bool bringup, struct hlist_node *node) + bool bringup, struct hlist_node *node, + struct hlist_node **lastp) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct cpuhp_step *step = cpuhp_get_step(state); @@ -138,6 +141,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int ret, cnt; if (!step->multi_instance) { + WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; if (!cb) return 0; @@ -152,6 +156,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, /* Single invocation for instance add/remove */ if (node) { + WARN_ON_ONCE(lastp && *lastp); trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); @@ -161,13 +166,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, /* State transition. Invoke on all instances */ cnt = 0; hlist_for_each(node, &step->list) { + if (lastp && node == *lastp) + break; + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); - if (ret) - goto err; + if (ret) { + if (!lastp) + goto err; + + *lastp = node; + return ret; + } cnt++; } + if (lastp) + *lastp = NULL; return 0; err: /* Rollback the instances if one failed */ @@ -323,7 +338,7 @@ static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true, NULL); + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); } } @@ -334,7 +349,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, int ret = 0; for (; st->state > target; st->state--) { - ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); if (ret) { st->target = prev_state; undo_cpu_down(cpu, st); @@ -350,7 +365,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, false, NULL); + cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); } } @@ -362,7 +377,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, while (st->state < target) { st->state++; - ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); if (ret) { st->target = prev_state; undo_cpu_up(cpu, st); @@ -428,11 +443,13 @@ static void cpuhp_thread_fun(unsigned int cpu) if (st->cb_state < CPUHP_AP_ONLINE) { local_irq_disable(); ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node); + st->bringup, st->node, + NULL); local_irq_enable(); } else { ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node); + st->bringup, st->node, + NULL); } } else if (st->rollback) { BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); @@ -472,7 +489,7 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, * we invoke the thread function directly. */ if (!st->thread) - return cpuhp_invoke_callback(cpu, state, bringup, node); + return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); st->cb_state = state; st->single = true; @@ -595,7 +612,7 @@ static int take_cpu_down(void *_param) st->state--; /* Invoke the former CPU_DYING callbacks */ for (; st->state > target; st->state--) - cpuhp_invoke_callback(cpu, st->state, false, NULL); + cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -776,7 +793,7 @@ void notify_cpu_starting(unsigned int cpu) rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { st->state++; - cpuhp_invoke_callback(cpu, st->state, true, NULL); + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); } } @@ -1307,9 +1324,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, if (cpuhp_is_ap_state(state)) ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); else - ret = cpuhp_invoke_callback(cpu, state, bringup, node); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #else - ret = cpuhp_invoke_callback(cpu, state, bringup, node); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #endif BUG_ON(ret && !bringup); return ret; -- cgit From 4dddfb5faa6118564b0c54a163353d13882299d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:17 +0200 Subject: smp/hotplug: Rewrite AP state machine core There is currently no explicit state change on rollback. That is, st->bringup, st->rollback and st->target are not consistent when doing the rollback. Rework the AP state handling to be more coherent. This does mean we have to do a second AP kick-and-wait for rollback, but since rollback is the slow path of a slowpath, this really should not matter. Take this opportunity to simplify the AP thread function to only run a single callback per invocation. This unifies the three single/up/down modes is supports. The looping it used to do for up/down are achieved by retaining should_run and relying on the main smpboot_thread_fn() loop. (I have most of a patch that does the same for the BP state handling, but that's not critical and gets a little complicated because CPUHP_BRINGUP_CPU does the AP handoff from a callback, which gets recursive @st usage, I still have de-fugly that.) [ tglx: Move cpuhp_down_callbacks() et al. into the HOTPLUG_CPU section to avoid gcc complaining about unused functions. Make the HOTPLUG_CPU one piece instead of having two consecutive ifdef sections of the same type. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.769658088@infradead.org --- kernel/cpu.c | 321 ++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 206 insertions(+), 115 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 323b71050b54..1139063de5af 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -58,6 +58,7 @@ struct cpuhp_cpu_state { bool single; bool bringup; struct hlist_node *node; + struct hlist_node *last; enum cpuhp_state cb_state; int result; struct completion done; @@ -112,6 +113,14 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state) return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ + return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} + static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { struct cpuhp_step *sp; @@ -286,7 +295,72 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); +static inline enum cpuhp_state +cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + + st->rollback = false; + st->last = NULL; + + st->target = target; + st->single = false; + st->bringup = st->state < target; + + return prev_state; +} + +static inline void +cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) +{ + st->rollback = true; + + /* + * If we have st->last we need to undo partial multi_instance of this + * state first. Otherwise start undo at the previous state. + */ + if (!st->last) { + if (st->bringup) + st->state--; + else + st->state++; + } + + st->target = prev_state; + st->bringup = !st->bringup; +} + +/* Regular hotplug invocation of the AP hotplug thread */ +static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) +{ + if (!st->single && st->state == st->target) + return; + + st->result = 0; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_completion(&st->done); +} + +static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ + enum cpuhp_state prev_state; + int ret; + + prev_state = cpuhp_set_state(st, target); + __cpuhp_kick_ap(st); + if ((ret = st->result)) { + cpuhp_reset_state(st, prev_state); + __cpuhp_kick_ap(st); + } + + return ret; +} static int bringup_wait_for_ap(unsigned int cpu) { @@ -301,12 +375,10 @@ static int bringup_wait_for_ap(unsigned int cpu) stop_machine_unpark(cpu); kthread_unpark(st->thread); - /* Should we go further up ? */ - if (st->target > CPUHP_AP_ONLINE_IDLE) { - __cpuhp_kick_ap_work(st); - wait_for_completion(&st->done); - } - return st->result; + if (st->target <= CPUHP_AP_ONLINE_IDLE) + return 0; + + return cpuhp_kick_ap(st, st->target); } static int bringup_cpu(unsigned int cpu) @@ -332,32 +404,6 @@ static int bringup_cpu(unsigned int cpu) /* * Hotplug state machine related functions */ -static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - for (st->state++; st->state < st->target; st->state++) { - struct cpuhp_step *step = cpuhp_get_step(st->state); - - if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); - } -} - -static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - enum cpuhp_state target) -{ - enum cpuhp_state prev_state = st->state; - int ret = 0; - - for (; st->state > target; st->state--) { - ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); - if (ret) { - st->target = prev_state; - undo_cpu_down(cpu, st); - break; - } - } - return ret; -} static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { @@ -404,71 +450,90 @@ static int cpuhp_should_run(unsigned int cpu) return st->should_run; } -/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ -static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); - - return cpuhp_down_callbacks(cpu, st, target); -} - -/* Execute the online startup callbacks. Used to be CPU_ONLINE */ -static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - return cpuhp_up_callbacks(cpu, st, st->target); -} - /* * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke * callbacks when a state gets [un]installed at runtime. + * + * Each invocation of this function by the smpboot thread does a single AP + * state callback. + * + * It has 3 modes of operation: + * - single: runs st->cb_state + * - up: runs ++st->state, while st->state < st->target + * - down: runs st->state--, while st->state > st->target + * + * When complete or on error, should_run is cleared and the completion is fired. */ static void cpuhp_thread_fun(unsigned int cpu) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); - int ret = 0; + bool bringup = st->bringup; + enum cpuhp_state state; /* - * Paired with the mb() in cpuhp_kick_ap_work and - * cpuhp_invoke_ap_callback, so the work set is consistent visible. + * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures + * that if we see ->should_run we also see the rest of the state. */ smp_mb(); - if (!st->should_run) - return; - st->should_run = false; + if (WARN_ON_ONCE(!st->should_run)) + return; lock_map_acquire(&cpuhp_state_lock_map); - /* Single callback invocation for [un]install ? */ + if (st->single) { - if (st->cb_state < CPUHP_AP_ONLINE) { - local_irq_disable(); - ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node, - NULL); - local_irq_enable(); + state = st->cb_state; + st->should_run = false; + } else { + if (bringup) { + st->state++; + state = st->state; + st->should_run = (st->state < st->target); + WARN_ON_ONCE(st->state > st->target); } else { - ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node, - NULL); + state = st->state; + st->state--; + st->should_run = (st->state > st->target); + WARN_ON_ONCE(st->state < st->target); } - } else if (st->rollback) { - BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); + } + + WARN_ON_ONCE(!cpuhp_is_ap_state(state)); + + if (st->rollback) { + struct cpuhp_step *step = cpuhp_get_step(state); + if (step->skip_onerr) + goto next; + } + + if (cpuhp_is_atomic_state(state)) { + local_irq_disable(); + st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); + local_irq_enable(); - undo_cpu_down(cpu, st); - st->rollback = false; + /* + * STARTING/DYING must not fail! + */ + WARN_ON_ONCE(st->result); } else { - /* Cannot happen .... */ - BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); - - /* Regular hotplug work */ - if (st->state < st->target) - ret = cpuhp_ap_online(cpu, st); - else if (st->state > st->target) - ret = cpuhp_ap_offline(cpu, st); + st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); + } + + if (st->result) { + /* + * If we fail on a rollback, we're up a creek without no + * paddle, no way forward, no way back. We loose, thanks for + * playing. + */ + WARN_ON_ONCE(st->rollback); + st->should_run = false; } + +next: lock_map_release(&cpuhp_state_lock_map); - st->result = ret; - complete(&st->done); + + if (!st->should_run) + complete(&st->done); } /* Invoke a single callback on a remote cpu */ @@ -477,6 +542,7 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret; if (!cpu_online(cpu)) return 0; @@ -491,48 +557,43 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!st->thread) return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); + st->rollback = false; + st->last = NULL; + + st->node = node; + st->bringup = bringup; st->cb_state = state; st->single = true; - st->bringup = bringup; - st->node = node; - /* - * Make sure the above stores are visible before should_run becomes - * true. Paired with the mb() above in cpuhp_thread_fun() - */ - smp_mb(); - st->should_run = true; - wake_up_process(st->thread); - wait_for_completion(&st->done); - return st->result; -} + __cpuhp_kick_ap(st); -/* Regular hotplug invocation of the AP hotplug thread */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) -{ - st->result = 0; - st->single = false; /* - * Make sure the above stores are visible before should_run becomes - * true. Paired with the mb() above in cpuhp_thread_fun() + * If we failed and did a partial, do a rollback. */ - smp_mb(); - st->should_run = true; - wake_up_process(st->thread); + if ((ret = st->result) && st->last) { + st->rollback = true; + st->bringup = !bringup; + + __cpuhp_kick_ap(st); + } + + return ret; } static int cpuhp_kick_ap_work(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - enum cpuhp_state state = st->state; + enum cpuhp_state prev_state = st->state; + int ret; - trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); lock_map_acquire(&cpuhp_state_lock_map); lock_map_release(&cpuhp_state_lock_map); - __cpuhp_kick_ap_work(st); - wait_for_completion(&st->done); - trace_cpuhp_exit(cpu, st->state, state, st->result); - return st->result; + + trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); + ret = cpuhp_kick_ap(st, st->target); + trace_cpuhp_exit(cpu, st->state, prev_state, ret); + + return ret; } static struct smp_hotplug_thread cpuhp_threads = { @@ -693,11 +754,32 @@ void cpuhp_report_idle_dead(void) cpuhp_complete_idle_dead, st, 0); } -#else -#define takedown_cpu NULL -#endif +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = cpuhp_get_step(st->state); -#ifdef CONFIG_HOTPLUG_CPU + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + } +} + +static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + for (; st->state > target; st->state--) { + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st); + break; + } + } + return ret; +} /* Requires cpu_add_remove_lock to be held */ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, @@ -716,13 +798,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, cpuhp_tasks_frozen = tasks_frozen; - prev_state = st->state; - st->target = target; + prev_state = cpuhp_set_state(st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread. */ if (st->state > CPUHP_TEARDOWN_CPU) { + st->target = max((int)target, CPUHP_TEARDOWN_CPU); ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just @@ -737,6 +819,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, */ if (st->state > CPUHP_TEARDOWN_CPU) goto out; + + st->target = target; } /* * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need @@ -744,9 +828,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, */ ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { - st->target = prev_state; - st->rollback = true; - cpuhp_kick_ap_work(cpu); + cpuhp_reset_state(st, prev_state); + __cpuhp_kick_ap(st); } out: @@ -771,11 +854,15 @@ out: cpu_maps_update_done(); return err; } + int cpu_down(unsigned int cpu) { return do_cpu_down(cpu, CPUHP_OFFLINE); } EXPORT_SYMBOL(cpu_down); + +#else +#define takedown_cpu NULL #endif /*CONFIG_HOTPLUG_CPU*/ /** @@ -846,7 +933,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpuhp_tasks_frozen = tasks_frozen; - st->target = target; + cpuhp_set_state(st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread once more. @@ -1313,6 +1400,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, struct cpuhp_step *sp = cpuhp_get_step(state); int ret; + /* + * If there's nothing to do, we done. + * Relies on the union for multi_instance. + */ if ((bringup && !sp->startup.single) || (!bringup && !sp->teardown.single)) return 0; -- cgit From 724a86881d03ee5794148e65142e24ed3621be66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:18 +0200 Subject: smp/hotplug: Callback vs state-machine consistency While the generic callback functions have an 'int' return and thus appear to be allowed to return error, this is not true for all states. Specifically, what used to be STARTING/DYING are ran with IRQs disabled from critical parts of CPU bringup/teardown and are not allowed to fail. Add WARNs to enforce this rule. But since some callbacks are indeed allowed to fail, we have the situation where a state-machine rollback encounters a failure, in this case we're stuck, we can't go forward and we can't go back. Also add a WARN for that case. AFAICT this is a fundamental 'problem' with no real obvious solution. We want the 'prepare' callbacks to allow failure on either up or down. Typically on prepare-up this would be things like -ENOMEM from resource allocations, and the typical usage in prepare-down would be something like -EBUSY to avoid CPUs being taken away. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.819539119@infradead.org --- kernel/cpu.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 1139063de5af..d6f1b8c36400 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -202,7 +202,14 @@ err: hlist_for_each(node, &step->list) { if (!cnt--) break; - cbm(cpu, node); + + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + /* + * Rollback must not fail, + */ + WARN_ON_ONCE(ret); } return ret; } @@ -659,6 +666,7 @@ static int take_cpu_down(void *_param) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); int err, cpu = smp_processor_id(); + int ret; /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); @@ -672,8 +680,13 @@ static int take_cpu_down(void *_param) WARN_ON(st->state != CPUHP_TEARDOWN_CPU); st->state--; /* Invoke the former CPU_DYING callbacks */ - for (; st->state > target; st->state--) - cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + for (; st->state > target; st->state--) { + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + /* + * DYING must not fail! + */ + WARN_ON_ONCE(ret); + } /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -876,11 +889,16 @@ void notify_cpu_starting(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); + int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { st->state++; - cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + /* + * STARTING must not fail! + */ + WARN_ON_ONCE(ret); } } -- cgit From 5f4b55e10645b7371322c800a5ec745cab487a6c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:20 +0200 Subject: smp/hotplug: Differentiate the AP-work lockdep class between up and down With lockdep-crossrelease we get deadlock reports that span cpu-up and cpu-down chains. Such deadlocks cannot possibly happen because cpu-up and cpu-down are globally serialized. CPU0 CPU1 CPU2 cpuhp_up_callbacks: takedown_cpu: cpuhp_thread_fun: cpuhp_state irq_lock_sparse() irq_lock_sparse() wait_for_completion() cpuhp_state complete() Now that we have consistent AP state, we can trivially separate the AP-work class between up and down using st->bringup. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: max.byungchul.park@gmail.com Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Link: https://lkml.kernel.org/r/20170920170546.922524234@infradead.org --- kernel/cpu.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index d6f1b8c36400..d5c09985fbb6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -68,9 +68,26 @@ struct cpuhp_cpu_state { static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) -static struct lock_class_key cpuhp_state_key; -static struct lockdep_map cpuhp_state_lock_map = - STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); +static struct lockdep_map cpuhp_state_up_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); +static struct lockdep_map cpuhp_state_down_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); + + +static void inline cpuhp_lock_acquire(bool bringup) +{ + lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} + +static void inline cpuhp_lock_release(bool bringup) +{ + lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} +#else + +static void inline cpuhp_lock_acquire(bool bringup) { } +static void inline cpuhp_lock_release(bool bringup) { } + #endif /** @@ -486,7 +503,7 @@ static void cpuhp_thread_fun(unsigned int cpu) if (WARN_ON_ONCE(!st->should_run)) return; - lock_map_acquire(&cpuhp_state_lock_map); + cpuhp_lock_acquire(bringup); if (st->single) { state = st->cb_state; @@ -537,7 +554,7 @@ static void cpuhp_thread_fun(unsigned int cpu) } next: - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_release(bringup); if (!st->should_run) complete(&st->done); @@ -554,8 +571,11 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!cpu_online(cpu)) return 0; - lock_map_acquire(&cpuhp_state_lock_map); - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_acquire(false); + cpuhp_lock_release(false); + + cpuhp_lock_acquire(true); + cpuhp_lock_release(true); /* * If we are up and running, use the hotplug thread. For early calls @@ -593,8 +613,11 @@ static int cpuhp_kick_ap_work(unsigned int cpu) enum cpuhp_state prev_state = st->state; int ret; - lock_map_acquire(&cpuhp_state_lock_map); - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_acquire(false); + cpuhp_lock_release(false); + + cpuhp_lock_acquire(true); + cpuhp_lock_release(true); trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); ret = cpuhp_kick_ap(st, st->target); -- cgit From 5ebe7742fff8be5f1359bc50f5d43fb6ff7bd060 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:19 +0200 Subject: smp/hotplug: Differentiate the AP completion between up and down With lockdep-crossrelease we get deadlock reports that span cpu-up and cpu-down chains. Such deadlocks cannot possibly happen because cpu-up and cpu-down are globally serialized. takedown_cpu() irq_lock_sparse() wait_for_completion(&st->done) cpuhp_thread_fun cpuhp_up_callback cpuhp_invoke_callback irq_affinity_online_cpu irq_local_spare() irq_unlock_sparse() complete(&st->done) Now that we have consistent AP state, we can trivially separate the AP completion between up and down using st->bringup. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: max.byungchul.park@gmail.com Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Link: https://lkml.kernel.org/r/20170920170546.872472799@infradead.org --- kernel/cpu.c | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index d5c09985fbb6..6bbe261b851f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -46,7 +46,8 @@ * @bringup: Single callback bringup or teardown selector * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation - * @done: Signal completion to the issuer of the task + * @done_up: Signal completion to the issuer of the task for cpu-up + * @done_down: Signal completion to the issuer of the task for cpu-down */ struct cpuhp_cpu_state { enum cpuhp_state state; @@ -61,7 +62,8 @@ struct cpuhp_cpu_state { struct hlist_node *last; enum cpuhp_state cb_state; int result; - struct completion done; + struct completion done_up; + struct completion done_down; #endif }; @@ -130,14 +132,6 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state) return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } -/* - * The former STARTING/DYING states, ran with IRQs disabled and must not fail. - */ -static bool cpuhp_is_atomic_state(enum cpuhp_state state) -{ - return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; -} - static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { struct cpuhp_step *sp; @@ -232,6 +226,26 @@ err: } #ifdef CONFIG_SMP +static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ + struct completion *done = bringup ? &st->done_up : &st->done_down; + wait_for_completion(done); +} + +static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ + struct completion *done = bringup ? &st->done_up : &st->done_down; + complete(done); +} + +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ + return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} + /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); bool cpuhp_tasks_frozen; @@ -368,7 +382,7 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) smp_mb(); st->should_run = true; wake_up_process(st->thread); - wait_for_completion(&st->done); + wait_for_ap_thread(st, st->bringup); } static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) @@ -391,7 +405,7 @@ static int bringup_wait_for_ap(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ - wait_for_completion(&st->done); + wait_for_ap_thread(st, true); if (WARN_ON_ONCE((!cpu_online(cpu)))) return -ECANCELED; @@ -464,7 +478,8 @@ static void cpuhp_create(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - init_completion(&st->done); + init_completion(&st->done_up); + init_completion(&st->done_down); } static int cpuhp_should_run(unsigned int cpu) @@ -557,7 +572,7 @@ next: cpuhp_lock_release(bringup); if (!st->should_run) - complete(&st->done); + complete_ap_thread(st, bringup); } /* Invoke a single callback on a remote cpu */ @@ -753,7 +768,7 @@ static int takedown_cpu(unsigned int cpu) * * Wait for the stop thread to go away. */ - wait_for_completion(&st->done); + wait_for_ap_thread(st, false); BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ @@ -772,7 +787,7 @@ static void cpuhp_complete_idle_dead(void *arg) { struct cpuhp_cpu_state *st = arg; - complete(&st->done); + complete_ap_thread(st, false); } void cpuhp_report_idle_dead(void) @@ -939,7 +954,7 @@ void cpuhp_online_idle(enum cpuhp_state state) return; st->state = CPUHP_AP_ONLINE_IDLE; - complete(&st->done); + complete_ap_thread(st, true); } /* Requires cpu_add_remove_lock to be held */ -- cgit From 1db49484f21ed0fcdadd0635a3669f5f386546fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:21 +0200 Subject: smp/hotplug: Hotplug state fail injection Add a sysfs file to one-time fail a specific state. This can be used to test the state rollback code paths. Something like this (hotplug-up.sh): #!/bin/bash echo 0 > /debug/sched_debug echo 1 > /debug/tracing/events/cpuhp/enable ALL_STATES=`cat /sys/devices/system/cpu/hotplug/states | cut -d':' -f1` STATES=${1:-$ALL_STATES} for state in $STATES do echo 0 > /sys/devices/system/cpu/cpu1/online echo 0 > /debug/tracing/trace echo Fail state: $state echo $state > /sys/devices/system/cpu/cpu1/hotplug/fail cat /sys/devices/system/cpu/cpu1/hotplug/fail echo 1 > /sys/devices/system/cpu/cpu1/online cat /debug/tracing/trace > hotfail-${state}.trace sleep 1 done Can be used to test for all possible rollback (barring multi-instance) scenarios on CPU-up, CPU-down is a trivial modification of the above. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.972581715@infradead.org --- include/linux/cpuhotplug.h | 3 ++- kernel/cpu.c | 60 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 477b2e6f60f7..6d508767e144 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -22,7 +22,8 @@ */ enum cpuhp_state { - CPUHP_OFFLINE, + CPUHP_INVALID = -1, + CPUHP_OFFLINE = 0, CPUHP_CREATE_THREADS, CPUHP_PERF_PREPARE, CPUHP_PERF_X86_PREPARE, diff --git a/kernel/cpu.c b/kernel/cpu.c index 6bbe261b851f..8de11a29e495 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -52,6 +52,7 @@ struct cpuhp_cpu_state { enum cpuhp_state state; enum cpuhp_state target; + enum cpuhp_state fail; #ifdef CONFIG_SMP struct task_struct *thread; bool should_run; @@ -67,7 +68,9 @@ struct cpuhp_cpu_state { #endif }; -static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { + .fail = CPUHP_INVALID, +}; #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = @@ -160,6 +163,15 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int (*cb)(unsigned int cpu); int ret, cnt; + if (st->fail == state) { + st->fail = CPUHP_INVALID; + + if (!(bringup ? step->startup.single : step->teardown.single)) + return 0; + + return -EAGAIN; + } + if (!step->multi_instance) { WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; @@ -1805,9 +1817,55 @@ static ssize_t show_cpuhp_target(struct device *dev, } static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); + +static ssize_t write_cpuhp_fail(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + struct cpuhp_step *sp; + int fail, ret; + + ret = kstrtoint(buf, 10, &fail); + if (ret) + return ret; + + /* + * Cannot fail STARTING/DYING callbacks. + */ + if (cpuhp_is_atomic_state(fail)) + return -EINVAL; + + /* + * Cannot fail anything that doesn't have callbacks. + */ + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(fail); + if (!sp->startup.single && !sp->teardown.single) + ret = -EINVAL; + mutex_unlock(&cpuhp_state_mutex); + if (ret) + return ret; + + st->fail = fail; + + return count; +} + +static ssize_t show_cpuhp_fail(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->fail); +} + +static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail); + static struct attribute *cpuhp_cpu_attrs[] = { &dev_attr_state.attr, &dev_attr_target.attr, + &dev_attr_fail.attr, NULL }; -- cgit From 910801809b2e40a4baedd080ef5d80b4a180e70e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 20 Sep 2017 16:58:38 +0200 Subject: security/keys: properly zero out sensitive key material in big_key Error paths forgot to zero out sensitive material, so this patch changes some kfrees into a kzfrees. Signed-off-by: Jason A. Donenfeld Signed-off-by: David Howells Reviewed-by: Eric Biggers Cc: Herbert Xu Cc: Kirill Marinushkin Cc: security@kernel.org Cc: stable@vger.kernel.org --- security/keys/big_key.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/security/keys/big_key.c b/security/keys/big_key.c index 6acb00f6f22c..507d6fb86a4f 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -195,7 +195,7 @@ int big_key_preparse(struct key_preparsed_payload *prep) *path = file->f_path; path_get(path); fput(file); - kfree(data); + kzfree(data); } else { /* Just store the data in a buffer */ void *data = kmalloc(datalen, GFP_KERNEL); @@ -211,9 +211,9 @@ int big_key_preparse(struct key_preparsed_payload *prep) err_fput: fput(file); err_enckey: - kfree(enckey); + kzfree(enckey); error: - kfree(data); + kzfree(data); return ret; } @@ -227,7 +227,7 @@ void big_key_free_preparse(struct key_preparsed_payload *prep) path_put(path); } - kfree(prep->payload.data[big_key_data]); + kzfree(prep->payload.data[big_key_data]); } /* @@ -259,7 +259,7 @@ void big_key_destroy(struct key *key) path->mnt = NULL; path->dentry = NULL; } - kfree(key->payload.data[big_key_data]); + kzfree(key->payload.data[big_key_data]); key->payload.data[big_key_data] = NULL; } @@ -328,7 +328,7 @@ long big_key_read(const struct key *key, char __user *buffer, size_t buflen) err_fput: fput(file); error: - kfree(data); + kzfree(data); } else { ret = datalen; if (copy_to_user(buffer, key->payload.data[big_key_data], -- cgit From 428490e38b2e352812e0b765d8bceafab0ec441d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 20 Sep 2017 16:58:39 +0200 Subject: security/keys: rewrite all of big_key crypto This started out as just replacing the use of crypto/rng with get_random_bytes_wait, so that we wouldn't use bad randomness at boot time. But, upon looking further, it appears that there were even deeper underlying cryptographic problems, and that this seems to have been committed with very little crypto review. So, I rewrote the whole thing, trying to keep to the conventions introduced by the previous author, to fix these cryptographic flaws. It makes no sense to seed crypto/rng at boot time and then keep using it like this, when in fact there's already get_random_bytes_wait, which can ensure there's enough entropy and be a much more standard way of generating keys. Since this sensitive material is being stored untrusted, using ECB and no authentication is simply not okay at all. I find it surprising and a bit horrifying that this code even made it past basic crypto review, which perhaps points to some larger issues. This patch moves from using AES-ECB to using AES-GCM. Since keys are uniquely generated each time, we can set the nonce to zero. There was also a race condition in which the same key would be reused at the same time in different threads. A mutex fixes this issue now. So, to summarize, this commit fixes the following vulnerabilities: * Low entropy key generation, allowing an attacker to potentially guess or predict keys. * Unauthenticated encryption, allowing an attacker to modify the cipher text in particular ways in order to manipulate the plaintext, which is is even more frightening considering the next point. * Use of ECB mode, allowing an attacker to trivially swap blocks or compare identical plaintext blocks. * Key re-use. * Faulty memory zeroing. Signed-off-by: Jason A. Donenfeld Reviewed-by: Eric Biggers Signed-off-by: David Howells Cc: Herbert Xu Cc: Kirill Marinushkin Cc: security@kernel.org Cc: stable@vger.kernel.org --- security/keys/Kconfig | 4 +- security/keys/big_key.c | 127 ++++++++++++++++++++++-------------------------- 2 files changed, 60 insertions(+), 71 deletions(-) diff --git a/security/keys/Kconfig b/security/keys/Kconfig index a7a23b5541f8..91eafada3164 100644 --- a/security/keys/Kconfig +++ b/security/keys/Kconfig @@ -45,10 +45,8 @@ config BIG_KEYS bool "Large payload keys" depends on KEYS depends on TMPFS - depends on (CRYPTO_ANSI_CPRNG = y || CRYPTO_DRBG = y) select CRYPTO_AES - select CRYPTO_ECB - select CRYPTO_RNG + select CRYPTO_GCM help This option provides support for holding large keys within the kernel (for example Kerberos ticket caches). The data may be stored out to diff --git a/security/keys/big_key.c b/security/keys/big_key.c index 507d6fb86a4f..e607830b6154 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -1,5 +1,6 @@ /* Large capacity key type * + * Copyright (C) 2017 Jason A. Donenfeld . All Rights Reserved. * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * @@ -16,10 +17,10 @@ #include #include #include +#include #include #include -#include -#include +#include /* * Layout of key payload words. @@ -49,7 +50,12 @@ enum big_key_op { /* * Key size for big_key data encryption */ -#define ENC_KEY_SIZE 16 +#define ENC_KEY_SIZE 32 + +/* + * Authentication tag length + */ +#define ENC_AUTHTAG_SIZE 16 /* * big_key defined keys take an arbitrary string as the description and an @@ -64,57 +70,62 @@ struct key_type key_type_big_key = { .destroy = big_key_destroy, .describe = big_key_describe, .read = big_key_read, + /* no ->update(); don't add it without changing big_key_crypt() nonce */ }; /* - * Crypto names for big_key data encryption + * Crypto names for big_key data authenticated encryption */ -static const char big_key_rng_name[] = "stdrng"; -static const char big_key_alg_name[] = "ecb(aes)"; +static const char big_key_alg_name[] = "gcm(aes)"; /* - * Crypto algorithms for big_key data encryption + * Crypto algorithms for big_key data authenticated encryption */ -static struct crypto_rng *big_key_rng; -static struct crypto_skcipher *big_key_skcipher; +static struct crypto_aead *big_key_aead; /* - * Generate random key to encrypt big_key data + * Since changing the key affects the entire object, we need a mutex. */ -static inline int big_key_gen_enckey(u8 *key) -{ - return crypto_rng_get_bytes(big_key_rng, key, ENC_KEY_SIZE); -} +static DEFINE_MUTEX(big_key_aead_lock); /* * Encrypt/decrypt big_key data */ static int big_key_crypt(enum big_key_op op, u8 *data, size_t datalen, u8 *key) { - int ret = -EINVAL; + int ret; struct scatterlist sgio; - SKCIPHER_REQUEST_ON_STACK(req, big_key_skcipher); - - if (crypto_skcipher_setkey(big_key_skcipher, key, ENC_KEY_SIZE)) { + struct aead_request *aead_req; + /* We always use a zero nonce. The reason we can get away with this is + * because we're using a different randomly generated key for every + * different encryption. Notably, too, key_type_big_key doesn't define + * an .update function, so there's no chance we'll wind up reusing the + * key to encrypt updated data. Simply put: one key, one encryption. + */ + u8 zero_nonce[crypto_aead_ivsize(big_key_aead)]; + + aead_req = aead_request_alloc(big_key_aead, GFP_KERNEL); + if (!aead_req) + return -ENOMEM; + + memset(zero_nonce, 0, sizeof(zero_nonce)); + sg_init_one(&sgio, data, datalen + (op == BIG_KEY_ENC ? ENC_AUTHTAG_SIZE : 0)); + aead_request_set_crypt(aead_req, &sgio, &sgio, datalen, zero_nonce); + aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + aead_request_set_ad(aead_req, 0); + + mutex_lock(&big_key_aead_lock); + if (crypto_aead_setkey(big_key_aead, key, ENC_KEY_SIZE)) { ret = -EAGAIN; goto error; } - - skcipher_request_set_tfm(req, big_key_skcipher); - skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, - NULL, NULL); - - sg_init_one(&sgio, data, datalen); - skcipher_request_set_crypt(req, &sgio, &sgio, datalen, NULL); - if (op == BIG_KEY_ENC) - ret = crypto_skcipher_encrypt(req); + ret = crypto_aead_encrypt(aead_req); else - ret = crypto_skcipher_decrypt(req); - - skcipher_request_zero(req); - + ret = crypto_aead_decrypt(aead_req); error: + mutex_unlock(&big_key_aead_lock); + aead_request_free(aead_req); return ret; } @@ -146,16 +157,13 @@ int big_key_preparse(struct key_preparsed_payload *prep) * * File content is stored encrypted with randomly generated key. */ - size_t enclen = ALIGN(datalen, crypto_skcipher_blocksize(big_key_skcipher)); + size_t enclen = datalen + ENC_AUTHTAG_SIZE; loff_t pos = 0; - /* prepare aligned data to encrypt */ data = kmalloc(enclen, GFP_KERNEL); if (!data) return -ENOMEM; - memcpy(data, prep->data, datalen); - memset(data + datalen, 0x00, enclen - datalen); /* generate random key */ enckey = kmalloc(ENC_KEY_SIZE, GFP_KERNEL); @@ -163,13 +171,12 @@ int big_key_preparse(struct key_preparsed_payload *prep) ret = -ENOMEM; goto error; } - - ret = big_key_gen_enckey(enckey); - if (ret) + ret = get_random_bytes_wait(enckey, ENC_KEY_SIZE); + if (unlikely(ret)) goto err_enckey; /* encrypt aligned data */ - ret = big_key_crypt(BIG_KEY_ENC, data, enclen, enckey); + ret = big_key_crypt(BIG_KEY_ENC, data, datalen, enckey); if (ret) goto err_enckey; @@ -295,7 +302,7 @@ long big_key_read(const struct key *key, char __user *buffer, size_t buflen) struct file *file; u8 *data; u8 *enckey = (u8 *)key->payload.data[big_key_data]; - size_t enclen = ALIGN(datalen, crypto_skcipher_blocksize(big_key_skcipher)); + size_t enclen = datalen + ENC_AUTHTAG_SIZE; loff_t pos = 0; data = kmalloc(enclen, GFP_KERNEL); @@ -344,47 +351,31 @@ error: */ static int __init big_key_init(void) { - struct crypto_skcipher *cipher; - struct crypto_rng *rng; int ret; - rng = crypto_alloc_rng(big_key_rng_name, 0, 0); - if (IS_ERR(rng)) { - pr_err("Can't alloc rng: %ld\n", PTR_ERR(rng)); - return PTR_ERR(rng); - } - - big_key_rng = rng; - - /* seed RNG */ - ret = crypto_rng_reset(rng, NULL, crypto_rng_seedsize(rng)); - if (ret) { - pr_err("Can't reset rng: %d\n", ret); - goto error_rng; - } - /* init block cipher */ - cipher = crypto_alloc_skcipher(big_key_alg_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(cipher)) { - ret = PTR_ERR(cipher); + big_key_aead = crypto_alloc_aead(big_key_alg_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(big_key_aead)) { + ret = PTR_ERR(big_key_aead); pr_err("Can't alloc crypto: %d\n", ret); - goto error_rng; + return ret; + } + ret = crypto_aead_setauthsize(big_key_aead, ENC_AUTHTAG_SIZE); + if (ret < 0) { + pr_err("Can't set crypto auth tag len: %d\n", ret); + goto free_aead; } - - big_key_skcipher = cipher; ret = register_key_type(&key_type_big_key); if (ret < 0) { pr_err("Can't register type: %d\n", ret); - goto error_cipher; + goto free_aead; } return 0; -error_cipher: - crypto_free_skcipher(big_key_skcipher); -error_rng: - crypto_free_rng(big_key_rng); +free_aead: + crypto_free_aead(big_key_aead); return ret; } -- cgit From e4d8ae00169f7686e1da5a62e5cf797d12bf8822 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Sep 2017 10:44:36 -0700 Subject: PM / OPP: Call notifier without holding opp_table->lock The notifier callbacks may want to call some OPP helper routines which may try to take the same opp_table->lock again and cause a deadlock. One such usecase was reported by Chanwoo Choi, where calling dev_pm_opp_disable() leads us to the devfreq's OPP notifier handler, which further calls dev_pm_opp_find_freq_floor() and it deadlocks. We don't really need the opp_table->lock to be held across the notifier call though, all we want to make sure is that the 'opp' doesn't get freed while being used from within the notifier chain. We can do it with help of dev_pm_opp_get/put() as well. Let's do it. Cc: 4.11+ # 4.11+ Fixes: 5b650b388844 "PM / OPP: Take kref from _find_opp_table()" Reported-by: Chanwoo Choi Tested-by: Chanwoo Choi Reviewed-by: Stephen Boyd Reviewed-by: Chanwoo Choi Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c index a8cc14fd8ae4..a6de32530693 100644 --- a/drivers/base/power/opp/core.c +++ b/drivers/base/power/opp/core.c @@ -1581,6 +1581,9 @@ static int _opp_set_availability(struct device *dev, unsigned long freq, opp->available = availability_req; + dev_pm_opp_get(opp); + mutex_unlock(&opp_table->lock); + /* Notify the change of the OPP availability */ if (availability_req) blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ENABLE, @@ -1589,8 +1592,12 @@ static int _opp_set_availability(struct device *dev, unsigned long freq, blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_DISABLE, opp); + dev_pm_opp_put(opp); + goto put_table; + unlock: mutex_unlock(&opp_table->lock); +put_table: dev_pm_opp_put_opp_table(opp_table); return r; } -- cgit From 675195d0be27391d48d8d23c7c62991505168528 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 20 Sep 2017 08:58:53 +0200 Subject: scsi: scsi_transport_fc: set scsi_target_id upon rescan When an rport is found in the bindings array there is no guarantee that it had been a target port, so we need to call fc_remote_port_rolechg() here to ensure the scsi_target_id is set correctly. Otherwise the port will never be scanned. Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Tested-by: Chad Dupuis Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_fc.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index ba9d70f8a6a1..e74fffc32c75 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -2876,7 +2876,6 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel, memcpy(&rport->port_name, &ids->port_name, sizeof(rport->port_name)); rport->port_id = ids->port_id; - rport->roles = ids->roles; rport->port_state = FC_PORTSTATE_ONLINE; rport->flags &= ~FC_RPORT_FAST_FAIL_TIMEDOUT; @@ -2885,15 +2884,7 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel, fci->f->dd_fcrport_size); spin_unlock_irqrestore(shost->host_lock, flags); - if (ids->roles & FC_PORT_ROLE_FCP_TARGET) { - scsi_target_unblock(&rport->dev, SDEV_RUNNING); - - /* initiate a scan of the target */ - spin_lock_irqsave(shost->host_lock, flags); - rport->flags |= FC_RPORT_SCAN_PENDING; - scsi_queue_work(shost, &rport->scan_work); - spin_unlock_irqrestore(shost->host_lock, flags); - } + fc_remote_port_rolechg(rport, ids->roles); return rport; } } -- cgit From d477bf3af1e88fe27c893f84136647fe11963198 Mon Sep 17 00:00:00 2001 From: Suniel Mahesh Date: Thu, 21 Sep 2017 19:09:03 +0530 Subject: cpufreq: dt: Fix sysfs duplicate filename creation for platform-device ti-cpufreq and cpufreq-dt-platdev drivers are registering platform-device with same name "cpufreq-dt" using platform_device_register_*() routines. This is leading to build warnings appended below. Providing hardware information to OPP framework along with the platform- device creation should be done by ti-cpufreq driver before cpufreq-dt driver comes into place. This patch add's TI am33xx, am43 and dra7 platforms (which use opp-v2 property) to the blacklist of devices in cpufreq-dt-platform driver to avoid creating platform-device twice and remove build warnings. [ 2.370167] ------------[ cut here ]------------ [ 2.375087] WARNING: CPU: 0 PID: 1 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x58/0x78 [ 2.383112] sysfs: cannot create duplicate filename '/devices/platform/cpufreq-dt' [ 2.391219] Modules linked in: [ 2.394506] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.13.0-next-20170912 #1 [ 2.402006] Hardware name: Generic AM33XX (Flattened Device Tree) [ 2.408437] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 2.416568] [] (show_stack) from [] (dump_stack+0xac/0xe0) [ 2.424165] [] (dump_stack) from [] (__warn+0xd8/0x104) [ 2.431488] [] (__warn) from [] (warn_slowpath_fmt+0x34/0x44) [ 2.439351] [] (warn_slowpath_fmt) from [] (sysfs_warn_dup+0x58/0x78) [ 2.447938] [] (sysfs_warn_dup) from [] (sysfs_create_dir_ns+0x80/0x98) [ 2.456719] [] (sysfs_create_dir_ns) from [] (kobject_add_internal+0x9c/0x2d4) [ 2.466124] [] (kobject_add_internal) from [] (kobject_add+0x4c/0x9c) [ 2.474712] [] (kobject_add) from [] (device_add+0xcc/0x57c) [ 2.482489] [] (device_add) from [] (platform_device_add+0x100/0x220) [ 2.491085] [] (platform_device_add) from [] (platform_device_register_full+0xf4/0x118) [ 2.501305] [] (platform_device_register_full) from [] (ti_cpufreq_init+0x150/0x22c) [ 2.511253] [] (ti_cpufreq_init) from [] (do_one_initcall+0x3c/0x170) [ 2.519838] [] (do_one_initcall) from [] (kernel_init_freeable+0x1fc/0x2c4) [ 2.528974] [] (kernel_init_freeable) from [] (kernel_init+0x8/0x110) [ 2.537565] [] (kernel_init) from [] (ret_from_fork+0x14/0x3c) [ 2.545981] ---[ end trace 2fc00e213c13ab20 ]--- [ 2.551051] ------------[ cut here ]------------ [ 2.555931] WARNING: CPU: 0 PID: 1 at lib/kobject.c:240 kobject_add_internal+0x254/0x2d4 [ 2.564578] kobject_add_internal failed for cpufreq-dt with -EEXIST, don't try to register things with the same name in the same directory. [ 2.577977] Modules linked in: [ 2.581261] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 4.13.0-next-20170912 #1 [ 2.590013] Hardware name: Generic AM33XX (Flattened Device Tree) [ 2.596437] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 2.604573] [] (show_stack) from [] (dump_stack+0xac/0xe0) [ 2.612172] [] (dump_stack) from [] (__warn+0xd8/0x104) [ 2.619494] [] (__warn) from [] (warn_slowpath_fmt+0x34/0x44) [ 2.627362] [] (warn_slowpath_fmt) from [] (kobject_add_internal+0x254/0x2d4) [ 2.636666] [] (kobject_add_internal) from [] (kobject_add+0x4c/0x9c) [ 2.645255] [] (kobject_add) from [] (device_add+0xcc/0x57c) [ 2.653027] [] (device_add) from [] (platform_device_add+0x100/0x220) [ 2.661615] [] (platform_device_add) from [] (platform_device_register_full+0xf4/0x118) [ 2.671833] [] (platform_device_register_full) from [] (ti_cpufreq_init+0x150/0x22c) [ 2.681779] [] (ti_cpufreq_init) from [] (do_one_initcall+0x3c/0x170) [ 2.690377] [] (do_one_initcall) from [] (kernel_init_freeable+0x1fc/0x2c4) [ 2.699510] [] (kernel_init_freeable) from [] (kernel_init+0x8/0x110) [ 2.708106] [] (kernel_init) from [] (ret_from_fork+0x14/0x3c) [ 2.716217] ---[ end trace 2fc00e213c13ab21 ]--- Fixes: edeec420de24 (cpufreq: dt-cpufreq: platdev Automatically create device with OPP v2) Signed-off-by: Suniel Mahesh Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt-platdev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 430edadca527..a753c50e9e41 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -118,6 +118,10 @@ static const struct of_device_id blacklist[] __initconst = { { .compatible = "sigma,tango4", }, + { .compatible = "ti,am33xx", }, + { .compatible = "ti,am43", }, + { .compatible = "ti,dra7", }, + { } }; -- cgit From 9561475db680f7144d2223a409dd3d7e322aca03 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Mon, 11 Sep 2017 09:45:40 +0200 Subject: PCI: Fix race condition with driver_override The driver_override implementation is susceptible to a race condition when different threads are reading vs. storing a different driver override. Add locking to avoid the race condition. This is in close analogy to commit 6265539776a0 ("driver core: platform: fix race condition with driver_override") from Adrian Salido. Fixes: 782a985d7af2 ("PCI: Introduce new device binding path using pci_dev.driver_override") Signed-off-by: Nicolai Stange Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v3.16+ --- drivers/pci/pci-sysfs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 1eecfa301f7f..8e075ea2743e 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -686,7 +686,7 @@ static ssize_t driver_override_store(struct device *dev, const char *buf, size_t count) { struct pci_dev *pdev = to_pci_dev(dev); - char *driver_override, *old = pdev->driver_override, *cp; + char *driver_override, *old, *cp; /* We need to keep extra room for a newline */ if (count >= (PAGE_SIZE - 1)) @@ -700,12 +700,15 @@ static ssize_t driver_override_store(struct device *dev, if (cp) *cp = '\0'; + device_lock(dev); + old = pdev->driver_override; if (strlen(driver_override)) { pdev->driver_override = driver_override; } else { kfree(driver_override); pdev->driver_override = NULL; } + device_unlock(dev); kfree(old); @@ -716,8 +719,12 @@ static ssize_t driver_override_show(struct device *dev, struct device_attribute *attr, char *buf) { struct pci_dev *pdev = to_pci_dev(dev); + ssize_t len; - return snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override); + device_lock(dev); + len = snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override); + device_unlock(dev); + return len; } static DEVICE_ATTR_RW(driver_override); -- cgit From b776e4b1a990045a7c70798f1f353c3160c26594 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 25 Sep 2017 20:38:45 -0400 Subject: fix a typo in put_compat_shm_info() "uip" misspelled as "up"; unfortunately, the latter happens to be a function and gcc is happy to convert it to void *... Signed-off-by: Al Viro --- ipc/shm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/shm.c b/ipc/shm.c index 1e2b1692ba2c..badac463e2c8 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1154,7 +1154,7 @@ static int put_compat_shm_info(struct shm_info *ip, info.shm_swp = ip->shm_swp; info.swap_attempts = ip->swap_attempts; info.swap_successes = ip->swap_successes; - return copy_to_user(up, &info, sizeof(info)); + return copy_to_user(uip, &info, sizeof(info)); } static int copy_compat_shmid_to_user(void __user *buf, struct shmid64_ds *in, -- cgit From cc6f77710a6de6210f9feda7cd53e2f5ee7a7e69 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:41:16 -0700 Subject: xfs: don't unconditionally clear the reflink flag on zero-block files If we have speculative cow preallocations hanging around in the cow fork, don't let a truncate operation clear the reflink flag because if we do then there's a chance we'll forget to free those extents when we destroy the incore inode. Reported-by: Amir Goldstein Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5599dda4727a..4ec5b7f45401 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1624,10 +1624,12 @@ xfs_itruncate_extents( goto out; /* - * Clear the reflink flag if we truncated everything. + * Clear the reflink flag if there are no data fork blocks and + * there are no extents staged in the cow fork. */ - if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) { - ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) { + if (ip->i_d.di_nblocks == 0) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; xfs_inode_clear_cowblocks_tag(ip); } -- cgit From 3af423b03435c81036fa710623d3ae92fbe346a3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:41:17 -0700 Subject: xfs: evict CoW fork extents when performing finsert/fcollapse When we perform an finsert/fcollapse operation, cancel all the CoW extents for the affected file offset range so that they don't end up pointing to the wrong blocks. Reported-by: Amir Goldstein Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index cd9a5400ba4f..bc6c6e10a969 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1459,7 +1459,19 @@ xfs_shift_file_space( return error; /* - * The extent shiting code works on extent granularity. So, if + * Clean out anything hanging around in the cow fork now that + * we've flushed all the dirty data out to disk to avoid having + * CoW extents at the wrong offsets. + */ + if (xfs_is_reflink_inode(ip)) { + error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF, + true); + if (error) + return error; + } + + /* + * The extent shifting code works on extent granularity. So, if * stop_fsb is not the starting block of extent, we need to split * the extent at stop_fsb. */ -- cgit From e150dcd459e1b441eaf08f341a986f04e61bf3b8 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 18 Sep 2017 11:34:16 -0700 Subject: fs/xfs: Use %pS printk format for direct addresses Use the %pS instead of the %pF printk format specifier for printing symbols from direct addresses. This is needed for the ia64, ppc64 and parisc64 architectures. Signed-off-by: Helge Deller Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_error.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index bd786a9ac2c3..eaf86f55b7f2 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -347,7 +347,7 @@ xfs_verifier_error( { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx", + xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", __return_address, bp->b_ops->name, bp->b_bn); -- cgit From 64671bafbdd984535aa382bccadd91fbe7be0e80 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Mon, 18 Sep 2017 11:38:58 -0700 Subject: xfs: kill meaningless variable 'zero' In xfs_file_aio_write_checks(), variable 'zero' is there only to satisfy xfs_zero_eof(), the result of it is ignored. Now, with iomap_zero_range() based xfs_zero_eof(), we can safely pass NULL as the last param of it and kill 'zero'. Signed-off-by: Eryu Guan Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ebdd0bd2b261..261d83f1db76 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -377,8 +377,6 @@ restart: */ spin_lock(&ip->i_flags_lock); if (iocb->ki_pos > i_size_read(inode)) { - bool zero = false; - spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { @@ -399,7 +397,7 @@ restart: drained_dio = true; goto restart; } - error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); + error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL); if (error) return error; } else -- cgit From d20a5e3851969fa685f118a80e4df670255a4e8d Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Mon, 18 Sep 2017 11:39:23 -0700 Subject: xfs: report zeroed or not correctly in xfs_zero_range() The 'did_zero' param of xfs_zero_range() was not passed to iomap_zero_range() correctly. This was introduced by commit 7bb41db3ea16 ("xfs: handle 64-bit length in xfs_iozero"), and found by code inspection. Signed-off-by: Eryu Guan Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 261d83f1db76..350b6d43ba23 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -58,7 +58,7 @@ xfs_zero_range( xfs_off_t count, bool *did_zero) { - return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); + return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops); } int -- cgit From 1e6fa688bffc0ff419a4c3e78dbaf7aabfb55183 Mon Sep 17 00:00:00 2001 From: Kenjiro Nakayama Date: Mon, 18 Sep 2017 12:03:56 -0700 Subject: xfs: Output warning message when discard option was enabled even though the device does not support discard In order to using discard function, it is necessary that not only xfs is mounted with discard option, but also the discard function is supported by the device. Current code doesn't output any message when users mount with discard option on unsupported device, so it is difficult to notice that it was not enabled actually. This patch adds the warning message to notice that discard option is not enabled due to unsupported device when the filesystem is mounted. Changes in v2 (Suggested by Brian Foster): - Move the unsupported device check into xfs_fs_fill_super(). - Clear the discard flag when device is unsupported. Signed-off-by: Kenjiro Nakayama Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c996f4ae4a5f..584cf2d573ba 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1654,6 +1654,16 @@ xfs_fs_fill_super( "DAX and reflink have not been tested together!"); } + if (mp->m_flags & XFS_MOUNT_DISCARD) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + + if (!blk_queue_discard(q)) { + xfs_warn(mp, "mounting with \"discard\" option, but " + "the device does not support discard"); + mp->m_flags &= ~XFS_MOUNT_DISCARD; + } + } + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { if (mp->m_sb.sb_rblocks) { xfs_alert(mp, -- cgit From 60915f83cd1e021a66fc1503a446aef5c772553a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Sep 2017 13:38:46 -0700 Subject: xfs: remove redundant re-initialization of total_nr_pages Variable total_nr_pages is being initialized and then updated with the same value, this latter assignment is redundant and can be removed. Cleans up clang build warning: Value stored to 'total_nr_pages' during its initialization is never read Signed-off-by: Colin Ian King Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index da14658da310..2f97c12ca75e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1258,8 +1258,6 @@ xfs_buf_ioapply_map( int size; int offset; - total_nr_pages = bp->b_page_count; - /* skip the pages in the buffer before the start offset */ page_index = 0; offset = *buf_offset; -- cgit From f091fb8c344ce13cbf058d304c6cbb042be97058 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 25 Sep 2017 13:47:23 +0200 Subject: scsi: scsi_transport_fc: Also check for NOTPRESENT in fc_remote_port_add() During failover there is a small race window between fc_remote_port_add() and fc_timeout_deleted_rport(); the latter drops the lock after setting the port to NOTPRESENT, so if fc_remote_port_add() is called right at that time it will fail to detect the existing rport and happily adding a new structure, causing rports to get registered twice. Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_fc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index e74fffc32c75..cbd4495d0ff9 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -2739,7 +2739,8 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel, list_for_each_entry(rport, &fc_host->rports, peers) { - if ((rport->port_state == FC_PORTSTATE_BLOCKED) && + if ((rport->port_state == FC_PORTSTATE_BLOCKED || + rport->port_state == FC_PORTSTATE_NOTPRESENT) && (rport->channel == channel)) { switch (fc_host->tgtid_bind_type) { -- cgit From 4618e90965f272fe522f2af2523a60d0d4bc78f3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:10 +0200 Subject: x86/fpu: Fix fpu__activate_fpstate_read() and update comments fpu__activate_fpstate_read() can be called for the current task when coredumping - or for stopped tasks when ptrace-ing. Implement this properly in the code and update the comments. This also fixes an incorrect (but harmless) warning introduced by one of the earlier patches. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-28-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 93103a909c47..afd3f2a5c64e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -254,18 +254,21 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr); /* * This function must be called before we read a task's fpstate. * - * If the task has not used the FPU before then initialize its - * fpstate. + * There's two cases where this gets called: + * + * - for the current task (when coredumping), in which case we have + * to save the latest FPU registers into the fpstate, + * + * - or it's called for stopped tasks (ptrace), in which case the + * registers were already saved by the context-switch code when + * the task scheduled out - we only have to initialize the registers + * if they've never been initialized. * * If the task has used the FPU before then save it. */ void fpu__activate_fpstate_read(struct fpu *fpu) { - /* - * If fpregs are active (in the current CPU), then - * copy them to the fpstate: - */ - if (fpu->fpstate_active) { + if (fpu == ¤t->thread.fpu) { fpu__save(fpu); } else { if (!fpu->fpstate_active) { -- cgit From 685c930d6e58e31e251ec354f9dca3958a4c5040 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:11 +0200 Subject: x86/fpu: Remove fpu__current_fpstate_write_begin/end() These functions are not used anymore, so remove them. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-29-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 -- arch/x86/kernel/fpu/core.c | 63 ------------------------------------- 2 files changed, 65 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index cf290d424e48..508e4181c4af 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -26,8 +26,6 @@ extern void fpu__activate_curr(struct fpu *fpu); extern void fpu__activate_fpstate_read(struct fpu *fpu); extern void fpu__activate_fpstate_write(struct fpu *fpu); -extern void fpu__current_fpstate_write_begin(void); -extern void fpu__current_fpstate_write_end(void); extern void fpu__save(struct fpu *fpu); extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index afd3f2a5c64e..b2cdeb3b1860 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -316,69 +316,6 @@ void fpu__activate_fpstate_write(struct fpu *fpu) } } -/* - * This function must be called before we write the current - * task's fpstate. - * - * This call gets the current FPU register state and moves - * it in to the 'fpstate'. Preemption is disabled so that - * no writes to the 'fpstate' can occur from context - * swiches. - * - * Must be followed by a fpu__current_fpstate_write_end(). - */ -void fpu__current_fpstate_write_begin(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - - /* - * Ensure that the context-switching code does not write - * over the fpstate while we are doing our update. - */ - preempt_disable(); - - /* - * Move the fpregs in to the fpu's 'fpstate'. - */ - fpu__activate_fpstate_read(fpu); - - /* - * The caller is about to write to 'fpu'. Ensure that no - * CPU thinks that its fpregs match the fpstate. This - * ensures we will not be lazy and skip a XRSTOR in the - * future. - */ - __fpu_invalidate_fpregs_state(fpu); -} - -/* - * This function must be paired with fpu__current_fpstate_write_begin() - * - * This will ensure that the modified fpstate gets placed back in - * the fpregs if necessary. - * - * Note: This function may be called whether or not an _actual_ - * write to the fpstate occurred. - */ -void fpu__current_fpstate_write_end(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - - /* - * 'fpu' now has an updated copy of the state, but the - * registers may still be out of date. Update them with - * an XRSTOR if they are active. - */ - if (fpu->fpstate_active) - copy_kernel_to_fpregs(&fpu->state); - - /* - * Our update is done and the fpregs/fpstate are in sync - * if necessary. Context switches can happen again. - */ - preempt_enable(); -} - /* * 'fpu__restore()' is called to copy FPU registers from * the FPU fpstate to the live hw registers and to activate -- cgit From e4a81bfcaae1ebbdc6efe74e8ea563144d90e9a9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 Sep 2017 09:43:36 +0200 Subject: x86/fpu: Rename fpu::fpstate_active to fpu::initialized The x86 FPU code used to have a complex state machine where both the FPU registers and the FPU state context could be 'active' (or inactive) independently of each other - which enabled features like lazy FPU restore. Much of this complexity is gone in the current code: now we basically can have FPU-less tasks (kernel threads) that don't use (and save/restore) FPU state at all, plus full FPU users that save/restore directly with no laziness whatsoever. But the fpu::fpstate_active still carries bits of the old complexity - meanwhile this flag has become a simple flag that shows whether the FPU context saving area in the thread struct is initialized and used, or not. Rename it to fpu::initialized to express this simplicity in the name as well. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-30-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/fpu/internal.h | 4 ++-- arch/x86/include/asm/fpu/types.h | 6 +++--- arch/x86/include/asm/trace/fpu.h | 8 ++++---- arch/x86/kernel/fpu/core.c | 24 ++++++++++++------------ arch/x86/kernel/fpu/init.c | 2 +- arch/x86/kernel/fpu/regset.c | 6 +++--- arch/x86/kernel/fpu/signal.c | 8 ++++---- arch/x86/kernel/fpu/xstate.c | 2 +- arch/x86/kernel/signal.c | 6 +++--- arch/x86/mm/pkeys.c | 2 +- 11 files changed, 35 insertions(+), 35 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index e0bb46c02857..0e2a5edbce00 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -231,7 +231,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; - if (fpu->fpstate_active) { + if (fpu->initialized) { unsigned long fx_aligned, math_size; sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 508e4181c4af..b26ae05da18a 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -527,7 +527,7 @@ static inline void fpregs_activate(struct fpu *fpu) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (old_fpu->fpstate_active) { + if (old_fpu->initialized) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else @@ -550,7 +550,7 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { bool preload = static_cpu_has(X86_FEATURE_FPU) && - new_fpu->fpstate_active; + new_fpu->initialized; if (preload) { if (!fpregs_state_valid(new_fpu, cpu)) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 71db45ca8870..a1520575d86b 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -293,13 +293,13 @@ struct fpu { unsigned int last_cpu; /* - * @fpstate_active: + * @initialized: * - * This flag indicates whether this context is active: if the task + * This flag indicates whether this context is initialized: if the task * is not running then we can restore from this context, if the task * is running then we should save into this context. */ - unsigned char fpstate_active; + unsigned char initialized; /* * @state: diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index da565aae9fd2..39f7a27bef13 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -12,22 +12,22 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_STRUCT__entry( __field(struct fpu *, fpu) - __field(bool, fpstate_active) + __field(bool, initialized) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; - __entry->fpstate_active = fpu->fpstate_active; + __entry->initialized = fpu->initialized; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p initialized: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, - __entry->fpstate_active, + __entry->initialized, __entry->xfeatures, __entry->xcomp_bv ) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b2cdeb3b1860..c8d6032f04d0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -100,7 +100,7 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); - if (fpu->fpstate_active) { + if (fpu->initialized) { /* * Ignore return value -- we don't care if reg state * is clobbered. @@ -116,7 +116,7 @@ void __kernel_fpu_end(void) { struct fpu *fpu = ¤t->thread.fpu; - if (fpu->fpstate_active) + if (fpu->initialized) copy_kernel_to_fpregs(&fpu->state); kernel_fpu_enable(); @@ -148,7 +148,7 @@ void fpu__save(struct fpu *fpu) preempt_disable(); trace_x86_fpu_before_save(fpu); - if (fpu->fpstate_active) { + if (fpu->initialized) { if (!copy_fpregs_to_fpstate(fpu)) { copy_kernel_to_fpregs(&fpu->state); } @@ -191,7 +191,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { dst_fpu->last_cpu = -1; - if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU)) + if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU)) return 0; WARN_ON_FPU(src_fpu != ¤t->thread.fpu); @@ -240,13 +240,13 @@ void fpu__activate_curr(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); - if (!fpu->fpstate_active) { + if (!fpu->initialized) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); trace_x86_fpu_activate_state(fpu); /* Safe to do for the current task: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } EXPORT_SYMBOL_GPL(fpu__activate_curr); @@ -271,13 +271,13 @@ void fpu__activate_fpstate_read(struct fpu *fpu) if (fpu == ¤t->thread.fpu) { fpu__save(fpu); } else { - if (!fpu->fpstate_active) { + if (!fpu->initialized) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); trace_x86_fpu_activate_state(fpu); /* Safe to do for current and for stopped child tasks: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } } @@ -303,7 +303,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) */ WARN_ON_FPU(fpu == ¤t->thread.fpu); - if (fpu->fpstate_active) { + if (fpu->initialized) { /* Invalidate any lazy state: */ __fpu_invalidate_fpregs_state(fpu); } else { @@ -312,7 +312,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) trace_x86_fpu_activate_state(fpu); /* Safe to do for stopped child tasks: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } @@ -354,7 +354,7 @@ void fpu__drop(struct fpu *fpu) preempt_disable(); if (fpu == ¤t->thread.fpu) { - if (fpu->fpstate_active) { + if (fpu->initialized) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" @@ -363,7 +363,7 @@ void fpu__drop(struct fpu *fpu) } } - fpu->fpstate_active = 0; + fpu->initialized = 0; trace_x86_fpu_dropped(fpu); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index d5d44c452624..7affb7e3d9a5 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -240,7 +240,7 @@ static void __init fpu__init_system_ctx_switch(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - WARN_ON_FPU(current->thread.fpu.fpstate_active); + WARN_ON_FPU(current->thread.fpu.initialized); } /* diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index c764f7405322..19e82334e811 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -16,14 +16,14 @@ int regset_fpregs_active(struct task_struct *target, const struct user_regset *r { struct fpu *target_fpu = &target->thread.fpu; - return target_fpu->fpstate_active ? regset->n : 0; + return target_fpu->initialized ? regset->n : 0; } int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { struct fpu *target_fpu = &target->thread.fpu; - if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->fpstate_active) + if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized) return regset->n; else return 0; @@ -380,7 +380,7 @@ int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) struct fpu *fpu = &tsk->thread.fpu; int fpvalid; - fpvalid = fpu->fpstate_active; + fpvalid = fpu->initialized; if (fpvalid) fpvalid = !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct), diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index da68ea1c3a44..ab2dd24cfea4 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -171,7 +171,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpu->fpstate_active || using_compacted_format()) { + if (fpu->initialized || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; @@ -315,12 +315,12 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) int err = 0; /* - * Drop the current fpu which clears fpu->fpstate_active. This ensures + * Drop the current fpu which clears fpu->initialized. This ensures * that any context-switch during the copy of the new state, * avoids the intermediate state from getting restored/saved. * Thus avoiding the new restored state from getting corrupted. * We will be ready to restore/save the state only after - * fpu->fpstate_active is again set. + * fpu->initialized is again set. */ fpu__drop(fpu); @@ -342,7 +342,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); } - fpu->fpstate_active = 1; + fpu->initialized = 1; preempt_disable(); fpu__restore(fpu); preempt_enable(); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index fda1109cc355..703e76d027ee 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -867,7 +867,7 @@ const void *get_xsave_field_ptr(int xsave_state) { struct fpu *fpu = ¤t->thread.fpu; - if (!fpu->fpstate_active) + if (!fpu->initialized) return NULL; /* * fpu__save() takes the CPU's xstate registers diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e04442345fc0..4e188fda5961 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -263,7 +263,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = (unsigned long) ka->sa.sa_restorer; } - if (fpu->fpstate_active) { + if (fpu->initialized) { sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), &buf_fx, &math_size); *fpstate = (void __user *)sp; @@ -279,7 +279,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; /* save i387 and extended state */ - if (fpu->fpstate_active && + if (fpu->initialized && copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0) return (void __user *)-1L; @@ -755,7 +755,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - if (fpu->fpstate_active) + if (fpu->initialized) fpu__clear(fpu); } signal_setup_done(failed, ksig, stepping); diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 4d24269c071f..d7bc0eea20a5 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -44,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm) */ preempt_disable(); if (!need_to_set_mm_pkey && - current->thread.fpu.fpstate_active && + current->thread.fpu.initialized && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); return execute_only_pkey; -- cgit From 7f1487c59b7c6dcb20155f4302985da2659a2997 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:13 +0200 Subject: x86/fpu: Fix stale comments about lazy FPU logic We don't do any lazy restore anymore, what we have are two pieces of optimization: - no-FPU tasks that don't save/restore the FPU context (kernel threads are such) - cached FPU registers maintained via the fpu->last_cpu field. This means that if an FPU task context switches to a non-FPU task then we can maintain the FPU registers as an in-FPU copies (cache), and skip the restoration of them once we switch back to the original FPU-using task. Update all the comments that still referred to old 'lazy' and 'unlazy' concepts. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-31-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c8d6032f04d0..77668d91fdc1 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -205,9 +205,6 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) /* * Save current FPU registers directly into the child * FPU context, without any memory-to-memory copying. - * In lazy mode, if the FPU context isn't loaded into - * fpregs, CR0.TS will be set and do_device_not_available - * will load the FPU context. * * We have to do all this with preemption disabled, * mostly because of the FNSAVE case, because in that @@ -285,13 +282,13 @@ void fpu__activate_fpstate_read(struct fpu *fpu) /* * This function must be called before we write a task's fpstate. * - * If the task has used the FPU before then unlazy it. + * If the task has used the FPU before then invalidate any cached FPU registers. * If the task has not used the FPU before then initialize its fpstate. * * After this function call, after registers in the fpstate are * modified and the child task has woken up, the child task will * restore the modified FPU state from the modified context. If we - * didn't clear its lazy status here then the lazy in-registers + * didn't clear its cached status here then the cached in-registers * state pending on its former CPU could be restored, corrupting * the modifications. */ @@ -304,7 +301,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) WARN_ON_FPU(fpu == ¤t->thread.fpu); if (fpu->initialized) { - /* Invalidate any lazy state: */ + /* Invalidate any cached state: */ __fpu_invalidate_fpregs_state(fpu); } else { fpstate_init(&fpu->state); -- cgit From e10078eba69859359ce8644dd423b4132a6a8913 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:14 +0200 Subject: x86/fpu: Simplify and speed up fpu__copy() fpu__copy() has a preempt_disable()/enable() pair, which it had to do to be able to atomically unlazy the current task when doing an FNSAVE. But we don't unlazy tasks anymore, we always do direct saves/restores of FPU context. So remove both the unnecessary critical section, and update the comments. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-32-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 77668d91fdc1..52122dd418ae 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -206,22 +206,13 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Save current FPU registers directly into the child * FPU context, without any memory-to-memory copying. * - * We have to do all this with preemption disabled, - * mostly because of the FNSAVE case, because in that - * case we must not allow preemption in the window - * between the FNSAVE and us marking the context lazy. - * - * It shouldn't be an issue as even FNSAVE is plenty - * fast in terms of critical section length. + * ( The function 'fails' in the FNSAVE case, which destroys + * register contents so we have to copy them back. ) */ - preempt_disable(); if (!copy_fpregs_to_fpstate(dst_fpu)) { - memcpy(&src_fpu->state, &dst_fpu->state, - fpu_kernel_xstate_size); - + memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size); copy_kernel_to_fpregs(&src_fpu->state); } - preempt_enable(); trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); -- cgit From 2ce03d850b9a2f17d55596ecfa86e72b5687a627 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:15 +0200 Subject: x86/fpu: Rename fpu__activate_curr() to fpu__initialize() Rename this function to better express that it's all about initializing the FPU state of a task which goes hand in hand with the fpu::initialized field. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-33-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/core.c | 8 ++++---- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kvm/x86.c | 2 +- arch/x86/math-emu/fpu_entry.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index b26ae05da18a..7c980aafb8aa 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -23,7 +23,7 @@ /* * High level FPU state handling functions: */ -extern void fpu__activate_curr(struct fpu *fpu); +extern void fpu__initialize(struct fpu *fpu); extern void fpu__activate_fpstate_read(struct fpu *fpu); extern void fpu__activate_fpstate_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 52122dd418ae..07db9d94b68b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -224,7 +224,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Activate the current task's in-memory FPU context, * if it has not been used before: */ -void fpu__activate_curr(struct fpu *fpu) +void fpu__initialize(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); @@ -237,7 +237,7 @@ void fpu__activate_curr(struct fpu *fpu) fpu->initialized = 1; } } -EXPORT_SYMBOL_GPL(fpu__activate_curr); +EXPORT_SYMBOL_GPL(fpu__initialize); /* * This function must be called before we read a task's fpstate. @@ -316,7 +316,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) */ void fpu__restore(struct fpu *fpu) { - fpu__activate_curr(fpu); + fpu__initialize(fpu); /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); @@ -392,7 +392,7 @@ void fpu__clear(struct fpu *fpu) */ if (static_cpu_has(X86_FEATURE_FPU)) { preempt_disable(); - fpu__activate_curr(fpu); + fpu__initialize(fpu); user_fpu_begin(); copy_init_fpstate_to_fpregs(); preempt_enable(); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index ab2dd24cfea4..7fa3bdb331e9 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -280,7 +280,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(VERIFY_READ, buf, size)) return -EACCES; - fpu__activate_curr(fpu); + fpu__initialize(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(current, NULL, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cd17b7d9a107..03869eb7fcd6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7225,7 +7225,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - fpu__activate_curr(fpu); + fpu__initialize(fpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index d4a7df2205b8..220638a4cb94 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -114,7 +114,7 @@ void math_emulate(struct math_emu_info *info) struct desc_struct code_descriptor; struct fpu *fpu = ¤t->thread.fpu; - fpu__activate_curr(fpu); + fpu__initialize(fpu); #ifdef RE_ENTRANT_CHECKING if (emulating) { -- cgit From 369a036de206710ff27a66f9bffe78ef657648c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 13:37:45 +0200 Subject: x86/fpu: Rename fpu__activate_fpstate_read/write() to fpu__prepare_[read|write]() As per the new nomenclature we don't 'activate' the FPU state anymore, we initialize it. So drop the _activate_fpstate name from these functions, which were a bit of a mouthful anyway, and name them: fpu__prepare_read() fpu__prepare_write() Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Eric Biggers Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 4 ++-- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kernel/fpu/regset.c | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7c980aafb8aa..e3221ffa304e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -24,8 +24,8 @@ * High level FPU state handling functions: */ extern void fpu__initialize(struct fpu *fpu); -extern void fpu__activate_fpstate_read(struct fpu *fpu); -extern void fpu__activate_fpstate_write(struct fpu *fpu); +extern void fpu__prepare_read(struct fpu *fpu); +extern void fpu__prepare_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 07db9d94b68b..f92a6593de1e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -254,7 +254,7 @@ EXPORT_SYMBOL_GPL(fpu__initialize); * * If the task has used the FPU before then save it. */ -void fpu__activate_fpstate_read(struct fpu *fpu) +void fpu__prepare_read(struct fpu *fpu) { if (fpu == ¤t->thread.fpu) { fpu__save(fpu); @@ -283,7 +283,7 @@ void fpu__activate_fpstate_read(struct fpu *fpu) * state pending on its former CPU could be restored, corrupting * the modifications. */ -void fpu__activate_fpstate_write(struct fpu *fpu) +void fpu__prepare_write(struct fpu *fpu) { /* * Only stopped child tasks can be used to modify the FPU diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 19e82334e811..ee8d2f049818 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -38,7 +38,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); fpstate_sanitize_xstate(fpu); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, @@ -55,7 +55,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); fpstate_sanitize_xstate(fpu); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, @@ -89,7 +89,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, xsave = &fpu->state.xsave; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); if (using_compacted_format()) { if (kbuf) @@ -132,7 +132,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, xsave = &fpu->state.xsave; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); if (boot_cpu_has(X86_FEATURE_XSAVES)) { if (kbuf) @@ -310,7 +310,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); if (!boot_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); @@ -340,7 +340,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); fpstate_sanitize_xstate(fpu); if (!boot_cpu_has(X86_FEATURE_FPU)) -- cgit From e63e5d5c15c6b1dba26f7cbd1b1089a1d6155db5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:04 +0200 Subject: x86/fpu: Introduce validate_xstate_header() Move validation of user-supplied xstate_header into a helper function, in preparation of calling it from both the ptrace and sigreturn syscall paths. The new function also considers it to be an error if *any* reserved bits are set, whereas before we were just clearing most of them silently. This should reduce the chance of bugs that fail to correctly validate user-supplied XSAVE areas. It also will expose any broken userspace programs that set the other reserved bits; this is desirable because such programs will lose compatibility with future CPUs and kernels if those bits are ever used for anything. (There shouldn't be any such programs, and in fact in the case where the compacted format is in use we were already validating xfeatures. But you never know...) Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-2-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++++ arch/x86/kernel/fpu/xstate.c | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 579ac2358e63..83fee2469eb7 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -52,4 +52,8 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); + +/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ +extern int validate_xstate_header(const struct xstate_header *hdr); + #endif diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 703e76d027ee..2427aeea33b5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -483,6 +483,30 @@ int using_compacted_format(void) return boot_cpu_has(X86_FEATURE_XSAVES); } +/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ +int validate_xstate_header(const struct xstate_header *hdr) +{ + /* No unknown or supervisor features may be set */ + if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR)) + return -EINVAL; + + /* Userspace must use the uncompacted format */ + if (hdr->xcomp_bv) + return -EINVAL; + + /* + * If 'reserved' is shrunken to add a new field, make sure to validate + * that new field here! + */ + BUILD_BUG_ON(sizeof(hdr->reserved) != 48); + + /* No reserved bits may be set */ + if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) + return -EINVAL; + + return 0; +} + static void __xstate_dump_leaves(void) { int i; -- cgit From cf9df81b139b6ebaec188d73758f02ca3b2110e4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:05 +0200 Subject: x86/fpu: Use validate_xstate_header() to validate the xstate_header in xstateregs_set() Tighten the checks in xstateregs_set(). Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-3-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/regset.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index ee8d2f049818..b831d5b9de99 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -141,27 +141,20 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, ret = copy_user_to_xstate(xsave, ubuf); } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - - /* xcomp_bv must be 0 when using uncompacted format */ - if (!ret && xsave->header.xcomp_bv) - ret = -EINVAL; + if (!ret) + ret = validate_xstate_header(&xsave->header); } - /* - * In case of failure, mark all states as init: - */ - if (ret) - fpstate_init(&fpu->state); - /* * mxcsr reserved bits must be masked to zero for security reasons. */ xsave->i387.mxcsr &= mxcsr_feature_mask; - xsave->header.xfeatures &= xfeatures_mask; + /* - * These bits must be zero. + * In case of failure, mark all states as init: */ - memset(&xsave->header.reserved, 0, 48); + if (ret) + fpstate_init(&fpu->state); return ret; } -- cgit From b11e2e18a7fc8eaa3d592c260d50c7129e094ded Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:06 +0200 Subject: x86/fpu: Use validate_xstate_header() to validate the xstate_header in __fpu__restore_sig() Tighten the checks in __fpu__restore_sig() and update comments. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-4-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/signal.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 7fa3bdb331e9..fb639e70048f 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -214,8 +214,11 @@ sanitize_restored_xstate(struct task_struct *tsk, struct xstate_header *header = &xsave->header; if (use_xsave()) { - /* These bits must be zero. */ - memset(header->reserved, 0, 48); + /* + * Note: we don't need to zero the reserved bits in the + * xstate_header here because we either didn't copy them at all, + * or we checked earlier that they aren't set. + */ /* * Init the state that is not present in the memory @@ -224,7 +227,7 @@ sanitize_restored_xstate(struct task_struct *tsk, if (fx_only) header->xfeatures = XFEATURE_MASK_FPSSE; else - header->xfeatures &= (xfeatures_mask & xfeatures); + header->xfeatures &= xfeatures; } if (use_fxsr()) { @@ -308,7 +311,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) /* * For 32-bit frames with fxstate, copy the user state to the * thread's fpu state, reconstruct fxstate from the fsave - * header. Sanitize the copied state etc. + * header. Validate and sanitize the copied state. */ struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; @@ -329,9 +332,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } else { err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); - /* xcomp_bv must be 0 when using uncompacted format */ - if (!err && state_size > offsetof(struct xregs_state, header) && fpu->state.xsave.header.xcomp_bv) - err = -EINVAL; + if (!err && state_size > offsetof(struct xregs_state, header)) + err = validate_xstate_header(&fpu->state.xsave.header); } if (err || __copy_from_user(&env, buf, sizeof(env))) { -- cgit From 80d8ae86b36791a545ca28ddc95133ea59bba6e0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:07 +0200 Subject: x86/fpu: Copy the full state_header in copy_kernel_to_xstate() This is in preparation to verify the full xstate header as supplied by user-space. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-5-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 2427aeea33b5..02591b96bb25 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1148,11 +1148,13 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) int i; u64 xfeatures; u64 allowed_features; + struct xstate_header hdr; offset = offsetof(struct xregs_state, header); - size = sizeof(xfeatures); + size = sizeof(hdr); - memcpy(&xfeatures, kbuf + offset, size); + memcpy(&hdr, kbuf + offset, size); + xfeatures = hdr.xfeatures; /* * Reject if the user sets any disabled or supervisor features: -- cgit From b89eda482d7849a1c146b6d0a42f4e76369bb08e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:08 +0200 Subject: x86/fpu: Eliminate the 'xfeatures' local variable in copy_kernel_to_xstate() We have this information in the xstate_header. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-6-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 02591b96bb25..c97c4a9db52a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1146,7 +1146,6 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) { unsigned int offset, size; int i; - u64 xfeatures; u64 allowed_features; struct xstate_header hdr; @@ -1154,20 +1153,19 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) size = sizeof(hdr); memcpy(&hdr, kbuf + offset, size); - xfeatures = hdr.xfeatures; /* * Reject if the user sets any disabled or supervisor features: */ allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; - if (xfeatures & ~allowed_features) + if (hdr.xfeatures & ~allowed_features) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { u64 mask = ((u64)1 << i); - if (xfeatures & mask) { + if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, 1 << i); offset = xstate_offsets[i]; @@ -1177,7 +1175,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) } } - if (xfeatures_mxcsr_quirk(xfeatures)) { + if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { offset = offsetof(struct fxregs_state, mxcsr); size = MXCSR_AND_FLAGS_SIZE; memcpy(&xsave->i387.mxcsr, kbuf + offset, size); @@ -1192,7 +1190,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) /* * Add back in the features that came in from userspace: */ - xsave->header.xfeatures |= xfeatures; + xsave->header.xfeatures |= hdr.xfeatures; return 0; } -- cgit From af95774b3ca080b0e1e651c0fc7680f3444ddda7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:09 +0200 Subject: x86/fpu: Use validate_xstate_header() to validate the xstate_header in copy_kernel_to_xstate() Tighten the checks in copy_kernel_to_xstate(). Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-7-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c97c4a9db52a..325db7850335 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1138,15 +1138,12 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i /* * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format - * and copy to the target thread. This is called from xstateregs_set() and - * there we check the CPU has XSAVES and a whole standard-sized buffer - * exists. + * and copy to the target thread. This is called from xstateregs_set(). */ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) { unsigned int offset, size; int i; - u64 allowed_features; struct xstate_header hdr; offset = offsetof(struct xregs_state, header); @@ -1154,12 +1151,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) memcpy(&hdr, kbuf + offset, size); - /* - * Reject if the user sets any disabled or supervisor features: - */ - allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; - - if (hdr.xfeatures & ~allowed_features) + if (validate_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { -- cgit From af2c4322d986a08a6e793b74b83a62b325019c20 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:10 +0200 Subject: x86/fpu: Copy the full header in copy_user_to_xstate() This is in preparation to verify the full xstate header as supplied by user-space. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-8-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 325db7850335..0cd7b73c25e8 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1199,13 +1199,16 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) int i; u64 xfeatures; u64 allowed_features; + struct xstate_header hdr; offset = offsetof(struct xregs_state, header); - size = sizeof(xfeatures); + size = sizeof(hdr); - if (__copy_from_user(&xfeatures, ubuf + offset, size)) + if (__copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; + xfeatures = hdr.xfeatures; + /* * Reject if the user sets any disabled or supervisor features: */ -- cgit From 3d703477bcfe8bb57079d97198cf1e342fe1fef9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:11 +0200 Subject: x86/fpu: Eliminate the 'xfeatures' local variable in copy_user_to_xstate() We now have this field in hdr.xfeatures. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-9-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0cd7b73c25e8..b6d78b78b5c2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1197,7 +1197,6 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) { unsigned int offset, size; int i; - u64 xfeatures; u64 allowed_features; struct xstate_header hdr; @@ -1207,20 +1206,18 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) if (__copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; - xfeatures = hdr.xfeatures; - /* * Reject if the user sets any disabled or supervisor features: */ allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; - if (xfeatures & ~allowed_features) + if (hdr.xfeatures & ~allowed_features) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { u64 mask = ((u64)1 << i); - if (xfeatures & mask) { + if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, 1 << i); offset = xstate_offsets[i]; @@ -1231,7 +1228,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) } } - if (xfeatures_mxcsr_quirk(xfeatures)) { + if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { offset = offsetof(struct fxregs_state, mxcsr); size = MXCSR_AND_FLAGS_SIZE; if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) @@ -1247,7 +1244,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) /* * Add back in the features that came in from userspace: */ - xsave->header.xfeatures |= xfeatures; + xsave->header.xfeatures |= hdr.xfeatures; return 0; } -- cgit From 98c0fad9d60e8b2cd47e15b7bee7df343648f5bb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:12 +0200 Subject: x86/fpu: Use validate_xstate_header() to validate the xstate_header in copy_user_to_xstate() Tighten the checks in copy_user_to_xstate(). Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-10-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b6d78b78b5c2..f1d5476c9022 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1188,16 +1188,15 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) } /* - * Convert from a ptrace standard-format user-space buffer to kernel XSAVES format - * and copy to the target thread. This is called from xstateregs_set() and - * there we check the CPU has XSAVES and a whole standard-sized buffer - * exists. + * Convert from a ptrace or sigreturn standard-format user-space buffer to + * kernel XSAVES format and copy to the target thread. This is called from + * xstateregs_set(), as well as potentially from the sigreturn() and + * rt_sigreturn() system calls. */ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) { unsigned int offset, size; int i; - u64 allowed_features; struct xstate_header hdr; offset = offsetof(struct xregs_state, header); @@ -1206,12 +1205,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) if (__copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; - /* - * Reject if the user sets any disabled or supervisor features: - */ - allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; - - if (hdr.xfeatures & ~allowed_features) + if (validate_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { -- cgit From 738f48cb5fdd5878d11934f1898aa2bcf1578289 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:13 +0200 Subject: x86/fpu: Use using_compacted_format() instead of open coded X86_FEATURE_XSAVES This is the canonical method to use. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-11-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/regset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index b831d5b9de99..3ea151372389 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -134,7 +134,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, fpu__prepare_write(fpu); - if (boot_cpu_has(X86_FEATURE_XSAVES)) { + if (using_compacted_format()) { if (kbuf) ret = copy_kernel_to_xstate(xsave, kbuf); else -- cgit From a98c75fcd0ec02623f4f56d824d76e659410a52b Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 23 Aug 2017 19:13:26 +0200 Subject: drm/tegra: trace: Fix path to include The TRACE_INCLUDE_FILE macro needs to specify the path relative to the define_trace.h header rather than relative to the file defining it. Reported-by: Dmitry Osipenko Tested-by: Dmitry Osipenko Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20170823171326.23620-1-thierry.reding@gmail.com --- drivers/gpu/drm/tegra/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/tegra/trace.h b/drivers/gpu/drm/tegra/trace.h index e9b7cdad5c4c..5a1ab4046e92 100644 --- a/drivers/gpu/drm/tegra/trace.h +++ b/drivers/gpu/drm/tegra/trace.h @@ -63,6 +63,6 @@ DEFINE_EVENT(register_access, sor_readl, /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_PATH ../../drivers/gpu/drm/tegra #define TRACE_INCLUDE_FILE trace #include -- cgit From ff40adf7fbdff96860b1153332c0b1c7bab6e0c1 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 24 Aug 2017 18:19:48 -0600 Subject: Btrfs: use the new helper wbc_to_write_flags This updates btrfs to use the helper wbc_to_write_flags which has been applied in ext4/xfs/f2fs/block. Please note that, with this, btrfs's dirty pages written by a writeback job will carry the flag REQ_BACKGROUND, which is currently used by writeback-throttle to determine whether it should go to get a request or wait. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d17783d70228..4ead6da5a645 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3471,8 +3471,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unsigned int write_flags = 0; unsigned long nr_written = 0; - if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = REQ_SYNC; + write_flags = wbc_to_write_flags(wbc); trace___extent_writepage(page, inode, wbc); @@ -3718,7 +3717,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, unsigned long i, num_pages; unsigned long bio_flags = 0; unsigned long start, end; - unsigned int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; + unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); -- cgit From 5f14efd3d437205143dcffcf776e0122eae1755a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 23 Aug 2017 12:15:09 -0600 Subject: Btrfs: do not reset bio->bi_ops while writing bio flush_epd_write_bio() sets bio->bi_opf by itself to honor REQ_SYNC, but it's not needed at all since bio->bi_opf has set up properly in both __extent_writepage() and write_one_eb(), and in the case of write_one_eb(), it also sets REQ_META, which we will lose in flush_epd_write_bio(). This remove this unnecessary bio->bi_opf setting. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4ead6da5a645..3738d245518c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4062,9 +4062,6 @@ static void flush_epd_write_bio(struct extent_page_data *epd) if (epd->bio) { int ret; - bio_set_op_attrs(epd->bio, REQ_OP_WRITE, - epd->sync_io ? REQ_SYNC : 0); - ret = submit_one_bio(epd->bio, 0, epd->bio_flags); BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; -- cgit From bea7eafdbda3ba1d4b2ccb9cca829eefb7989bb9 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 22 Aug 2017 23:46:00 -0700 Subject: Btrfs: fix incorrect {node,sector}size endianness from BTRFS_IOC_FS_INFO fs_info->super_copy->{node,sector}size are little-endian, but the ioctl should return the values in native endianness. Use the cached values in btrfs_fs_info instead. Found with sparse. Fixes: 80a773fbfc2d ("btrfs: retrieve more info from FS_INFO ioctl") Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ae8fbf9d3de2..cf1c2ee030dd 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2769,9 +2769,9 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, } mutex_unlock(&fs_devices->device_list_mutex); - fi_args->nodesize = fs_info->super_copy->nodesize; - fi_args->sectorsize = fs_info->super_copy->sectorsize; - fi_args->clone_alignment = fs_info->super_copy->sectorsize; + fi_args->nodesize = fs_info->nodesize; + fi_args->sectorsize = fs_info->sectorsize; + fi_args->clone_alignment = fs_info->sectorsize; if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; -- cgit From 63d71450c8d817649a79e37d685523f988b9cc98 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 1 Sep 2017 17:58:47 +0900 Subject: btrfs: clear ordered flag on cleaning up ordered extents Commit 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang") introduced btrfs_cleanup_ordered_extents() to cleanup submitted ordered extents. However, it does not clear the ordered bit (Private2) of corresponding pages. Thus, the following BUG occurs from free_pages_check_bad() (on btrfs/125 with nospace_cache). BUG: Bad page state in process btrfs pfn:3fa787 page:ffffdf2acfe9e1c0 count:0 mapcount:0 mapping: (null) index:0xd flags: 0x8000000000002008(uptodate|private_2) raw: 8000000000002008 0000000000000000 000000000000000d 00000000ffffffff raw: ffffdf2acf5c1b20 ffffb443802238b0 0000000000000000 0000000000000000 page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set bad because of flags: 0x2000(private_2) This patch clears the flag same as other places calling btrfs_dec_test_ordered_pending() for every page in the specified range. Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang") Cc: # 4.12 Signed-off-by: Naohiro Aota Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d184a46e46c4..455c0f22fe2d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -135,6 +135,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, const u64 offset, const u64 bytes) { + unsigned long index = offset >> PAGE_SHIFT; + unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + index++; + if (!page) + continue; + ClearPagePrivate2(page); + put_page(page); + } return __endio_write_update_ordered(inode, offset + PAGE_SIZE, bytes - PAGE_SIZE, false); } -- cgit From 67c003f90fd68062d92a7ffade36f9b2a9098bd8 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 1 Sep 2017 17:59:07 +0900 Subject: btrfs: finish ordered extent cleaning if no progress is found __endio_write_update_ordered() repeats the search until it reaches the end of the specified range. This works well with direct IO path, because before the function is called, it's ensured that there are ordered extents filling whole the range. It's not the case, however, when it's called from run_delalloc_range(): it is possible to have error in the midle of the loop in e.g. run_delalloc_nocow(), so that there exisits the range not covered by any ordered extents. By cleaning such "uncomplete" range, __endio_write_update_ordered() stucks at offset where there're no ordered extents. Since the ordered extents are created from head to tail, we can stop the search if there are no offset progress. Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang") Cc: # 4.12 Signed-off-by: Naohiro Aota Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 455c0f22fe2d..f78c5640c6dc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8396,6 +8396,7 @@ static void __endio_write_update_ordered(struct inode *inode, btrfs_work_func_t func; u64 ordered_offset = offset; u64 ordered_bytes = bytes; + u64 last_offset; int ret; if (btrfs_is_free_space_inode(BTRFS_I(inode))) { @@ -8407,6 +8408,7 @@ static void __endio_write_update_ordered(struct inode *inode, } again: + last_offset = ordered_offset; ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, ordered_bytes, @@ -8417,6 +8419,12 @@ again: btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &ordered->work); out_test: + /* + * If btrfs_dec_test_ordered_pending does not find any ordered extent + * in the range, we can exit. + */ + if (ordered_offset == last_offset) + return; /* * our bio might span multiple ordered extents. If we haven't * completed the accounting for the whole dio, go back and try again -- cgit From bb166d7207432d3c7d10c45dc052f12ba3a2121d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 25 Aug 2017 14:15:14 +0900 Subject: btrfs: fix NULL pointer dereference from free_reloc_roots() __del_reloc_root should be called before freeing up reloc_root->node. If not, calling __del_reloc_root() dereference reloc_root->node, causing the system BUG. Fixes: 6bdf131fac23 ("Btrfs: don't leak reloc root nodes on error") Cc: # 4.9 Signed-off-by: Naohiro Aota Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3a49a3c2fca4..9841faef08ea 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2400,11 +2400,11 @@ void free_reloc_roots(struct list_head *list) while (!list_empty(list)) { reloc_root = list_entry(list->next, struct btrfs_root, root_list); + __del_reloc_root(reloc_root); free_extent_buffer(reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->node = NULL; reloc_root->commit_root = NULL; - __del_reloc_root(reloc_root); } } -- cgit From ca6842bf01dc1ad41195eac1e343b4f08c496ba8 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Fri, 22 Jan 2016 09:13:25 +0900 Subject: Btrfs: send: fix error number for unknown inode types ENOTSUPP should not be returned to the user program. (cf. include/linux/errno.h) Therefore, EOPNOTSUPP is used instead of ENOTSUPP. Signed-off-by: Tsutomu Itoh Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8f1d3d6e7087..43430e6c99aa 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2640,7 +2640,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) } else { btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", (int)(mode & S_IFMT)); - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; goto out; } -- cgit From 6d6d282932d1a609e60dc4467677e0e863682f57 Mon Sep 17 00:00:00 2001 From: satoru takeuchi Date: Tue, 12 Sep 2017 22:42:52 +0900 Subject: btrfs: prevent to set invalid default subvolid `btrfs sub set-default` succeeds to set an ID which isn't corresponding to any fs/file tree. If such the bad ID is set to a filesystem, we can't mount this filesystem without specifying `subvol` or `subvolid` mount options. Fixes: 6ef5ed0d386b ("Btrfs: add ioctl and incompat flag to set the default mount subvol") Cc: Signed-off-by: Satoru Takeuchi Reviewed-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cf1c2ee030dd..d4a77993e52f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4057,6 +4057,10 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } + if (!is_fstree(new_root->objectid)) { + ret = -ENOENT; + goto out; + } path = btrfs_alloc_path(); if (!path) { -- cgit From 78ad4ce014d025f41b8dde3a81876832ead643cf Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 8 Sep 2017 17:48:55 +0900 Subject: btrfs: propagate error to btrfs_cmp_data_prepare caller btrfs_cmp_data_prepare() (almost) always returns 0 i.e. ignoring errors from gather_extent_pages(). While the pages are freed by btrfs_cmp_data_free(), cmp->num_pages still has > 0. Then, btrfs_extent_same() try to access the already freed pages causing faults (or violates PageLocked assertion). This patch just return the error as is so that the caller stop the process. Signed-off-by: Naohiro Aota Fixes: f441460202cb ("btrfs: fix deadlock with extent-same and readpage") Cc: # 4.2 Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d4a77993e52f..802df5755cd3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3028,7 +3028,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, out: if (ret) btrfs_cmp_data_free(cmp); - return 0; + return ret; } static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) -- cgit From c2faff790ccd11ea5be8e3ca99713f116fcd6030 Mon Sep 17 00:00:00 2001 From: "Misono, Tomohiro" Date: Wed, 30 Aug 2017 16:33:16 +0900 Subject: btrfs: remove BTRFS_FS_QUOTA_DISABLING flag Currently, "btrfs quota enable" would fail after "btrfs quota disable" on the first time with syslog output "qgroup_rescan_init failed with -22", but it would succeed on the second time. When "quota disable" is called, BTRFS_FS_QUOTA_DISABLING flag bit will be set in fs_info->flags in btrfs_quota_disable(), but it will not be droppd in btrfs_run_qgroups() (which is called in btrfs_commit_transaction()) because quota_root has already been freed. If "quota enable" is called after that, both BTRFS_FS_QUOTA_DISABLING and BTRFS_FS_QUOTA_ENABLED flag would be dropped in the btrfs_run_qgroups() since quota_root is not NULL. This leads to the failure of "quota enable" on the first time. BTRFS_FS_QUOTA_DISABLING flag is not used outside of "quota disable" context and is equivalent to whether quota_root is NULL or not. btrfs_run_qgroups() checks whether quota_root is NULL or not in the first place. So, let's remove BTRFS_FS_QUOTA_DISABLING flag. Signed-off-by: Tomohiro Misono Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/qgroup.c | 4 ---- 2 files changed, 5 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2add002662f4..b7ccfcc01732 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -708,7 +708,6 @@ struct btrfs_delayed_root; #define BTRFS_FS_OPEN 5 #define BTRFS_FS_QUOTA_ENABLED 6 #define BTRFS_FS_QUOTA_ENABLING 7 -#define BTRFS_FS_QUOTA_DISABLING 8 #define BTRFS_FS_UPDATE_UUID_TREE_GEN 9 #define BTRFS_FS_CREATING_FREE_SPACE_TREE 10 #define BTRFS_FS_BTREE_ERR 11 diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5c8b61c86e61..770f667269f5 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -807,7 +807,6 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, } ret = 0; out: - set_bit(BTRFS_FS_QUOTA_DISABLING, &root->fs_info->flags); btrfs_free_path(path); return ret; } @@ -953,7 +952,6 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, if (!fs_info->quota_root) goto out; clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - set_bit(BTRFS_FS_QUOTA_DISABLING, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -2086,8 +2084,6 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, if (test_and_clear_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags)) set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - if (test_and_clear_bit(BTRFS_FS_QUOTA_DISABLING, &fs_info->flags)) - clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); spin_lock(&fs_info->qgroup_lock); while (!list_empty(&fs_info->dirty_qgroups)) { -- cgit From fed3b381145e2e7c66b0b3f640851e1633ebd07f Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 13 Sep 2017 12:25:21 -0600 Subject: Btrfs: do not backup tree roots when fsync It doesn't make sense to backup tree roots when doing fsync, since during fsync those tree roots have not been consistent on disk. Signed-off-by: Liu Bo Reviewed-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 27d458640536..0f2271815eb6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3641,7 +3641,14 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) u64 flags; do_barriers = !btrfs_test_opt(fs_info, NOBARRIER); - backup_super_roots(fs_info); + + /* + * max_mirrors == 0 indicates we're from commit_transaction, + * not from fsync where the tree roots in fs_info have not + * been consistent on disk. + */ + if (max_mirrors == 0) + backup_super_roots(fs_info); sb = fs_info->super_for_commit; dev_item = &sb->dev_item; -- cgit From bd7d63c2ceaf737eeb21630a2b62fc5fe34dba29 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 19 Sep 2017 17:50:09 -0600 Subject: Btrfs: use btrfs_op instead of bio_op in __btrfs_map_block This seems to be a leftover of commit cf8cddd38bab ("btrfs: don't abuse REQ_OP_* flags for btrfs_map_block"). It should use btrfs_op() helper to provide one of 'enum btrfs_map_op' types. Fixes: cf8cddd38bab ("btrfs: don't abuse REQ_OP_* flags for btrfs_map_block") Signed-off-by: Liu Bo Reviewed-by: Satoru Takeuchi Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d024f1b07282..6a72f88f77b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6166,7 +6166,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, map_length = length; btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, bio_op(bio), logical, + ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, &bbio, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); -- cgit From cf1167d5c1abf3bc42b2a1562bfa7937c05337e2 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 20 Sep 2017 17:50:18 -0600 Subject: Btrfs: fix kernel oops while reading compressed data The kernel oops happens at kernel BUG at fs/btrfs/extent_io.c:2104! ... RIP: clean_io_failure+0x263/0x2a0 [btrfs] It's showing that read-repair code is using an improper mirror index. This is due to the fact that compression read's endio hasn't recorded the failed mirror index in %cb->orig_bio. With this, btrfs's read-repair can work properly on reading compressed data. Signed-off-by: Liu Bo Reported-by: Paul Jones Tested-by: Paul Jones Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 883ecc58fd0d..c6aa53c4c102 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -107,6 +107,7 @@ static void end_compressed_bio_read(struct bio *bio) struct inode *inode; struct page *page; unsigned long index; + unsigned int mirror = btrfs_io_bio(bio)->mirror_num; int ret; if (bio->bi_status) @@ -118,6 +119,14 @@ static void end_compressed_bio_read(struct bio *bio) if (!refcount_dec_and_test(&cb->pending_bios)) goto out; + /* + * Record the correct mirror_num in cb->orig_bio so that + * read-repair can work properly. + */ + ASSERT(btrfs_io_bio(cb->orig_bio)); + btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; + cb->mirror_num = mirror; + inode = cb->inode; ret = check_compressed_csum(BTRFS_I(inode), cb, (u64)bio->bi_iter.bi_sector << 9); -- cgit From e6311f240c946788131ba2b97e14f37312688072 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 20 Sep 2017 17:50:19 -0600 Subject: Btrfs: skip checksum when reading compressed data if some IO have failed Currently even if the underlying disk reports failure on IO, compressed read endio still gets to verify checksum and reports it as a checksum error. In fact, if some IO have failed during reading a compressed data extent , there's no way the checksum could match, therefore, we can skip that in order to return error quickly to the upper layer. Please note that we need to do this after recording the failed mirror index so that read-repair in the upper layer's endio can work properly. Signed-off-by: Liu Bo Tested-by: Paul Jones Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c6aa53c4c102..fc31af98a41b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -108,7 +108,7 @@ static void end_compressed_bio_read(struct bio *bio) struct page *page; unsigned long index; unsigned int mirror = btrfs_io_bio(bio)->mirror_num; - int ret; + int ret = 0; if (bio->bi_status) cb->errors = 1; @@ -127,6 +127,13 @@ static void end_compressed_bio_read(struct bio *bio) btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; cb->mirror_num = mirror; + /* + * Some IO in this cb have failed, just skip checksum as there + * is no way it could be correct. + */ + if (cb->errors == 1) + goto csum_failed; + inode = cb->inode; ret = check_compressed_csum(BTRFS_I(inode), cb, (u64)bio->bi_iter.bi_sector << 9); -- cgit From 36b96fdc6b2dc6f4a0fedc563fa7508c91b90a10 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Sun, 17 Sep 2017 09:02:29 +0000 Subject: btrfs: Report error on removing qgroup if del_qgroup_item fails Previously, we were calling del_qgroup_item, and ignoring the return code resulting in a potential to have divergent in-memory state without an error. Perhaps, it makes sense to handle this error code, and put the filesystem into a read only, or similar state. This patch only adds reporting of the error if the error is fatal, (any error other than qgroup not found). Signed-off-by: Sargun Dhillon Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 770f667269f5..e172d4843eae 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1305,6 +1305,8 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, } } ret = del_qgroup_item(trans, quota_root, qgroupid); + if (ret && ret != -ENOENT) + goto out; while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, -- cgit From 99c4e3b96c797f047be4e6b7c03cfca01959f146 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 15 Sep 2017 15:06:51 -0600 Subject: Btrfs: fix unexpected result when dio reading corrupted blocks commit 4246a0b63bd8 ("block: add a bi_error field to struct bio") changed the logic of how dio read endio reports errors. For single stripe dio read, %bio->bi_status reflects the error before verifying checksum, and now we're updating it when data block matches with its checksum, while in the mismatching case, %bio->bi_status is not updated to relfect that. When some blocks in a file have been corrupted on disk, reading such a file ends up with 1) checksum errors are reported in kernel log 2) read(2) returns successfully with some content being 0x01. In order to fix it, we need to report its checksum mismatch error to the upper layer (dio layer in this case) as well. Fixes: 4246a0b63bd8 ("block: add a bi_error field to struct bio") Signed-off-by: Liu Bo Reported-by: Goffredo Baroncelli Tested-by: Goffredo Baroncelli Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f78c5640c6dc..c242d0230db9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8366,11 +8366,8 @@ static void btrfs_endio_direct_read(struct bio *bio) struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); blk_status_t err = bio->bi_status; - if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) { + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) err = btrfs_subio_endio_read(inode, io_bio, err); - if (!err) - bio->bi_status = 0; - } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -8378,7 +8375,7 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); - dio_bio->bi_status = bio->bi_status; + dio_bio->bi_status = err; dio_end_io(dio_bio); if (io_bio->end_io) -- cgit From 8c6c592831a09a28428448e68fb08c6bbb8b9b8b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 29 Aug 2017 10:11:39 -0400 Subject: btrfs: log csums for all modified extents Amir reported a bug discovered by his cleaned up version of my dm-log-writes xfstests where we were missing csums at certain replay points. This is because fsx was doing an msync(), which essentially fsync()'s a specific range of a file. We will log all modified extents, but only search for the checksums in the range we are being asked to sync. We cannot simply log the extents in the range we're being asked because we are logging the inode item as it is currently, which if it has had a i_size update before the msync means we will miss extents when replaying. We could possibly get around this by marking the inode with the transaction that extended the i_size to see if we have this case, but this would be racy and we'd have to lock the whole range of the inode to make sure we didn't have an ordered extent outside of our range that was in the middle of completing. Fix this simply by keeping track of the modified extents range and logging the csums for the entire range of extents that we are logging. This makes the xfstest pass. Reported-by: Amir Goldstein Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ad7f4bab640b..c800d067fcbf 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4181,6 +4181,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; + u64 logged_start, logged_end; u64 test_gen; int ret = 0; int num = 0; @@ -4190,10 +4191,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, down_write(&inode->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; + logged_start = start; + logged_end = end; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { list_del_init(&em->list); - /* * Just an arbitrary number, this can be really CPU intensive * once we start getting a lot of extents, and really once we @@ -4208,6 +4210,12 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, if (em->generation <= test_gen) continue; + + if (em->start < logged_start) + logged_start = em->start; + if ((em->start + em->len - 1) > logged_end) + logged_end = em->start + em->len - 1; + /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); @@ -4216,7 +4224,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); - btrfs_get_logged_extents(inode, logged_list, start, end); + btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); /* * Some ordered extents started by fsync might have completed * before we could collect them into the list logged_list, which -- cgit From 6851a3db7e224bbb85e23b3c64a506c9e0904382 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 18 Sep 2017 14:46:03 -0700 Subject: xfs: validate bdev support for DAX inode flag Currently only the blocksize is checked, but we should really be calling bdev_dax_supported() which also tests to make sure we can get a struct dax_device and that the dax_direct_access() path is working. This is the same check that we do for the "-o dax" mount option in xfs_fs_fill_super(). This does not fix the race issues that caused the XFS DAX inode option to be disabled, so that option will still be disabled. If/when we re-enable it, though, I think we will want this issue to have been fixed. I also do think that we want to fix this in stable kernels. Signed-off-by: Ross Zwisler CC: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 5049e8ab6e30..aa75389be8cf 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1088,6 +1088,7 @@ xfs_ioctl_setattr_dax_invalidate( int *join_flags) { struct inode *inode = VFS_I(ip); + struct super_block *sb = inode->i_sb; int error; *join_flags = 0; @@ -1100,7 +1101,7 @@ xfs_ioctl_setattr_dax_invalidate( if (fa->fsx_xflags & FS_XFLAG_DAX) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; - if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE) + if (bdev_dax_supported(sb, sb->s_blocksize) < 0) return -EINVAL; } -- cgit From 546e7be8244dc050effef0555df5b8d94d10dafc Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Fri, 22 Sep 2017 11:47:33 -0700 Subject: iomap_dio_rw: Allocate AIO completion queue before submitting dio Executing xfs/104 test in a loop on Linux-v4.13 kernel on a ppc64 machine can cause the following NULL pointer dereference, .queue_work_on+0x4c/0x80 .iomap_dio_bio_end_io+0xbc/0x1f0 .bio_endio+0x118/0x1f0 .blk_update_request+0xd0/0x470 .blk_mq_end_request+0x24/0xc0 .lo_complete_rq+0x40/0xe0 .__blk_mq_complete_request_remote+0x28/0x40 .flush_smp_call_function_queue+0xc4/0x1e0 .smp_ipi_demux_relaxed+0x8c/0x100 .icp_hv_ipi_action+0x54/0xa0 .__handle_irq_event_percpu+0x84/0x2c0 .handle_irq_event_percpu+0x28/0x80 .handle_percpu_irq+0x78/0xc0 .generic_handle_irq+0x40/0x70 .__do_irq+0x88/0x200 .call_do_irq+0x14/0x24 .do_IRQ+0x84/0x130 This occurs due to the following sequence of events, 1. Allocate dio for Direct I/O write. 2. Invoke iomap_apply() until iov_iter_count() bytes have been submitted. - Assume that we have submitted atleast one bio. Hence iomap_dio->ref value will be >= 2. - If during the second iteration, iomap_apply() ends up returning -ENOSPC, we would break out of the loop and since the 'ret' value is a negative number we end up not allocating memory for super_block->s_dio_done_wq. 3. Meanwhile, iomap_dio_bio_end_io() is invoked for bios that have been submitted and here the code ends up dereferencing the NULL pointer stored at super_block->s_dio_done_wq. This commit fixes the bug by allocating memory for super_block->s_dio_done_wq before iomap_apply() is invoked. Reported-by: Eryu Guan Reviewed-by: Christoph Hellwig Tested-by: Eryu Guan Signed-off-by: Chandan Rajendra Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/iomap.c b/fs/iomap.c index 269b24a01f32..d4f526a3f5b2 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -993,6 +993,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, WARN_ON_ONCE(ret); ret = 0; + if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && + !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + goto out_free_dio; + } + inode_dio_begin(inode); blk_start_plug(&plug); @@ -1015,13 +1022,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret < 0) iomap_dio_set_error(dio, ret); - if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && - !inode->i_sb->s_dio_done_wq) { - ret = sb_init_dio_done_wq(inode->i_sb); - if (ret < 0) - iomap_dio_set_error(dio, ret); - } - if (!atomic_dec_and_test(&dio->ref)) { if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; -- cgit From ee70daaba82d70766d0723b743d9fdeb3b06102a Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Thu, 21 Sep 2017 11:26:18 -0700 Subject: xfs: update i_size after unwritten conversion in dio completion Since commit d531d91d6990 ("xfs: always use unwritten extents for direct I/O writes"), we start allocating unwritten extents for all direct writes to allow appending aio in XFS. But for dio writes that could extend file size we update the in-core inode size first, then convert the unwritten extents to real allocations at dio completion time in xfs_dio_write_end_io(). Thus a racing direct read could see the new i_size and find the unwritten extents first and read zeros instead of actual data, if the direct writer also takes a shared iolock. Fix it by updating the in-core inode size after the unwritten extent conversion. To do this, introduce a new boolean argument to xfs_iomap_write_unwritten() to tell if we want to update in-core i_size or not. Suggested-by: Brian Foster Reviewed-by: Brian Foster Signed-off-by: Eryu Guan Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 3 ++- fs/xfs/xfs_file.c | 33 +++++++++++++++++++-------------- fs/xfs/xfs_iomap.c | 7 +++++-- fs/xfs/xfs_iomap.h | 2 +- fs/xfs/xfs_pnfs.c | 2 +- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 29172609f2a3..f18e5932aec4 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -343,7 +343,8 @@ xfs_end_io( error = xfs_reflink_end_cow(ip, offset, size); break; case XFS_IO_UNWRITTEN: - error = xfs_iomap_write_unwritten(ip, offset, size); + /* writeback should never update isize */ + error = xfs_iomap_write_unwritten(ip, offset, size, false); break; default: ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 350b6d43ba23..309e26c9dddb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -434,7 +434,6 @@ xfs_dio_write_end_io( struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); loff_t offset = iocb->ki_pos; - bool update_size = false; int error = 0; trace_xfs_end_io_direct_write(ip, offset, size); @@ -445,6 +444,21 @@ xfs_dio_write_end_io( if (size <= 0) return size; + if (flags & IOMAP_DIO_COW) { + error = xfs_reflink_end_cow(ip, offset, size); + if (error) + return error; + } + + /* + * Unwritten conversion updates the in-core isize after extent + * conversion but before updating the on-disk size. Updating isize any + * earlier allows a racing dio read to find unwritten extents before + * they are converted. + */ + if (flags & IOMAP_DIO_UNWRITTEN) + return xfs_iomap_write_unwritten(ip, offset, size, true); + /* * We need to update the in-core inode size here so that we don't end up * with the on-disk inode size being outside the in-core inode size. We @@ -459,20 +473,11 @@ xfs_dio_write_end_io( spin_lock(&ip->i_flags_lock); if (offset + size > i_size_read(inode)) { i_size_write(inode, offset + size); - update_size = true; - } - spin_unlock(&ip->i_flags_lock); - - if (flags & IOMAP_DIO_COW) { - error = xfs_reflink_end_cow(ip, offset, size); - if (error) - return error; - } - - if (flags & IOMAP_DIO_UNWRITTEN) - error = xfs_iomap_write_unwritten(ip, offset, size); - else if (update_size) + spin_unlock(&ip->i_flags_lock); error = xfs_setfilesize(ip, offset, size); + } else { + spin_unlock(&ip->i_flags_lock); + } return error; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index a1909bc064e9..f179bdf1644d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -829,7 +829,8 @@ int xfs_iomap_write_unwritten( xfs_inode_t *ip, xfs_off_t offset, - xfs_off_t count) + xfs_off_t count, + bool update_isize) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -840,6 +841,7 @@ xfs_iomap_write_unwritten( xfs_trans_t *tp; xfs_bmbt_irec_t imap; struct xfs_defer_ops dfops; + struct inode *inode = VFS_I(ip); xfs_fsize_t i_size; uint resblks; int error; @@ -899,7 +901,8 @@ xfs_iomap_write_unwritten( i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); if (i_size > offset + count) i_size = offset + count; - + if (update_isize && i_size > i_size_read(inode)) + i_size_write(inode, i_size); i_size = xfs_new_eof(ip, i_size); if (i_size) { ip->i_d.di_size = i_size; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 00db3ecea084..ee535065c5d0 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, struct xfs_bmbt_irec *); -int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); +int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 2f2dc3c09ad0..4246876df7b7 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -274,7 +274,7 @@ xfs_fs_commit_blocks( (end - 1) >> PAGE_SHIFT); WARN_ON_ONCE(error); - error = xfs_iomap_write_unwritten(ip, start, length); + error = xfs_iomap_write_unwritten(ip, start, length, false); if (error) goto out_drop_iolock; } -- cgit From 9789dd9e1d939232e8ff4c50ef8e75aa6781b3fb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:42:09 -0700 Subject: xfs: perag initialization should only touch m_ag_max_usable for AG 0 We call __xfs_ag_resv_init to make a per-AG reservation for each AG. This makes the reservation per-AG, not per-filesystem. Therefore, it is incorrect to adjust m_ag_max_usable for each AG. Adjust it only when we're reserving AG 0's blocks so that we only do it once per fs. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_ag_resv.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index b008ff3250eb..df3e600835e8 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -156,7 +156,8 @@ __xfs_ag_resv_free( trace_xfs_ag_resv_free(pag, type, 0); resv = xfs_perag_resv(pag, type); - pag->pag_mount->m_ag_max_usable += resv->ar_asked; + if (pag->pag_agno == 0) + pag->pag_mount->m_ag_max_usable += resv->ar_asked; /* * AGFL blocks are always considered "free", so whatever * was reserved at mount time must be given back at umount. @@ -216,7 +217,14 @@ __xfs_ag_resv_init( return error; } - mp->m_ag_max_usable -= ask; + /* + * Reduce the maximum per-AG allocation length by however much we're + * trying to reserve for an AG. Since this is a filesystem-wide + * counter, we only make the adjustment for AG 0. This assumes that + * there aren't any AGs hungrier for per-AG reservation than AG 0. + */ + if (pag->pag_agno == 0) + mp->m_ag_max_usable -= ask; resv = xfs_perag_resv(pag, type); resv->ar_asked = ask; -- cgit From 842f6e9f786226c58fcbd5ef80eadca72fdfe652 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Fri, 22 Sep 2017 11:47:46 -0700 Subject: xfs: Capture state of the right inode in xfs_iflush_done My previous patch: d3a304b6292168b83b45d624784f973fdc1ca674 check for XFS_LI_FAILED flag xfs_iflush done, so the failed item can be properly resubmitted. In the loop scanning other inodes being completed, it should check the current item for the XFS_LI_FAILED, and not the initial one. The state of the initial inode is checked after the loop ends Kudos to Eric for catching this. Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode_item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6d0f74ec31e8..a705f34b58fa 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -745,7 +745,7 @@ xfs_iflush_done( */ iip = INODE_ITEM(blip); if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || - lip->li_flags & XFS_LI_FAILED) + (blip->li_flags & XFS_LI_FAILED)) need_ail++; blip = next; -- cgit From 5e5c943c1f257c2b3424fc3f8a7b18570152dab3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:41:17 -0700 Subject: xfs: revert "xfs: factor rmap btree size into the indlen calculations" In commit fd26a88093ba we added a worst case estimate for rmapbt blocks needed to satisfy the block mapping request. Since then, we added the ability to reserve enough space in each AG such that we should never run out of blocks to grow the rmapbt, which makes this calculation unnecessary. Revert the commit because it makes the extra delalloc indlen accounting unnecessary and incorrect. Reported-by: Eryu Guan Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 459f4b4f08fe..044a363119be 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -49,7 +49,6 @@ #include "xfs_rmap.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" -#include "xfs_rmap_btree.h" #include "xfs_icache.h" @@ -192,12 +191,8 @@ xfs_bmap_worst_indlen( int maxrecs; /* maximum record count at this level */ xfs_mount_t *mp; /* mount structure */ xfs_filblks_t rval; /* return value */ - xfs_filblks_t orig_len; mp = ip->i_mount; - - /* Calculate the worst-case size of the bmbt. */ - orig_len = len; maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); @@ -205,20 +200,12 @@ xfs_bmap_worst_indlen( len += maxrecs - 1; do_div(len, maxrecs); rval += len; - if (len == 1) { - rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - level - 1; - break; - } if (level == 0) maxrecs = mp->m_bmap_dmxr[1]; } - - /* Calculate the worst-case size of the rmapbt. */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) + - mp->m_rmap_maxlevels; - return rval; } -- cgit From fc46820b27a2d9a46f7e90c9ceb4a64a1bc5fab8 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 25 Sep 2017 12:23:03 +0200 Subject: vfs: Return -ENXIO for negative SEEK_HOLE / SEEK_DATA offsets In generic_file_llseek_size, return -ENXIO for negative offsets as well as offsets beyond EOF. This affects filesystems which don't implement SEEK_HOLE / SEEK_DATA internally, possibly because they don't support holes. Fixes xfstest generic/448. Signed-off-by: Andreas Gruenbacher Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/read_write.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index a2b9a47235c5..f0d4b16873e8 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -112,7 +112,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ - if (offset >= eof) + if ((unsigned long long)offset >= eof) return -ENXIO; break; case SEEK_HOLE: @@ -120,7 +120,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ - if (offset >= eof) + if ((unsigned long long)offset >= eof) return -ENXIO; offset = eof; break; -- cgit From ce7c47d60bda6c7f09ccf16e978d971c8fa16ff0 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Mon, 18 Sep 2017 23:00:59 +0300 Subject: platform/x86: fujitsu-laptop: Don't oops when FUJ02E3 is not presnt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My Fujitsu-Siemens Lifebook S6120 doesn't have the FUJ02E3 device, but it does have FUJ02B1. That means we do register the backlight device (and it even seems to work), but the code will oops as soon as we try to set the backlight brightness because it's trying to call call_fext_func() with a NULL device. Let's just skip those function calls when the FUJ02E3 device is not present. Cc: Jonathan Woithe Cc: Andy Shevchenko Signed-off-by: Ville Syrjälä Cc: # 4.13.x Signed-off-by: Darren Hart (VMware) --- drivers/platform/x86/fujitsu-laptop.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c index 85de30f93a9c..56a8195096a2 100644 --- a/drivers/platform/x86/fujitsu-laptop.c +++ b/drivers/platform/x86/fujitsu-laptop.c @@ -254,10 +254,12 @@ static int bl_update_status(struct backlight_device *b) { struct acpi_device *device = bl_get_data(b); - if (b->props.power == FB_BLANK_POWERDOWN) - call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x3); - else - call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x0); + if (fext) { + if (b->props.power == FB_BLANK_POWERDOWN) + call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x3); + else + call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x0); + } return set_lcd_level(device, b->props.brightness); } -- cgit From 4c6bb69663b3a3f2db8f488356e96acb5460f25f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 26 Sep 2017 10:36:05 +0200 Subject: quota: Fix quota corruption with generic/232 test Eric has reported that since commit d2faa415166b "quota: Do not acquire dqio_sem for dquot overwrites in v2 format" test generic/232 occasionally fails due to quota information being incorrect. Indeed that commit was too eager to remove dqio_sem completely from the path that just overwrites quota structure with updated information. Although that is innocent on its own, another process that inserts new quota structure to the same block can perform read-modify-write cycle of that block thus effectively discarding quota information update if they race in a wrong way. Fix the problem by acquiring dqio_sem for reading for overwrites of quota structure. Note that it *is* possible to completely avoid taking dqio_sem in the overwrite path however that will require modifying path inserting / deleting quota structures to avoid RMW cycles of the full block and for now it is not clear whether it is worth the hassle. Fixes: d2faa415166b2883428efa92f451774ef44373ac Reported-and-tested-by: Eric Whitney Signed-off-by: Jan Kara --- fs/quota/quota_v2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index c0187cda2c1e..a73e5b34db41 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -328,12 +328,16 @@ static int v2_write_dquot(struct dquot *dquot) if (!dquot->dq_off) { alloc = true; down_write(&dqopt->dqio_sem); + } else { + down_read(&dqopt->dqio_sem); } ret = qtree_write_dquot( sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); if (alloc) up_write(&dqopt->dqio_sem); + else + up_read(&dqopt->dqio_sem); return ret; } -- cgit From 5371513fb338fb9989c569dc071326d369d6ade8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Sep 2017 15:57:16 +0100 Subject: arm64: Make sure SPsel is always set When the kernel is entered at EL2 on an ARMv8.0 system, we construct the EL1 pstate and make sure this uses the the EL1 stack pointer (we perform an exception return to EL1h). But if the kernel is either entered at EL1 or stays at EL2 (because we're on a VHE-capable system), we fail to set SPsel, and use whatever stack selection the higher exception level has choosen for us. Let's not take any chance, and make sure that SPsel is set to one before we decide the mode we're going to run in. Cc: Acked-by: Mark Rutland Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kernel/head.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 7434ec0c7a27..0b243ecaf7ac 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -384,6 +384,7 @@ ENTRY(kimage_vaddr) * booted in EL1 or EL2 respectively. */ ENTRY(el2_setup) + msr SPsel, #1 // We want to use SP_EL{1,2} mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 b.eq 1f -- cgit From cd39e1176d320157831ce030b4c869bd2d5eb142 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 12:57:04 +0200 Subject: KVM: VMX: extract __pi_post_block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simple code movement patch, preparing for the next one. Cc: Huangweidong Cc: Gonglei Cc: wangxin Cc: Radim Krčmář Tested-by: Longpeng (Mike) Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 71 +++++++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c83d28b0ab05..0002b14307ab 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11705,6 +11705,43 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +static void __pi_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + unsigned long flags; + + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + if(vcpu->pre_pcpu != -1) { + spin_lock_irqsave( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + } +} + /* * This routine does the following things for vCPU which is going * to be blocked if VT-d PI is enabled. @@ -11798,44 +11835,12 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) static void pi_post_block(struct kvm_vcpu *vcpu) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - unsigned long flags; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || !irq_remapping_cap(IRQ_POSTING_CAP) || !kvm_vcpu_apicv_active(vcpu)) return; - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* Allow posting non-urgent interrupts */ - new.sn = 0; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); - - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - } + __pi_post_block(vcpu); } static void vmx_post_block(struct kvm_vcpu *vcpu) -- cgit From 8b306e2f3c41939ea528e6174c88cfbfff893ce1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 12:57:05 +0200 Subject: KVM: VMX: avoid double list add with VT-d posted interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases, for example involving hot-unplug of assigned devices, pi_post_block can forget to remove the vCPU from the blocked_vcpu_list. When this happens, the next call to pi_pre_block corrupts the list. Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block and WARN instead of adding the element twice in the list. Second, always do the list removal in pi_post_block if vcpu->pre_pcpu is set (not -1). The new code keeps interrupts disabled for the whole duration of pi_pre_block/pi_post_block. This is not strictly necessary, but easier to follow. For the same reason, PI.ON is checked only after the cmpxchg, and to handle it we just call the post-block code. This removes duplication of the list removal code. Cc: Huangweidong Cc: Gonglei Cc: wangxin Cc: Radim Krčmář Tested-by: Longpeng (Mike) Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 62 ++++++++++++++++++++++-------------------------------- 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0002b14307ab..0bfe97e50a40 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11710,10 +11710,11 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct pi_desc old, new; unsigned int dest; - unsigned long flags; do { old.control = new.control = pi_desc->control; + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the VCPU is blocked\n"); dest = cpu_physical_id(vcpu->cpu); @@ -11730,14 +11731,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); vcpu->pre_pcpu = -1; } } @@ -11757,7 +11754,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) */ static int pi_pre_block(struct kvm_vcpu *vcpu) { - unsigned long flags; unsigned int dest; struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -11767,34 +11763,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) !kvm_vcpu_apicv_active(vcpu)) return 0; - vcpu->pre_pcpu = vcpu->cpu; - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + WARN_ON(irqs_disabled()); + local_irq_disable(); + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + } do { old.control = new.control = pi_desc->control; - /* - * We should not block the vCPU if - * an interrupt is posted for it. - */ - if (pi_test_on(pi_desc) == 1) { - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - - return 1; - } - WARN((pi_desc->sn == 1), "Warning: SN field of posted-interrupts " "is set before blocking\n"); @@ -11819,7 +11801,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); - return 0; + /* We should not block the vCPU if an interrupt is posted for it. */ + if (pi_test_on(pi_desc) == 1) + __pi_post_block(vcpu); + + local_irq_enable(); + return (vcpu->pre_pcpu == -1); } static int vmx_pre_block(struct kvm_vcpu *vcpu) @@ -11835,12 +11822,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) static void pi_post_block(struct kvm_vcpu *vcpu) { - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (vcpu->pre_pcpu == -1) return; + WARN_ON(irqs_disabled()); + local_irq_disable(); __pi_post_block(vcpu); + local_irq_enable(); } static void vmx_post_block(struct kvm_vcpu *vcpu) -- cgit From 31afb2ea2b10a7d17ce3db4cdb0a12b63b2fe08a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 12:57:06 +0200 Subject: KVM: VMX: simplify and fix vmx_vcpu_pi_load MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The simplify part: do not touch pi_desc.nv, we can set it when the VCPU is first created. Likewise, pi_desc.sn is only handled by vmx_vcpu_pi_load, do not touch it in __pi_post_block. The fix part: do not check kvm_arch_has_assigned_device, instead check the SN bit to figure out whether vmx_vcpu_pi_put ran before. This matches what the previous patch did in pi_post_block. Cc: Huangweidong Cc: Gonglei Cc: wangxin Cc: Radim Krčmář Tested-by: Longpeng (Mike) Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 68 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0bfe97e50a40..b9d2140eb212 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2202,43 +2202,41 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) struct pi_desc old, new; unsigned int dest; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + /* + * In case of hot-plug or hot-unplug, we may have to undo + * vmx_vcpu_pi_put even if there is no assigned device. And we + * always keep PI.NDST up to date for simplicity: it makes the + * code easier, and CPU migration is not a fast path. + */ + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + return; + + /* + * First handle the simple case where no cmpxchg is necessary; just + * allow posting non-urgent interrupts. + * + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block will do it for us and the wakeup_handler + * expects the VCPU to be on the blocked_vcpu_list that matches + * PI.NDST. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || + vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); return; + } + /* The full case. */ do { old.control = new.control = pi_desc->control; - /* - * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there - * are two possible cases: - * 1. After running 'pre_block', context switch - * happened. For this case, 'sn' was set in - * vmx_vcpu_put(), so we need to clear it here. - * 2. After running 'pre_block', we were blocked, - * and woken up by some other guy. For this case, - * we don't need to do anything, 'pi_post_block' - * will do everything for us. However, we cannot - * check whether it is case #1 or case #2 here - * (maybe, not needed), so we also clear sn here, - * I think it is not a big deal. - */ - if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { - if (vcpu->cpu != cpu) { - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - } + dest = cpu_physical_id(cpu); - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; - /* Allow posting non-urgent interrupts */ new.sn = 0; } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); @@ -9592,6 +9590,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; + return &vmx->vcpu; free_vmcs: @@ -11723,9 +11728,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) else new.ndst = (dest << 8) & 0xFF00; - /* Allow posting non-urgent interrupts */ - new.sn = 0; - /* set 'NV' to 'notification vector' */ new.nv = POSTED_INTR_VECTOR; } while (cmpxchg(&pi_desc->control, old.control, -- cgit From 7e439681af82984045efc215437ebb2ca8d33a4c Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 25 Sep 2017 10:19:57 +0200 Subject: mtd: Fix partition alignment check on multi-erasesize devices Commit 1eeef2d7483a ("mtd: handle partitioning on devices with 0 erasesize") introduced a regression on heterogeneous erase region devices. Alignment of the partition was tested against the master eraseblock size which can be bigger than the slave one, thus leading to some partitions being marked as read-only. Update wr_alignment to match this slave erasesize after this erasesize has been determined by picking the biggest erasesize of all the regions embedded in the MTD partition. Reported-by: Mathias Thore Fixes: 1eeef2d7483a ("mtd: handle partitioning on devices with 0 erasesize") Cc: Signed-off-by: Boris Brezillon Tested-by: Mathias Thore Reviewed-by: Mathias Thore --- drivers/mtd/mtdpart.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index 5736b0c90b33..a308e707392d 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -581,6 +581,14 @@ static struct mtd_part *allocate_partition(struct mtd_info *parent, slave->mtd.erasesize = parent->erasesize; } + /* + * Slave erasesize might differ from the master one if the master + * exposes several regions with different erasesize. Adjust + * wr_alignment accordingly. + */ + if (!(slave->mtd.flags & MTD_NO_ERASE)) + wr_alignment = slave->mtd.erasesize; + tmp = slave->offset; remainder = do_div(tmp, wr_alignment); if ((slave->mtd.flags & MTD_WRITEABLE) && remainder) { -- cgit From 5c62c1c67903621cfa715d6f690548ee53301620 Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Mon, 25 Sep 2017 17:28:47 +0800 Subject: iommu/io-pgtable-arm-v7s: Need dma-sync while there is no QUIRK_NO_DMA Fix the commit 81b3c2521844 ("iommu/io-pgtable: Introduce explicit coherency"). If there is no IO_PGTABLE_QUIRK_NO_DMA, we should call dma_sync_single_for_device for cache synchronization. Signed-off-by: Yong Wu Fixes: 81b3c2521844 ('iommu/io-pgtable: Introduce explicit coherency') Reviewed-by: Robin Murphy Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-arm-v7s.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index d665d0dc16e8..6961fc393f0b 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -245,7 +245,7 @@ static void __arm_v7s_free_table(void *table, int lvl, static void __arm_v7s_pte_sync(arm_v7s_iopte *ptep, int num_entries, struct io_pgtable_cfg *cfg) { - if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) + if (cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA) return; dma_sync_single_for_device(cfg->iommu_dev, __arm_v7s_dma_addr(ptep), -- cgit From 1ff9b17cedb39bc78f9e3f82485765f9b467177d Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Mon, 25 Sep 2017 18:15:26 +0800 Subject: iommu/mediatek: Limit the physical address in 32bit for v7s The ARM short descriptor has already limited the physical address to 32bit after the commit <76557391433c> ("iommu/io-pgtable: Sanitise map/unmap addresses"). But in MediaTek 4GB mode, the physical address is from 0x1_0000_0000 to 0x1_ffff_ffff. this will cause: WARNING: CPU: 4 PID: 3900 at xxx/drivers/iommu/io-pgtable-arm-v7s.c:482 arm_v7s_map+0x40/0xf8 Modules linked in: CPU: 4 PID: 3900 Comm: weston Tainted: G S W 4.9.44 #1 Hardware name: MediaTek MT2712m1v1 board (DT) task: ffffffc0eaa5b280 task.stack: ffffffc0e9858000 PC is at arm_v7s_map+0x40/0xf8 LR is at mtk_iommu_map+0x64/0x90 pc : [] lr : [] pstate: 000001c5 sp : ffffffc0e985b920 x29: ffffffc0e985b920 x28: 0000000127d00000 x27: 0000000000100000 x26: ffffff8008f9e000 x25: 0000000000000003 x24: 0000000000100000 x23: 0000000127d00000 x22: 00000000ff800000 x21: ffffffc0f7ec8ce0 x20: 0000000000000003 x19: 0000000000000003 x18: 0000000000000002 x17: 0000007f7e5d72c0 x16: ffffff80082b0f08 x15: 0000000000000001 x14: 000000000000003f x13: 0000000000000000 x12: 0000000000000028 x11: 0088000000000000 x10: 0000000000000000 x9 : ffffff80092fa000 x8 : ffffffc0e9858000 x7 : ffffff80085b29d8 x6 : 0000000000000000 x5 : ffffff80085b09a8 x4 : 0000000000000003 x3 : 0000000000100000 x2 : 0000000127d00000 x1 : 00000000ff800000 x0 : 0000000000000001 ... Call trace: [] arm_v7s_map+0x40/0xf8 [] mtk_iommu_map+0x64/0x90 [] iommu_map+0x100/0x3a0 [] default_iommu_map_sg+0x104/0x168 [] iommu_dma_alloc+0x238/0x3f8 [] __iommu_alloc_attrs+0xa8/0x260 [] mtk_drm_gem_create+0xac/0x180 [] mtk_drm_gem_dumb_create+0x54/0xc8 [] drm_mode_create_dumb_ioctl+0xa4/0xd8 [] drm_ioctl+0x1c0/0x490 In order to satify this, Limit the physical address to 32bit. Signed-off-by: Yong Wu Acked-by: Will Deacon Signed-off-by: Joerg Roedel --- drivers/iommu/mtk_iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index bd515be5b380..16d33ac19db0 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -371,7 +371,8 @@ static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova, int ret; spin_lock_irqsave(&dom->pgtlock, flags); - ret = dom->iop->map(dom->iop, iova, paddr, size, prot); + ret = dom->iop->map(dom->iop, iova, paddr & DMA_BIT_MASK(32), + size, prot); spin_unlock_irqrestore(&dom->pgtlock, flags); return ret; -- cgit From 3c6bae62136ba5b24f0b113e68121b783457ca4b Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Tue, 26 Sep 2017 13:07:46 +0530 Subject: iommu/amd: pr_err() strings should end with newlines pr_err() messages should end with a new-line to avoid other messages being concatenated. So replace '/n' with '\n'. Signed-off-by: Arvind Yadav Fixes: 45a01c42933b ('iommu/amd: Add function copy_dev_tables()') Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 382de42b8359..6fe2d0346073 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -874,7 +874,7 @@ static bool copy_device_table(void) hi = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET + 4); entry = (((u64) hi) << 32) + lo; if (last_entry && last_entry != entry) { - pr_err("IOMMU:%d should use the same dev table as others!/n", + pr_err("IOMMU:%d should use the same dev table as others!\n", iommu->index); return false; } @@ -882,7 +882,7 @@ static bool copy_device_table(void) old_devtb_size = ((entry & ~PAGE_MASK) + 1) << 12; if (old_devtb_size != dev_table_size) { - pr_err("The device table size of IOMMU:%d is not expected!/n", + pr_err("The device table size of IOMMU:%d is not expected!\n", iommu->index); return false; } @@ -890,7 +890,7 @@ static bool copy_device_table(void) old_devtb_phys = entry & PAGE_MASK; if (old_devtb_phys >= 0x100000000ULL) { - pr_err("The address of old device table is above 4G, not trustworthy!/n"); + pr_err("The address of old device table is above 4G, not trustworthy!\n"); return false; } old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB); @@ -901,7 +901,7 @@ static bool copy_device_table(void) old_dev_tbl_cpy = (void *)__get_free_pages(gfp_flag, get_order(dev_table_size)); if (old_dev_tbl_cpy == NULL) { - pr_err("Failed to allocate memory for copying old device table!/n"); + pr_err("Failed to allocate memory for copying old device table!\n"); return false; } -- cgit From 50ce6312f293e129eedf2affc7bd791c71d8287e Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 26 Sep 2017 19:32:52 +0100 Subject: iommu: Fix comment for iommu_ops.map_sg The definition of map_sg was split during a recent addition to iommu_ops. Put it back together. Fixes: add02cfdc9bc ("iommu: Introduce Interface for IOMMU TLB Flushing") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a7f2ac689d29..41b8c5757859 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -167,11 +167,11 @@ struct iommu_resv_region { * @map: map a physically contiguous memory region to an iommu domain * @unmap: unmap a physically contiguous memory region from an iommu domain * @map_sg: map a scatter-gather list of physically contiguous memory chunks + * to an iommu domain * @flush_tlb_all: Synchronously flush all hardware TLBs for this domain * @tlb_range_add: Add a given iova range to the flush queue for this domain * @tlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue - * to an iommu domain * @iova_to_phys: translate iova to physical address * @add_device: add device to iommu grouping * @remove_device: remove device from iommu grouping -- cgit From df5efdd97029f2cff7e5c91ea1c9f2b94d009b0f Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Tue, 26 Sep 2017 06:05:57 -0700 Subject: IB/hfi1: Turn off AOC TX after offline substates Offline.quietDuration was added in the 8051 firmware, and the driver only turns off the AOC transmitters when offline.quiet is reached. However, the AOC transmitters need to be turned off at the new state. Therefore, turn off the AOC transmitters at any offline substates including offline.quiet and offline.quietDuration, then recheck we reached offline.quiet to support backwards compatibility. Reviewed-by: Jakub Byczkowski Reviewed-by: Mike Marciniszyn Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 85 +++++++++++++++++++++++++++++---------- drivers/infiniband/hw/hfi1/chip.h | 1 + 2 files changed, 65 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index b2ed4b9cda6e..1c810d65721a 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1066,6 +1066,8 @@ static int read_idle_sma(struct hfi1_devdata *dd, u64 *data); static int thermal_init(struct hfi1_devdata *dd); static void update_statusp(struct hfi1_pportdata *ppd, u32 state); +static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd, + int msecs); static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, int msecs); static void log_state_transition(struct hfi1_pportdata *ppd, u32 state); @@ -10305,6 +10307,7 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) { struct hfi1_devdata *dd = ppd->dd; u32 previous_state; + int offline_state_ret; int ret; update_lcb_cache(dd); @@ -10326,28 +10329,11 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT); - /* - * Wait for offline transition. It can take a while for - * the link to go down. - */ - ret = wait_physical_linkstate(ppd, PLS_OFFLINE, 10000); - if (ret < 0) - return ret; - - /* - * Now in charge of LCB - must be after the physical state is - * offline.quiet and before host_link_state is changed. - */ - set_host_lcb_access(dd); - write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ - - /* make sure the logical state is also down */ - ret = wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000); - if (ret) - force_logical_link_state_down(ppd); - - ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */ + offline_state_ret = wait_phys_link_offline_substates(ppd, 10000); + if (offline_state_ret < 0) + return offline_state_ret; + /* Disabling AOC transmitters */ if (ppd->port_type == PORT_TYPE_QSFP && ppd->qsfp_info.limiting_active && qsfp_mod_present(ppd)) { @@ -10364,6 +10350,30 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) } } + /* + * Wait for the offline.Quiet transition if it hasn't happened yet. It + * can take a while for the link to go down. + */ + if (offline_state_ret != PLS_OFFLINE_QUIET) { + ret = wait_physical_linkstate(ppd, PLS_OFFLINE, 30000); + if (ret < 0) + return ret; + } + + /* + * Now in charge of LCB - must be after the physical state is + * offline.quiet and before host_link_state is changed. + */ + set_host_lcb_access(dd); + write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ + + /* make sure the logical state is also down */ + ret = wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000); + if (ret) + force_logical_link_state_down(ppd); + + ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */ + /* * The LNI has a mandatory wait time after the physical state * moves to Offline.Quiet. The wait time may be different @@ -12804,6 +12814,39 @@ static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state, return 0; } +/* + * wait_phys_link_offline_quiet_substates - wait for any offline substate + * @ppd: port device + * @msecs: the number of milliseconds to wait + * + * Wait up to msecs milliseconds for any offline physical link + * state change to occur. + * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT. + */ +static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd, + int msecs) +{ + u32 read_state; + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(msecs); + while (1) { + read_state = read_physical_state(ppd->dd); + if ((read_state & 0xF0) == PLS_OFFLINE) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(ppd->dd, + "timeout waiting for phy link offline.quiet substates. Read state 0x%x, %dms\n", + read_state, msecs); + return -ETIMEDOUT; + } + usleep_range(1950, 2050); /* sleep 2ms-ish */ + } + + log_state_transition(ppd, read_state); + return read_state; +} + #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \ (r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index b8345a60a0fb..461f937fe110 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -204,6 +204,7 @@ #define PLS_OFFLINE_READY_TO_QUIET_LT 0x92 #define PLS_OFFLINE_REPORT_FAILURE 0x93 #define PLS_OFFLINE_READY_TO_QUIET_BCC 0x94 +#define PLS_OFFLINE_QUIET_DURATION 0x95 #define PLS_POLLING 0x20 #define PLS_POLLING_QUIET 0x20 #define PLS_POLLING_ACTIVE 0x21 -- cgit From 30e10527bcce376114e627abb7fabfbe9bfee91e Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Tue, 26 Sep 2017 06:06:03 -0700 Subject: IB/hfi1: Only reset QSFP after link up and turn off AOC TX QSFP reset enables AOC transmitters by default. They should be off before moving to high power mode to complete the setup. There is no need to reset the QSFP during LNI failure as it was reset at link down. Reviewed-by: Mike Marciniszyn Reviewed-by: Jakub Byczkowski Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 12 +++++++++++- drivers/infiniband/hw/hfi1/chip.h | 2 +- drivers/infiniband/hw/hfi1/platform.c | 4 +++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 1c810d65721a..27b75a8f5097 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -9415,7 +9415,7 @@ static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable) write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask); } -void reset_qsfp(struct hfi1_pportdata *ppd) +int reset_qsfp(struct hfi1_pportdata *ppd) { struct hfi1_devdata *dd = ppd->dd; u64 mask, qsfp_mask; @@ -9445,6 +9445,13 @@ void reset_qsfp(struct hfi1_pportdata *ppd) * for alarms and warnings */ set_qsfp_int_n(ppd, 1); + + /* + * After the reset, AOC transmitters are enabled by default. They need + * to be turned off to complete the QSFP setup before they can be + * enabled again. + */ + return set_qsfp_tx(ppd, 0); } static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, @@ -10406,6 +10413,9 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { /* went down while attempting link up */ check_lni_states(ppd); + + /* The QSFP doesn't need to be reset on LNI failure */ + ppd->qsfp_info.reset_needed = 0; } /* the active link width (downgrade) is 0 on link down */ diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 461f937fe110..50b8645d0b87 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -723,7 +723,7 @@ void handle_link_downgrade(struct work_struct *work); void handle_link_bounce(struct work_struct *work); void handle_start_link(struct work_struct *work); void handle_sma_message(struct work_struct *work); -void reset_qsfp(struct hfi1_pportdata *ppd); +int reset_qsfp(struct hfi1_pportdata *ppd); void qsfp_event(struct work_struct *work); void start_freeze_handling(struct hfi1_pportdata *ppd, int flags); int send_idle_sma(struct hfi1_devdata *dd, u64 message); diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c index a8af96d2b1b0..d486355880cb 100644 --- a/drivers/infiniband/hw/hfi1/platform.c +++ b/drivers/infiniband/hw/hfi1/platform.c @@ -790,7 +790,9 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, * reuse of stale settings established in our previous pass through. */ if (ppd->qsfp_info.reset_needed) { - reset_qsfp(ppd); + ret = reset_qsfp(ppd); + if (ret) + return ret; refresh_qsfp_cache(ppd, &ppd->qsfp_info); } else { ppd->qsfp_info.reset_needed = 1; -- cgit From 753b19afb19dd97d0767df5e8afb13faff605315 Mon Sep 17 00:00:00 2001 From: Jan Sokolowski Date: Tue, 26 Sep 2017 06:06:09 -0700 Subject: IB/hfi1: Check eeprom config partition validity Relying on a trailing magic value is incorrect. There are instances where this is not present as trailing magic value has a specific purpose which is not partition validation. Instead use the header magic value which is present in all variants of the platform configuration and is intended for validation. This is also used in other locations in the driver. Fixes: bc5214ee2922 (IB/hfi1: Handle missing magic values in config file) Reviewed-by: Jakub Byczkowski Signed-off-by: Jan Sokolowski Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/eprom.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/eprom.c b/drivers/infiniband/hw/hfi1/eprom.c index d46b17107901..1613af1c58d9 100644 --- a/drivers/infiniband/hw/hfi1/eprom.c +++ b/drivers/infiniband/hw/hfi1/eprom.c @@ -204,7 +204,10 @@ done_asic: return ret; } -/* magic character sequence that trails an image */ +/* magic character sequence that begins an image */ +#define IMAGE_START_MAGIC "APO=" + +/* magic character sequence that might trail an image */ #define IMAGE_TRAIL_MAGIC "egamiAPO" /* EPROM file types */ @@ -250,6 +253,7 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data, { void *buffer; void *p; + u32 length; int ret; buffer = kmalloc(P1_SIZE, GFP_KERNEL); @@ -262,15 +266,21 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data, return ret; } - /* scan for image magic that may trail the actual data */ - p = strnstr(buffer, IMAGE_TRAIL_MAGIC, P1_SIZE); - if (!p) { + /* config partition is valid only if it starts with IMAGE_START_MAGIC */ + if (memcmp(buffer, IMAGE_START_MAGIC, strlen(IMAGE_START_MAGIC))) { kfree(buffer); return -ENOENT; } + /* scan for image magic that may trail the actual data */ + p = strnstr(buffer, IMAGE_TRAIL_MAGIC, P1_SIZE); + if (p) + length = p - buffer; + else + length = P1_SIZE; + *data = buffer; - *size = p - buffer; + *size = length; return 0; } -- cgit From 09592af5fdd722615ebe435fb34308de26c74bcf Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Tue, 26 Sep 2017 06:06:15 -0700 Subject: IB/hfi1: Return correct value in general interrupt handler The general interrupt handler returns IRQ_HANDLED whether an IRQ was handled or not. Determine if an IRQ was handled and return the correct value. Reviewed-by: Dennis Dalessandro Reviewed-by: Michael J. Ruhl Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 27b75a8f5097..0be42787759f 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -8240,6 +8240,7 @@ static irqreturn_t general_interrupt(int irq, void *data) u64 regs[CCE_NUM_INT_CSRS]; u32 bit; int i; + irqreturn_t handled = IRQ_NONE; this_cpu_inc(*dd->int_counter); @@ -8260,9 +8261,10 @@ static irqreturn_t general_interrupt(int irq, void *data) for_each_set_bit(bit, (unsigned long *)®s[0], CCE_NUM_INT_CSRS * 64) { is_interrupt(dd, bit); + handled = IRQ_HANDLED; } - return IRQ_HANDLED; + return handled; } static irqreturn_t sdma_interrupt(int irq, void *data) -- cgit From 612601d0013f03de9dc134809f242ba6da9ca252 Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Tue, 26 Sep 2017 06:06:22 -0700 Subject: Revert "IB/ipoib: Update broadcast object if PKey value was changed in index 0" commit 9a9b8112699d will cause core to fail UD QP from being destroyed on ipoib unload, therefore cause resources leakage. On pkey change event above patch modifies mgid before calling underlying driver to detach it from QP. Drivers' detach_mcast() will fail to find modified mgid it was never given to attach in a first place. Core qp->usecnt will never go down, so ib_destroy_qp() will fail. IPoIB driver actually does take care of new broadcast mgid based on new pkey by destroying an old mcast object in ipoib_mcast_dev_flush()) .... if (priv->broadcast) { rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree); list_add_tail(&priv->broadcast->list, &remove_list); priv->broadcast = NULL; } ... then in restarted ipoib_macst_join_task() creating a new broadcast mcast object, sending join request and on completion tells the driver to attach to reinitialized QP: ... if (!priv->broadcast) { ... broadcast = ipoib_mcast_alloc(dev, 0); ... memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, sizeof (union ib_gid)); priv->broadcast = broadcast; ... Fixes: 9a9b8112699d ("IB/ipoib: Update broadcast object if PKey value was changed in index 0") Cc: stable@vger.kernel.org Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Reviewed-by: Feras Daoud Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 2e075377242e..6cd61638b441 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -1000,19 +1000,6 @@ static inline int update_parent_pkey(struct ipoib_dev_priv *priv) */ priv->dev->broadcast[8] = priv->pkey >> 8; priv->dev->broadcast[9] = priv->pkey & 0xff; - - /* - * Update the broadcast address in the priv->broadcast object, - * in case it already exists, otherwise no one will do that. - */ - if (priv->broadcast) { - spin_lock_irq(&priv->lock); - memcpy(priv->broadcast->mcmember.mgid.raw, - priv->dev->broadcast + 4, - sizeof(union ib_gid)); - spin_unlock_irq(&priv->lock); - } - return 0; } -- cgit From b8f42738acaddf67731c34935c0994e09a588ca7 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Tue, 26 Sep 2017 06:06:28 -0700 Subject: IB/hfi1: On error, fix use after free during user context setup During base context setup, if setup_base_ctxt() fails, the context is deallocated. This is incorrect because the context is referenced on return, to notify any waiting subcontext. If there are no subcontexts the pointer will be invalid. Reorganize the error path so that deallocate_ctxt() is called after all the possible subcontexts have been notified. Reviewed-by: Ira Weiny Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 41 +++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 2bc89260235a..d9a1e9893136 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -930,15 +930,8 @@ static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo) switch (ret) { case 0: ret = setup_base_ctxt(fd, uctxt); - if (uctxt->subctxt_cnt) { - /* - * Base context is done (successfully or not), notify - * anybody using a sub-context that is waiting for - * this completion. - */ - clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags); - wake_up(&uctxt->wait); - } + if (ret) + deallocate_ctxt(uctxt); break; case 1: ret = complete_subctxt(fd); @@ -1305,25 +1298,25 @@ static int setup_base_ctxt(struct hfi1_filedata *fd, /* Now allocate the RcvHdr queue and eager buffers. */ ret = hfi1_create_rcvhdrq(dd, uctxt); if (ret) - return ret; + goto done; ret = hfi1_setup_eagerbufs(uctxt); if (ret) - goto setup_failed; + goto done; /* If sub-contexts are enabled, do the appropriate setup */ if (uctxt->subctxt_cnt) ret = setup_subctxt(uctxt); if (ret) - goto setup_failed; + goto done; ret = hfi1_alloc_ctxt_rcv_groups(uctxt); if (ret) - goto setup_failed; + goto done; ret = init_user_ctxt(fd, uctxt); if (ret) - goto setup_failed; + goto done; user_init(uctxt); @@ -1331,12 +1324,22 @@ static int setup_base_ctxt(struct hfi1_filedata *fd, fd->uctxt = uctxt; hfi1_rcd_get(uctxt); - return 0; +done: + if (uctxt->subctxt_cnt) { + /* + * On error, set the failed bit so sub-contexts will clean up + * correctly. + */ + if (ret) + set_bit(HFI1_CTXT_BASE_FAILED, &uctxt->event_flags); -setup_failed: - /* Set the failed bit so sub-context init can do the right thing */ - set_bit(HFI1_CTXT_BASE_FAILED, &uctxt->event_flags); - deallocate_ctxt(uctxt); + /* + * Base context is done (successfully or not), notify anybody + * using a sub-context that is waiting for this completion. + */ + clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags); + wake_up(&uctxt->wait); + } return ret; } -- cgit From 828bcbdc975fbcfb27946c33d4b1d1bfab70789b Mon Sep 17 00:00:00 2001 From: Harish Chegondi Date: Tue, 26 Sep 2017 06:06:34 -0700 Subject: IB/hfi1: Unsuccessful PCIe caps tuning should not fail driver load Failure to tune PCIe capabilities should not fail driver load. This can cause the driver load to fail on systems with any of the following: 1. HFI's parent is not root. Example: HFI card is behind a PCIe bridge. 2. HFI's parent is not PCI Express capable. In these situations, failure to tune PCIe capabilities should be logged in the system message logs but not cause the driver load to fail. This patch also ensures pcie capability word DevCtl is written only after a successful read and the capability tuning process continues even if read/write of the pcie capability word DevCtl fails. Fixes: c53df62c7a9a ("IB/hfi1: Check return values from PCI config API calls") Fixes: bf70a7757736 ("staging/rdma/hfi1: Enable WFR PCIe extended tags from the driver") Reviewed-by: Michael J. Ruhl Reviewed-by: Mike Marciniszyn Reviewed-by: Jakub Byczkowski Signed-off-by: Harish Chegondi Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/pcie.c | 50 ++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 82447b7cdda1..09e50fd2a08f 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -68,7 +68,7 @@ /* * Code to adjust PCIe capabilities. */ -static int tune_pcie_caps(struct hfi1_devdata *); +static void tune_pcie_caps(struct hfi1_devdata *); /* * Do all the common PCIe setup and initialization. @@ -351,7 +351,7 @@ int pcie_speeds(struct hfi1_devdata *dd) */ int request_msix(struct hfi1_devdata *dd, u32 msireq) { - int nvec, ret; + int nvec; nvec = pci_alloc_irq_vectors(dd->pcidev, 1, msireq, PCI_IRQ_MSIX | PCI_IRQ_LEGACY); @@ -360,12 +360,7 @@ int request_msix(struct hfi1_devdata *dd, u32 msireq) return nvec; } - ret = tune_pcie_caps(dd); - if (ret) { - dd_dev_err(dd, "tune_pcie_caps() failed: %d\n", ret); - pci_free_irq_vectors(dd->pcidev); - return ret; - } + tune_pcie_caps(dd); /* check for legacy IRQ */ if (nvec == 1 && !dd->pcidev->msix_enabled) @@ -502,7 +497,7 @@ uint aspm_mode = ASPM_MODE_DISABLED; module_param_named(aspm, aspm_mode, uint, S_IRUGO); MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic"); -static int tune_pcie_caps(struct hfi1_devdata *dd) +static void tune_pcie_caps(struct hfi1_devdata *dd) { struct pci_dev *parent; u16 rc_mpss, rc_mps, ep_mpss, ep_mps; @@ -513,22 +508,14 @@ static int tune_pcie_caps(struct hfi1_devdata *dd) * Turn on extended tags in DevCtl in case the BIOS has turned it off * to improve WFR SDMA bandwidth */ - ret = pcie_capability_read_word(dd->pcidev, - PCI_EXP_DEVCTL, &ectl); - if (ret) { - dd_dev_err(dd, "Unable to read from PCI config\n"); - return ret; - } - - if (!(ectl & PCI_EXP_DEVCTL_EXT_TAG)) { + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl); + if ((!ret) && !(ectl & PCI_EXP_DEVCTL_EXT_TAG)) { dd_dev_info(dd, "Enabling PCIe extended tags\n"); ectl |= PCI_EXP_DEVCTL_EXT_TAG; ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); - if (ret) { - dd_dev_err(dd, "Unable to write to PCI config\n"); - return ret; - } + if (ret) + dd_dev_info(dd, "Unable to write to PCI config\n"); } /* Find out supported and configured values for parent (root) */ parent = dd->pcidev->bus->self; @@ -536,15 +523,22 @@ static int tune_pcie_caps(struct hfi1_devdata *dd) * The driver cannot perform the tuning if it does not have * access to the upstream component. */ - if (!parent) - return -EINVAL; + if (!parent) { + dd_dev_info(dd, "Parent not found\n"); + return; + } if (!pci_is_root_bus(parent->bus)) { dd_dev_info(dd, "Parent not root\n"); - return -EINVAL; + return; + } + if (!pci_is_pcie(parent)) { + dd_dev_info(dd, "Parent is not PCI Express capable\n"); + return; + } + if (!pci_is_pcie(dd->pcidev)) { + dd_dev_info(dd, "PCI device is not PCI Express capable\n"); + return; } - - if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev)) - return -EINVAL; rc_mpss = parent->pcie_mpss; rc_mps = ffs(pcie_get_mps(parent)) - 8; /* Find out supported and configured values for endpoint (us) */ @@ -590,8 +584,6 @@ static int tune_pcie_caps(struct hfi1_devdata *dd) ep_mrrs = max_mrrs; pcie_set_readrq(dd->pcidev, ep_mrrs); } - - return 0; } /* End of PCIe capability tuning */ -- cgit From 36de80740008e6a4a55115b4a92e2059e47c1cba Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Wed, 27 Sep 2017 14:49:17 +0200 Subject: mtd: nand: atmel: fix buffer overflow in atmel_pmecc_user When calculating the size needed by struct atmel_pmecc_user *user, the dmu and delta buffer sizes were forgotten. This lead to a memory corruption (especially with a large ecc_strength). Link: http://lkml.kernel.org/r/1506503157.3016.5.camel@gmail.com Fixes: f88fc122cc34 ("mtd: nand: Cleanup/rework the atmel_nand driver") Cc: stable@vger.kernel.org Reported-by: Richard Genoud Pointed-at-by: Boris Brezillon Signed-off-by: Richard Genoud Reviewed-by: Nicolas Ferre Signed-off-by: Boris Brezillon --- drivers/mtd/nand/atmel/pmecc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/atmel/pmecc.c b/drivers/mtd/nand/atmel/pmecc.c index 146af8218314..8268636675ef 100644 --- a/drivers/mtd/nand/atmel/pmecc.c +++ b/drivers/mtd/nand/atmel/pmecc.c @@ -363,7 +363,7 @@ atmel_pmecc_create_user(struct atmel_pmecc *pmecc, size += (req->ecc.strength + 1) * sizeof(u16); /* Reserve space for mu, dmu and delta. */ size = ALIGN(size, sizeof(s32)); - size += (req->ecc.strength + 1) * sizeof(s32); + size += (req->ecc.strength + 1) * sizeof(s32) * 3; user = kzalloc(size, GFP_KERNEL); if (!user) -- cgit From aaf2c2fb0f51f91c699039440862b6ae9c25c10e Mon Sep 17 00:00:00 2001 From: Tyler Baicar Date: Mon, 28 Aug 2017 10:53:41 -0600 Subject: ACPI / APEI: clear error status before acknowledging the error Currently we acknowledge errors before clearing the error status. This could cause a new error to be populated by firmware in-between the error acknowledgment and the error status clearing which would cause the second error's status to be cleared without being handled. So, clear the error status before acknowledging the errors. Also, make sure to acknowledge the error if the error status read fails. Signed-off-by: Tyler Baicar Reviewed-by: Borislav Petkov Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/ghes.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 077f9bad6f44..3c3a37b8503b 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -743,17 +743,19 @@ static int ghes_proc(struct ghes *ghes) } ghes_do_proc(ghes, ghes->estatus); +out: + ghes_clear_estatus(ghes); + + if (rc == -ENOENT) + return rc; + /* * GHESv2 type HEST entries introduce support for error acknowledgment, * so only acknowledge the error if this support is present. */ - if (is_hest_type_generic_v2(ghes)) { - rc = ghes_ack_error(ghes->generic_v2); - if (rc) - return rc; - } -out: - ghes_clear_estatus(ghes); + if (is_hest_type_generic_v2(ghes)) + return ghes_ack_error(ghes->generic_v2); + return rc; } -- cgit From 8aba2333904f9b1c1ea038df261bf7ae8fefb98e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Sep 2017 02:08:43 +0200 Subject: cpufreq: docs: Drop intel-pstate.txt from index.txt Commit 33fc30b47098 (cpufreq: intel_pstate: Document the current behavior and user interface) dropped the intel-pstate.txt file from Documentation/cpu-freq/, but it did not update the index.txt file in there accordingly, so do that now. Fixes: 33fc30b47098 (cpufreq: intel_pstate: Document the current behavior and user interface) Signed-off-by: Rafael J. Wysocki --- Documentation/cpu-freq/index.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/cpu-freq/index.txt b/Documentation/cpu-freq/index.txt index 03a7cee6ac73..c15e75386a05 100644 --- a/Documentation/cpu-freq/index.txt +++ b/Documentation/cpu-freq/index.txt @@ -32,8 +32,6 @@ cpufreq-stats.txt - General description of sysfs cpufreq stats. index.txt - File index, Mailing list and Links (this document) -intel-pstate.txt - Intel pstate cpufreq driver specific file. - pcc-cpufreq.txt - PCC cpufreq driver specific file. -- cgit From d1b490939d8c117a06dfc562c41d933f71d30289 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Tue, 19 Sep 2017 12:11:55 -0300 Subject: scsi: aacraid: Add a small delay after IOP reset Commit 0e9973ed3382 ("scsi: aacraid: Add periodic checks to see IOP reset status") changed the way driver checks if a reset succeeded. Now, after an IOP reset, aacraid immediately start polling a register to verify the reset is complete. This behavior cause regressions on the reset path in PowerPC (at least). Since the delay after the IOP reset was removed by the aforementioned patch, the fact driver just starts to read a register instantly after the reset was issued (by writing in another register) "corrupts" the reset procedure, which ends up failing all the time. The issue highly impacted kdump on PowerPC, since on kdump path we proactively issue a reset in adapter (through the reset_devices kernel parameter). This patch (re-)adds a delay right after IOP reset is issued. Empirically we measured that 3 seconds is enough, but for safety reasons we delay for 5s (and since it was 30s before, 5s is still a small amount). For reference, without this patch we observe the following messages on kdump kernel boot process: [ 76.294] aacraid 0003:01:00.0: IOP reset failed [ 76.294] aacraid 0003:01:00.0: ARC Reset attempt failed [ 86.524] aacraid 0003:01:00.0: adapter kernel panic'd ff. [ 86.524] aacraid 0003:01:00.0: Controller reset type is 3 [ 86.524] aacraid 0003:01:00.0: Issuing IOP reset [146.534] aacraid 0003:01:00.0: IOP reset failed [146.534] aacraid 0003:01:00.0: ARC Reset attempt failed Fixes: 0e9973ed3382 ("scsi: aacraid: Add periodic checks to see IOP reset status") Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Guilherme G. Piccoli Acked-by: Dave Carroll Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/src.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/aacraid/src.c b/drivers/scsi/aacraid/src.c index 48c2b2b34b72..0c9361c87ec8 100644 --- a/drivers/scsi/aacraid/src.c +++ b/drivers/scsi/aacraid/src.c @@ -740,6 +740,8 @@ static void aac_send_iop_reset(struct aac_dev *dev) aac_set_intx_mode(dev); src_writel(dev, MUnit.IDR, IOP_SRC_RESET_MASK); + + msleep(5000); } static void aac_send_hardware_soft_reset(struct aac_dev *dev) -- cgit From d0b7a9095c0730b92a0a2eecaba2e6b77ed87339 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 27 Sep 2017 14:44:19 +0200 Subject: scsi: ILLEGAL REQUEST + ASC==27 => target failure ASC 0x27 is "WRITE PROTECTED". This error code is returned e.g. by Fujitsu ETERNUS systems under certain conditions for WRITE SAME 16 commands with UNMAP bit set. It should not be treated as a path error. In general, it makes sense to assume that being write protected is a target rather than a path property. Signed-off-by: Martin Wilck Acked-by: Lee Duncan Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_error.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 38942050b265..dab876c65473 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -580,7 +580,8 @@ int scsi_check_sense(struct scsi_cmnd *scmd) if (sshdr.asc == 0x20 || /* Invalid command operation code */ sshdr.asc == 0x21 || /* Logical block address out of range */ sshdr.asc == 0x24 || /* Invalid field in cdb */ - sshdr.asc == 0x26) { /* Parameter value invalid */ + sshdr.asc == 0x26 || /* Parameter value invalid */ + sshdr.asc == 0x27) { /* Write protected */ set_host_byte(scmd, DID_TARGET_FAILURE); } return SUCCESS; -- cgit From 393debc23c7820211d1c8253dd6a8408a7628fe7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 10:23:35 -0700 Subject: md: separate request handling With commit cc27b0c78c79, pers->make_request could bail out without handling the bio. If that happens, we should retry. The commit fixes md_make_request but not other call sites. Separate the request handling part, so other call sites can use it. Reported-by: Nate Dailey Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start()) Cc: stable@vger.kernel.org Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 58 ++++++++++++++++++++++++++++++++------------------------- drivers/md/md.h | 1 + 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 08fcaebc61bd..1db1a22ed835 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -266,6 +266,37 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ +void md_handle_request(struct mddev *mddev, struct bio *bio) +{ +check_suspended: + rcu_read_lock(); + if (mddev->suspended) { + DEFINE_WAIT(__wait); + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!mddev->suspended) + break; + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + atomic_inc(&mddev->active_io); + rcu_read_unlock(); + + if (!mddev->pers->make_request(mddev, bio)) { + atomic_dec(&mddev->active_io); + wake_up(&mddev->sb_wait); + goto check_suspended; + } + + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); +} +EXPORT_SYMBOL(md_handle_request); + static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); @@ -285,23 +316,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) bio_endio(bio); return BLK_QC_T_NONE; } -check_suspended: - rcu_read_lock(); - if (mddev->suspended) { - DEFINE_WAIT(__wait); - for (;;) { - prepare_to_wait(&mddev->sb_wait, &__wait, - TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) - break; - rcu_read_unlock(); - schedule(); - rcu_read_lock(); - } - finish_wait(&mddev->sb_wait, &__wait); - } - atomic_inc(&mddev->active_io); - rcu_read_unlock(); /* * save the sectors now since our bio can @@ -310,20 +324,14 @@ check_suspended: sectors = bio_sectors(bio); /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; - if (!mddev->pers->make_request(mddev, bio)) { - atomic_dec(&mddev->active_io); - wake_up(&mddev->sb_wait); - goto check_suspended; - } + + md_handle_request(mddev, bio); cpu = part_stat_lock(); part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); part_stat_unlock(); - if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) - wake_up(&mddev->sb_wait); - return BLK_QC_T_NONE; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 561d22b9a9a8..d8287d3cd1bf 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -692,6 +692,7 @@ extern void md_stop_writes(struct mddev *mddev); extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); +extern void md_handle_request(struct mddev *mddev, struct bio *bio); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, -- cgit From 79bf31a3b2a7ca467cfec8ff97d359a77065d01f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 09:55:28 -0700 Subject: md: fix a race condition for flush request handling md_submit_flush_data calls pers->make_request, which missed the suspend check. Fix it with the new md_handle_request API. Reported-by: Nate Dailey Tested-by: Nate Dailey Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start()) Cc: stable@vger.kernel.org Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 1db1a22ed835..0ff1bbf6c90e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -447,16 +447,22 @@ static void md_submit_flush_data(struct work_struct *ws) struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct bio *bio = mddev->flush_bio; + /* + * must reset flush_bio before calling into md_handle_request to avoid a + * deadlock, because other bios passed md_handle_request suspend check + * could wait for this and below md_handle_request could wait for those + * bios because of suspend check + */ + mddev->flush_bio = NULL; + wake_up(&mddev->sb_wait); + if (bio->bi_iter.bi_size == 0) /* an empty barrier - all done */ bio_endio(bio); else { bio->bi_opf &= ~REQ_PREFLUSH; - mddev->pers->make_request(mddev, bio); + md_handle_request(mddev, bio); } - - mddev->flush_bio = NULL; - wake_up(&mddev->sb_wait); } void md_flush_request(struct mddev *mddev, struct bio *bio) -- cgit From c4d6a1b8e8ea79c439a4871cba540443c9eb13b9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 10:29:22 -0700 Subject: dm-raid: fix a race condition in request handling raid_map calls pers->make_request, which missed the suspend check. Fix it with the new md_handle_request API. Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start()) Cc: Heinz Mauelshagen Cc: Mike Snitzer Cc: stable@vger.kernel.org Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 5bfe285ea9d1..1ac58c5651b7 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3238,7 +3238,7 @@ static int raid_map(struct dm_target *ti, struct bio *bio) if (unlikely(bio_end_sector(bio) > mddev->array_sectors)) return DM_MAPIO_REQUEUE; - mddev->pers->make_request(mddev, bio); + md_handle_request(mddev, bio); return DM_MAPIO_SUBMITTED; } -- cgit From 7d5d7b5058fbd638914e42504677141a69f43011 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 11:03:52 -0700 Subject: md/raid5: cap worker count static checker reports a potential integer overflow. Cap the worker count to avoid the overflow. Reported:-by: Dan Carpenter Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 076409455b60..928e24a07133 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6575,14 +6575,17 @@ static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { struct r5conf *conf; - unsigned long new; + unsigned int new; int err; struct r5worker_group *new_groups, *old_groups; int group_cnt, worker_cnt_per_group; if (len >= PAGE_SIZE) return -EINVAL; - if (kstrtoul(page, 10, &new)) + if (kstrtouint(page, 10, &new)) + return -EINVAL; + /* 8192 should be big enough */ + if (new > 8192) return -EINVAL; err = mddev_lock(mddev); -- cgit From da541b20021c781f8b65492eeaee824e729599eb Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 27 Sep 2017 17:34:23 -0500 Subject: objtool: Skip unreachable warnings for GCC 4.4 and older The kbuild bot occasionally reports warnings like: drivers/scsi/pcmcia/aha152x_core.o: warning: objtool: seldo_run()+0x130: unreachable instruction These warnings are always with GCC 4.4. That version of GCC sometimes places unreachable instructions after calls to noreturn functions. The unreachable warnings aren't very important anyway. Just ignore them for old versions of GCC. Reported-by: kbuild test robot Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/bc89b807d965b98ec18a0bb94f96a594bd58f2f2.1506551639.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- scripts/Makefile.build | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 2e3a10e79ca9..061d0c3a420a 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -265,6 +265,8 @@ objtool_args += --no-fp endif ifdef CONFIG_GCOV_KERNEL objtool_args += --no-unreachable +else +objtool_args += $(call cc-ifversion, -lt, 0405, --no-unreachable) endif # 'OBJECT_FILES_NON_STANDARD := y': skip objtool checking for a directory -- cgit From 607a4029d439cdfa258aff5da32bb9cd6ed1a66d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 27 Sep 2017 10:36:38 -0500 Subject: objtool: Support unoptimized frame pointer setup Arnd Bergmann reported a bunch of warnings like: crypto/jitterentropy.o: warning: objtool: jent_fold_time()+0x3b: call without frame pointer save/setup crypto/jitterentropy.o: warning: objtool: jent_stuck()+0x1d: call without frame pointer save/setup crypto/jitterentropy.o: warning: objtool: jent_unbiased_bit()+0x15: call without frame pointer save/setup crypto/jitterentropy.o: warning: objtool: jent_read_entropy()+0x32: call without frame pointer save/setup crypto/jitterentropy.o: warning: objtool: jent_entropy_collector_free()+0x19: call without frame pointer save/setup and arch/x86/events/core.o: warning: objtool: collect_events uses BP as a scratch register arch/x86/events/core.o: warning: objtool: events_ht_sysfs_show()+0x22: call without frame pointer save/setup With certain rare configurations, GCC sometimes sets up the frame pointer with: lea (%rsp),%rbp instead of: mov %rsp,%rbp The instructions are equivalent, so treat the former like the latter. Reported-by: Arnd Bergmann Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a468af8b28a69b83fffc6d7668be9b6fcc873699.1506526584.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- tools/objtool/arch/x86/decode.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 0f22768c0d4d..34a579f806e3 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -284,11 +284,16 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, case 0x8d: if (sib == 0x24 && rex_w && !rex_b && !rex_x) { - /* lea disp(%rsp), reg */ *type = INSN_STACK; - op->src.type = OP_SRC_ADD; + if (!insn.displacement.value) { + /* lea (%rsp), reg */ + op->src.type = OP_SRC_REG; + } else { + /* lea disp(%rsp), reg */ + op->src.type = OP_SRC_ADD; + op->src.offset = insn.displacement.value; + } op->src.reg = CFI_SP; - op->src.offset = insn.displacement.value; op->dest.type = OP_DEST_REG; op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; -- cgit From 66a733ea6b611aecf0119514d2dddab5f9d6c01e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Sep 2017 09:25:30 -0600 Subject: seccomp: fix the usage of get/put_seccomp_filter() in seccomp_get_filter() As Chris explains, get_seccomp_filter() and put_seccomp_filter() can end up using different filters. Once we drop ->siglock it is possible for task->seccomp.filter to have been replaced by SECCOMP_FILTER_FLAG_TSYNC. Fixes: f8e529ed941b ("seccomp, ptrace: add support for dumping seccomp filters") Reported-by: Chris Salls Cc: stable@vger.kernel.org # needs s/refcount_/atomic_/ for v4.12 and earlier Signed-off-by: Oleg Nesterov [tycho: add __get_seccomp_filter vs. open coding refcount_inc()] Signed-off-by: Tycho Andersen [kees: tweak commit log] Signed-off-by: Kees Cook --- kernel/seccomp.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index c24579dfa7a1..bb3a38005b9c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -473,14 +473,19 @@ static long seccomp_attach_filter(unsigned int flags, return 0; } +void __get_seccomp_filter(struct seccomp_filter *filter) +{ + /* Reference count is bounded by the number of total processes. */ + refcount_inc(&filter->usage); +} + /* get_seccomp_filter - increments the reference count of the filter on @tsk */ void get_seccomp_filter(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; if (!orig) return; - /* Reference count is bounded by the number of total processes. */ - refcount_inc(&orig->usage); + __get_seccomp_filter(orig); } static inline void seccomp_filter_free(struct seccomp_filter *filter) @@ -491,10 +496,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) } } -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) +static void __put_seccomp_filter(struct seccomp_filter *orig) { - struct seccomp_filter *orig = tsk->seccomp.filter; /* Clean up single-reference branches iteratively. */ while (orig && refcount_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; @@ -503,6 +506,12 @@ void put_seccomp_filter(struct task_struct *tsk) } } +/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ +void put_seccomp_filter(struct task_struct *tsk) +{ + __put_seccomp_filter(tsk->seccomp.filter); +} + static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) { memset(info, 0, sizeof(*info)); @@ -1025,13 +1034,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, if (!data) goto out; - get_seccomp_filter(task); + __get_seccomp_filter(filter); spin_unlock_irq(&task->sighand->siglock); if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) ret = -EFAULT; - put_seccomp_filter(task); + __put_seccomp_filter(filter); return ret; out: -- cgit From 72364d320644c12948786962673772f271039a4a Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Thu, 28 Sep 2017 12:37:31 +0800 Subject: irq/generic-chip: Don't replace domain's name When generic irq chips are allocated for an irq domain the domain name is set to the irq chip name. That was done to have named domains before the recent changes which enforce domain naming were done. Since then the overwrite causes a memory leak when the domain name is dynamically allocated and even worse it would cause the domain free code to free the wrong name pointer, which might point to a constant. Remove the name assignment to prevent this. Fixes: d59f6617eef0 ("genirq: Allow fwnode to carry name information only") Signed-off-by: Jeffy Chen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20170928043731.4764-1-jeffy.chen@rock-chips.com --- kernel/irq/generic-chip.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index f7086b78ad6e..5270a54b9fa4 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, /* Calc pointer to the next generic chip */ tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); } - d->name = name; return 0; } EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); -- cgit From 8c28ef3f1c1c57b6f468343d5959e5125b30334d Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 25 Sep 2017 02:01:01 -0600 Subject: xen-pciback: relax BAR sizing write value check Just like done in d2bd05d88d ("xen-pciback: return proper values during BAR sizing") for the ROM BAR, ordinary ones also shouldn't compare the written value directly against ~0, but consider the r/o bits at the bottom (if any). Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/xen-pciback/conf_space_header.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c index 5fbfd9cfb6d6..5b3d57fc82d3 100644 --- a/drivers/xen/xen-pciback/conf_space_header.c +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -169,6 +169,9 @@ static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) { struct pci_bar_info *bar = data; + unsigned int pos = (offset - PCI_BASE_ADDRESS_0) / 4; + const struct resource *res = dev->resource; + u32 mask; if (unlikely(!bar)) { pr_warn(DRV_NAME ": driver data not found for %s\n", @@ -179,7 +182,13 @@ static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) /* A write to obtain the length must happen as a 32-bit write. * This does not (yet) support writing individual bytes */ - if (value == ~0) + if (res[pos].flags & IORESOURCE_IO) + mask = ~PCI_BASE_ADDRESS_IO_MASK; + else if (pos && (res[pos - 1].flags & IORESOURCE_MEM_64)) + mask = 0; + else + mask = ~PCI_BASE_ADDRESS_MEM_MASK; + if ((value | mask) == ~0U) bar->which = 1; else { u32 tmpval; -- cgit From 0d805ee70a69eabd38160dc199e183ac2f13fe4b Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 27 Sep 2017 02:41:25 -0700 Subject: xen/mmu: Call xen_cleanhighmap() with 4MB aligned for page tables mapping When bootup a PVM guest with large memory(Ex.240GB), XEN provided initial mapping overlaps with kernel module virtual space. When mapping in this space is cleared by xen_cleanhighmap(), in certain case there could be an 2MB mapping left. This is due to XEN initialize 4MB aligned mapping but xen_cleanhighmap() finish at 2MB boundary. When module loading is just on top of the 2MB space, got below warning: WARNING: at mm/vmalloc.c:106 vmap_pte_range+0x14e/0x190() Call Trace: [] warn_alloc_failed+0xf3/0x160 [] __vmalloc_area_node+0x182/0x1c0 [] ? module_alloc_update_bounds+0x1e/0x80 [] __vmalloc_node_range+0xa7/0x110 [] ? module_alloc_update_bounds+0x1e/0x80 [] module_alloc+0x64/0x70 [] ? module_alloc_update_bounds+0x1e/0x80 [] module_alloc_update_bounds+0x1e/0x80 [] move_module+0x27/0x150 [] layout_and_allocate+0x120/0x1b0 [] load_module+0x78/0x640 [] ? security_file_permission+0x8b/0x90 [] sys_init_module+0x62/0x1e0 [] system_call_fastpath+0x16/0x1b Then the mapping of 2MB is cleared, finally oops when the page in that space is accessed. BUG: unable to handle kernel paging request at ffff880022600000 IP: [] clear_page_c_e+0x7/0x10 PGD 1788067 PUD 178c067 PMD 22434067 PTE 0 Oops: 0002 [#1] SMP Call Trace: [] ? prep_new_page+0x127/0x1c0 [] get_page_from_freelist+0x1e2/0x550 [] ? ii_iovec_copy_to_user+0x90/0x140 [] __alloc_pages_nodemask+0x12d/0x230 [] alloc_pages_vma+0xc6/0x1a0 [] ? pte_mfn_to_pfn+0x7d/0x100 [] do_anonymous_page+0x16b/0x350 [] handle_pte_fault+0x1e4/0x200 [] ? xen_pmd_val+0xe/0x10 [] ? __raw_callee_save_xen_pmd_val+0x11/0x1e [] handle_mm_fault+0x15b/0x270 [] do_page_fault+0x140/0x470 [] page_fault+0x25/0x30 Call xen_cleanhighmap() with 4MB aligned for page tables mapping to fix it. The unnecessory call of xen_cleanhighmap() in DEBUG mode is also removed. -v2: add comment about XEN alignment from Juergen. References: https://lists.xen.org/archives/html/xen-devel/2012-07/msg01562.html Signed-off-by: Zhenzhong Duan Reviewed-by: Juergen Gross [boris: added 'xen/mmu' tag to commit subject] Signed-off-by: Boris Ostrovsky --- arch/x86/xen/mmu_pv.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 509f560bd0c6..58b09fcadbaa 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1238,21 +1238,16 @@ static void __init xen_pagetable_cleanhighmap(void) * from _brk_limit way up to the max_pfn_mapped (which is the end of * the ramdisk). We continue on, erasing PMD entries that point to page * tables - do note that they are accessible at this stage via __va. - * For good measure we also round up to the PMD - which means that if + * As Xen is aligning the memory end to a 4MB boundary, for good + * measure we also round up to PMD_SIZE * 2 - which means that if * anybody is using __ka address to the initial boot-stack - and try * to use it - they are going to crash. The xen_start_info has been * taken care of already in xen_setup_kernel_pagetable. */ addr = xen_start_info->pt_base; - size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); + size = xen_start_info->nr_pt_frames * PAGE_SIZE; - xen_cleanhighmap(addr, addr + size); + xen_cleanhighmap(addr, roundup(addr + size, PMD_SIZE * 2)); xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); -#ifdef DEBUG - /* This is superfluous and is not necessary, but you know what - * lets do it. The MODULES_VADDR -> MODULES_END should be clear of - * anything at this stage. */ - xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); -#endif } #endif -- cgit From 686fef928bba6be13cabe639f154af7d72b63120 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 28 Sep 2017 06:38:17 -0700 Subject: timer: Prepare to change timer callback argument type Modern kernel callback systems pass the structure associated with a given callback to the callback function. The timer callback remains one of the legacy cases where an arbitrary unsigned long argument continues to be passed as the callback argument. This has several problems: - This bloats the timer_list structure with a normally redundant .data field. - No type checking is being performed, forcing callbacks to do explicit type casts of the unsigned long argument into the object that was passed, rather than using container_of(), as done in most of the other callback infrastructure. - Neighboring buffer overflows can overwrite both the .function and the .data field, providing attackers with a way to elevate from a buffer overflow into a simplistic ROP-like mechanism that allows calling arbitrary functions with a controlled first argument. - For future Control Flow Integrity work, this creates a unique function prototype for timer callbacks, instead of allowing them to continue to be clustered with other void functions that take a single unsigned long argument. This adds a new timer initialization API, which will ultimately replace the existing setup_timer(), setup_{deferrable,pinned,etc}_timer() family, named timer_setup() (to mirror hrtimer_setup(), making instances of its use much easier to grep for). In order to support the migration of existing timers into the new callback arguments, timer_setup() casts its arguments to the existing legacy types, and explicitly passes the timer pointer as the legacy data argument. Once all setup_*timer() callers have been replaced with timer_setup(), the casts can be removed, and the data argument can be dropped with the timer expiration code changed to just pass the timer to the callback directly. Since the regular pattern of using container_of() during local variable declaration repeats the need for the variable type declaration to be included, this adds a helper modeled after other from_*() helpers that wrap container_of(), named from_timer(). This helper uses typeof(*variable), removing the type redundancy and minimizing the need for line wraps in forthcoming conversions from "unsigned data long" to "struct timer_list *" in the timer callbacks: -void callback(unsigned long data) +void callback(struct timer_list *t) { - struct some_data_structure *local = (struct some_data_structure *)data; + struct some_data_structure *local = from_timer(local, t, timer); Finally, in order to support the handful of timer users that perform open-coded assignments of the .function (and .data) fields, provide cast macros (TIMER_FUNC_TYPE and TIMER_DATA_TYPE) that can be used temporarily. Once conversion has been completed, these can be globally trivially removed. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20170928133817.GA113410@beast --- include/linux/timer.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/timer.h b/include/linux/timer.h index e6789b8757d5..6383c528b148 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -168,6 +168,20 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define setup_pinned_deferrable_timer_on_stack(timer, fn, data) \ __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) +#define TIMER_DATA_TYPE unsigned long +#define TIMER_FUNC_TYPE void (*)(TIMER_DATA_TYPE) + +static inline void timer_setup(struct timer_list *timer, + void (*callback)(struct timer_list *), + unsigned int flags) +{ + __setup_timer(timer, (TIMER_FUNC_TYPE)callback, + (TIMER_DATA_TYPE)timer, flags); +} + +#define from_timer(var, callback_timer, timer_fieldname) \ + container_of(callback_timer, typeof(*var), timer_fieldname) + /** * timer_pending - is a timer pending? * @timer: the timer in question -- cgit From c0a1666bcb2a33e84187a15eabdcd54056be9a97 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 28 Sep 2017 17:58:41 +0200 Subject: KVM: VMX: use cmpxchg64 This fixes a compilation failure on 32-bit systems. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b9d2140eb212..7f62c94196d1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2238,8 +2238,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.ndst = (dest << 8) & 0xFF00; new.sn = 0; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); } static void decache_tsc_multiplier(struct vcpu_vmx *vmx) @@ -11730,8 +11730,8 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'notification vector' */ new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); @@ -11800,8 +11800,8 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'wakeup vector' */ new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); /* We should not block the vCPU if an interrupt is posted for it. */ if (pi_test_on(pi_desc) == 1) -- cgit From b28503a3fe6400757817e4460090f96bc1b9d6e7 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 15 Sep 2017 09:14:03 +0200 Subject: perf test: Fix vmlinux failure on s390x On s390x perf test 1 failed. It turned out that commit 4a084ecfc821 ("perf report: Fix module symbol adjustment for s390x") was incorrect. The previous implementation in dso__load_sym() is also suitable for s390x. Therefore this patch undoes commit 4a084ecfc821. Signed-off-by: Thomas-Mich Richter Cc: Hendrik Brueckner Cc: Zvonko Kosic Fixes: 4a084ecfc821 ("perf report: Fix module symbol adjustment for s390x") LPU-Reference: 20170915071404.58398-1-tmricht@linux.vnet.ibm.com Link: http://lkml.kernel.org/n/tip-5ani7ly57zji7s0hmzkx416l@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/s390/util/sym-handling.c | 8 -------- tools/perf/util/symbol-elf.c | 8 +------- tools/perf/util/symbol.h | 3 --- 3 files changed, 1 insertion(+), 18 deletions(-) diff --git a/tools/perf/arch/s390/util/sym-handling.c b/tools/perf/arch/s390/util/sym-handling.c index e103f6e46afe..581d4c5a896b 100644 --- a/tools/perf/arch/s390/util/sym-handling.c +++ b/tools/perf/arch/s390/util/sym-handling.c @@ -18,12 +18,4 @@ bool elf__needs_adjust_symbols(GElf_Ehdr ehdr) return false; return ehdr.e_type == ET_REL || ehdr.e_type == ET_DYN; } - -void arch__adjust_sym_map_offset(GElf_Sym *sym, - GElf_Shdr *shdr __maybe_unused, - struct map *map) -{ - if (map->type == MAP__FUNCTION) - sym->st_value += map->start; -} #endif diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 5c39f420111e..9cf781f0d8a2 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -810,12 +810,6 @@ static u64 ref_reloc(struct kmap *kmap) void __weak arch__sym_update(struct symbol *s __maybe_unused, GElf_Sym *sym __maybe_unused) { } -void __weak arch__adjust_sym_map_offset(GElf_Sym *sym, GElf_Shdr *shdr, - struct map *map __maybe_unused) -{ - sym->st_value -= shdr->sh_addr - shdr->sh_offset; -} - int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, struct symsrc *runtime_ss, int kmodule) { @@ -996,7 +990,7 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, /* Adjust symbol to map to file offset */ if (adjust_kernel_syms) - arch__adjust_sym_map_offset(&sym, &shdr, map); + sym.st_value -= shdr.sh_addr - shdr.sh_offset; if (strcmp(section_name, (curr_dso->short_name + diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 2bd6a1f01a1c..aad99e7e179b 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -344,9 +344,6 @@ int setup_intlist(struct intlist **list, const char *list_str, #ifdef HAVE_LIBELF_SUPPORT bool elf__needs_adjust_symbols(GElf_Ehdr ehdr); void arch__sym_update(struct symbol *s, GElf_Sym *sym); -void arch__adjust_sym_map_offset(GElf_Sym *sym, - GElf_Shdr *shdr __maybe_unused, - struct map *map __maybe_unused); #endif #define SYMBOL_A 0 -- cgit From 5357413f5c067f60933e4b8d79d483fbe62b2bb5 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 15 Sep 2017 09:14:04 +0200 Subject: perf test: Fix vmlinux failure on s390x part 2 On s390x perf test 1 failed. It turned out that commit cf6383f73cf2 ("perf report: Fix kernel symbol adjustment for s390x") was incorrect. The previous implementation in dso__load_sym() is also suitable for s390x. Therefore this patch undoes commit cf6383f73cf2 Signed-off-by: Thomas-Mich Richter Cc: Zvonko Kosic Cc: Hendrik Brueckner Fixes: cf6383f73cf2 ("perf report: Fix kernel symbol adjustment for s390x") LPU-Reference: 20170915071404.58398-2-tmricht@linux.vnet.ibm.com Link: http://lkml.kernel.org/n/tip-v101o8k25vuja2ogosgf15yy@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/s390/util/Build | 1 - tools/perf/arch/s390/util/sym-handling.c | 21 --------------------- 2 files changed, 22 deletions(-) delete mode 100644 tools/perf/arch/s390/util/sym-handling.c diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build index bd518b623d7a..5bd7b9260cc0 100644 --- a/tools/perf/arch/s390/util/Build +++ b/tools/perf/arch/s390/util/Build @@ -1,5 +1,4 @@ libperf-y += header.o -libperf-y += sym-handling.o libperf-y += kvm-stat.o libperf-$(CONFIG_DWARF) += dwarf-regs.o diff --git a/tools/perf/arch/s390/util/sym-handling.c b/tools/perf/arch/s390/util/sym-handling.c deleted file mode 100644 index 581d4c5a896b..000000000000 --- a/tools/perf/arch/s390/util/sym-handling.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Architecture specific ELF symbol handling and relocation mapping. - * - * Copyright 2017 IBM Corp. - * Author(s): Thomas Richter - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - */ - -#include "symbol.h" - -#ifdef HAVE_LIBELF_SUPPORT -bool elf__needs_adjust_symbols(GElf_Ehdr ehdr) -{ - if (ehdr.e_type == ET_EXEC) - return false; - return ehdr.e_type == ET_REL || ehdr.e_type == ET_DYN; -} -#endif -- cgit From e49aa15ef6c179f69e5578a271801f31a09e9a3f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Sep 2017 13:20:32 -0700 Subject: Revert "Bluetooth: Add option for disabling legacy ioctl interfaces" This reverts commit dbbccdc4ced015cdd4051299bd87fbe0254ad351. It turns out that the "legacy" users aren't so legacy at all, and that turning off the legacy ioctl will break the current Qt bluetooth stack for bluetooth LE devices that were released just a couple of months ago. So it's simply not true that this was a legacy interface that hasn't been needed and is only limited to old legacy BT devices. Because I actually read Kconfig help messages, and actively try to turn off features that I don't need, I turned the option off. Then I spent _way_ too much time debugging BLE issues until I realized that it wasn't the Qt and subsurface development that had broken one of my dive computer BLE downloads, but simply my broken kernel config. Maybe in a decade it will be true that this is a legacy interface. And maybe with a better help-text and correct dependencies, this kind of legacy removal might be acceptable. But as things are right now both the commit message and the Kconfig help text were misleading, and the Kconfig option had the wrong dependenencies. There's no reason to keep that broken Kconfig option in the tree. Cc: Marcel Holtmann Cc: Johan Hedberg Signed-off-by: Linus Torvalds --- net/bluetooth/Kconfig | 10 ---------- net/bluetooth/hci_sock.c | 6 ------ 2 files changed, 16 deletions(-) diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index c18115d22f00..db82a40875e8 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -126,14 +126,4 @@ config BT_DEBUGFS Provide extensive information about internal Bluetooth states in debugfs. -config BT_LEGACY_IOCTL - bool "Enable legacy ioctl interfaces" - depends on BT && BT_BREDR - default y - help - Enable support for legacy ioctl interfaces. This is only needed - for old and deprecated applications using direct ioctl calls for - controller management. Since Linux 3.4 all configuration and - setup is done via mgmt interface and this is no longer needed. - source "drivers/bluetooth/Kconfig" diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 0bad296fe0af..65d734c165bd 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -878,7 +878,6 @@ static int hci_sock_release(struct socket *sock) return 0; } -#ifdef CONFIG_BT_LEGACY_IOCTL static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; @@ -1050,7 +1049,6 @@ done: release_sock(sk); return err; } -#endif static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) @@ -1971,11 +1969,7 @@ static const struct proto_ops hci_sock_ops = { .getname = hci_sock_getname, .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, -#ifdef CONFIG_BT_LEGACY_IOCTL .ioctl = hci_sock_ioctl, -#else - .ioctl = sock_no_ioctl, -#endif .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, -- cgit From 441430eb54a00586f95f1aefc48e0801bbd6a923 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 6 Sep 2017 19:08:11 +0300 Subject: perf/aux: Only update ->aux_wakeup in non-overwrite mode The following commit: d9a50b0256 ("perf/aux: Ensure aux_wakeup represents most recent wakeup index") changed the AUX wakeup position calculation to rounddown(), which causes a division-by-zero in AUX overwrite mode (aka "snapshot mode"). The zero denominator results from the fact that perf record doesn't set aux_watermark to anything, in which case the kernel will set it to half the AUX buffer size, but only for non-overwrite mode. In the overwrite mode aux_watermark stays zero. The good news is that, AUX overwrite mode, wakeups don't happen and related bookkeeping is not relevant, so we can simply forego the whole wakeup updates. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/20170906160811.16510-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index af71a84e12ee..f684d8e5fa2b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -412,6 +412,19 @@ err: return NULL; } +static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) +{ + if (rb->aux_overwrite) + return false; + + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); + return true; + } + + return false; +} + /* * Commit the data written by hardware into the ring buffer by adjusting * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the @@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) } rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) wakeup = true; - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); - } if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) @@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } -- cgit From 69b73e95982649a1f2dc63b8f08f2113d28f7fed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Sep 2017 10:07:44 +0200 Subject: um/time: Fixup namespace collision The new timer_setup() function for struct timer_list collides with a private um function. Rename it. Fixes: 686fef928bba ("timer: Prepare to change timer callback argument type") Signed-off-by: Thomas Gleixner Cc: Richard Weinberger Cc: Jeff Dike Cc: user-mode-linux-devel@lists.sourceforge.net Cc: Kees Cook --- arch/um/kernel/time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 0b034ebbda2a..7f69d17de354 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -98,7 +98,7 @@ static struct clocksource timer_clocksource = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static void __init timer_setup(void) +static void __init um_timer_setup(void) { int err; @@ -132,5 +132,5 @@ void read_persistent_clock(struct timespec *ts) void __init time_init(void) { timer_set_signal_handler(); - late_time_init = timer_setup; + late_time_init = um_timer_setup; } -- cgit From 1593baab910da72480d651ea7bf2ce6e3a25a484 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:09:26 +0200 Subject: sched/debug: Implement consistent task-state printing Currently get_task_state() and task_state_to_char() report different states, create a number of common helpers and unify the reported state space. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/proc/array.c | 15 ++------------- include/linux/sched.h | 26 +++++++++++++++++++------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 525157ca25cb..01196d3ad452 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -130,19 +130,8 @@ static const char * const task_state_array[] = { static inline const char *get_task_state(struct task_struct *tsk) { - unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; - - /* - * Parked tasks do not run; they sit in __kthread_parkme(). - * Without this check, we would report them as running, which is - * clearly wrong, so we report them as sleeping instead. - */ - if (tsk->state == TASK_PARKED) - state = TASK_INTERRUPTIBLE; - - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); - - return task_state_array[fls(state)]; + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array) - 1); + return task_state_array[__get_task_state(tsk)]; } static inline int get_task_umask(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index 92fb8dd5a9e4..163a0b738908 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1243,17 +1243,29 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return task_pgrp_nr_ns(tsk, &init_pid_ns); } -static inline char task_state_to_char(struct task_struct *task) +static inline unsigned int __get_task_state(struct task_struct *tsk) { - const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - unsigned long state = task->state; + unsigned int tsk_state = READ_ONCE(tsk->state); + unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; - state = state ? __ffs(state) + 1 : 0; + if (tsk_state == TASK_PARKED) + state = TASK_INTERRUPTIBLE; - /* Make sure the string lines up properly with the number of task states: */ - BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); + return fls(state); +} + +static inline char __task_state_to_char(unsigned int state) +{ + static const char state_char[] = "RSDTtXZ"; + + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 2); - return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'; + return state_char[state]; +} + +static inline char task_state_to_char(struct task_struct *tsk) +{ + return __task_state_to_char(__get_task_state(tsk)); } /** -- cgit From 92c4bc9f9cd92a8581e36bc5105f03b569f37e36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:13:36 +0200 Subject: sched/debug: Convert TASK_state to hex Bit patterns are easier in hex. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 163a0b738908..69bed5339ffa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -65,23 +65,23 @@ struct task_group; */ /* Used in tsk->state: */ -#define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define __TASK_STOPPED 4 -#define __TASK_TRACED 8 +#define TASK_RUNNING 0x0000 +#define TASK_INTERRUPTIBLE 0x0001 +#define TASK_UNINTERRUPTIBLE 0x0002 +#define __TASK_STOPPED 0x0004 +#define __TASK_TRACED 0x0008 /* Used in tsk->exit_state: */ -#define EXIT_DEAD 16 -#define EXIT_ZOMBIE 32 +#define EXIT_DEAD 0x0010 +#define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ -#define TASK_DEAD 64 -#define TASK_WAKEKILL 128 -#define TASK_WAKING 256 -#define TASK_PARKED 512 -#define TASK_NOLOAD 1024 -#define TASK_NEW 2048 -#define TASK_STATE_MAX 4096 +#define TASK_DEAD 0x0040 +#define TASK_WAKEKILL 0x0080 +#define TASK_WAKING 0x0100 +#define TASK_PARKED 0x0200 +#define TASK_NOLOAD 0x0400 +#define TASK_NEW 0x0800 +#define TASK_STATE_MAX 0x1000 #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" -- cgit From 65d5dc47fe8530e17e318722fb4df676536c2bfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:14:08 +0200 Subject: sched/debug: Remove unused variable Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 01217fb5a5de..2f93e4a2d9f6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -466,8 +466,6 @@ static char *task_group_path(struct task_group *tg) } #endif -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { -- cgit From efb40f588b4370ffaeffafbd50f6ff213d954254 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:19:53 +0200 Subject: sched/tracing: Fix trace_sched_switch task-state printing Convert trace_sched_switch to use the common task-state helpers and fix the "X" and "Z" order, possibly they ended up in the wrong order because TASK_REPORT has them in the wrong order too. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- include/trace/events/sched.h | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 69bed5339ffa..a2fe636b6825 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -99,7 +99,7 @@ struct task_group; /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD) + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ae1409ffe99a..c63e20c9ef24 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -114,7 +114,10 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * * Preemption ignores task state, therefore preempted tasks are always * RUNNING (we will not have dequeued if state != RUNNING). */ - return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state; + if (preempt) + return TASK_STATE_MAX; + + return __get_task_state(p); } #endif /* CREATE_TRACE_POINTS */ @@ -152,12 +155,13 @@ TRACE_EVENT(sched_switch, TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - __entry->prev_state & (TASK_STATE_MAX-1) ? - __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", - { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, - { 16, "Z" }, { 32, "X" }, { 64, "x" }, - { 128, "K" }, { 256, "W" }, { 512, "P" }, - { 1024, "N" }) : "R", + + (__entry->prev_state & TASK_REPORT) ? + __print_flags(__entry->prev_state & TASK_REPORT, "|", + { 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, + { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }) : + "R", + __entry->prev_state & TASK_STATE_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); -- cgit From 9c29c31830a4eca724e137a9339137204bbb31be Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Thu, 7 Sep 2017 20:00:58 +0530 Subject: locking/rwsem-xadd: Fix missed wakeup due to reordering of load If a spinner is present, there is a chance that the load of rwsem_has_spinner() in rwsem_wake() can be reordered with respect to decrement of rwsem count in __up_write() leading to wakeup being missed: spinning writer up_write caller --------------- ----------------------- [S] osq_unlock() [L] osq spin_lock(wait_lock) sem->count=0xFFFFFFFF00000001 +0xFFFFFFFF00000000 count=sem->count MB sem->count=0xFFFFFFFE00000001 -0xFFFFFFFF00000001 spin_trylock(wait_lock) return rwsem_try_write_lock(count) spin_unlock(wait_lock) schedule() Reordering of atomic_long_sub_return_release() in __up_write() and rwsem_has_spinner() in rwsem_wake() can cause missing of wakeup in up_write() context. In spinning writer, sem->count and local variable count is 0XFFFFFFFE00000001. It would result in rwsem_try_write_lock() failing to acquire rwsem and spinning writer going to sleep in rwsem_down_write_failed(). The smp_rmb() will make sure that the spinner state is consulted after sem->count is updated in up_write context. Signed-off-by: Prateek Sood Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: longman@redhat.com Cc: parri.andrea@gmail.com Cc: sramana@codeaurora.org Link: http://lkml.kernel.org/r/1504794658-15397-1-git-send-email-prsood@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 02f660666ab8..1fefe6dcafd7 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -612,6 +612,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) unsigned long flags; DEFINE_WAKE_Q(wake_q); + /* + * __rwsem_down_write_failed_common(sem) + * rwsem_optimistic_spin(sem) + * osq_unlock(sem->osq) + * ... + * atomic_long_add_return(&sem->count) + * + * - VS - + * + * __up_write() + * if (atomic_long_sub_return_release(&sem->count) < 0) + * rwsem_wake(sem) + * osq_is_locked(&sem->osq) + * + * And __up_write() must observe !osq_is_locked() when it observes the + * atomic_long_add_return() in order to not miss a wakeup. + * + * This boils down to: + * + * [S.rel] X = 1 [RmW] r0 = (Y += 0) + * MB RMB + * [RmW] Y += 1 [L] r1 = X + * + * exists (r0=1 /\ r1=0) + */ + smp_rmb(); + /* * If a spinner is present, it is not necessary to do the wakeup. * Try to do wakeup only if the trylock succeeds to minimize -- cgit From 5f6ad26ea353fdf3dad2328052cbee49e0b9c5b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:23:31 +0200 Subject: sched/tracing: Use common task-state helpers Remove yet another task-state char instance. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- kernel/trace/trace_output.c | 21 ++++++--------------- kernel/trace/trace_sched_wakeup.c | 8 ++++---- 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index a2fe636b6825..bc7807933415 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -83,8 +83,6 @@ struct task_group; #define TASK_NEW 0x0800 #define TASK_STATE_MAX 0x1000 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" - /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bac629af2285..c738e764e2a5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter) return !trace_seq_has_overflowed(s); } -static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; - -static int task_state_char(unsigned long state) -{ - int bit = state ? __ffs(state) + 1 : 0; - - return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; -} - /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - T = task_state_char(field->next_state); - S = task_state_char(field->prev_state); + T = __task_state_to_char(field->next_state); + S = __task_state_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", @@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); SEQ_PUT_HEX_FIELD(s, field->prev_pid); SEQ_PUT_HEX_FIELD(s, field->prev_prio); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ddec53b67646..0c331978b1a6 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; - entry->prev_state = prev->state; + entry->prev_state = __get_task_state(prev); entry->next_pid = next->pid; entry->next_prio = next->prio; - entry->next_state = next->state; + entry->next_state = __get_task_state(next); entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) @@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = curr->pid; entry->prev_prio = curr->prio; - entry->prev_state = curr->state; + entry->prev_state = __get_task_state(curr); entry->next_pid = wakee->pid; entry->next_prio = wakee->prio; - entry->next_state = wakee->state; + entry->next_state = __get_task_state(wakee); entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) -- cgit From 06eb61844d841d0032a9950ce7f8e783ee49c0d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:30:40 +0200 Subject: sched/debug: Add explicit TASK_IDLE printing Markus reported that kthreads that idle using TASK_IDLE instead of TASK_INTERRUPTIBLE are reported in as TASK_UNINTERRUPTIBLE and things like htop mark those red. This is undesirable, so add an explicit state for TASK_IDLE. Reported-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/proc/array.c | 21 +++++++++++++-------- include/linux/sched.h | 12 ++++++++++-- include/trace/events/sched.h | 7 ++++--- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 01196d3ad452..a120a4549d48 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -119,18 +119,23 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) * simple bit tests. */ static const char * const task_state_array[] = { - "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "t (tracing stop)", /* 8 */ - "X (dead)", /* 16 */ - "Z (zombie)", /* 32 */ + + /* states in TASK_REPORT: */ + "R (running)", /* 0x00 */ + "S (sleeping)", /* 0x01 */ + "D (disk sleep)", /* 0x02 */ + "T (stopped)", /* 0x04 */ + "t (tracing stop)", /* 0x08 */ + "X (dead)", /* 0x10 */ + "Z (zombie)", /* 0x20 */ + + /* states beyond TASK_REPORT: */ + "I (idle)", /* 0x40 */ }; static inline const char *get_task_state(struct task_struct *tsk) { - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array) - 1); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); return task_state_array[__get_task_state(tsk)]; } diff --git a/include/linux/sched.h b/include/linux/sched.h index bc7807933415..286fc1117046 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1241,22 +1241,30 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return task_pgrp_nr_ns(tsk, &init_pid_ns); } +#define TASK_REPORT_IDLE (TASK_REPORT + 1) +#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) + static inline unsigned int __get_task_state(struct task_struct *tsk) { unsigned int tsk_state = READ_ONCE(tsk->state); unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; + BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); + if (tsk_state == TASK_PARKED) state = TASK_INTERRUPTIBLE; + if (tsk_state == TASK_IDLE) + state = TASK_REPORT_IDLE; + return fls(state); } static inline char __task_state_to_char(unsigned int state) { - static const char state_char[] = "RSDTtXZ"; + static const char state_char[] = "RSDTtXZI"; - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 2); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); return state_char[state]; } diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c63e20c9ef24..b371ef8206e1 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -156,10 +156,11 @@ TRACE_EVENT(sched_switch, TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - (__entry->prev_state & TASK_REPORT) ? - __print_flags(__entry->prev_state & TASK_REPORT, "|", + (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? + __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", { 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, - { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }) : + { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }, + { 0x40, "I" }) : "R", __entry->prev_state & TASK_STATE_MAX ? "+" : "", -- cgit From 5d68cc95fb24b2f58060cc5340dd7402a11f054e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:32:41 +0200 Subject: sched/debug: Ignore TASK_IDLE for SysRq-W Markus reported that tasks in TASK_IDLE state are reported by SysRq-W, which results in undesirable clutter. Reported-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18a6966567da..d17c5da523a0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5166,6 +5166,28 @@ void sched_show_task(struct task_struct *p) put_task_stack(p); } +static inline bool +state_filter_match(unsigned long state_filter, struct task_struct *p) +{ + /* no filter, everything matches */ + if (!state_filter) + return true; + + /* filter, but doesn't match */ + if (!(p->state & state_filter)) + return false; + + /* + * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows + * TASK_KILLABLE). + */ + if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) + return false; + + return true; +} + + void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; @@ -5188,7 +5210,7 @@ void show_state_filter(unsigned long state_filter) */ touch_nmi_watchdog(); touch_all_softlockup_watchdogs(); - if (!state_filter || (p->state & state_filter)) + if (state_filter_match(state_filter, p)) sched_show_task(p); } -- cgit From 8ef9925b02c23e3838d5e593c5cf37984141150f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:37:28 +0200 Subject: sched/debug: Add explicit TASK_PARKED printing Currently TASK_PARKED is masqueraded as TASK_INTERRUPTIBLE, give it its own print state because it will not in fact get woken by regular wakeups and is a long-term state. This requires moving TASK_PARKED into the TASK_REPORT mask, and since that latter needs to be a contiguous bitmask, we need to shuffle the bits around a bit. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/proc/array.c | 3 ++- include/linux/sched.h | 16 +++++++--------- include/trace/events/sched.h | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index a120a4549d48..77a8eacbe032 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -128,9 +128,10 @@ static const char * const task_state_array[] = { "t (tracing stop)", /* 0x08 */ "X (dead)", /* 0x10 */ "Z (zombie)", /* 0x20 */ + "P (parked)", /* 0x40 */ /* states beyond TASK_REPORT: */ - "I (idle)", /* 0x40 */ + "I (idle)", /* 0x80 */ }; static inline const char *get_task_state(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index 286fc1117046..26a7df4e558c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -75,10 +75,10 @@ struct task_group; #define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ -#define TASK_DEAD 0x0040 -#define TASK_WAKEKILL 0x0080 -#define TASK_WAKING 0x0100 -#define TASK_PARKED 0x0200 +#define TASK_PARKED 0x0040 +#define TASK_DEAD 0x0080 +#define TASK_WAKEKILL 0x0100 +#define TASK_WAKING 0x0200 #define TASK_NOLOAD 0x0400 #define TASK_NEW 0x0800 #define TASK_STATE_MAX 0x1000 @@ -97,7 +97,8 @@ struct task_group; /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE) + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ + TASK_PARKED) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) @@ -1251,9 +1252,6 @@ static inline unsigned int __get_task_state(struct task_struct *tsk) BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); - if (tsk_state == TASK_PARKED) - state = TASK_INTERRUPTIBLE; - if (tsk_state == TASK_IDLE) state = TASK_REPORT_IDLE; @@ -1262,7 +1260,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk) static inline char __task_state_to_char(unsigned int state) { - static const char state_char[] = "RSDTtXZI"; + static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index b371ef8206e1..3c8b7f625670 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -160,7 +160,7 @@ TRACE_EVENT(sched_switch, __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", { 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }, - { 0x40, "I" }) : + { 0x40, "P" }, { 0x80, "I" }) : "R", __entry->prev_state & TASK_STATE_MAX ? "+" : "", -- cgit From 520a13c530aeb5f63e011d668c42db1af19ed349 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 28 Sep 2017 16:58:26 -0500 Subject: x86/asm: Fix inline asm call constraints for GCC 4.4 The kernel test bot (run by Xiaolong Ye) reported that the following commit: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") is causing double faults in a kernel compiled with GCC 4.4. Linus subsequently diagnosed the crash pattern and the buggy commit and found that the issue is with this code: register unsigned int __asm_call_sp asm("esp"); #define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) Even on a 64-bit kernel, it's using ESP instead of RSP. That causes GCC to produce the following bogus code: ffffffff8147461d: 89 e0 mov %esp,%eax ffffffff8147461f: 4c 89 f7 mov %r14,%rdi ffffffff81474622: 4c 89 fe mov %r15,%rsi ffffffff81474625: ba 20 00 00 00 mov $0x20,%edx ffffffff8147462a: 89 c4 mov %eax,%esp ffffffff8147462c: e8 bf 52 05 00 callq ffffffff814c98f0 Despite the absurdity of it backing up and restoring the stack pointer for no reason, the bug is actually the fact that it's only backing up and restoring the lower 32 bits of the stack pointer. The upper 32 bits are getting cleared out, corrupting the stack pointer. So change the '__asm_call_sp' register variable to be associated with the actual full-size stack pointer. This also requires changing the __ASM_SEL() macro to be based on the actual compiled arch size, rather than the CONFIG value, because CONFIG_X86_64 compiles some files with '-m32' (e.g., realmode and vdso). Otherwise Clang fails to build the kernel because it complains about the use of a 64-bit register (RSP) in a 32-bit file. Reported-and-Bisected-and-Tested-by: kernel test robot Diagnosed-by: Linus Torvalds Signed-off-by: Josh Poimboeuf Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dmitriy Vyukov Cc: LKP Cc: Linus Torvalds Cc: Matthias Kaehlcke Cc: Miguel Bernal Marin Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") Link: http://lkml.kernel.org/r/20170928215826.6sdpmwtkiydiytim@treble Signed-off-by: Ingo Molnar --- arch/x86/include/asm/asm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index c1eadbaf1115..30c3c9ac784a 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -11,10 +11,12 @@ # define __ASM_FORM_COMMA(x) " " #x "," #endif -#ifdef CONFIG_X86_32 +#ifndef __x86_64__ +/* 32 bit */ # define __ASM_SEL(a,b) __ASM_FORM(a) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) #else +/* 64 bit */ # define __ASM_SEL(a,b) __ASM_FORM(b) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) #endif @@ -139,7 +141,7 @@ * gets set up by the containing function. If you forget to do this, objtool * may print a "call without frame pointer save/setup" warning. */ -register unsigned int __asm_call_sp asm("esp"); +register unsigned long __asm_call_sp asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) #endif -- cgit From 5ccba44ba118a5000cccc50076b0344632459779 Mon Sep 17 00:00:00 2001 From: Ethan Zhao Date: Mon, 4 Sep 2017 13:59:34 +0800 Subject: sched/sysctl: Check user input value of sysctl_sched_time_avg System will hang if user set sysctl_sched_time_avg to 0: [root@XXX ~]# sysctl kernel.sched_time_avg_ms=0 Stack traceback for pid 0 0xffff883f6406c600 0 0 1 3 R 0xffff883f6406cf50 *swapper/3 ffff883f7ccc3ae8 0000000000000018 ffffffff810c4dd0 0000000000000000 0000000000017800 ffff883f7ccc3d78 0000000000000003 ffff883f7ccc3bf8 ffffffff810c4fc9 ffff883f7ccc3c08 00000000810c5043 ffff883f7ccc3c08 Call Trace: [] ? update_group_capacity+0x110/0x200 [] ? update_sd_lb_stats+0x109/0x600 [] ? find_busiest_group+0x47/0x530 [] ? load_balance+0x194/0x900 [] ? update_rq_clock.part.83+0x1a/0xe0 [] ? rebalance_domains+0x152/0x290 [] ? run_rebalance_domains+0xdc/0x1d0 [] ? __do_softirq+0xfb/0x320 [] ? irq_exit+0x125/0x130 [] ? scheduler_ipi+0x97/0x160 [] ? smp_reschedule_interrupt+0x29/0x30 [] ? reschedule_interrupt+0x6e/0x80 [] ? cpuidle_enter_state+0xcc/0x230 [] ? cpuidle_enter_state+0x9c/0x230 [] ? cpuidle_enter+0x17/0x20 [] ? cpu_startup_entry+0x38c/0x420 [] ? start_secondary+0x173/0x1e0 Because divide-by-zero error happens in function: update_group_capacity() update_cpu_capacity() scale_rt_capacity() { ... total = sched_avg_period() + delta; used = div_u64(avg, total); ... } To fix this issue, check user input value of sysctl_sched_time_avg, keep it unchanged when hitting invalid input, and set the minimum limit of sysctl_sched_time_avg to 1 ms. Reported-by: James Puthukattukaran Signed-off-by: Ethan Zhao Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: efault@gmx.de Cc: ethan.kernel@gmail.com Cc: keescook@chromium.org Cc: mcgrof@kernel.org Cc: Link: http://lkml.kernel.org/r/1504504774-18253-1-git-send-email-ethan.zhao@oracle.com Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbbb8157..423554ad3610 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_time_avg, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, #ifdef CONFIG_SCHEDSTATS { -- cgit From 305d0ab4764d36a02c8e7cddb67099aca65351ce Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 28 Sep 2017 18:16:44 -0700 Subject: KVM: nVMX: Fix nested #PF intends to break L1's vmlauch/vmresume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ------------[ cut here ]------------ WARNING: CPU: 4 PID: 5280 at /home/kernel/linux/arch/x86/kvm//vmx.c:11394 nested_vmx_vmexit+0xc2b/0xd70 [kvm_intel] CPU: 4 PID: 5280 Comm: qemu-system-x86 Tainted: G W OE 4.13.0+ #17 RIP: 0010:nested_vmx_vmexit+0xc2b/0xd70 [kvm_intel] Call Trace: ? emulator_read_emulated+0x15/0x20 [kvm] ? segmented_read+0xae/0xf0 [kvm] vmx_inject_page_fault_nested+0x60/0x70 [kvm_intel] ? vmx_inject_page_fault_nested+0x60/0x70 [kvm_intel] x86_emulate_instruction+0x733/0x810 [kvm] vmx_handle_exit+0x2f4/0xda0 [kvm_intel] ? kvm_arch_vcpu_ioctl_run+0xd2f/0x1c60 [kvm] kvm_arch_vcpu_ioctl_run+0xdab/0x1c60 [kvm] ? kvm_arch_vcpu_load+0x62/0x230 [kvm] kvm_vcpu_ioctl+0x340/0x700 [kvm] ? kvm_vcpu_ioctl+0x340/0x700 [kvm] ? __fget+0xfc/0x210 do_vfs_ioctl+0xa4/0x6a0 ? __fget+0x11d/0x210 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x23/0xc2 A nested #PF is triggered during L0 emulating instruction for L2. However, it doesn't consider we should not break L1's vmlauch/vmresme. This patch fixes it by queuing the #PF exception instead ,requesting an immediate VM exit from L2 and keeping the exception for L1 pending for a subsequent nested VM exit. This should actually work all the time, making vmx_inject_page_fault_nested totally unnecessary. However, that's not working yet, so this patch can work around the issue in the meanwhile. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7f62c94196d1..5bfa353f6354 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9845,7 +9845,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, WARN_ON(!is_guest_mode(vcpu)); - if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) { + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && + !to_vmx(vcpu)->nested.nested_run_pending) { vmcs12->vm_exit_intr_error_code = fault->error_code; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | -- cgit From b862789aa5186d5ea3a024b7cfe0f80c3a38b980 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 29 Sep 2017 19:01:45 +0800 Subject: kvm/x86: Handle async PF in RCU read-side critical sections Sasha Levin reported a WARNING: | WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329 | rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline] | WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329 | rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458 ... | CPU: 0 PID: 6974 Comm: syz-fuzzer Not tainted 4.13.0-next-20170908+ #246 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS | 1.10.1-1ubuntu1 04/01/2014 | Call Trace: ... | RIP: 0010:rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline] | RIP: 0010:rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458 | RSP: 0018:ffff88003b2debc8 EFLAGS: 00010002 | RAX: 0000000000000001 RBX: 1ffff1000765bd85 RCX: 0000000000000000 | RDX: 1ffff100075d7882 RSI: ffffffffb5c7da20 RDI: ffff88003aebc410 | RBP: ffff88003b2def30 R08: dffffc0000000000 R09: 0000000000000001 | R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003b2def08 | R13: 0000000000000000 R14: ffff88003aebc040 R15: ffff88003aebc040 | __schedule+0x201/0x2240 kernel/sched/core.c:3292 | schedule+0x113/0x460 kernel/sched/core.c:3421 | kvm_async_pf_task_wait+0x43f/0x940 arch/x86/kernel/kvm.c:158 | do_async_page_fault+0x72/0x90 arch/x86/kernel/kvm.c:271 | async_page_fault+0x22/0x30 arch/x86/entry/entry_64.S:1069 | RIP: 0010:format_decode+0x240/0x830 lib/vsprintf.c:1996 | RSP: 0018:ffff88003b2df520 EFLAGS: 00010283 | RAX: 000000000000003f RBX: ffffffffb5d1e141 RCX: ffff88003b2df670 | RDX: 0000000000000001 RSI: dffffc0000000000 RDI: ffffffffb5d1e140 | RBP: ffff88003b2df560 R08: dffffc0000000000 R09: 0000000000000000 | R10: ffff88003b2df718 R11: 0000000000000000 R12: ffff88003b2df5d8 | R13: 0000000000000064 R14: ffffffffb5d1e140 R15: 0000000000000000 | vsnprintf+0x173/0x1700 lib/vsprintf.c:2136 | sprintf+0xbe/0xf0 lib/vsprintf.c:2386 | proc_self_get_link+0xfb/0x1c0 fs/proc/self.c:23 | get_link fs/namei.c:1047 [inline] | link_path_walk+0x1041/0x1490 fs/namei.c:2127 ... This happened when the host hit a page fault, and delivered it as in an async page fault, while the guest was in an RCU read-side critical section. The guest then tries to reschedule in kvm_async_pf_task_wait(), but rcu_preempt_note_context_switch() would treat the reschedule as a sleep in RCU read-side critical section, which is not allowed (even in preemptible RCU). Thus the WARN. To cure this, make kvm_async_pf_task_wait() go to the halt path if the PF happens in a RCU read-side critical section. Reported-by: Sasha Levin Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: stable@vger.kernel.org Signed-off-by: Boqun Feng Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index aa60a08b65b1..e675704fa6f7 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -140,7 +140,8 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || preempt_count() > 1; + n.halted = is_idle_task(current) || preempt_count() > 1 || + rcu_preempt_depth(); init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); raw_spin_unlock(&b->lock); -- cgit From f069faba688701c4d56b6c3452a130f97bf02e95 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 29 Sep 2017 11:29:55 +0100 Subject: arm64: mm: Use READ_ONCE when dereferencing pointer to pte table On kernels built with support for transparent huge pages, different CPUs can access the PMD concurrently due to e.g. fast GUP or page_vma_mapped_walk and they must take care to use READ_ONCE to avoid value tearing or caching of stale values by the compiler. Unfortunately, these functions call into our pgtable macros, which don't use READ_ONCE, and compiler caching has been observed to cause the following crash during ext4 writeback: PC is at check_pte+0x20/0x170 LR is at page_vma_mapped_walk+0x2e0/0x540 [...] Process doio (pid: 2463, stack limit = 0xffff00000f2e8000) Call trace: [] check_pte+0x20/0x170 [] page_vma_mapped_walk+0x2e0/0x540 [] page_mkclean_one+0xac/0x278 [] rmap_walk_file+0xf0/0x238 [] rmap_walk+0x64/0xa0 [] page_mkclean+0x90/0xa8 [] clear_page_dirty_for_io+0x84/0x2a8 [] mpage_submit_page+0x34/0x98 [] mpage_process_page_bufs+0x164/0x170 [] mpage_prepare_extent_to_map+0x134/0x2b8 [] ext4_writepages+0x484/0xe30 [] do_writepages+0x44/0xe8 [] __filemap_fdatawrite_range+0xbc/0x110 [] file_write_and_wait_range+0x48/0xd8 [] ext4_sync_file+0x80/0x4b8 [] vfs_fsync_range+0x64/0xc0 [] SyS_msync+0x194/0x1e8 This is because page_vma_mapped_walk loads the PMD twice before calling pte_offset_map: the first time without READ_ONCE (where it gets all zeroes due to a concurrent pmdp_invalidate) and the second time with READ_ONCE (where it sees a valid table pointer due to a concurrent pmd_populate). However, the compiler inlines everything and caches the first value in a register, which is subsequently used in pte_offset_phys which returns a junk pointer that is later dereferenced when attempting to access the relevant pte. This patch fixes the issue by using READ_ONCE in pte_offset_phys to ensure that a stale value is not used. Whilst this is a point fix for a known failure (and simple to backport), a full fix moving all of our page table accessors over to {READ,WRITE}_ONCE and consistently using READ_ONCE in page_vma_mapped_walk is in the works for a future kernel release. Cc: Jon Masters Cc: Timur Tabi Cc: Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use page_vma_mapped_walk()") Tested-by: Richard Ruigrok Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index bc4e92337d16..b46e54c2399b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -401,7 +401,7 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) /* Find an entry in the third-level page table. */ #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_phys(dir,addr) (pmd_page_paddr(*(dir)) + pte_index(addr) * sizeof(pte_t)) +#define pte_offset_phys(dir,addr) (pmd_page_paddr(READ_ONCE(*(dir))) + pte_index(addr) * sizeof(pte_t)) #define pte_offset_kernel(dir,addr) ((pte_t *)__va(pte_offset_phys((dir), (addr)))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) -- cgit From 760bfb47c36a07741a089bf6a28e854ffbee7dc9 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 29 Sep 2017 12:27:41 +0100 Subject: arm64: fault: Route pte translation faults via do_translation_fault We currently route pte translation faults via do_page_fault, which elides the address check against TASK_SIZE before invoking the mm fault handling code. However, this can cause issues with the path walking code in conjunction with our word-at-a-time implementation because load_unaligned_zeropad can end up faulting in kernel space if it reads across a page boundary and runs into a page fault (e.g. by attempting to read from a guard region). In the case of such a fault, load_unaligned_zeropad has registered a fixup to shift the valid data and pad with zeroes, however the abort is reported as a level 3 translation fault and we dispatch it straight to do_page_fault, despite it being a kernel address. This results in calling a sleeping function from atomic context: BUG: sleeping function called from invalid context at arch/arm64/mm/fault.c:313 in_atomic(): 0, irqs_disabled(): 0, pid: 10290 Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [...] [] ___might_sleep+0x134/0x144 [] __might_sleep+0x7c/0x8c [] do_page_fault+0x140/0x330 [] do_mem_abort+0x54/0xb0 Exception stack(0xfffffffb20247a70 to 0xfffffffb20247ba0) [...] [] el1_da+0x18/0x78 [] path_parentat+0x44/0x88 [] filename_parentat+0x5c/0xd8 [] filename_create+0x4c/0x128 [] SyS_mkdirat+0x50/0xc8 [] el0_svc_naked+0x24/0x28 Code: 36380080 d5384100 f9400800 9402566d (d4210000) ---[ end trace 2d01889f2bca9b9f ]--- Fix this by dispatching all translation faults to do_translation_faults, which avoids invoking the page fault logic for faults on kernel addresses. Cc: Reported-by: Ankit Jain Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 89993c4be1be..2069e9bc0fca 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -651,7 +651,7 @@ static const struct fault_info fault_info[] = { { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, - { do_page_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, + { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, { do_bad, SIGBUS, 0, "unknown 8" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, -- cgit From bc829ee36e0ec92383c9d9b88fe08f00d4d592f8 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 29 Sep 2017 11:24:19 -0500 Subject: x86/mm: Disable branch profiling in mem_encrypt.c Some routines in mem_encrypt.c are called very early in the boot process, e.g. sme_encrypt_kernel(). When CONFIG_TRACE_BRANCH_PROFILING=y is defined the resulting branch profiling associated with the check to see if SME is active results in a kernel crash. Disable branch profiling for mem_encrypt.c by defining DISABLE_BRANCH_PROFILING before including any header files. Reported-by: kernel test robot Signed-off-by: Tom Lendacky Acked-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170929162419.6016.53390.stgit@tlendack-t1.amdoffice.net Signed-off-by: Ingo Molnar --- arch/x86/mm/mem_encrypt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 3fcc8e01683b..16c5f37933a2 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -10,6 +10,8 @@ * published by the Free Software Foundation. */ +#define DISABLE_BRANCH_PROFILING + #include #include #include -- cgit From 196bd485ee4f03ce4c690bfcf38138abfcd0a4bc Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 29 Sep 2017 17:15:36 +0300 Subject: x86/asm: Use register variable to get stack pointer value Currently we use current_stack_pointer() function to get the value of the stack pointer register. Since commit: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") ... we have a stack register variable declared. It can be used instead of current_stack_pointer() function which allows to optimize away some excessive "mov %rsp, %" instructions: -mov %rsp,%rdx -sub %rdx,%rax -cmp $0x3fff,%rax -ja ffffffff810722fd +sub %rsp,%rax +cmp $0x3fff,%rax +ja ffffffff810722fa Remove current_stack_pointer(), rename __asm_call_sp to current_stack_pointer and use it instead of the removed function. Signed-off-by: Andrey Ryabinin Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170929141537.29167-1-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/asm.h | 4 ++-- arch/x86/include/asm/thread_info.h | 11 ----------- arch/x86/kernel/irq_32.c | 6 +++--- arch/x86/kernel/traps.c | 2 +- arch/x86/mm/tlb.c | 2 +- 5 files changed, 7 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 30c3c9ac784a..b0dc91f4bedc 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -141,8 +141,8 @@ * gets set up by the containing function. If you forget to do this, objtool * may print a "call without frame pointer save/setup" warning. */ -register unsigned long __asm_call_sp asm(_ASM_SP); -#define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) +register unsigned long current_stack_pointer asm(_ASM_SP); +#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 5161da1a0fa0..89e7eeb5cec1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -158,17 +158,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -static inline unsigned long current_stack_pointer(void) -{ - unsigned long sp; -#ifdef CONFIG_X86_64 - asm("mov %%rsp,%0" : "=g" (sp)); -#else - asm("mov %%esp,%0" : "=g" (sp)); -#endif - return sp; -} - /* * Walks up the stack frames to make sure that the specified object is * entirely contained by a single stack frame. diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 1f38d9a4d9de..d4eb450144fd 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -64,7 +64,7 @@ static void call_on_stack(void *func, void *stack) static inline void *current_stack(void) { - return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); + return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); } static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) @@ -88,7 +88,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) /* Save the next esp at the bottom of the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer(); + *prev_esp = current_stack_pointer; if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -139,7 +139,7 @@ void do_softirq_own_stack(void) /* Push the previous esp onto the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer(); + *prev_esp = current_stack_pointer; call_on_stack(__do_softirq, isp); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 34ea3651362e..67db4f43309e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -142,7 +142,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) * from double_fault. */ BUG_ON((unsigned long)(current_top_of_stack() - - current_stack_pointer()) >= THREAD_SIZE); + current_stack_pointer) >= THREAD_SIZE); preempt_enable_no_resched(); } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 93fe97cce581..49d9778376d7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -191,7 +191,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ - unsigned int index = pgd_index(current_stack_pointer()); + unsigned int index = pgd_index(current_stack_pointer); pgd_t *pgd = next->pgd + index; if (unlikely(pgd_none(*pgd))) -- cgit From 6c85501f2fabcfc4fc6ed976543d252c4eaf4be9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 29 Sep 2017 13:43:15 -0400 Subject: fix infoleak in waitid(2) kernel_waitid() can return a PID, an error or 0. rusage is filled in the first case and waitid(2) rusage should've been copied out exactly in that case, *not* whenever kernel_waitid() has not returned an error. Compat variant shares that braino; none of kernel_wait4() callers do, so the below ought to fix it. Reported-and-tested-by: Alexander Potapenko Fixes: ce72a16fa705 ("wait4(2)/waitid(2): separate copying rusage to userland") Cc: stable@vger.kernel.org # v4.13 Signed-off-by: Al Viro --- kernel/exit.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 3481ababd06a..f2cd53e92147 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1600,12 +1600,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; + if (err > 0) { signo = SIGCHLD; err = 0; - } - - if (!err) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } @@ -1723,16 +1721,15 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (err > 0) { signo = SIGCHLD; err = 0; - } - - if (!err && uru) { - /* kernel_waitid() overwrites everything in ru */ - if (COMPAT_USE_64BIT_TIME) - err = copy_to_user(uru, &ru, sizeof(ru)); - else - err = put_compat_rusage(&ru, uru); - if (err) - return -EFAULT; + if (uru) { + /* kernel_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + err = copy_to_user(uru, &ru, sizeof(ru)); + else + err = put_compat_rusage(&ru, uru); + if (err) + return -EFAULT; + } } if (!infop) -- cgit From 9e66317d3c92ddaab330c125dfe9d06eee268aff Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Oct 2017 14:54:54 -0700 Subject: Linux 4.14-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f0c5b21fadb2..cf007a31d575 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Fearless Coyote # *DOCUMENTATION* -- cgit From d522bb6a105ffabeb765ad890062199fd4d18245 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 17:53:36 -0700 Subject: ALSA: sh: aica: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. This requires adding a pointer to hold the timer's target substream, as there won't be a way to pass this in the future. Signed-off-by: Kees Cook Signed-off-by: Takashi Iwai --- sound/sh/aica.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/sound/sh/aica.c b/sound/sh/aica.c index fdc680ae8aa0..2b26311405a4 100644 --- a/sound/sh/aica.c +++ b/sound/sh/aica.c @@ -299,14 +299,14 @@ static void run_spu_dma(struct work_struct *work) } } -static void aica_period_elapsed(unsigned long timer_var) +static void aica_period_elapsed(struct timer_list *t) { + struct snd_card_aica *dreamcastcard = from_timer(dreamcastcard, + t, timer); + struct snd_pcm_substream *substream = dreamcastcard->timer_substream; /*timer function - so cannot sleep */ int play_period; struct snd_pcm_runtime *runtime; - struct snd_pcm_substream *substream; - struct snd_card_aica *dreamcastcard; - substream = (struct snd_pcm_substream *) timer_var; runtime = substream->runtime; dreamcastcard = substream->pcm->private_data; /* Have we played out an additional period? */ @@ -336,12 +336,12 @@ static void spu_begin_dma(struct snd_pcm_substream *substream) /*get the queue to do the work */ schedule_work(&(dreamcastcard->spu_dma_work)); /* Timer may already be running */ - if (unlikely(dreamcastcard->timer.data)) { + if (unlikely(dreamcastcard->timer_substream)) { mod_timer(&dreamcastcard->timer, jiffies + 4); return; } - setup_timer(&dreamcastcard->timer, aica_period_elapsed, - (unsigned long) substream); + timer_setup(&dreamcastcard->timer, aica_period_elapsed, 0); + dreamcastcard->timer_substream = substream; mod_timer(&dreamcastcard->timer, jiffies + 4); } @@ -379,7 +379,7 @@ static int snd_aicapcm_pcm_close(struct snd_pcm_substream { struct snd_card_aica *dreamcastcard = substream->pcm->private_data; flush_work(&(dreamcastcard->spu_dma_work)); - if (dreamcastcard->timer.data) + if (dreamcastcard->timer_substream) del_timer(&dreamcastcard->timer); kfree(dreamcastcard->channel); spu_disable(); @@ -600,7 +600,7 @@ static int snd_aica_probe(struct platform_device *devptr) { int err; struct snd_card_aica *dreamcastcard; - dreamcastcard = kmalloc(sizeof(struct snd_card_aica), GFP_KERNEL); + dreamcastcard = kzalloc(sizeof(struct snd_card_aica), GFP_KERNEL); if (unlikely(!dreamcastcard)) return -ENOMEM; err = snd_card_new(&devptr->dev, index, SND_AICA_DRIVER, @@ -619,8 +619,6 @@ static int snd_aica_probe(struct platform_device *devptr) err = snd_aicapcmchip(dreamcastcard, 0); if (unlikely(err < 0)) goto freedreamcast; - dreamcastcard->timer.data = 0; - dreamcastcard->channel = NULL; /* Add basic controls */ err = add_aicamixer_controls(dreamcastcard); if (unlikely(err < 0)) -- cgit From 38e9a80f66beb108d30f47dc856dd17b983c3dd6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 17:53:33 -0700 Subject: ALSA: timer: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. This adds a pointer back to struct snd_timer. Signed-off-by: Kees Cook Signed-off-by: Takashi Iwai --- sound/core/timer.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sound/core/timer.c b/sound/core/timer.c index 6cdd04a45962..09acaf2b2e57 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1028,15 +1028,17 @@ EXPORT_SYMBOL(snd_timer_global_register); struct snd_timer_system_private { struct timer_list tlist; + struct snd_timer *snd_timer; unsigned long last_expires; unsigned long last_jiffies; unsigned long correction; }; -static void snd_timer_s_function(unsigned long data) +static void snd_timer_s_function(struct timer_list *t) { - struct snd_timer *timer = (struct snd_timer *)data; - struct snd_timer_system_private *priv = timer->private_data; + struct snd_timer_system_private *priv = from_timer(priv, t, + tlist); + struct snd_timer *timer = priv->snd_timer; unsigned long jiff = jiffies; if (time_after(jiff, priv->last_expires)) priv->correction += (long)jiff - (long)priv->last_expires; @@ -1118,7 +1120,8 @@ static int snd_timer_register_system(void) snd_timer_free(timer); return -ENOMEM; } - setup_timer(&priv->tlist, snd_timer_s_function, (unsigned long) timer); + priv->snd_timer = timer; + timer_setup(&priv->tlist, snd_timer_s_function, 0); timer->private_data = priv; timer->private_free = snd_timer_free_system; return snd_timer_global_register(timer); -- cgit From 394ca81cb4c1518e9463fe342fb1ae8a9f46a82d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Oct 2017 17:53:40 -0700 Subject: ALSA: asihpi: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Signed-off-by: Kees Cook Signed-off-by: Takashi Iwai --- sound/pci/asihpi/asihpi.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sound/pci/asihpi/asihpi.c b/sound/pci/asihpi/asihpi.c index 70d023a85bf5..614322171c79 100644 --- a/sound/pci/asihpi/asihpi.c +++ b/sound/pci/asihpi/asihpi.c @@ -749,9 +749,9 @@ static inline unsigned int modulo_min(unsigned int a, unsigned int b, /** Timer function, equivalent to interrupt service routine for cards */ -static void snd_card_asihpi_timer_function(unsigned long data) +static void snd_card_asihpi_timer_function(struct timer_list *t) { - struct snd_card_asihpi_pcm *dpcm = (struct snd_card_asihpi_pcm *)data; + struct snd_card_asihpi_pcm *dpcm = from_timer(dpcm, t, timer); struct snd_pcm_substream *substream = dpcm->substream; struct snd_card_asihpi *card = snd_pcm_substream_chip(substream); struct snd_pcm_runtime *runtime; @@ -948,7 +948,7 @@ static void snd_card_asihpi_int_task(unsigned long data) asihpi = (struct snd_card_asihpi *)a->snd_card->private_data; if (asihpi->llmode_streampriv) snd_card_asihpi_timer_function( - (unsigned long)asihpi->llmode_streampriv); + &asihpi->llmode_streampriv->timer); } static void snd_card_asihpi_isr(struct hpi_adapter *a) @@ -1059,8 +1059,7 @@ static int snd_card_asihpi_playback_open(struct snd_pcm_substream *substream) If internal and other stream playing, can't switch */ - setup_timer(&dpcm->timer, snd_card_asihpi_timer_function, - (unsigned long) dpcm); + timer_setup(&dpcm->timer, snd_card_asihpi_timer_function, 0); dpcm->substream = substream; runtime->private_data = dpcm; runtime->private_free = snd_card_asihpi_runtime_free; @@ -1240,8 +1239,7 @@ static int snd_card_asihpi_capture_open(struct snd_pcm_substream *substream) if (err) return -EIO; - setup_timer(&dpcm->timer, snd_card_asihpi_timer_function, - (unsigned long) dpcm); + timer_setup(&dpcm->timer, snd_card_asihpi_timer_function, 0); dpcm->substream = substream; runtime->private_data = dpcm; runtime->private_free = snd_card_asihpi_runtime_free; -- cgit