diff options
Diffstat (limited to 'kernel')
48 files changed, 980 insertions, 499 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 4cb4130ced32..c332eb9d4841 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -103,6 +103,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ +obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/audit.c b/kernel/audit.c index 87f31bf1f0a0..8c201f414226 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -880,7 +880,7 @@ main_queue: return 0; } -int audit_send_list(void *_dest) +int audit_send_list_thread(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; @@ -924,19 +924,30 @@ out_kfree_skb: return NULL; } +static void audit_free_reply(struct audit_reply *reply) +{ + if (!reply) + return; + + if (reply->skb) + kfree_skb(reply->skb); + if (reply->net) + put_net(reply->net); + kfree(reply); +} + static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; - struct sock *sk = audit_get_sk(reply->net); audit_ctl_lock(); audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ - netlink_unicast(sk, reply->skb, reply->portid, 0); - put_net(reply->net); - kfree(reply); + netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0); + reply->skb = NULL; + audit_free_reply(reply); return 0; } @@ -950,35 +961,32 @@ static int audit_send_reply_thread(void *arg) * @payload: payload data * @size: payload size * - * Allocates an skb, builds the netlink message, and sends it to the port id. - * No failure notifications. + * Allocates a skb, builds the netlink message, and sends it to the port id. */ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { - struct net *net = sock_net(NETLINK_CB(request_skb).sk); - struct sk_buff *skb; struct task_struct *tsk; - struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), - GFP_KERNEL); + struct audit_reply *reply; + reply = kzalloc(sizeof(*reply), GFP_KERNEL); if (!reply) return; - skb = audit_make_reply(seq, type, done, multi, payload, size); - if (!skb) - goto out; - - reply->net = get_net(net); + reply->skb = audit_make_reply(seq, type, done, multi, payload, size); + if (!reply->skb) + goto err; + reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); reply->portid = NETLINK_CB(request_skb).portid; - reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); - if (!IS_ERR(tsk)) - return; - kfree_skb(skb); -out: - kfree(reply); + if (IS_ERR(tsk)) + goto err; + + return; + +err: + audit_free_reply(reply); } /* @@ -1525,20 +1533,60 @@ static void audit_receive(struct sk_buff *skb) audit_ctl_unlock(); } +/* Log information about who is connecting to the audit multicast socket */ +static void audit_log_multicast(int group, const char *op, int err) +{ + const struct cred *cred; + struct tty_struct *tty; + char comm[sizeof(current->comm)]; + struct audit_buffer *ab; + + if (!audit_enabled) + return; + + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER); + if (!ab) + return; + + cred = current_cred(); + tty = audit_get_tty(); + audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", + task_pid_nr(current), + from_kuid(&init_user_ns, cred->uid), + from_kuid(&init_user_ns, audit_get_loginuid(current)), + tty ? tty_name(tty) : "(none)", + audit_get_sessionid(current)); + audit_put_tty(tty); + audit_log_task_context(ab); /* subj= */ + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); + audit_log_d_path_exe(ab, current->mm); /* exe= */ + audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err); + audit_log_end(ab); +} + /* Run custom bind function on netlink socket group connect or bind requests. */ -static int audit_bind(struct net *net, int group) +static int audit_multicast_bind(struct net *net, int group) { + int err = 0; + if (!capable(CAP_AUDIT_READ)) - return -EPERM; + err = -EPERM; + audit_log_multicast(group, "connect", err); + return err; +} - return 0; +static void audit_multicast_unbind(struct net *net, int group) +{ + audit_log_multicast(group, "disconnect", 0); } static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, - .bind = audit_bind, + .bind = audit_multicast_bind, + .unbind = audit_multicast_unbind, .flags = NL_CFG_F_NONROOT_RECV, .groups = AUDIT_NLGRP_MAX, }; diff --git a/kernel/audit.h b/kernel/audit.h index 2eed4d231624..f0233dc40b17 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -229,7 +229,7 @@ struct audit_netlink_list { struct sk_buff_head q; }; -int audit_send_list(void *_dest); +int audit_send_list_thread(void *_dest); extern int selinux_audit_rule_update(void); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 026e34da4ace..a10e2997aa6c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1161,11 +1161,8 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz) */ int audit_list_rules_send(struct sk_buff *request_skb, int seq) { - u32 portid = NETLINK_CB(request_skb).portid; - struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct task_struct *tsk; struct audit_netlink_list *dest; - int err = 0; /* We can't just spew out the rules here because we might fill * the available socket buffer space and deadlock waiting for @@ -1173,25 +1170,26 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq) * happen if we're actually running in the context of auditctl * trying to _send_ the stuff */ - dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); + dest = kmalloc(sizeof(*dest), GFP_KERNEL); if (!dest) return -ENOMEM; - dest->net = get_net(net); - dest->portid = portid; + dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); + dest->portid = NETLINK_CB(request_skb).portid; skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); audit_list_rules(seq, &dest->q); mutex_unlock(&audit_filter_mutex); - tsk = kthread_run(audit_send_list, dest, "audit_send_list"); + tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list"); if (IS_ERR(tsk)) { skb_queue_purge(&dest->q); + put_net(dest->net); kfree(dest); - err = PTR_ERR(tsk); + return PTR_ERR(tsk); } - return err; + return 0; } int audit_comparator(u32 left, u32 op, u32 right) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 814406a35db1..468a23390457 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -130,6 +130,17 @@ struct audit_tree_refs { struct audit_chunk *c[31]; }; +struct audit_nfcfgop_tab { + enum audit_nfcfgop op; + const char *s; +}; + +static const struct audit_nfcfgop_tab audit_nfcfgs[] = { + { AUDIT_XT_OP_REGISTER, "register" }, + { AUDIT_XT_OP_REPLACE, "replace" }, + { AUDIT_XT_OP_UNREGISTER, "unregister" }, +}; + static int audit_match_perm(struct audit_context *ctx, int mask) { unsigned n; @@ -2542,6 +2553,26 @@ void __audit_ntp_log(const struct audit_ntp_data *ad) audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); } +void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, + enum audit_nfcfgop op) +{ + struct audit_buffer *ab; + char comm[sizeof(current->comm)]; + + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG); + if (!ab) + return; + audit_log_format(ab, "table=%s family=%u entries=%u op=%s", + name, af, nentries, audit_nfcfgs[op].s); + + audit_log_format(ab, " pid=%u", task_pid_nr(current)); + audit_log_task_context(ab); /* subj= */ + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); + audit_log_end(ab); +} +EXPORT_SYMBOL_GPL(__audit_log_nfcfg); + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 916f5132a984..cf6fe9107f5c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -82,7 +82,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; @@ -232,7 +232,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (ret) return NULL; - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { @@ -262,10 +262,10 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { - const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); + const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); u32 raw_size = bpf_prog_tag_scratch_size(fp); - u32 digest[SHA_DIGEST_WORDS]; - u32 ws[SHA_WORKSPACE_WORDS]; + u32 digest[SHA1_DIGEST_WORDS]; + u32 ws[SHA1_WORKSPACE_WORDS]; u32 i, bsize, psize, blocks; struct bpf_insn *dst; bool was_ld_map; @@ -277,7 +277,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) if (!raw) return -ENOMEM; - sha_init(digest); + sha1_init(digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation @@ -308,8 +308,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) memset(&raw[psize], 0, raw_size - psize); raw[psize++] = 0x80; - bsize = round_up(psize, SHA_MESSAGE_BYTES); - blocks = bsize / SHA_MESSAGE_BYTES; + bsize = round_up(psize, SHA1_BLOCK_SIZE); + blocks = bsize / SHA1_BLOCK_SIZE; todo = raw; if (bsize - psize >= sizeof(__be64)) { bits = (__be64 *)(todo + bsize - sizeof(__be64)); @@ -320,12 +320,12 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) *bits = cpu_to_be64((psize - 1) << 3); while (blocks--) { - sha_transform(digest, todo, ws); - todo += SHA_MESSAGE_BYTES; + sha1_transform(digest, todo, ws); + todo += SHA1_BLOCK_SIZE; } result = (__force __be32 *)digest; - for (i = 0; i < SHA_DIGEST_WORDS; i++) + for (i = 0; i < SHA1_DIGEST_WORDS; i++) result[i] = cpu_to_be32(digest[i]); memcpy(fp->tag, result, sizeof(fp->tag)); @@ -1089,7 +1089,7 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; - fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2843bbba9ca1..42c7a42fc9c8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,6 +25,7 @@ #include <linux/nospec.h> #include <linux/audit.h> #include <uapi/linux/btf.h> +#include <asm/pgtable.h> #include <linux/bpf_lsm.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -281,27 +282,29 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; + const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO; + unsigned int flags = 0; + unsigned long align = 1; void *area; if (size >= SIZE_MAX) return NULL; /* kmalloc()'ed memory can't be mmap()'ed */ - if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, + if (mmapable) { + BUG_ON(!PAGE_ALIGNED(size)); + align = SHMLBA; + flags = VM_USERMAP; + } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, numa_node); if (area != NULL) return area; } - if (mmapable) { - BUG_ON(!PAGE_ALIGNED(size)); - return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | - __GFP_RETRY_MAYFAIL | flags); - } - return __vmalloc_node_flags_caller(size, numa_node, - GFP_KERNEL | __GFP_RETRY_MAYFAIL | - flags, __builtin_return_address(0)); + + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, + flags, numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) @@ -623,9 +626,20 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) mutex_lock(&map->freeze_mutex); - if ((vma->vm_flags & VM_WRITE) && map->frozen) { - err = -EPERM; - goto out; + if (vma->vm_flags & VM_WRITE) { + if (map->frozen) { + err = -EPERM; + goto out; + } + /* map is meant to be read-only, so do not allow mapping as + * writable, because it's possible to leak a writable page + * reference and allows user-space to still modify it after + * freezing, while verifier will assume contents do not change + */ + if (map->map_flags & BPF_F_RDONLY_PROG) { + err = -EACCES; + goto out; + } } /* set default open/close callbacks */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d7ee40e2748..efe14cf24bc6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1168,14 +1168,14 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg) * but must be positive otherwise set to worse case bounds * and refine later from tnum. */ - if (reg->s32_min_value > 0) - reg->smin_value = reg->s32_min_value; - else - reg->smin_value = 0; - if (reg->s32_max_value > 0) + if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0) reg->smax_value = reg->s32_max_value; else reg->smax_value = U32_MAX; + if (reg->s32_min_value >= 0) + reg->smin_value = reg->s32_min_value; + else + reg->smin_value = 0; } static void __reg_combine_32_into_64(struct bpf_reg_state *reg) @@ -10428,22 +10428,13 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } #define SECURITY_PREFIX "security_" -static int check_attach_modify_return(struct bpf_verifier_env *env) +static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr) { - struct bpf_prog *prog = env->prog; - unsigned long addr = (unsigned long) prog->aux->trampoline->func.addr; - - /* This is expected to be cleaned up in the future with the KRSI effort - * introducing the LSM_HOOK macro for cleaning up lsm_hooks.h. - */ if (within_error_injection_list(addr) || !strncmp(SECURITY_PREFIX, prog->aux->attach_func_name, sizeof(SECURITY_PREFIX) - 1)) return 0; - verbose(env, "fmod_ret attach_btf_id %u (%s) is not modifiable\n", - prog->aux->attach_btf_id, prog->aux->attach_func_name); - return -EINVAL; } @@ -10654,11 +10645,18 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) goto out; } } + + if (prog->expected_attach_type == BPF_MODIFY_RETURN) { + ret = check_attach_modify_return(prog, addr); + if (ret) + verbose(env, "%s() is not modifiable\n", + prog->aux->attach_func_name); + } + + if (ret) + goto out; tr->func.addr = (void *)addr; prog->aux->trampoline = tr; - - if (prog->expected_attach_type == BPF_MODIFY_RETURN) - ret = check_attach_modify_return(env); out: mutex_unlock(&tr->mutex); if (ret) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 6f87352f8219..41ca996568df 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -33,12 +33,9 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) return; /* - * Paired with the one in cgroup_rstat_cpu_pop_updated(). Either we - * see NULL updated_next or they see our updated stat. - */ - smp_mb(); - - /* + * Speculative already-on-list test. This may race leading to + * temporary inaccuracies, which is fine. + * * Because @parent's updated_children is terminated with @parent * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. @@ -134,13 +131,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, *nextp = rstatc->updated_next; rstatc->updated_next = NULL; - /* - * Paired with the one in cgroup_rstat_cpu_updated(). - * Either they see NULL updated_next or we see their - * updated stat. - */ - smp_mb(); - return pos; } diff --git a/kernel/compat.c b/kernel/compat.c index 843dd17e6078..b8d2800bb4b7 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -199,7 +199,7 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!user_access_begin(umask, bitmap_size / 8)) + if (!user_read_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { @@ -211,11 +211,11 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, } if (nr_compat_longs) unsafe_get_user(*mask, umask++, Efault); - user_access_end(); + user_read_access_end(); return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } @@ -228,7 +228,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!user_access_begin(umask, bitmap_size / 8)) + if (!user_write_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { @@ -239,10 +239,10 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, } if (nr_compat_longs) unsafe_put_user((compat_ulong_t)*mask, umask++, Efault); - user_access_end(); + user_write_access_end(); return 0; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 244d30544377..6ff2578ecf17 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -433,7 +433,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) /* * On x86 it's required to boot all logical CPUs at least once so * that the init code can get a chance to set CR4.MCE on each - * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any + * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any * core will shutdown the machine. */ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); @@ -1343,7 +1343,7 @@ void bringup_nonboot_cpus(unsigned int setup_max_cpus) #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -int __freeze_secondary_cpus(int primary, bool suspend) +int freeze_secondary_cpus(int primary) { int cpu, error = 0; @@ -1368,7 +1368,7 @@ int __freeze_secondary_cpus(int primary, bool suspend) if (cpu == primary) continue; - if (suspend && pm_wakeup_pending()) { + if (pm_wakeup_pending()) { pr_info("Wakeup pending. Abort CPU freeze\n"); error = -EBUSY; break; @@ -1392,8 +1392,8 @@ int __freeze_secondary_cpus(int primary, bool suspend) /* * Make sure the CPUs won't be enabled by someone else. We need to do - * this even in case of failure as all disable_nonboot_cpus() users are - * supposed to do enable_nonboot_cpus() on the failure path. + * this even in case of failure as all freeze_secondary_cpus() users are + * supposed to do thaw_secondary_cpus() on the failure path. */ cpu_hotplug_disabled++; @@ -1401,15 +1401,15 @@ int __freeze_secondary_cpus(int primary, bool suspend) return error; } -void __weak arch_enable_nonboot_cpus_begin(void) +void __weak arch_thaw_secondary_cpus_begin(void) { } -void __weak arch_enable_nonboot_cpus_end(void) +void __weak arch_thaw_secondary_cpus_end(void) { } -void enable_nonboot_cpus(void) +void thaw_secondary_cpus(void) { int cpu, error; @@ -1421,7 +1421,7 @@ void enable_nonboot_cpus(void) pr_info("Enabling non-boot CPUs ...\n"); - arch_enable_nonboot_cpus_begin(); + arch_thaw_secondary_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); @@ -1434,7 +1434,7 @@ void enable_nonboot_cpus(void) pr_warn("Error taking CPU%d up: %d\n", cpu, error); } - arch_enable_nonboot_cpus_end(); + arch_thaw_secondary_cpus_end(); cpumask_clear(frozen_cpus); out: diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 9c23ae074b40..92da32275af5 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -6,12 +6,6 @@ #include <linux/export.h> /* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; - -/* * stores the physical address of elf header of crash image * * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index d14cbc83986a..914ff5a58dd5 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -20,23 +20,6 @@ struct page **dma_common_find_pages(void *cpu_addr) return area->pages; } -static struct vm_struct *__dma_common_pages_remap(struct page **pages, - size_t size, pgprot_t prot, const void *caller) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, VM_DMA_COHERENT, caller); - if (!area) - return NULL; - - if (map_vm_area(area, prot, pages)) { - vunmap(area->addr); - return NULL; - } - - return area; -} - /* * Remaps an array of PAGE_SIZE pages into another vm_area. * Cannot be used in non-sleeping contexts @@ -44,15 +27,12 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages, void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot, const void *caller) { - struct vm_struct *area; + void *vaddr; - area = __dma_common_pages_remap(pages, size, prot, caller); - if (!area) - return NULL; - - area->pages = pages; - - return area->addr; + vaddr = vmap(pages, size >> PAGE_SHIFT, VM_DMA_COHERENT, prot); + if (vaddr) + find_vm_area(vaddr)->pages = pages; + return vaddr; } /* @@ -62,24 +42,20 @@ void *dma_common_pages_remap(struct page **pages, size_t size, void *dma_common_contiguous_remap(struct page *page, size_t size, pgprot_t prot, const void *caller) { - int i; + int count = size >> PAGE_SHIFT; struct page **pages; - struct vm_struct *area; + void *vaddr; + int i; - pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); + pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!pages) return NULL; - - for (i = 0; i < (size >> PAGE_SHIFT); i++) + for (i = 0; i < count; i++) pages[i] = nth_page(page, i); - - area = __dma_common_pages_remap(pages, size, prot, caller); - + vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); kfree(pages); - if (!area) - return NULL; - return area->addr; + return vaddr; } /* diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c2b41a263166..b1991043b7d8 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -16,7 +16,7 @@ struct callchain_cpus_entries { struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; + struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; diff --git a/kernel/events/core.c b/kernel/events/core.c index 633b4ae72ed5..e296c5c59c6f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -95,11 +95,11 @@ static void remote_function(void *data) * @info: the function call argument * * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly + * be on the current CPU, which just calls the function directly. This will + * retry due to any failures in smp_call_function_single(), such as if the + * task_cpu() goes offline concurrently. * - * returns: @func return value, or - * -ESRCH - when the process isn't running - * -EAGAIN - when the process moved away + * returns @func return value or -ESRCH when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -112,11 +112,16 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) }; int ret; - do { - ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); - if (!ret) - ret = data.ret; - } while (ret == -EAGAIN); + for (;;) { + ret = smp_call_function_single(task_cpu(p), remote_function, + &data, 1); + ret = !ret ? data.ret : -EAGAIN; + + if (ret != -EAGAIN) + break; + + cond_resched(); + } return ret; } @@ -9404,7 +9409,7 @@ static int perf_kprobe_event_init(struct perf_event *event) if (event->attr.type != perf_kprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* @@ -9464,7 +9469,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* @@ -11511,7 +11516,7 @@ SYSCALL_DEFINE5(perf_event_open, } if (attr.namespaces) { - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index f16f66b6b655..fcbf5616a441 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -55,7 +55,7 @@ struct perf_buffer { void *aux_priv; struct perf_event_mmap_page *user_page; - void *data_pages[0]; + void *data_pages[]; }; extern void rb_free(struct perf_buffer *rb); diff --git a/kernel/exit.c b/kernel/exit.c index d56fe51bdf07..c81805a6e03b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1565,7 +1565,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1574,10 +1574,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } @@ -1692,7 +1692,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1701,10 +1701,10 @@ COMPAT_SYSCALL_DEFINE5(waitid, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } #endif diff --git a/kernel/fork.c b/kernel/fork.c index c40478e749a7..be98e94cb3cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -94,6 +94,7 @@ #include <linux/thread_info.h> #include <linux/stackleak.h> #include <linux/kasan.h> +#include <linux/scs.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -456,6 +457,8 @@ void put_task_stack(struct task_struct *tsk) void free_task(struct task_struct *tsk) { + scs_release(tsk); + #ifndef CONFIG_THREAD_INFO_IN_TASK /* * The task is finally done with both the stack and thread_info, @@ -840,6 +843,8 @@ void __init fork_init(void) NULL, free_vm_stack_cache); #endif + scs_init(); + lockdep_init_task(&init_task); uprobes_init(); } @@ -899,6 +904,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (err) goto free_stack; + err = scs_prepare(tsk, node); + if (err) + goto free_stack; + #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under diff --git a/kernel/futex.c b/kernel/futex.c index b59532862bc0..b4b9f960b610 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -486,10 +486,13 @@ static u64 get_inode_sequence_number(struct inode *inode) * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: + * * ( inode->i_sequence, page->index, offset_within_page ) + * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: + * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex diff --git a/kernel/groups.c b/kernel/groups.c index daae2f2dc6d4..6ee6691f6839 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -20,7 +20,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT); if (!gi) return NULL; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 20d501af4f2e..d63c324895ea 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -72,6 +72,7 @@ config IRQ_DOMAIN config IRQ_SIM bool select IRQ_WORK + select IRQ_DOMAIN # Support for hierarchical irq domains config IRQ_DOMAIN_HIERARCHY diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index b992f88c5613..48006608baf0 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -1,14 +1,31 @@ // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017-2018 Bartosz Golaszewski <brgl@bgdev.pl> + * Copyright (C) 2020 Bartosz Golaszewski <bgolaszewski@baylibre.com> */ -#include <linux/slab.h> -#include <linux/irq_sim.h> #include <linux/irq.h> +#include <linux/irq_sim.h> +#include <linux/irq_work.h> +#include <linux/interrupt.h> +#include <linux/slab.h> + +struct irq_sim_work_ctx { + struct irq_work work; + int irq_base; + unsigned int irq_count; + unsigned long *pending; + struct irq_domain *domain; +}; + +struct irq_sim_irq_ctx { + int irqnum; + bool enabled; + struct irq_sim_work_ctx *work_ctx; +}; struct irq_sim_devres { - struct irq_sim *sim; + struct irq_domain *domain; }; static void irq_sim_irqmask(struct irq_data *data) @@ -36,159 +53,205 @@ static int irq_sim_set_type(struct irq_data *data, unsigned int type) return 0; } +static int irq_sim_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool *state) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + switch (which) { + case IRQCHIP_STATE_PENDING: + if (irq_ctx->enabled) + *state = test_bit(hwirq, irq_ctx->work_ctx->pending); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int irq_sim_set_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool state) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + switch (which) { + case IRQCHIP_STATE_PENDING: + if (irq_ctx->enabled) { + assign_bit(hwirq, irq_ctx->work_ctx->pending, state); + if (state) + irq_work_queue(&irq_ctx->work_ctx->work); + } + break; + default: + return -EINVAL; + } + + return 0; +} + static struct irq_chip irq_sim_irqchip = { - .name = "irq_sim", - .irq_mask = irq_sim_irqmask, - .irq_unmask = irq_sim_irqunmask, - .irq_set_type = irq_sim_set_type, + .name = "irq_sim", + .irq_mask = irq_sim_irqmask, + .irq_unmask = irq_sim_irqunmask, + .irq_set_type = irq_sim_set_type, + .irq_get_irqchip_state = irq_sim_get_irqchip_state, + .irq_set_irqchip_state = irq_sim_set_irqchip_state, }; static void irq_sim_handle_irq(struct irq_work *work) { struct irq_sim_work_ctx *work_ctx; unsigned int offset = 0; - struct irq_sim *sim; int irqnum; work_ctx = container_of(work, struct irq_sim_work_ctx, work); - sim = container_of(work_ctx, struct irq_sim, work_ctx); - while (!bitmap_empty(work_ctx->pending, sim->irq_count)) { + while (!bitmap_empty(work_ctx->pending, work_ctx->irq_count)) { offset = find_next_bit(work_ctx->pending, - sim->irq_count, offset); + work_ctx->irq_count, offset); clear_bit(offset, work_ctx->pending); - irqnum = irq_sim_irqnum(sim, offset); + irqnum = irq_find_mapping(work_ctx->domain, offset); handle_simple_irq(irq_to_desc(irqnum)); } } +static int irq_sim_domain_map(struct irq_domain *domain, + unsigned int virq, irq_hw_number_t hw) +{ + struct irq_sim_work_ctx *work_ctx = domain->host_data; + struct irq_sim_irq_ctx *irq_ctx; + + irq_ctx = kzalloc(sizeof(*irq_ctx), GFP_KERNEL); + if (!irq_ctx) + return -ENOMEM; + + irq_set_chip(virq, &irq_sim_irqchip); + irq_set_chip_data(virq, irq_ctx); + irq_set_handler(virq, handle_simple_irq); + irq_modify_status(virq, IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE); + irq_ctx->work_ctx = work_ctx; + + return 0; +} + +static void irq_sim_domain_unmap(struct irq_domain *domain, unsigned int virq) +{ + struct irq_sim_irq_ctx *irq_ctx; + struct irq_data *irqd; + + irqd = irq_domain_get_irq_data(domain, virq); + irq_ctx = irq_data_get_irq_chip_data(irqd); + + irq_set_handler(virq, NULL); + irq_domain_reset_irq_data(irqd); + kfree(irq_ctx); +} + +static const struct irq_domain_ops irq_sim_domain_ops = { + .map = irq_sim_domain_map, + .unmap = irq_sim_domain_unmap, +}; + /** - * irq_sim_init - Initialize the interrupt simulator: allocate a range of - * dummy interrupts. + * irq_domain_create_sim - Create a new interrupt simulator irq_domain and + * allocate a range of dummy interrupts. * - * @sim: The interrupt simulator object to initialize. - * @num_irqs: Number of interrupts to allocate + * @fnode: struct fwnode_handle to be associated with this domain. + * @num_irqs: Number of interrupts to allocate. * - * On success: return the base of the allocated interrupt range. - * On failure: a negative errno. + * On success: return a new irq_domain object. + * On failure: a negative errno wrapped with ERR_PTR(). */ -int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) +struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, + unsigned int num_irqs) { - int i; + struct irq_sim_work_ctx *work_ctx; - sim->irqs = kmalloc_array(num_irqs, sizeof(*sim->irqs), GFP_KERNEL); - if (!sim->irqs) - return -ENOMEM; + work_ctx = kmalloc(sizeof(*work_ctx), GFP_KERNEL); + if (!work_ctx) + goto err_out; - sim->irq_base = irq_alloc_descs(-1, 0, num_irqs, 0); - if (sim->irq_base < 0) { - kfree(sim->irqs); - return sim->irq_base; - } + work_ctx->pending = bitmap_zalloc(num_irqs, GFP_KERNEL); + if (!work_ctx->pending) + goto err_free_work_ctx; - sim->work_ctx.pending = bitmap_zalloc(num_irqs, GFP_KERNEL); - if (!sim->work_ctx.pending) { - kfree(sim->irqs); - irq_free_descs(sim->irq_base, num_irqs); - return -ENOMEM; - } + work_ctx->domain = irq_domain_create_linear(fwnode, num_irqs, + &irq_sim_domain_ops, + work_ctx); + if (!work_ctx->domain) + goto err_free_bitmap; - for (i = 0; i < num_irqs; i++) { - sim->irqs[i].irqnum = sim->irq_base + i; - sim->irqs[i].enabled = false; - irq_set_chip(sim->irq_base + i, &irq_sim_irqchip); - irq_set_chip_data(sim->irq_base + i, &sim->irqs[i]); - irq_set_handler(sim->irq_base + i, &handle_simple_irq); - irq_modify_status(sim->irq_base + i, - IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE); - } + work_ctx->irq_count = num_irqs; + init_irq_work(&work_ctx->work, irq_sim_handle_irq); - init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq); - sim->irq_count = num_irqs; + return work_ctx->domain; - return sim->irq_base; +err_free_bitmap: + bitmap_free(work_ctx->pending); +err_free_work_ctx: + kfree(work_ctx); +err_out: + return ERR_PTR(-ENOMEM); } -EXPORT_SYMBOL_GPL(irq_sim_init); +EXPORT_SYMBOL_GPL(irq_domain_create_sim); /** - * irq_sim_fini - Deinitialize the interrupt simulator: free the interrupt - * descriptors and allocated memory. + * irq_domain_remove_sim - Deinitialize the interrupt simulator domain: free + * the interrupt descriptors and allocated memory. * - * @sim: The interrupt simulator to tear down. + * @domain: The interrupt simulator domain to tear down. */ -void irq_sim_fini(struct irq_sim *sim) +void irq_domain_remove_sim(struct irq_domain *domain) { - irq_work_sync(&sim->work_ctx.work); - bitmap_free(sim->work_ctx.pending); - irq_free_descs(sim->irq_base, sim->irq_count); - kfree(sim->irqs); + struct irq_sim_work_ctx *work_ctx = domain->host_data; + + irq_work_sync(&work_ctx->work); + bitmap_free(work_ctx->pending); + kfree(work_ctx); + + irq_domain_remove(domain); } -EXPORT_SYMBOL_GPL(irq_sim_fini); +EXPORT_SYMBOL_GPL(irq_domain_remove_sim); -static void devm_irq_sim_release(struct device *dev, void *res) +static void devm_irq_domain_release_sim(struct device *dev, void *res) { struct irq_sim_devres *this = res; - irq_sim_fini(this->sim); + irq_domain_remove_sim(this->domain); } /** - * irq_sim_init - Initialize the interrupt simulator for a managed device. + * devm_irq_domain_create_sim - Create a new interrupt simulator for + * a managed device. * * @dev: Device to initialize the simulator object for. - * @sim: The interrupt simulator object to initialize. + * @fnode: struct fwnode_handle to be associated with this domain. * @num_irqs: Number of interrupts to allocate * - * On success: return the base of the allocated interrupt range. - * On failure: a negative errno. + * On success: return a new irq_domain object. + * On failure: a negative errno wrapped with ERR_PTR(). */ -int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, - unsigned int num_irqs) +struct irq_domain *devm_irq_domain_create_sim(struct device *dev, + struct fwnode_handle *fwnode, + unsigned int num_irqs) { struct irq_sim_devres *dr; - int rv; - dr = devres_alloc(devm_irq_sim_release, sizeof(*dr), GFP_KERNEL); + dr = devres_alloc(devm_irq_domain_release_sim, + sizeof(*dr), GFP_KERNEL); if (!dr) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - rv = irq_sim_init(sim, num_irqs); - if (rv < 0) { + dr->domain = irq_domain_create_sim(fwnode, num_irqs); + if (IS_ERR(dr->domain)) { devres_free(dr); - return rv; + return dr->domain; } - dr->sim = sim; devres_add(dev, dr); - - return rv; -} -EXPORT_SYMBOL_GPL(devm_irq_sim_init); - -/** - * irq_sim_fire - Enqueue an interrupt. - * - * @sim: The interrupt simulator object. - * @offset: Offset of the simulated interrupt which should be fired. - */ -void irq_sim_fire(struct irq_sim *sim, unsigned int offset) -{ - if (sim->irqs[offset].enabled) { - set_bit(offset, sim->work_ctx.pending); - irq_work_queue(&sim->work_ctx.work); - } -} -EXPORT_SYMBOL_GPL(irq_sim_fire); - -/** - * irq_sim_irqnum - Get the allocated number of a dummy interrupt. - * - * @sim: The interrupt simulator object. - * @offset: Offset of the simulated interrupt for which to retrieve - * the number. - */ -int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset) -{ - return sim->irqs[offset].irqnum; + return dr->domain; } -EXPORT_SYMBOL_GPL(irq_sim_irqnum); +EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 35b8d97c3a1d..a4c2c915511d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -132,14 +132,13 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, const struct irq_domain_ops *ops, void *host_data) { - struct device_node *of_node = to_of_node(fwnode); struct irqchip_fwid *fwid; struct irq_domain *domain; static atomic_t unknown_domains; domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), - GFP_KERNEL, of_node_to_nid(of_node)); + GFP_KERNEL, of_node_to_nid(to_of_node(fwnode))); if (!domain) return NULL; @@ -162,30 +161,16 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->name = fwid->name; break; } -#ifdef CONFIG_ACPI - } else if (is_acpi_device_node(fwnode)) { - struct acpi_buffer buf = { - .length = ACPI_ALLOCATE_BUFFER, - }; - acpi_handle handle; - - handle = acpi_device_handle(to_acpi_device_node(fwnode)); - if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) { - domain->name = buf.pointer; - domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; - } - - domain->fwnode = fwnode; -#endif - } else if (of_node) { + } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || + is_software_node(fwnode)) { char *name; /* - * DT paths contain '/', which debugfs is legitimately + * fwnode paths contain '/', which debugfs is legitimately * unhappy about. Replace them with ':', which does * the trick and is not as offensive as '\'... */ - name = kasprintf(GFP_KERNEL, "%pOF", of_node); + name = kasprintf(GFP_KERNEL, "%pfw", fwnode); if (!name) { kfree(domain); return NULL; @@ -210,7 +195,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; } - of_node_get(of_node); + fwnode_handle_get(fwnode); /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); @@ -259,7 +244,7 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); - of_node_put(irq_domain_get_of_node(domain)); + fwnode_handle_put(domain->fwnode); if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) kfree(domain->name); kfree(domain); @@ -1047,6 +1032,18 @@ int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, return virq; } +/** + * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data + * @irq_data: The pointer to irq_data + */ +void irq_domain_reset_irq_data(struct irq_data *irq_data) +{ + irq_data->hwirq = 0; + irq_data->chip = &no_irq_chip; + irq_data->chip_data = NULL; +} +EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); + #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /** * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy @@ -1248,18 +1245,6 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, EXPORT_SYMBOL(irq_domain_set_info); /** - * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data - * @irq_data: The pointer to irq_data - */ -void irq_domain_reset_irq_data(struct irq_data *irq_data) -{ - irq_data->hwirq = 0; - irq_data->chip = &no_irq_chip; - irq_data->chip_data = NULL; -} -EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); - -/** * irq_domain_free_irqs_common - Clear irq_data and free the parent * @domain: Interrupt domain to match * @virq: IRQ number to start with diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 453a8a0f4804..761911168438 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2619,6 +2619,8 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); + if (WARN_ON_ONCE(!chip)) + return -ENODEV; if (chip->irq_get_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY @@ -2696,6 +2698,8 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); + if (WARN_ON_ONCE(!chip)) + return -ENODEV; if (chip->irq_set_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2625c241ac00..3f310df4a693 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2179,6 +2179,24 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) return 0; } +/* Remove all symbols in given area from kprobe blacklist */ +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) +{ + struct kprobe_blacklist_entry *ent, *n; + + list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { + if (ent->start_addr < start || ent->start_addr >= end) + continue; + list_del(&ent->list); + kfree(ent); + } +} + +static void kprobe_remove_ksym_blacklist(unsigned long entry) +{ + kprobe_remove_area_blacklist(entry, entry + 1); +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; @@ -2211,10 +2229,62 @@ static int __init populate_kprobe_blacklist(unsigned long *start, /* Symbols in __kprobes_text are blacklisted */ ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start, (unsigned long)__kprobes_text_end); + if (ret) + return ret; + + /* Symbols in noinstr section are blacklisted */ + ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start, + (unsigned long)__noinstr_text_end); return ret ? : arch_populate_kprobe_blacklist(); } +static void add_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_add_ksym_blacklist(mod->kprobe_blacklist[i]); + } + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_add_area_blacklist(start, end); + } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_add_area_blacklist(start, end); + } +} + +static void remove_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_remove_ksym_blacklist(mod->kprobe_blacklist[i]); + } + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_remove_area_blacklist(start, end); + } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_remove_area_blacklist(start, end); + } +} + /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2225,6 +2295,11 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); + if (val == MODULE_STATE_COMING) { + mutex_lock(&kprobe_mutex); + add_module_kprobe_blacklist(mod); + mutex_unlock(&kprobe_mutex); + } if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2255,6 +2330,8 @@ static int kprobes_module_callback(struct notifier_block *nb, kill_kprobe(p); } } + if (val == MODULE_STATE_GOING) + remove_module_kprobe_blacklist(mod); mutex_unlock(&kprobe_mutex); return NOTIFY_DONE; } @@ -2420,6 +2497,7 @@ static const struct file_operations debugfs_kprobes_operations = { /* kprobes/blacklist -- shows which functions can not be probed */ static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&kprobe_mutex); return seq_list_start(&kprobe_blacklist, *pos); } @@ -2446,10 +2524,15 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) return 0; } +static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v) +{ + mutex_unlock(&kprobe_mutex); +} + static const struct seq_operations kprobe_blacklist_seq_ops = { .start = kprobe_blacklist_seq_start, .next = kprobe_blacklist_seq_next, - .stop = kprobe_seq_stop, /* Reuse void function */ + .stop = kprobe_blacklist_seq_stop, .show = kprobe_blacklist_seq_show, }; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 6f1c8cba09c6..dd3cc0854c32 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -470,7 +470,7 @@ struct lock_trace { struct hlist_node hash_entry; u32 hash; u32 nr_entries; - unsigned long entries[0] __aligned(sizeof(unsigned long)); + unsigned long entries[] __aligned(sizeof(unsigned long)); }; #define LOCK_TRACE_SIZE_IN_LONGS \ (sizeof(struct lock_trace) / sizeof(unsigned long)) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c9f090d64f00..cfdd5b93264d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -141,7 +141,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) * set up. */ #ifndef CONFIG_DEBUG_RT_MUTEXES -# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c) # define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) # define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) @@ -202,7 +201,6 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #else -# define rt_mutex_cmpxchg_relaxed(l,c,n) (0) # define rt_mutex_cmpxchg_acquire(l,c,n) (0) # define rt_mutex_cmpxchg_release(l,c,n) (0) diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..a0f201d2e184 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2400,7 +2400,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || strstarts(sname, ".init")) + || module_init_section(sname)) continue; s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); @@ -2433,7 +2433,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || !strstarts(sname, ".init")) + || !module_init_section(sname)) continue; s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); @@ -2768,6 +2768,11 @@ void * __weak module_alloc(unsigned long size) return vmalloc_exec(size); } +bool __weak module_init_section(const char *name) +{ + return strstarts(name, ".init"); +} + bool __weak module_exit_section(const char *name) { return strstarts(name, ".exit"); @@ -2946,8 +2951,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return err; /* Suck in entire file: we'll want most of it. */ - info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); + info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN); if (!info->hdr) return -ENOMEM; @@ -3150,6 +3154,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) } #endif + mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1, + &mod->noinstr_text_size); + #ifdef CONFIG_TRACEPOINTS mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", sizeof(*mod->tracepoints_ptrs), @@ -3194,6 +3201,13 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ei_funcs), &mod->num_ei_funcs); #endif +#ifdef CONFIG_KPROBES + mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1, + &mod->kprobes_text_size); + mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist", + sizeof(unsigned long), + &mod->num_kprobe_blacklist); +#endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); diff --git a/kernel/notifier.c b/kernel/notifier.c index 5989bbb93039..84c987dfbe03 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,6 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/padata.c b/kernel/padata.c index a6afa12fb75e..aae789896616 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -703,7 +703,7 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_online_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -718,7 +718,7 @@ static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_dead_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -734,8 +734,9 @@ static enum cpuhp_state hp_online; static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, &pinst->node); - cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); + cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, + &pinst->cpu_dead_node); + cpuhp_state_remove_instance_nocalls(hp_online, &pinst->cpu_online_node); #endif WARN_ON(!list_empty(&pinst->pslist)); @@ -939,9 +940,10 @@ static struct padata_instance *padata_alloc(const char *name, mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, + &pinst->cpu_online_node); cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD, - &pinst->node); + &pinst->cpu_dead_node); #endif put_online_cpus(); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index c208566c844b..4d0e6e815a2b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -80,6 +80,18 @@ config HIBERNATION For more information take a look at <file:Documentation/power/swsusp.rst>. +config HIBERNATION_SNAPSHOT_DEV + bool "Userspace snapshot device" + depends on HIBERNATION + default y + ---help--- + Device used by the uswsusp tools. + + Say N if no snapshotting from userspace is needed, this also + reduces the attack surface of the kernel. + + If in doubt, say Y. + config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION diff --git a/kernel/power/Makefile b/kernel/power/Makefile index e7e47d9be1e5..5899260a8bef 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -10,7 +10,8 @@ obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o +obj-$(CONFIG_HIBERNATION_SNAPSHOT_DEV) += user.o obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 30bd28d1d418..02ec716a4927 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -67,6 +67,18 @@ bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; +static atomic_t hibernate_atomic = ATOMIC_INIT(1); + +bool hibernate_acquire(void) +{ + return atomic_add_unless(&hibernate_atomic, -1, 0); +} + +void hibernate_release(void) +{ + atomic_inc(&hibernate_atomic); +} + bool hibernation_available(void) { return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION); @@ -704,7 +716,7 @@ int hibernate(void) lock_system_sleep(); /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } @@ -775,7 +787,7 @@ int hibernate(void) Exit: __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); pm_restore_console(); - atomic_inc(&snapshot_device_available); + hibernate_release(); Unlock: unlock_system_sleep(); pr_info("hibernation exit\n"); @@ -880,7 +892,7 @@ static int software_resume(void) goto Unlock; /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; swsusp_close(FMODE_READ); goto Unlock; @@ -911,7 +923,7 @@ static int software_resume(void) __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); pm_restore_console(); pr_info("resume failed (%d)\n", error); - atomic_inc(&snapshot_device_available); + hibernate_release(); /* For success case, the suspend path will release the lock */ Unlock: mutex_unlock(&system_transition_mutex); diff --git a/kernel/power/power.h b/kernel/power/power.h index 7cdc64dc2373..ba2094db6294 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -154,8 +154,8 @@ extern int snapshot_write_next(struct snapshot_handle *handle); extern void snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); -/* If unset, the snapshot device cannot be open. */ -extern atomic_t snapshot_device_available; +extern bool hibernate_acquire(void); +extern void hibernate_release(void); extern sector_t alloc_swapdev_block(int swap); extern void free_all_swap_pages(int swap); diff --git a/kernel/power/user.c b/kernel/power/user.c index 7959449765d9..d5eedc2baa2a 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -35,9 +35,13 @@ static struct snapshot_data { bool ready; bool platform_support; bool free_bitmaps; + struct inode *bd_inode; } snapshot_state; -atomic_t snapshot_device_available = ATOMIC_INIT(1); +int is_hibernate_resume_dev(const struct inode *bd_inode) +{ + return hibernation_available() && snapshot_state.bd_inode == bd_inode; +} static int snapshot_open(struct inode *inode, struct file *filp) { @@ -49,13 +53,13 @@ static int snapshot_open(struct inode *inode, struct file *filp) lock_system_sleep(); - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } if ((filp->f_flags & O_ACCMODE) == O_RDWR) { - atomic_inc(&snapshot_device_available); + hibernate_release(); error = -ENOSYS; goto Unlock; } @@ -92,11 +96,12 @@ static int snapshot_open(struct inode *inode, struct file *filp) __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); } if (error) - atomic_inc(&snapshot_device_available); + hibernate_release(); data->frozen = false; data->ready = false; data->platform_support = false; + data->bd_inode = NULL; Unlock: unlock_system_sleep(); @@ -112,6 +117,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) swsusp_free(); data = filp->private_data; + data->bd_inode = NULL; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); @@ -122,7 +128,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); - atomic_inc(&snapshot_device_available); + hibernate_release(); unlock_system_sleep(); @@ -204,6 +210,7 @@ struct compat_resume_swap_area { static int snapshot_set_swap_area(struct snapshot_data *data, void __user *argp) { + struct block_device *bdev; sector_t offset; dev_t swdev; @@ -234,9 +241,12 @@ static int snapshot_set_swap_area(struct snapshot_data *data, data->swap = -1; return -EINVAL; } - data->swap = swap_type_of(swdev, offset, NULL); + data->swap = swap_type_of(swdev, offset, &bdev); if (data->swap < 0) return -ENODEV; + + data->bd_inode = bdev->bd_inode; + bdput(bdev); return 0; } diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index 11f19c466af5..3ca74ad391d6 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -6,6 +6,7 @@ struct console_cmdline { char name[16]; /* Name of the driver */ int index; /* Minor dev. to use */ + bool user_specified; /* Specified by command line vs. platform */ char *options; /* Options for the driver */ #ifdef CONFIG_A11Y_BRAILLE_CONSOLE char *brl_options; /* Options for braille driver */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..9fdd6a42ad6a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -280,6 +280,7 @@ static struct console *exclusive_console; static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; +static bool has_preferred_console; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); @@ -974,6 +975,16 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) user->idx = log_next_idx; user->seq = log_next_seq; break; + case SEEK_CUR: + /* + * It isn't supported due to the record nature of this + * interface: _SET _DATA and _END point to very specific + * record positions, while _CUR would be more useful in case + * of a byte-based log. Because of that, return the default + * errno value for invalid seek operation. + */ + ret = -ESPIPE; + break; default: ret = -EINVAL; } @@ -2140,7 +2151,7 @@ asmlinkage __visible void early_printk(const char *fmt, ...) #endif static int __add_preferred_console(char *name, int idx, char *options, - char *brl_options) + char *brl_options, bool user_specified) { struct console_cmdline *c; int i; @@ -2155,6 +2166,8 @@ static int __add_preferred_console(char *name, int idx, char *options, if (strcmp(c->name, name) == 0 && c->index == idx) { if (!brl_options) preferred_console = i; + if (user_specified) + c->user_specified = true; return 0; } } @@ -2164,6 +2177,7 @@ static int __add_preferred_console(char *name, int idx, char *options, preferred_console = i; strlcpy(c->name, name, sizeof(c->name)); c->options = options; + c->user_specified = user_specified; braille_set_options(c, brl_options); c->index = idx; @@ -2190,6 +2204,9 @@ static int __init console_setup(char *str) char *s, *options, *brl_options = NULL; int idx; + if (str[0] == 0) + return 1; + if (_braille_console_setup(&str, &brl_options)) return 1; @@ -2218,7 +2235,7 @@ static int __init console_setup(char *str) idx = simple_strtoul(s, NULL, 10); *s = 0; - __add_preferred_console(buf, idx, options, brl_options); + __add_preferred_console(buf, idx, options, brl_options, true); console_set_on_cmdline = 1; return 1; } @@ -2239,7 +2256,7 @@ __setup("console=", console_setup); */ int add_preferred_console(char *name, int idx, char *options) { - return __add_preferred_console(name, idx, options, NULL); + return __add_preferred_console(name, idx, options, NULL, false); } bool console_suspend_enabled = true; @@ -2438,9 +2455,9 @@ again: printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); if (console_seq < log_first_seq) { - len = sprintf(text, - "** %llu printk messages dropped **\n", - log_first_seq - console_seq); + len = snprintf(text, sizeof(text), + "** %llu printk messages dropped **\n", + log_first_seq - console_seq); /* messages are gone, move to first one */ console_seq = log_first_seq; @@ -2652,6 +2669,63 @@ static int __init keep_bootcon_setup(char *str) early_param("keep_bootcon", keep_bootcon_setup); /* + * This is called by register_console() to try to match + * the newly registered console with any of the ones selected + * by either the command line or add_preferred_console() and + * setup/enable it. + * + * Care need to be taken with consoles that are statically + * enabled such as netconsole + */ +static int try_enable_new_console(struct console *newcon, bool user_specified) +{ + struct console_cmdline *c; + int i; + + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { + if (c->user_specified != user_specified) + continue; + if (!newcon->match || + newcon->match(newcon, c->name, c->index, c->options) != 0) { + /* default matching */ + BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); + if (strcmp(c->name, newcon->name) != 0) + continue; + if (newcon->index >= 0 && + newcon->index != c->index) + continue; + if (newcon->index < 0) + newcon->index = c->index; + + if (_braille_register_console(newcon, c)) + return 0; + + if (newcon->setup && + newcon->setup(newcon, c->options) != 0) + return -EIO; + } + newcon->flags |= CON_ENABLED; + if (i == preferred_console) { + newcon->flags |= CON_CONSDEV; + has_preferred_console = true; + } + return 0; + } + + /* + * Some consoles, such as pstore and netconsole, can be enabled even + * without matching. Accept the pre-enabled consoles only when match() + * and setup() had a change to be called. + */ + if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) + return 0; + + return -ENOENT; +} + +/* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to * print any messages that were printed by the kernel before the @@ -2672,11 +2746,9 @@ early_param("keep_bootcon", keep_bootcon_setup); */ void register_console(struct console *newcon) { - int i; unsigned long flags; struct console *bcon = NULL; - struct console_cmdline *c; - static bool has_preferred; + int err; for_each_console(bcon) { if (WARN(bcon == newcon, "console '%s%d' already registered\n", @@ -2701,15 +2773,15 @@ void register_console(struct console *newcon) if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; - if (!has_preferred || bcon || !console_drivers) - has_preferred = preferred_console >= 0; + if (!has_preferred_console || bcon || !console_drivers) + has_preferred_console = preferred_console >= 0; /* * See if we want to use this console driver. If we * didn't select a console we take the first one * that registers here. */ - if (!has_preferred) { + if (!has_preferred_console) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || @@ -2717,47 +2789,20 @@ void register_console(struct console *newcon) newcon->flags |= CON_ENABLED; if (newcon->device) { newcon->flags |= CON_CONSDEV; - has_preferred = true; + has_preferred_console = true; } } } - /* - * See if this console matches one we selected on - * the command line. - */ - for (i = 0, c = console_cmdline; - i < MAX_CMDLINECONSOLES && c->name[0]; - i++, c++) { - if (!newcon->match || - newcon->match(newcon, c->name, c->index, c->options) != 0) { - /* default matching */ - BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); - if (strcmp(c->name, newcon->name) != 0) - continue; - if (newcon->index >= 0 && - newcon->index != c->index) - continue; - if (newcon->index < 0) - newcon->index = c->index; - - if (_braille_register_console(newcon, c)) - return; - - if (newcon->setup && - newcon->setup(newcon, c->options) != 0) - break; - } + /* See if this console matches one we selected on the command line */ + err = try_enable_new_console(newcon, true); - newcon->flags |= CON_ENABLED; - if (i == preferred_console) { - newcon->flags |= CON_CONSDEV; - has_preferred = true; - } - break; - } + /* If not, try to match against the platform default(s) */ + if (err == -ENOENT) + err = try_enable_new_console(newcon, false); - if (!(newcon->flags & CON_ENABLED)) + /* printk() messages are not printed to the Braille console. */ + if (err || newcon->flags & CON_BRL) return; /* @@ -2779,6 +2824,8 @@ void register_console(struct console *newcon) console_drivers = newcon; if (newcon->next) newcon->next->flags &= ~CON_CONSDEV; + /* Ensure this flag is always set for the head of the list */ + newcon->flags |= CON_CONSDEV; } else { newcon->next = console_drivers->next; console_drivers->next = newcon; @@ -3144,6 +3191,23 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); +const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) +{ + switch (reason) { + case KMSG_DUMP_PANIC: + return "Panic"; + case KMSG_DUMP_OOPS: + return "Oops"; + case KMSG_DUMP_EMERG: + return "Emergency"; + case KMSG_DUMP_SHUTDOWN: + return "Shutdown"; + default: + return "Unknown"; + } +} +EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); + /** * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping @@ -3157,12 +3221,19 @@ void kmsg_dump(enum kmsg_dump_reason reason) struct kmsg_dumper *dumper; unsigned long flags; - if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) - return; - rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { - if (dumper->max_reason && reason > dumper->max_reason) + enum kmsg_dump_reason max_reason = dumper->max_reason; + + /* + * If client has not provided a specific max_reason, default + * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. + */ + if (max_reason == KMSG_DUMP_UNDEF) { + max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : + KMSG_DUMP_OOPS; + } + if (reason > max_reason) continue; /* initialize iterator with data about the stored records */ @@ -3360,7 +3431,7 @@ out: EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** - * kmsg_dump_rewind_nolock - reset the interator (unlocked version) + * kmsg_dump_rewind_nolock - reset the iterator (unlocked version) * @dumper: registered kmsg dumper * * Reset the dumper's iterator so that kmsg_dump_get_line() and @@ -3378,7 +3449,7 @@ void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) } /** - * kmsg_dump_rewind - reset the interator + * kmsg_dump_rewind - reset the iterator * @dumper: registered kmsg dumper * * Reset the dumper's iterator so that kmsg_dump_get_line() and diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f51385b86ea3..c716eadc7617 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -778,6 +778,24 @@ void rcu_irq_exit_preempt(void) "RCU in extended quiescent state!"); } +#ifdef CONFIG_PROVE_RCU +/** + * rcu_irq_exit_check_preempt - Validate that scheduling is possible + */ +void rcu_irq_exit_check_preempt(void) +{ + lockdep_assert_irqs_disabled(); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); +} +#endif /* #ifdef CONFIG_PROVE_RCU */ + /* * Wrapper for rcu_irq_exit() where interrupts are enabled. * @@ -861,6 +879,67 @@ void noinstr rcu_user_exit(void) { rcu_eqs_exit(1); } + +/** + * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it. + * + * The scheduler tick is not normally enabled when CPUs enter the kernel + * from nohz_full userspace execution. After all, nohz_full userspace + * execution is an RCU quiescent state and the time executing in the kernel + * is quite short. Except of course when it isn't. And it is not hard to + * cause a large system to spend tens of seconds or even minutes looping + * in the kernel, which can cause a number of problems, include RCU CPU + * stall warnings. + * + * Therefore, if a nohz_full CPU fails to report a quiescent state + * in a timely manner, the RCU grace-period kthread sets that CPU's + * ->rcu_urgent_qs flag with the expectation that the next interrupt or + * exception will invoke this function, which will turn on the scheduler + * tick, which will enable RCU to detect that CPU's quiescent states, + * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels. + * The tick will be disabled once a quiescent state is reported for + * this CPU. + * + * Of course, in carefully tuned systems, there might never be an + * interrupt or exception. In that case, the RCU grace-period kthread + * will eventually cause one to happen. However, in less carefully + * controlled environments, this function allows RCU to get what it + * needs without creating otherwise useless interruptions. + */ +void __rcu_irq_enter_check_tick(void) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + // Enabling the tick is unsafe in NMI handlers. + if (WARN_ON_ONCE(in_nmi())) + return; + + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); + + if (!tick_nohz_full_cpu(rdp->cpu) || + !READ_ONCE(rdp->rcu_urgent_qs) || + READ_ONCE(rdp->rcu_forced_tick)) { + // RCU doesn't need nohz_full help from this CPU, or it is + // already getting that help. + return; + } + + // We get here only when not in an extended quiescent state and + // from interrupts (as opposed to NMIs). Therefore, (1) RCU is + // already watching and (2) The fact that we are in an interrupt + // handler and that the rcu_node lock is an irq-disabled lock + // prevents self-deadlock. So we can safely recheck under the lock. + // Note that the nohz_full state currently cannot change. + raw_spin_lock_rcu_node(rdp->mynode); + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + // A nohz_full CPU is in the kernel and RCU needs a + // quiescent state. Turn on the tick! + WRITE_ONCE(rdp->rcu_forced_tick, true); + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); +} #endif /* CONFIG_NO_HZ_FULL */ /** @@ -907,26 +986,7 @@ noinstr void rcu_nmi_enter(void) incby = 1; } else if (!in_nmi()) { instrumentation_begin(); - if (tick_nohz_full_cpu(rdp->cpu) && - rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && - !READ_ONCE(rdp->rcu_forced_tick)) { - // We get here only if we had already exited the - // extended quiescent state and this was an - // interrupt (not an NMI). Therefore, (1) RCU is - // already watching and (2) The fact that we are in - // an interrupt handler and that the rcu_node lock - // is an irq-disabled lock prevents self-deadlock. - // So we can safely recheck under the lock. - raw_spin_lock_rcu_node(rdp->mynode); - if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - // A nohz_full CPU is in the kernel and RCU - // needs a quiescent state. Turn on the tick! - WRITE_ONCE(rdp->rcu_forced_tick, true); - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); - } - raw_spin_unlock_rcu_node(rdp->mynode); - } + rcu_irq_enter_check_tick(); instrumentation_end(); } instrumentation_begin(); diff --git a/kernel/reboot.c b/kernel/reboot.c index c4d472b7f1b4..491f1347bf43 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -250,7 +250,7 @@ void kernel_restart(char *cmd) pr_emerg("Restarting system\n"); else pr_emerg("Restarting system with command '%s'\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -274,7 +274,7 @@ void kernel_halt(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("System halted\n"); - kmsg_dump(KMSG_DUMP_HALT); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_halt(); } EXPORT_SYMBOL_GPL(kernel_halt); @@ -292,7 +292,7 @@ void kernel_power_off(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); diff --git a/kernel/relay.c b/kernel/relay.c index ade14fb7ce2e..d0c9c287680a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1,7 +1,7 @@ /* * Public API and common code for kernel->userspace relay file support. * - * See Documentation/filesystems/relay.txt for an overview. + * See Documentation/filesystems/relay.rst for an overview. * * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 43ba2d4a8eca..d7669027aede 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11,6 +11,7 @@ #include <linux/nospec.h> #include <linux/kcov.h> +#include <linux/scs.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -3940,6 +3941,9 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) #ifdef CONFIG_SCHED_STACK_END_CHECK if (task_stack_end_corrupted(prev)) panic("corrupted stack end detected inside scheduler\n"); + + if (task_scs_end_corrupted(prev)) + panic("corrupted shadow stack detected inside scheduler\n"); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP @@ -6110,6 +6114,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; + scs_task_reset(idle); kasan_unpoison_task_stack(idle); #ifdef CONFIG_SMP diff --git a/kernel/scs.c b/kernel/scs.c new file mode 100644 index 000000000000..222a7a9ad543 --- /dev/null +++ b/kernel/scs.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shadow Call Stack support. + * + * Copyright (C) 2019 Google LLC + */ + +#include <linux/kasan.h> +#include <linux/mm.h> +#include <linux/scs.h> +#include <linux/slab.h> +#include <linux/vmstat.h> + +static struct kmem_cache *scs_cache; + +static void __scs_account(void *s, int account) +{ + struct page *scs_page = virt_to_page(s); + + mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB, + account * (SCS_SIZE / SZ_1K)); +} + +static void *scs_alloc(int node) +{ + void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); + + if (!s) + return NULL; + + *__scs_magic(s) = SCS_END_MAGIC; + + /* + * Poison the allocation to catch unintentional accesses to + * the shadow stack when KASAN is enabled. + */ + kasan_poison_object_data(scs_cache, s); + __scs_account(s, 1); + return s; +} + +static void scs_free(void *s) +{ + __scs_account(s, -1); + kasan_unpoison_object_data(scs_cache, s); + kmem_cache_free(scs_cache, s); +} + +void __init scs_init(void) +{ + scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); +} + +int scs_prepare(struct task_struct *tsk, int node) +{ + void *s = scs_alloc(node); + + if (!s) + return -ENOMEM; + + task_scs(tsk) = task_scs_sp(tsk) = s; + return 0; +} + +static void scs_check_usage(struct task_struct *tsk) +{ + static unsigned long highest; + + unsigned long *p, prev, curr = highest, used = 0; + + if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE)) + return; + + for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) { + if (!READ_ONCE_NOCHECK(*p)) + break; + used++; + } + + while (used > curr) { + prev = cmpxchg_relaxed(&highest, curr, used); + + if (prev == curr) { + pr_info("%s (%d): highest shadow stack usage: %lu bytes\n", + tsk->comm, task_pid_nr(tsk), used); + break; + } + + curr = prev; + } +} + +void scs_release(struct task_struct *tsk) +{ + void *s = task_scs(tsk); + + if (!s) + return; + + WARN(task_scs_end_corrupted(tsk), + "corrupted shadow stack detected when freeing task\n"); + scs_check_usage(tsk); + scs_free(s); +} diff --git a/kernel/signal.c b/kernel/signal.c index 284fc1600063..5ca48cc5da76 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3235,94 +3235,94 @@ int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) } #ifdef CONFIG_COMPAT -int copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) -{ - return __copy_siginfo_to_user32(to, from, in_x32_syscall()); -} -int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from, bool x32_ABI) -#endif +/** + * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo + * @to: compat siginfo destination + * @from: kernel siginfo source + * + * Note: This function does not work properly for the SIGCHLD on x32, but + * fortunately it doesn't have to. The only valid callers for this function are + * copy_siginfo_to_user32, which is overriden for x32 and the coredump code. + * The latter does not care because SIGCHLD will never cause a coredump. + */ +void copy_siginfo_to_external32(struct compat_siginfo *to, + const struct kernel_siginfo *from) { - struct compat_siginfo new; - memset(&new, 0, sizeof(new)); + memset(to, 0, sizeof(*to)); - new.si_signo = from->si_signo; - new.si_errno = from->si_errno; - new.si_code = from->si_code; + to->si_signo = from->si_signo; + to->si_errno = from->si_errno; + to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; break; case SIL_TIMER: - new.si_tid = from->si_tid; - new.si_overrun = from->si_overrun; - new.si_int = from->si_int; + to->si_tid = from->si_tid; + to->si_overrun = from->si_overrun; + to->si_int = from->si_int; break; case SIL_POLL: - new.si_band = from->si_band; - new.si_fd = from->si_fd; + to->si_band = from->si_band; + to->si_fd = from->si_fd; break; case SIL_FAULT: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif break; case SIL_FAULT_MCEERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_addr_lsb = from->si_addr_lsb; + to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_lower = ptr_to_compat(from->si_lower); - new.si_upper = ptr_to_compat(from->si_upper); + to->si_lower = ptr_to_compat(from->si_lower); + to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_pkey = from->si_pkey; + to->si_pkey = from->si_pkey; break; case SIL_CHLD: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_status = from->si_status; -#ifdef CONFIG_X86_X32_ABI - if (x32_ABI) { - new._sifields._sigchld_x32._utime = from->si_utime; - new._sifields._sigchld_x32._stime = from->si_stime; - } else -#endif - { - new.si_utime = from->si_utime; - new.si_stime = from->si_stime; - } + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_status = from->si_status; + to->si_utime = from->si_utime; + to->si_stime = from->si_stime; break; case SIL_RT: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_int = from->si_int; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_int = from->si_int; break; case SIL_SYS: - new.si_call_addr = ptr_to_compat(from->si_call_addr); - new.si_syscall = from->si_syscall; - new.si_arch = from->si_arch; + to->si_call_addr = ptr_to_compat(from->si_call_addr); + to->si_syscall = from->si_syscall; + to->si_arch = from->si_arch; break; } +} +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; - return 0; } diff --git a/kernel/smp.c b/kernel/smp.c index c80486a7e3b8..472c2b274c65 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -709,7 +709,7 @@ void __init smp_init(void) * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead * of local_irq_disable/enable(). */ -void on_each_cpu(void (*func) (void *info), void *info, int wait) +void on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; diff --git a/kernel/sys.c b/kernel/sys.c index d325f3ab624a..891667a49bb7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2262,7 +2262,7 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } -#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE) +#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) @@ -2634,6 +2634,7 @@ struct compat_sysinfo { COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) { struct sysinfo s; + struct compat_sysinfo s_32; do_sysinfo(&s); @@ -2658,23 +2659,23 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) s.freehigh >>= bitcount; } - if (!access_ok(info, sizeof(struct compat_sysinfo)) || - __put_user(s.uptime, &info->uptime) || - __put_user(s.loads[0], &info->loads[0]) || - __put_user(s.loads[1], &info->loads[1]) || - __put_user(s.loads[2], &info->loads[2]) || - __put_user(s.totalram, &info->totalram) || - __put_user(s.freeram, &info->freeram) || - __put_user(s.sharedram, &info->sharedram) || - __put_user(s.bufferram, &info->bufferram) || - __put_user(s.totalswap, &info->totalswap) || - __put_user(s.freeswap, &info->freeswap) || - __put_user(s.procs, &info->procs) || - __put_user(s.totalhigh, &info->totalhigh) || - __put_user(s.freehigh, &info->freehigh) || - __put_user(s.mem_unit, &info->mem_unit)) + memset(&s_32, 0, sizeof(s_32)); + s_32.uptime = s.uptime; + s_32.loads[0] = s.loads[0]; + s_32.loads[1] = s.loads[1]; + s_32.loads[2] = s.loads[2]; + s_32.totalram = s.totalram; + s_32.freeram = s.freeram; + s_32.sharedram = s.sharedram; + s_32.bufferram = s.bufferram; + s_32.totalswap = s.totalswap; + s_32.freeswap = s.freeswap; + s_32.procs = s.procs; + s_32.totalhigh = s.totalhigh; + s_32.freehigh = s.freehigh; + s_32.mem_unit = s.mem_unit; + if (copy_to_user(info, &s_32, sizeof(s_32))) return -EFAULT; - return 0; } #endif /* CONFIG_COMPAT */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ca39dc3230cb..ea47f2084087 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -170,10 +170,10 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) blkcg = NULL; #ifdef CONFIG_BLK_CGROUP - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, blkcg ? cgroup_id(blkcg->css.cgroup) : 1); #else - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0); #endif local_irq_restore(flags); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a010edc37ee0..92ba69b716dc 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1500,7 +1500,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) u32 *ids, prog_cnt, ids_len; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EPERM; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29615f15a820..f12e99b387b2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8527,18 +8527,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot = false; #endif - /* - * Because of some magic with the way alloc_percpu() works on - * x86_64, we need to synchronize the pgd of all the tables, - * otherwise the trace events that happen in x86_64 page fault - * handlers can't cope with accessing the chance that a - * alloc_percpu()'d memory might be touched in the page fault trace - * event. Oh, and we need to audit all other alloc_percpu() and vmalloc() - * calls in tracing, because something might get triggered within a - * page fault trace event! - */ - vmalloc_sync_mappings(); - return 0; } |