diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/inode.c | 4 | ||||
-rw-r--r-- | kernel/kexec_handover.c | 361 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace.c | 278 | ||||
-rw-r--r-- | kernel/trace/trace_irqsoff.c | 23 | ||||
-rw-r--r-- | kernel/trace/trace_osnoise.c | 11 | ||||
-rw-r--r-- | kernel/trace/trace_sched_wakeup.c | 16 |
7 files changed, 568 insertions, 127 deletions
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index f90bdcc0a047..81780bcf8d25 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -775,7 +775,7 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void bpf_free_inode(struct inode *inode) +static void bpf_destroy_inode(struct inode *inode) { enum bpf_type type; @@ -790,7 +790,7 @@ const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = inode_just_drop, .show_options = bpf_show_options, - .free_inode = bpf_free_inode, + .destroy_inode = bpf_destroy_inode, }; enum { diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 5083c68c3a4e..76f0940fb485 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -18,6 +18,7 @@ #include <linux/memblock.h> #include <linux/notifier.h> #include <linux/page-isolation.h> +#include <linux/vmalloc.h> #include <asm/early_ioremap.h> @@ -107,6 +108,29 @@ struct kho_serialization { struct khoser_mem_chunk *preserved_mem_map; }; +struct kho_out { + struct blocking_notifier_head chain_head; + + struct dentry *dir; + + struct mutex lock; /* protects KHO FDT finalization */ + + struct kho_serialization ser; + bool finalized; +}; + +static struct kho_out kho_out = { + .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), + .lock = __MUTEX_INITIALIZER(kho_out.lock), + .ser = { + .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), + .track = { + .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), + }, + }, + .finalized = false, +}; + static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) { void *elm, *res; @@ -165,6 +189,9 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, might_sleep(); + if (kho_out.finalized) + return -EBUSY; + physxa = xa_load(&track->orders, order); if (!physxa) { int err; @@ -248,6 +275,37 @@ struct folio *kho_restore_folio(phys_addr_t phys) } EXPORT_SYMBOL_GPL(kho_restore_folio); +/** + * kho_restore_pages - restore list of contiguous order 0 pages. + * @phys: physical address of the first page. + * @nr_pages: number of pages. + * + * Restore a contiguous list of order 0 pages that was preserved with + * kho_preserve_pages(). + * + * Return: 0 on success, error code on failure + */ +struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) +{ + const unsigned long start_pfn = PHYS_PFN(phys); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn = start_pfn; + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + struct page *page = kho_restore_page(PFN_PHYS(pfn)); + + if (!page) + return NULL; + split_page(page, order); + pfn += 1 << order; + } + + return pfn_to_page(start_pfn); +} +EXPORT_SYMBOL_GPL(kho_restore_pages); + /* Serialize and deserialize struct kho_mem_phys across kexec * * Record all the bitmaps in a linked list of pages for the next kernel to @@ -667,29 +725,6 @@ int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) } EXPORT_SYMBOL_GPL(kho_add_subtree); -struct kho_out { - struct blocking_notifier_head chain_head; - - struct dentry *dir; - - struct mutex lock; /* protects KHO FDT finalization */ - - struct kho_serialization ser; - bool finalized; -}; - -static struct kho_out kho_out = { - .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), - .lock = __MUTEX_INITIALIZER(kho_out.lock), - .ser = { - .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), - .track = { - .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), - }, - }, - .finalized = false, -}; - int register_kho_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&kho_out.chain_head, nb); @@ -717,37 +752,28 @@ int kho_preserve_folio(struct folio *folio) const unsigned int order = folio_order(folio); struct kho_mem_track *track = &kho_out.ser.track; - if (kho_out.finalized) - return -EBUSY; - return __kho_preserve_order(track, pfn, order); } EXPORT_SYMBOL_GPL(kho_preserve_folio); /** - * kho_preserve_phys - preserve a physically contiguous range across kexec. - * @phys: physical address of the range. - * @size: size of the range. + * kho_preserve_pages - preserve contiguous pages across kexec + * @page: first page in the list. + * @nr_pages: number of pages. * - * Instructs KHO to preserve the memory range from @phys to @phys + @size - * across kexec. + * Preserve a contiguous list of order 0 pages. Must be restored using + * kho_restore_pages() to ensure the pages are restored properly as order 0. * * Return: 0 on success, error code on failure */ -int kho_preserve_phys(phys_addr_t phys, size_t size) +int kho_preserve_pages(struct page *page, unsigned int nr_pages) { - unsigned long pfn = PHYS_PFN(phys); + struct kho_mem_track *track = &kho_out.ser.track; + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn = start_pfn; unsigned long failed_pfn = 0; - const unsigned long start_pfn = pfn; - const unsigned long end_pfn = PHYS_PFN(phys + size); int err = 0; - struct kho_mem_track *track = &kho_out.ser.track; - - if (kho_out.finalized) - return -EBUSY; - - if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) - return -EINVAL; while (pfn < end_pfn) { const unsigned int order = @@ -767,7 +793,256 @@ int kho_preserve_phys(phys_addr_t phys, size_t size) return err; } -EXPORT_SYMBOL_GPL(kho_preserve_phys); +EXPORT_SYMBOL_GPL(kho_preserve_pages); + +struct kho_vmalloc_hdr { + DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); +}; + +#define KHO_VMALLOC_SIZE \ + ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \ + sizeof(phys_addr_t)) + +struct kho_vmalloc_chunk { + struct kho_vmalloc_hdr hdr; + phys_addr_t phys[KHO_VMALLOC_SIZE]; +}; + +static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE); + +/* vmalloc flags KHO supports */ +#define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP) + +/* KHO internal flags for vmalloc preservations */ +#define KHO_VMALLOC_ALLOC 0x0001 +#define KHO_VMALLOC_HUGE_VMAP 0x0002 + +static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags) +{ + unsigned short kho_flags = 0; + + if (vm_flags & VM_ALLOC) + kho_flags |= KHO_VMALLOC_ALLOC; + if (vm_flags & VM_ALLOW_HUGE_VMAP) + kho_flags |= KHO_VMALLOC_HUGE_VMAP; + + return kho_flags; +} + +static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags) +{ + unsigned int vm_flags = 0; + + if (kho_flags & KHO_VMALLOC_ALLOC) + vm_flags |= VM_ALLOC; + if (kho_flags & KHO_VMALLOC_HUGE_VMAP) + vm_flags |= VM_ALLOW_HUGE_VMAP; + + return vm_flags; +} + +static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur) +{ + struct kho_vmalloc_chunk *chunk; + int err; + + chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL); + if (!chunk) + return NULL; + + err = kho_preserve_pages(virt_to_page(chunk), 1); + if (err) + goto err_free; + if (cur) + KHOSER_STORE_PTR(cur->hdr.next, chunk); + return chunk; + +err_free: + free_page((unsigned long)chunk); + return NULL; +} + +static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) +{ + struct kho_mem_track *track = &kho_out.ser.track; + unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); + + __kho_unpreserve(track, pfn, pfn + 1); + + for (int i = 0; chunk->phys[i]; i++) { + pfn = PHYS_PFN(chunk->phys[i]); + __kho_unpreserve(track, pfn, pfn + 1); + } +} + +static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) +{ + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first); + + while (chunk) { + struct kho_vmalloc_chunk *tmp = chunk; + + kho_vmalloc_unpreserve_chunk(chunk); + + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + free_page((unsigned long)tmp); + } +} + +/** + * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec + * @ptr: pointer to the area in vmalloc address space + * @preservation: placeholder for preservation metadata + * + * Instructs KHO to preserve the area in vmalloc address space at @ptr. The + * physical pages mapped at @ptr will be preserved and on successful return + * @preservation will hold the physical address of a structure that describes + * the preservation. + * + * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably + * restored on the same node + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) +{ + struct kho_vmalloc_chunk *chunk; + struct vm_struct *vm = find_vm_area(ptr); + unsigned int order, flags, nr_contig_pages; + unsigned int idx = 0; + int err; + + if (!vm) + return -EINVAL; + + if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) + return -EOPNOTSUPP; + + flags = vmalloc_flags_to_kho(vm->flags); + order = get_vm_area_page_order(vm); + + chunk = new_vmalloc_chunk(NULL); + if (!chunk) + return -ENOMEM; + KHOSER_STORE_PTR(preservation->first, chunk); + + nr_contig_pages = (1 << order); + for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) { + phys_addr_t phys = page_to_phys(vm->pages[i]); + + err = kho_preserve_pages(vm->pages[i], nr_contig_pages); + if (err) + goto err_free; + + chunk->phys[idx++] = phys; + if (idx == ARRAY_SIZE(chunk->phys)) { + chunk = new_vmalloc_chunk(chunk); + if (!chunk) + goto err_free; + idx = 0; + } + } + + preservation->total_pages = vm->nr_pages; + preservation->flags = flags; + preservation->order = order; + + return 0; + +err_free: + kho_vmalloc_free_chunks(preservation); + return err; +} +EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); + +/** + * kho_restore_vmalloc - recreates and populates an area in vmalloc address + * space from the preserved memory. + * @preservation: preservation metadata. + * + * Recreates an area in vmalloc address space and populates it with memory that + * was preserved using kho_preserve_vmalloc(). + * + * Return: pointer to the area in the vmalloc address space, NULL on failure. + */ +void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) +{ + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); + unsigned int align, order, shift, vm_flags; + unsigned long total_pages, contig_pages; + unsigned long addr, size; + struct vm_struct *area; + struct page **pages; + unsigned int idx = 0; + int err; + + vm_flags = kho_flags_to_vmalloc(preservation->flags); + if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) + return NULL; + + total_pages = preservation->total_pages; + pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return NULL; + order = preservation->order; + contig_pages = (1 << order); + shift = PAGE_SHIFT + order; + align = 1 << shift; + + while (chunk) { + struct page *page; + + for (int i = 0; chunk->phys[i]; i++) { + phys_addr_t phys = chunk->phys[i]; + + if (idx + contig_pages > total_pages) + goto err_free_pages_array; + + page = kho_restore_pages(phys, contig_pages); + if (!page) + goto err_free_pages_array; + + for (int j = 0; j < contig_pages; j++) + pages[idx++] = page; + + phys += contig_pages * PAGE_SIZE; + } + + page = kho_restore_pages(virt_to_phys(chunk), 1); + if (!page) + goto err_free_pages_array; + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + __free_page(page); + } + + if (idx != total_pages) + goto err_free_pages_array; + + area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift, + vm_flags, VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, + __builtin_return_address(0)); + if (!area) + goto err_free_pages_array; + + addr = (unsigned long)area->addr; + size = get_vm_area_size(area); + err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift); + if (err) + goto err_free_vm_area; + + area->nr_pages = total_pages; + area->pages = pages; + + return area->addr; + +err_free_vm_area: + free_vm_area(area); +err_free_pages_array: + kvfree(pages); + return NULL; +} +EXPORT_SYMBOL_GPL(kho_restore_vmalloc); /* Handling for debug/kho/out */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 43460949ad3f..1244d2c5c384 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7273,7 +7273,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, atomic_dec(&cpu_buffer->resize_disabled); } - return 0; + return err; } int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 156e7e0bf559..d1e527cf2aae 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4791,12 +4791,6 @@ int tracing_single_release_file_tr(struct inode *inode, struct file *filp) return single_release(inode, filp); } -static int tracing_mark_open(struct inode *inode, struct file *filp) -{ - stream_open(inode, filp); - return tracing_open_generic_tr(inode, filp); -} - static int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -7163,7 +7157,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) #define TRACE_MARKER_MAX_SIZE 4096 -static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user *ubuf, +static ssize_t write_marker_to_buffer(struct trace_array *tr, const char *buf, size_t cnt, unsigned long ip) { struct ring_buffer_event *event; @@ -7173,20 +7167,11 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user int meta_size; ssize_t written; size_t size; - int len; - -/* Used in tracing_mark_raw_write() as well */ -#define FAULTED_STR "<faulted>" -#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ again: size = cnt + meta_size; - /* If less than "<faulted>", then make sure we can still add that */ - if (cnt < FAULTED_SIZE) - size += FAULTED_SIZE - cnt; - buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); @@ -7196,9 +7181,6 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user * make it smaller and try again. */ if (size > ring_buffer_max_event_size(buffer)) { - /* cnt < FAULTED size should never be bigger than max */ - if (WARN_ON_ONCE(cnt < FAULTED_SIZE)) - return -EBADF; cnt = ring_buffer_max_event_size(buffer) - meta_size; /* The above should only happen once */ if (WARN_ON_ONCE(cnt + meta_size == size)) @@ -7212,14 +7194,8 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user entry = ring_buffer_event_data(event); entry->ip = ip; - - len = copy_from_user_nofault(&entry->buf, ubuf, cnt); - if (len) { - memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); - cnt = FAULTED_SIZE; - written = -EFAULT; - } else - written = cnt; + memcpy(&entry->buf, buf, cnt); + written = cnt; if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ @@ -7243,6 +7219,169 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user return written; } +struct trace_user_buf { + char *buf; +}; + +struct trace_user_buf_info { + struct trace_user_buf __percpu *tbuf; + int ref; +}; + + +static DEFINE_MUTEX(trace_user_buffer_mutex); +static struct trace_user_buf_info *trace_user_buffer; + +static void trace_user_fault_buffer_free(struct trace_user_buf_info *tinfo) +{ + char *buf; + int cpu; + + for_each_possible_cpu(cpu) { + buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf; + kfree(buf); + } + free_percpu(tinfo->tbuf); + kfree(tinfo); +} + +static int trace_user_fault_buffer_enable(void) +{ + struct trace_user_buf_info *tinfo; + char *buf; + int cpu; + + guard(mutex)(&trace_user_buffer_mutex); + + if (trace_user_buffer) { + trace_user_buffer->ref++; + return 0; + } + + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + return -ENOMEM; + + tinfo->tbuf = alloc_percpu(struct trace_user_buf); + if (!tinfo->tbuf) { + kfree(tinfo); + return -ENOMEM; + } + + tinfo->ref = 1; + + /* Clear each buffer in case of error */ + for_each_possible_cpu(cpu) { + per_cpu_ptr(tinfo->tbuf, cpu)->buf = NULL; + } + + for_each_possible_cpu(cpu) { + buf = kmalloc_node(TRACE_MARKER_MAX_SIZE, GFP_KERNEL, + cpu_to_node(cpu)); + if (!buf) { + trace_user_fault_buffer_free(tinfo); + return -ENOMEM; + } + per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf; + } + + trace_user_buffer = tinfo; + + return 0; +} + +static void trace_user_fault_buffer_disable(void) +{ + struct trace_user_buf_info *tinfo; + + guard(mutex)(&trace_user_buffer_mutex); + + tinfo = trace_user_buffer; + + if (WARN_ON_ONCE(!tinfo)) + return; + + if (--tinfo->ref) + return; + + trace_user_fault_buffer_free(tinfo); + trace_user_buffer = NULL; +} + +/* Must be called with preemption disabled */ +static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, + const char __user *ptr, size_t size, + size_t *read_size) +{ + int cpu = smp_processor_id(); + char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; + unsigned int cnt; + int trys = 0; + int ret; + + if (size > TRACE_MARKER_MAX_SIZE) + size = TRACE_MARKER_MAX_SIZE; + *read_size = 0; + + /* + * This acts similar to a seqcount. The per CPU context switches are + * recorded, migration is disabled and preemption is enabled. The + * read of the user space memory is copied into the per CPU buffer. + * Preemption is disabled again, and if the per CPU context switches count + * is still the same, it means the buffer has not been corrupted. + * If the count is different, it is assumed the buffer is corrupted + * and reading must be tried again. + */ + + do { + /* + * If for some reason, copy_from_user() always causes a context + * switch, this would then cause an infinite loop. + * If this task is preempted by another user space task, it + * will cause this task to try again. But just in case something + * changes where the copying from user space causes another task + * to run, prevent this from going into an infinite loop. + * 100 tries should be plenty. + */ + if (WARN_ONCE(trys++ > 100, "Error: Too many tries to read user space")) + return NULL; + + /* Read the current CPU context switch counter */ + cnt = nr_context_switches_cpu(cpu); + + /* + * Preemption is going to be enabled, but this task must + * remain on this CPU. + */ + migrate_disable(); + + /* + * Now preemption is being enabed and another task can come in + * and use the same buffer and corrupt our data. + */ + preempt_enable_notrace(); + + ret = __copy_from_user(buffer, ptr, size); + + preempt_disable_notrace(); + migrate_enable(); + + /* if it faulted, no need to test if the buffer was corrupted */ + if (ret) + return NULL; + + /* + * Preemption is disabled again, now check the per CPU context + * switch counter. If it doesn't match, then another user space + * process may have schedule in and corrupted our buffer. In that + * case the copying must be retried. + */ + } while (nr_context_switches_cpu(cpu) != cnt); + + *read_size = size; + return buffer; +} + static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) @@ -7250,6 +7389,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, struct trace_array *tr = filp->private_data; ssize_t written = -ENODEV; unsigned long ip; + size_t size; + char *buf; if (tracing_disabled) return -EINVAL; @@ -7263,6 +7404,16 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_MARKER_MAX_SIZE) cnt = TRACE_MARKER_MAX_SIZE; + /* Must have preemption disabled while having access to the buffer */ + guard(preempt_notrace)(); + + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); + if (!buf) + return -EFAULT; + + if (cnt > size) + cnt = size; + /* The selftests expect this function to be the IP address */ ip = _THIS_IP_; @@ -7270,32 +7421,28 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tr == &global_trace) { guard(rcu)(); list_for_each_entry_rcu(tr, &marker_copies, marker_list) { - written = write_marker_to_buffer(tr, ubuf, cnt, ip); + written = write_marker_to_buffer(tr, buf, cnt, ip); if (written < 0) break; } } else { - written = write_marker_to_buffer(tr, ubuf, cnt, ip); + written = write_marker_to_buffer(tr, buf, cnt, ip); } return written; } static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, - const char __user *ubuf, size_t cnt) + const char *buf, size_t cnt) { struct ring_buffer_event *event; struct trace_buffer *buffer; struct raw_data_entry *entry; ssize_t written; - int size; - int len; - -#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + size_t size; - size = sizeof(*entry) + cnt; - if (cnt < FAULT_SIZE_ID) - size += FAULT_SIZE_ID - cnt; + /* cnt includes both the entry->id and the data behind it. */ + size = struct_size(entry, buf, cnt - sizeof(entry->id)); buffer = tr->array_buffer.buffer; @@ -7309,14 +7456,11 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, return -EBADF; entry = ring_buffer_event_data(event); - - len = copy_from_user_nofault(&entry->id, ubuf, cnt); - if (len) { - entry->id = -1; - memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); - written = -EFAULT; - } else - written = cnt; + unsafe_memcpy(&entry->id, buf, cnt, + "id and content already reserved on ring buffer" + "'buf' includes the 'id' and the data." + "'entry' was allocated with cnt from 'id'."); + written = cnt; __buffer_unlock_commit(buffer, event); @@ -7329,8 +7473,8 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; ssize_t written = -ENODEV; - -#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + size_t size; + char *buf; if (tracing_disabled) return -EINVAL; @@ -7342,21 +7486,53 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, if (cnt < sizeof(unsigned int)) return -EINVAL; + /* Must have preemption disabled while having access to the buffer */ + guard(preempt_notrace)(); + + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); + if (!buf) + return -EFAULT; + + /* raw write is all or nothing */ + if (cnt > size) + return -EINVAL; + /* The global trace_marker_raw can go to multiple instances */ if (tr == &global_trace) { guard(rcu)(); list_for_each_entry_rcu(tr, &marker_copies, marker_list) { - written = write_raw_marker_to_buffer(tr, ubuf, cnt); + written = write_raw_marker_to_buffer(tr, buf, cnt); if (written < 0) break; } } else { - written = write_raw_marker_to_buffer(tr, ubuf, cnt); + written = write_raw_marker_to_buffer(tr, buf, cnt); } return written; } +static int tracing_mark_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = trace_user_fault_buffer_enable(); + if (ret < 0) + return ret; + + stream_open(inode, filp); + ret = tracing_open_generic_tr(inode, filp); + if (ret < 0) + trace_user_fault_buffer_disable(); + return ret; +} + +static int tracing_mark_release(struct inode *inode, struct file *file) +{ + trace_user_fault_buffer_disable(); + return tracing_release_generic_tr(inode, file); +} + static int tracing_clock_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; @@ -7764,13 +7940,13 @@ static const struct file_operations tracing_free_buffer_fops = { static const struct file_operations tracing_mark_fops = { .open = tracing_mark_open, .write = tracing_mark_write, - .release = tracing_release_generic_tr, + .release = tracing_mark_release, }; static const struct file_operations tracing_mark_raw_fops = { .open = tracing_mark_open, .write = tracing_mark_raw_write, - .release = tracing_release_generic_tr, + .release = tracing_mark_release, }; static const struct file_operations trace_clock_fops = { diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5496758b6c76..4c45c49b06c8 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -184,7 +184,7 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, unsigned long flags; unsigned int trace_ctx; u64 *calltime; - int ret; + int ret = 0; if (ftrace_graph_ignore_func(gops, trace)) return 0; @@ -202,13 +202,11 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, return 0; calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); - if (!calltime) - return 0; - - *calltime = trace_clock_local(); - - trace_ctx = tracing_gen_ctx_flags(flags); - ret = __trace_graph_entry(tr, trace, trace_ctx); + if (calltime) { + *calltime = trace_clock_local(); + trace_ctx = tracing_gen_ctx_flags(flags); + ret = __trace_graph_entry(tr, trace, trace_ctx); + } local_dec(&data->disabled); return ret; @@ -233,11 +231,10 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace, rettime = trace_clock_local(); calltime = fgraph_retrieve_data(gops->idx, &size); - if (!calltime) - return; - - trace_ctx = tracing_gen_ctx_flags(flags); - __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); + if (calltime) { + trace_ctx = tracing_gen_ctx_flags(flags); + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); + } local_dec(&data->disabled); } diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 12ee346820da..a9962d4497e8 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -24,6 +24,7 @@ #include <linux/sched/clock.h> #include <uapi/linux/sched/types.h> #include <linux/sched.h> +#include <linux/string.h> #include "trace.h" #ifdef CONFIG_X86_LOCAL_APIC @@ -2325,13 +2326,9 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, if (count < 1) return 0; - buf = kmalloc(count + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, ubuf, count)) - return -EFAULT; - buf[count] = '\0'; + buf = memdup_user_nul(ubuf, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL)) return -ENOMEM; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index bf1cb80742ae..e3f2e4f56faa 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -138,12 +138,10 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace, return 0; calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); - if (!calltime) - return 0; - - *calltime = trace_clock_local(); - - ret = __trace_graph_entry(tr, trace, trace_ctx); + if (calltime) { + *calltime = trace_clock_local(); + ret = __trace_graph_entry(tr, trace, trace_ctx); + } local_dec(&data->disabled); preempt_enable_notrace(); @@ -169,12 +167,10 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace, rettime = trace_clock_local(); calltime = fgraph_retrieve_data(gops->idx, &size); - if (!calltime) - return; + if (calltime) + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); - __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); local_dec(&data->disabled); - preempt_enable_notrace(); return; } |