From 958de668197651bbf2b4b9528f204ab5a0f1af65 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Oct 2019 21:07:31 +0200 Subject: module: Remove set_all_modules_text_*() Now that there are no users of set_all_modules_text_*() left, remove it. While it appears nds32 uses it, it does not have STRICT_MODULE_RWX and therefore ends up with the NOP stubs. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Greentime Hu Cc: H. Peter Anvin Cc: Jessica Yu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Chen Link: https://lkml.kernel.org/r/20191111132458.284298307@infradead.org Signed-off-by: Ingo Molnar --- kernel/module.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index acf7962936c4..5cd9bed595f4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2029,49 +2029,6 @@ static void module_enable_nx(const struct module *mod) frob_writable_data(&mod->init_layout, set_memory_nx); } -/* Iterate through all modules and set each module's text as RW */ -void set_all_modules_text_rw(void) -{ - struct module *mod; - - if (!rodata_enabled) - return; - - mutex_lock(&module_mutex); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - - frob_text(&mod->core_layout, set_memory_rw); - frob_text(&mod->init_layout, set_memory_rw); - } - mutex_unlock(&module_mutex); -} - -/* Iterate through all modules and set each module's text as RO */ -void set_all_modules_text_ro(void) -{ - struct module *mod; - - if (!rodata_enabled) - return; - - mutex_lock(&module_mutex); - list_for_each_entry_rcu(mod, &modules, list) { - /* - * Ignore going modules since it's possible that ro - * protection has already been disabled, otherwise we'll - * run into protection faults at module deallocation. - */ - if (mod->state == MODULE_STATE_UNFORMED || - mod->state == MODULE_STATE_GOING) - continue; - - frob_text(&mod->core_layout, set_memory_ro); - frob_text(&mod->init_layout, set_memory_ro); - } - mutex_unlock(&module_mutex); -} #else /* !CONFIG_STRICT_MODULE_RWX */ static void module_enable_nx(const struct module *mod) { } #endif /* CONFIG_STRICT_MODULE_RWX */ -- cgit From 04ae87a52074e2d448fc66143f1bd2c7d694d2b9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Oct 2019 22:26:59 +0200 Subject: ftrace: Rework event_create_dir() Rework event_create_dir() to use an array of static data instead of function pointers where possible. The problem is that it would call the function pointer on module load before parse_args(), possibly even before jump_labels were initialized. Luckily the generated functions don't use jump_labels but it still seems fragile. It also gets in the way of changing when we make the module map executable. The generated function are basically calling trace_define_field() with a bunch of static arguments. So instead of a function, capture these arguments in a static array, avoiding the function call. Now there are a number of cases where the fields are dynamic (syscall arguments, kprobes and uprobes), in which case a static array does not work, for these we preserve the function call. Luckily all these cases are not related to modules and so we can retain the function call for them. Also fix up all broken tracepoint definitions that now generate a compile error. Tested-by: Alexei Starovoitov Tested-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111132458.342979914@infradead.org Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 31 ++++++------ kernel/trace/trace_entries.h | 66 +++++++----------------- kernel/trace/trace_events.c | 20 +++++++- kernel/trace/trace_events_hist.c | 8 ++- kernel/trace/trace_export.c | 106 ++++++++++++++------------------------- kernel/trace/trace_kprobe.c | 16 +++++- kernel/trace/trace_syscalls.c | 50 ++++++++---------- kernel/trace/trace_uprobe.c | 9 +++- 8 files changed, 138 insertions(+), 168 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d685c61085c0..298a7cacf146 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -49,6 +49,9 @@ enum trace_type { #undef __field #define __field(type, item) type item; +#undef __field_fn +#define __field_fn(type, item) type item; + #undef __field_struct #define __field_struct(type, item) __field(type, item) @@ -68,26 +71,22 @@ enum trace_type { #define F_STRUCT(args...) args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ struct struct_name { \ struct trace_entry ent; \ tstruct \ } #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ - filter, regfn) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) #undef FTRACE_ENTRY_PACKED -#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \ - filter) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) __packed +#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) __packed #include "trace_entries.h" @@ -1899,17 +1898,15 @@ extern void tracing_log_err(struct trace_array *tr, #define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ extern struct trace_event_call \ __aligned(4) event_##call; #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ - FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #undef FTRACE_ENTRY_PACKED -#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \ - FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index fc8e97328e54..3e9d81608284 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -61,15 +61,13 @@ FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, F_STRUCT( - __field( unsigned long, ip ) - __field( unsigned long, parent_ip ) + __field_fn( unsigned long, ip ) + __field_fn( unsigned long, parent_ip ) ), F_printk(" %ps <-- %ps", (void *)__entry->ip, (void *)__entry->parent_ip), - FILTER_TRACE_FN, - perf_ftrace_event_register ); @@ -84,9 +82,7 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth), - - FILTER_OTHER + F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth) ); /* Function return entry */ @@ -97,18 +93,16 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) __field_desc( unsigned long, ret, func ) + __field_desc( unsigned long, ret, overrun ) __field_desc( unsigned long long, ret, calltime) __field_desc( unsigned long long, ret, rettime ) - __field_desc( unsigned long, ret, overrun ) __field_desc( int, ret, depth ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, - __entry->depth), - - FILTER_OTHER + __entry->depth) ); /* @@ -137,9 +131,7 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry, F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu), - - FILTER_OTHER + __entry->next_cpu) ); /* @@ -157,9 +149,7 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu), - - FILTER_OTHER + __entry->next_cpu) ); /* @@ -183,9 +173,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry, (void *)__entry->caller[0], (void *)__entry->caller[1], (void *)__entry->caller[2], (void *)__entry->caller[3], (void *)__entry->caller[4], (void *)__entry->caller[5], - (void *)__entry->caller[6], (void *)__entry->caller[7]), - - FILTER_OTHER + (void *)__entry->caller[6], (void *)__entry->caller[7]) ); FTRACE_ENTRY(user_stack, userstack_entry, @@ -203,9 +191,7 @@ FTRACE_ENTRY(user_stack, userstack_entry, (void *)__entry->caller[0], (void *)__entry->caller[1], (void *)__entry->caller[2], (void *)__entry->caller[3], (void *)__entry->caller[4], (void *)__entry->caller[5], - (void *)__entry->caller[6], (void *)__entry->caller[7]), - - FILTER_OTHER + (void *)__entry->caller[6], (void *)__entry->caller[7]) ); /* @@ -222,9 +208,7 @@ FTRACE_ENTRY(bprint, bprint_entry, ), F_printk("%ps: %s", - (void *)__entry->ip, __entry->fmt), - - FILTER_OTHER + (void *)__entry->ip, __entry->fmt) ); FTRACE_ENTRY_REG(print, print_entry, @@ -239,8 +223,6 @@ FTRACE_ENTRY_REG(print, print_entry, F_printk("%ps: %s", (void *)__entry->ip, __entry->buf), - FILTER_OTHER, - ftrace_event_register ); @@ -254,9 +236,7 @@ FTRACE_ENTRY(raw_data, raw_data_entry, ), F_printk("id:%04x %08x", - __entry->id, (int)__entry->buf[0]), - - FILTER_OTHER + __entry->id, (int)__entry->buf[0]) ); FTRACE_ENTRY(bputs, bputs_entry, @@ -269,9 +249,7 @@ FTRACE_ENTRY(bputs, bputs_entry, ), F_printk("%ps: %s", - (void *)__entry->ip, __entry->str), - - FILTER_OTHER + (void *)__entry->ip, __entry->str) ); FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, @@ -283,16 +261,14 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, __field_desc( resource_size_t, rw, phys ) __field_desc( unsigned long, rw, value ) __field_desc( unsigned long, rw, pc ) - __field_desc( int, rw, map_id ) + __field_desc( int, rw, map_id ) __field_desc( unsigned char, rw, opcode ) __field_desc( unsigned char, rw, width ) ), F_printk("%lx %lx %lx %d %x %x", (unsigned long)__entry->phys, __entry->value, __entry->pc, - __entry->map_id, __entry->opcode, __entry->width), - - FILTER_OTHER + __entry->map_id, __entry->opcode, __entry->width) ); FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, @@ -304,15 +280,13 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, __field_desc( resource_size_t, map, phys ) __field_desc( unsigned long, map, virt ) __field_desc( unsigned long, map, len ) - __field_desc( int, map, map_id ) + __field_desc( int, map, map_id ) __field_desc( unsigned char, map, opcode ) ), F_printk("%lx %lx %lx %d %x", (unsigned long)__entry->phys, __entry->virt, __entry->len, - __entry->map_id, __entry->opcode), - - FILTER_OTHER + __entry->map_id, __entry->opcode) ); @@ -334,9 +308,7 @@ FTRACE_ENTRY(branch, trace_branch, F_printk("%u:%s:%s (%u)%s", __entry->line, __entry->func, __entry->file, __entry->correct, - __entry->constant ? " CONSTANT" : ""), - - FILTER_OTHER + __entry->constant ? " CONSTANT" : "") ); @@ -362,7 +334,5 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __entry->duration, __entry->outer_duration, __entry->nmi_total_ts, - __entry->nmi_count), - - FILTER_OTHER + __entry->nmi_count) ); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fba87d10f0c1..5ab10c3dce78 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -24,6 +24,7 @@ #include #include +#include #include @@ -1990,7 +1991,24 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) */ head = trace_get_fields(call); if (list_empty(head)) { - ret = call->class->define_fields(call); + struct trace_event_fields *field = call->class->fields_array; + unsigned int offset = sizeof(struct trace_entry); + + for (; field->type; field++) { + if (field->type == TRACE_FUNCTION_TYPE) { + ret = field->define_fields(call); + break; + } + + offset = ALIGN(offset, field->align); + ret = trace_define_field(call, field->type, field->name, + offset, field->size, + field->is_signed, field->filter_type); + if (ret) + break; + + offset += field->size; + } if (ret < 0) { pr_warn("Could not initialize trace point events/%s\n", name); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7482a1466ebf..3bec92c020f1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1135,6 +1135,12 @@ static struct synth_event *find_synth_event(const char *name) return NULL; } +static struct trace_event_fields synth_event_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = synth_event_define_fields }, + {} +}; + static int register_synth_event(struct synth_event *event) { struct trace_event_call *call = &event->call; @@ -1156,7 +1162,7 @@ static int register_synth_event(struct synth_event *event) INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &synth_event_funcs; - call->class->define_fields = synth_event_define_fields; + call->class->fields_array = synth_event_fields_array; ret = register_trace_event(&call->event); if (!ret) { diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 45630a76ed3a..6d64c1c19fd5 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -29,10 +29,8 @@ static int ftrace_event_register(struct trace_event_call *call, * function and thus become accesible via perf. */ #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ - filter, regfn) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) /* not needed for this file */ #undef __field_struct @@ -41,6 +39,9 @@ static int ftrace_event_register(struct trace_event_call *call, #undef __field #define __field(type, item) type item; +#undef __field_fn +#define __field_fn(type, item) type item; + #undef __field_desc #define __field_desc(type, container, item) type item; @@ -60,7 +61,7 @@ static int ftrace_event_register(struct trace_event_call *call, #define F_printk(fmt, args...) fmt, args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ struct ____ftrace_##name { \ tstruct \ }; \ @@ -73,76 +74,46 @@ static void __always_unused ____ftrace_check_##name(void) \ } #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ - filter) +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" +#undef __field_ext +#define __field_ext(_type, _item, _filter_type) { \ + .type = #_type, .name = #_item, \ + .size = sizeof(_type), .align = __alignof__(_type), \ + is_signed_type(_type), .filter_type = _filter_type }, + #undef __field -#define __field(type, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), \ - is_signed_type(type), filter_type); \ - if (ret) \ - return ret; +#define __field(_type, _item) __field_ext(_type, _item, FILTER_OTHER) + +#undef __field_fn +#define __field_fn(_type, _item) __field_ext(_type, _item, FILTER_TRACE_FN) #undef __field_desc -#define __field_desc(type, container, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), \ - container.item), \ - sizeof(field.container.item), \ - is_signed_type(type), filter_type); \ - if (ret) \ - return ret; +#define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER) #undef __array -#define __array(type, item, len) \ - do { \ - char *type_str = #type"["__stringify(len)"]"; \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, type_str, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), \ - is_signed_type(type), filter_type); \ - if (ret) \ - return ret; \ - } while (0); +#define __array(_type, _item, _len) { \ + .type = #_type"["__stringify(_len)"]", .name = #_item, \ + .size = sizeof(_type[_len]), .align = __alignof__(_type), \ + is_signed_type(_type), .filter_type = FILTER_OTHER }, #undef __array_desc -#define __array_desc(type, container, item, len) \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ - offsetof(typeof(field), \ - container.item), \ - sizeof(field.container.item), \ - is_signed_type(type), filter_type); \ - if (ret) \ - return ret; +#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len) #undef __dynamic_array -#define __dynamic_array(type, item) \ - ret = trace_define_field(event_call, #type "[]", #item, \ - offsetof(typeof(field), item), \ - 0, is_signed_type(type), filter_type);\ - if (ret) \ - return ret; +#define __dynamic_array(_type, _item) { \ + .type = #_type "[]", .name = #_item, \ + .size = 0, .align = __alignof__(_type), \ + is_signed_type(_type), .filter_type = FILTER_OTHER }, #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ -static int __init \ -ftrace_define_fields_##name(struct trace_event_call *event_call) \ -{ \ - struct struct_name field; \ - int ret; \ - int filter_type = filter; \ - \ - tstruct; \ - \ - return ret; \ -} +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +static struct trace_event_fields ftrace_event_fields_##name[] = { \ + tstruct \ + {} }; #include "trace_entries.h" @@ -152,6 +123,9 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \ #undef __field #define __field(type, item) +#undef __field_fn +#define __field_fn(type, item) + #undef __field_desc #define __field_desc(type, container, item) @@ -168,12 +142,10 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \ #define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args) #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ - regfn) \ - \ +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn) \ struct trace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ - .define_fields = ftrace_define_fields_##call, \ + .fields_array = ftrace_event_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ .reg = regfn, \ }; \ @@ -191,9 +163,9 @@ struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \ +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ FTRACE_ENTRY_REG(call, struct_name, etype, \ - PARAMS(tstruct), PARAMS(print), filter, NULL) + PARAMS(tstruct), PARAMS(print), NULL) bool ftrace_event_is_function(struct trace_event_call *call) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1552a95c743b..66e0a8ff1c01 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1534,16 +1534,28 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; +static struct trace_event_fields kretprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = kretprobe_event_define_fields }, + {} +}; + +static struct trace_event_fields kprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = kprobe_event_define_fields }, + {} +}; + static inline void init_trace_event_call(struct trace_kprobe *tk) { struct trace_event_call *call = trace_probe_event_call(&tk->tp); if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; - call->class->define_fields = kretprobe_event_define_fields; + call->class->fields_array = kretprobe_fields_array; } else { call->event.funcs = &kprobe_funcs; - call->class->define_fields = kprobe_event_define_fields; + call->class->fields_array = kprobe_fields_array; } call->flags = TRACE_EVENT_FL_KPROBE; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index fa8fbff736d6..53935259f701 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -198,11 +198,10 @@ print_syscall_exit(struct trace_iterator *iter, int flags, extern char *__bad_type_size(void); -#define SYSCALL_FIELD(type, field, name) \ - sizeof(type) != sizeof(trace.field) ? \ - __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), field), \ - sizeof(trace.field), is_signed_type(type) +#define SYSCALL_FIELD(_type, _name) { \ + .type = #_type, .name = #_name, \ + .size = sizeof(_type), .align = __alignof__(_type), \ + .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER } static int __init __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) @@ -269,42 +268,22 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; - int ret; - int i; int offset = offsetof(typeof(trace), args); - - ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), - FILTER_OTHER); - if (ret) - return ret; + int ret, i; for (i = 0; i < meta->nb_args; i++) { ret = trace_define_field(call, meta->types[i], meta->args[i], offset, sizeof(unsigned long), 0, FILTER_OTHER); + if (ret) + break; offset += sizeof(unsigned long); } return ret; } -static int __init syscall_exit_define_fields(struct trace_event_call *call) -{ - struct syscall_trace_exit trace; - int ret; - - ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), - FILTER_OTHER); - if (ret) - return ret; - - ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret), - FILTER_OTHER); - - return ret; -} - static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; @@ -502,6 +481,13 @@ static int __init init_syscall_trace(struct trace_event_call *call) return id; } +static struct trace_event_fields __refdata syscall_enter_fields_array[] = { + SYSCALL_FIELD(int, __syscall_nr), + { .type = TRACE_FUNCTION_TYPE, + .define_fields = syscall_enter_define_fields }, + {} +}; + struct trace_event_functions enter_syscall_print_funcs = { .trace = print_syscall_enter, }; @@ -513,7 +499,7 @@ struct trace_event_functions exit_syscall_print_funcs = { struct trace_event_class __refdata event_class_syscall_enter = { .system = "syscalls", .reg = syscall_enter_register, - .define_fields = syscall_enter_define_fields, + .fields_array = syscall_enter_fields_array, .get_fields = syscall_get_enter_fields, .raw_init = init_syscall_trace, }; @@ -521,7 +507,11 @@ struct trace_event_class __refdata event_class_syscall_enter = { struct trace_event_class __refdata event_class_syscall_exit = { .system = "syscalls", .reg = syscall_exit_register, - .define_fields = syscall_exit_define_fields, + .fields_array = (struct trace_event_fields[]){ + SYSCALL_FIELD(int, __syscall_nr), + SYSCALL_FIELD(long, ret), + {} + }, .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), .raw_init = init_syscall_trace, }; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 352073d36585..476a382f1f1b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1507,12 +1507,17 @@ static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; +static struct trace_event_fields uprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = uprobe_event_define_fields }, + {} +}; + static inline void init_trace_event_call(struct trace_uprobe *tu) { struct trace_event_call *call = trace_probe_event_call(&tu->tp); - call->event.funcs = &uprobe_funcs; - call->class->define_fields = uprobe_event_define_fields; + call->class->fields_array = uprobe_fields_array; call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY; call->class->reg = trace_uprobe_register; -- cgit From f66c0447cca1281116224d474cdb37d6a18e4b5b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 27 Nov 2019 14:57:04 +0900 Subject: kprobes: Set unoptimized flag after unoptimizing code Set the unoptimized flag after confirming the code is completely unoptimized. Without this fix, when a kprobe hits the intermediate modified instruction (the first byte is replaced by an INT3, but later bytes can still be a jump address operand) while unoptimizing, it can return to the middle byte of the modified code, which causes an invalid instruction exception in the kernel. Usually, this is a rare case, but if we put a probe on the function call while text patching, it always causes a kernel panic as below: # echo p text_poke+5 > kprobe_events # echo 1 > events/kprobes/enable # echo 0 > events/kprobes/enable invalid opcode: 0000 [#1] PREEMPT SMP PTI RIP: 0010:text_poke+0x9/0x50 Call Trace: arch_unoptimize_kprobe+0x22/0x28 arch_unoptimize_kprobes+0x39/0x87 kprobe_optimizer+0x6e/0x290 process_one_work+0x2a0/0x610 worker_thread+0x28/0x3d0 ? process_one_work+0x610/0x610 kthread+0x10d/0x130 ? kthread_park+0x80/0x80 ret_from_fork+0x3a/0x50 text_poke() is used for patching the code in optprobes. This can happen even if we blacklist text_poke() and other functions, because there is a small time window during which we show the intermediate code to other CPUs. [ mingo: Edited the changelog. ] Tested-by: Alexei Starovoitov Signed-off-by: Masami Hiramatsu Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: bristot@redhat.com Fixes: 6274de4984a6 ("kprobes: Support delayed unoptimizing") Link: https://lkml.kernel.org/r/157483422375.25881.13508326028469515760.stgit@devnote2 Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 53534aa258a6..34e28b236d68 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -510,6 +510,8 @@ static void do_unoptimize_kprobes(void) arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ list_for_each_entry_safe(op, tmp, &freeing_list, list) { + /* Switching from detour code to origin */ + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; /* Disarm probes if marked disabled */ if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); @@ -649,6 +651,7 @@ static void force_unoptimize_kprobe(struct optimized_kprobe *op) { lockdep_assert_cpus_held(); arch_unoptimize_kprobe(op); + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); } @@ -676,7 +679,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force) return; } - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; if (!list_empty(&op->list)) { /* Dequeue from the optimization queue */ list_del_init(&op->list); -- cgit From c5105d764e0214bcc4c6d40d7ba231d01b2e9dda Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 27 Nov 2019 16:37:28 +0800 Subject: sched/clock: Use static_branch_likely() with sched_clock_running sched_clock_running is enabled early at bootup stage and never disabled. So hint that to the compiler by using static_branch_likely() rather than static_branch_unlikely(). The branch probability mis-annotation was introduced in the original commit that converted the plain sched_clock_running flag to a static key: 46457ea464f5 ("sched/clock: Use static key for sched_clock_running") Steve further notes: | Looks like the confusion was the moving of the "!": | | - if (unlikely(!sched_clock_running)) | + if (!static_branch_unlikely(&sched_clock_running)) | | Where, it was unlikely that !sched_clock_running would be true, but | because the "!" was moved outside the "unlikely()" it makes the test | "likely()". That is, if we added an intermediate step, it would have | been: | | if (!likely(sched_clock_running)) | | which would have prevented the mistake that this patch fixes. [ mingo: Edited the changelog. ] Signed-off-by: Zhenzhong Duan Reviewed-by: Steven Rostedt (VMware) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: mgorman@suse.de Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/1574843848-26825-1-git-send-email-zhenzhong.duan@oracle.com Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 1152259a4ca0..12bca64dff73 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -370,7 +370,7 @@ u64 sched_clock_cpu(int cpu) if (sched_clock_stable()) return sched_clock() + __sched_clock_offset; - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return sched_clock(); preempt_disable_notrace(); @@ -393,7 +393,7 @@ void sched_clock_tick(void) if (sched_clock_stable()) return; - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return; lockdep_assert_irqs_disabled(); @@ -460,7 +460,7 @@ void __init sched_clock_init(void) u64 sched_clock_cpu(int cpu) { - if (!static_branch_unlikely(&sched_clock_running)) + if (!static_branch_likely(&sched_clock_running)) return 0; return sched_clock(); -- cgit From 1b40cd56f3bcffcfedb43bc30bd431b52240fb3b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:18 +0200 Subject: sched/rt, locking: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the Kconfig dependency to use CONFIG_PREEMPTION. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Link: https://lore.kernel.org/r/20191015191821.11479-32-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/Kconfig.locks | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index e0852dc333ac..3de8fd11873b 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -101,7 +101,7 @@ config UNINLINE_SPIN_UNLOCK # unlock and unlock_irq functions are inlined when: # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y # or -# - DEBUG_SPINLOCK=n and PREEMPT=n +# - DEBUG_SPINLOCK=n and PREEMPTION=n # # unlock_bh and unlock_irqrestore functions are inlined when: # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y @@ -139,7 +139,7 @@ config INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ + depends on !PREEMPTION || ARCH_INLINE_SPIN_UNLOCK_IRQ config INLINE_SPIN_UNLOCK_IRQRESTORE def_bool y @@ -168,7 +168,7 @@ config INLINE_READ_LOCK_IRQSAVE config INLINE_READ_UNLOCK def_bool y - depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK + depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK config INLINE_READ_UNLOCK_BH def_bool y @@ -176,7 +176,7 @@ config INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ + depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK_IRQ config INLINE_READ_UNLOCK_IRQRESTORE def_bool y @@ -205,7 +205,7 @@ config INLINE_WRITE_LOCK_IRQSAVE config INLINE_WRITE_UNLOCK def_bool y - depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK + depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK config INLINE_WRITE_UNLOCK_BH def_bool y @@ -213,7 +213,7 @@ config INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ + depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK_IRQ config INLINE_WRITE_UNLOCK_IRQRESTORE def_bool y -- cgit From 025f50f3866486a5278afa91f0d3b6b780141050 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:21 +0200 Subject: sched/rt, workqueue: Use PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Update the comment to use PREEMPTION because it is true for both preemption models. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tejun Heo Link: https://lore.kernel.org/r/20191015191821.11479-35-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc88fd939f4e..bf57dc717b38 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2280,7 +2280,7 @@ __acquires(&pool->lock) } /* - * The following prevents a kworker from hogging CPU on !PREEMPT + * The following prevents a kworker from hogging CPU on !PREEMPTION * kernels, where a requeueing work item waiting for something to * happen could deadlock with stop_machine as such work item could * indefinitely requeue itself while all other CPUs are trapped in -- cgit From 15c7c972cd26d89a26788e609c53b5a465324a6c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Oct 2019 18:53:18 -0700 Subject: rcu: Use *_ONCE() to protect lockless ->expmask accesses The rcu_node structure's ->expmask field is accessed locklessly when starting a new expedited grace period and when reporting an expedited RCU CPU stall warning. This commit therefore handles the former by taking a snapshot of ->expmask while the lock is held and the latter by applying READ_ONCE() to lockless reads and WRITE_ONCE() to the corresponding updates. Link: https://lore.kernel.org/lkml/CANpmjNNmSOagbTpffHr4=Yedckx9Rm2NuGqC9UqE+AOz5f1-ZQ@mail.gmail.com Reported-by: syzbot+134336b86f728d6e55a0@syzkaller.appspotmail.com Signed-off-by: Paul E. McKenney Acked-by: Marco Elver --- kernel/rcu/tree_exp.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d632cd019597..69c5aa64fcfd 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -134,7 +134,7 @@ static void __maybe_unused sync_exp_reset_tree(void) rcu_for_each_node_breadth_first(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); - rnp->expmask = rnp->expmaskinit; + WRITE_ONCE(rnp->expmask, rnp->expmaskinit); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -211,7 +211,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, rnp = rnp->parent; raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); - rnp->expmask &= ~mask; + WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); } } @@ -241,7 +241,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - rnp->expmask &= ~mask; + WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */ } @@ -372,12 +372,10 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - if (!(mask_ofl_ipi & mask)) - continue; retry_ipi: if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { mask_ofl_test |= mask; @@ -491,7 +489,7 @@ static void synchronize_sched_expedited_wait(void) struct rcu_data *rdp; mask = leaf_node_cpu_bit(rnp, cpu); - if (!(rnp->expmask & mask)) + if (!(READ_ONCE(rnp->expmask) & mask)) continue; ndetected++; rdp = per_cpu_ptr(&rcu_data, cpu); @@ -503,7 +501,8 @@ static void synchronize_sched_expedited_wait(void) } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, - rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + READ_ONCE(rnp_root->expmask), + ".T"[!!rnp_root->exp_tasks]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { @@ -513,7 +512,7 @@ static void synchronize_sched_expedited_wait(void) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, - rnp->expmask, + READ_ONCE(rnp->expmask), ".T"[!!rnp->exp_tasks]); } pr_cont("\n"); @@ -521,7 +520,7 @@ static void synchronize_sched_expedited_wait(void) rcu_for_each_leaf_node(rnp) { for_each_leaf_node_possible_cpu(rnp, cpu) { mask = leaf_node_cpu_bit(rnp, cpu); - if (!(rnp->expmask & mask)) + if (!(READ_ONCE(rnp->expmask) & mask)) continue; dump_cpu_task(cpu); } -- cgit From 9f08cf088676c12a5b53bd5a29cf04f00c787b5d Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 8 Oct 2019 13:01:40 +0800 Subject: rcu: Avoid modifying mask_ofl_ipi in sync_rcu_exp_select_node_cpus() The "mask_ofl_ipi" is used to track which CPUs get IPIed, however in the IPI sending loop, "mask_ofl_ipi" along with another variable "mask_ofl_test" might also get modified to record which CPUs' quiesent states must be reported by the sync_rcu_exp_select_node_cpus() at the end of sync_rcu_exp_select_node_cpus(). This overlap of roles can be confusing, so this patch cleans things a little by using "mask_ofl_ipi" solely for determining which CPUs must be IPIed and "mask_ofl_test" for solely determining on behalf of which CPUs sync_rcu_exp_select_node_cpus() must report a quiscent state. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Acked-by: Marco Elver --- kernel/rcu/tree_exp.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 69c5aa64fcfd..6a6f328a5f52 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -387,10 +387,10 @@ retry_ipi: } ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); put_cpu(); - if (!ret) { - mask_ofl_ipi &= ~mask; + /* The CPU will report the QS in response to the IPI. */ + if (!ret) continue; - } + /* Failed, raced with CPU hotplug operation. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); if ((rnp->qsmaskinitnext & mask) && @@ -401,13 +401,12 @@ retry_ipi: schedule_timeout_uninterruptible(1); goto retry_ipi; } - /* CPU really is offline, so we can ignore it. */ - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + /* CPU really is offline, so we must report its QS. */ + if (rnp->expmask & mask) + mask_ofl_test |= mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; if (mask_ofl_test) rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); } -- cgit From 6cf539a87a61a4fbc43f625267dbcbcf283872ed Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 9 Oct 2019 17:57:43 +0200 Subject: rcu: Fix data-race due to atomic_t copy-by-value This fixes a data-race where `atomic_t dynticks` is copied by value. The copy is performed non-atomically, resulting in a data-race if `dynticks` is updated concurrently. This data-race was found with KCSAN: ================================================================== BUG: KCSAN: data-race in dyntick_save_progress_counter / rcu_irq_enter write to 0xffff989dbdbe98e0 of 4 bytes by task 10 on cpu 3: atomic_add_return include/asm-generic/atomic-instrumented.h:78 [inline] rcu_dynticks_snap kernel/rcu/tree.c:310 [inline] dyntick_save_progress_counter+0x43/0x1b0 kernel/rcu/tree.c:984 force_qs_rnp+0x183/0x200 kernel/rcu/tree.c:2286 rcu_gp_fqs kernel/rcu/tree.c:1601 [inline] rcu_gp_fqs_loop+0x71/0x880 kernel/rcu/tree.c:1653 rcu_gp_kthread+0x22c/0x3b0 kernel/rcu/tree.c:1799 kthread+0x1b5/0x200 kernel/kthread.c:255 read to 0xffff989dbdbe98e0 of 4 bytes by task 154 on cpu 7: rcu_nmi_enter_common kernel/rcu/tree.c:828 [inline] rcu_irq_enter+0xda/0x240 kernel/rcu/tree.c:870 irq_enter+0x5/0x50 kernel/softirq.c:347 Reported by Kernel Concurrency Sanitizer on: CPU: 7 PID: 154 Comm: kworker/7:1H Not tainted 5.3.0+ #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Workqueue: kblockd blk_mq_run_work_fn ================================================================== Signed-off-by: Marco Elver Cc: Paul E. McKenney Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Joel Fernandes Cc: Ingo Molnar Cc: Dmitry Vyukov Cc: rcu@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..6145e08a1407 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -577,7 +577,7 @@ static void rcu_eqs_enter(bool user) } lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks); + trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); do_nocb_deferred_wakeup(rdp); @@ -650,14 +650,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { - trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); + trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, + atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks); + trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ if (irq) @@ -744,7 +745,7 @@ static void rcu_eqs_exit(bool user) rcu_dynticks_task_exit(); rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks); + trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); WARN_ON_ONCE(rdp->dynticks_nmi_nesting); @@ -833,7 +834,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq) } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, - rdp->dynticks_nmi_nesting + incby, rdp->dynticks); + rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdp->dynticks_nmi_nesting + incby); barrier(); -- cgit From aca2991a25da03ca96127b1d21e1f4aba41f81a6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Oct 2019 06:51:57 -0700 Subject: rcu: Substitute lookup for bit-twiddling in sync_rcu_exp_select_node_cpus() The code in sync_rcu_exp_select_node_cpus() calculates the current CPU's mask within its rcu_node structure's bitmasks, but this has already been computed in the ->grpmask field of that CPU's rcu_data structure. This commit therefore just uses this ->grpmask field. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6a6f328a5f52..3b59c3ee42e5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -345,8 +345,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + unsigned long mask = rdp->grpmask; int snap; if (raw_smp_processor_id() == cpu || @@ -373,8 +373,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) /* IPI the remaining CPUs for expedited quiescent state. */ for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + unsigned long mask = rdp->grpmask; retry_ipi: if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { -- cgit From fd6bc19d7676a060a171d1cf3dcbf6fd797eb05f Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Tue, 19 Nov 2019 03:17:07 +0000 Subject: rcu: Fix missed wakeup of exp_wq waiters Tasks waiting within exp_funnel_lock() for an expedited grace period to elapse can be starved due to the following sequence of events: 1. Tasks A and B both attempt to start an expedited grace period at about the same time. This grace period will have completed when the lower four bits of the rcu_state structure's ->expedited_sequence field are 0b'0100', for example, when the initial value of this counter is zero. Task A wins, and thus does the actual work of starting the grace period, including acquiring the rcu_state structure's .exp_mutex and sets the counter to 0b'0001'. 2. Because task B lost the race to start the grace period, it waits on ->expedited_sequence to reach 0b'0100' inside of exp_funnel_lock(). This task therefore blocks on the rcu_node structure's ->exp_wq[1] field, keeping in mind that the end-of-grace-period value of ->expedited_sequence (0b'0100') is shifted down two bits before indexing the ->exp_wq[] field. 3. Task C attempts to start another expedited grace period, but blocks on ->exp_mutex, which is still held by Task A. 4. The aforementioned expedited grace period completes, so that ->expedited_sequence now has the value 0b'0100'. A kworker task therefore acquires the rcu_state structure's ->exp_wake_mutex and starts awakening any tasks waiting for this grace period. 5. One of the first tasks awakened happens to be Task A. Task A therefore releases the rcu_state structure's ->exp_mutex, which allows Task C to start the next expedited grace period, which causes the lower four bits of the rcu_state structure's ->expedited_sequence field to become 0b'0101'. 6. Task C's expedited grace period completes, so that the lower four bits of the rcu_state structure's ->expedited_sequence field now become 0b'1000'. 7. The kworker task from step 4 above continues its wakeups. Unfortunately, the wake_up_all() refetches the rcu_state structure's .expedited_sequence field: wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]); This results in the wakeup being applied to the rcu_node structure's ->exp_wq[2] field, which is unfortunate given that Task B is instead waiting on ->exp_wq[1]. On a busy system, no harm is done (or at least no permanent harm is done). Some later expedited grace period will redo the wakeup. But on a quiet system, such as many embedded systems, it might be a good long time before there was another expedited grace period. On such embedded systems, this situation could therefore result in a system hang. This issue manifested as DPM device timeout during suspend (which usually qualifies as a quiet time) due to a SCSI device being stuck in _synchronize_rcu_expedited(), with the following stack trace: schedule() synchronize_rcu_expedited() synchronize_rcu() scsi_device_quiesce() scsi_bus_suspend() dpm_run_callback() __device_suspend() This commit therefore prevents such delays, timeouts, and hangs by making rcu_exp_wait_wake() use its "s" argument consistently instead of refetching from rcu_state.expedited_sequence. Fixes: 3b5f668e715b ("rcu: Overlap wakeups with next expedited grace period") Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 3b59c3ee42e5..fa143e40cd93 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -557,7 +557,7 @@ static void rcu_exp_wait_wake(unsigned long s) spin_unlock(&rnp->exp_lock); } smp_mb(); /* All above changes before wakeup. */ - wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]); + wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]); } trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake")); mutex_unlock(&rcu_state.exp_wake_mutex); -- cgit From 4bc6b745e5cbefed92c48071e28a5f41246d0470 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Tue, 19 Nov 2019 11:50:52 -0800 Subject: rcu: Allow only one expedited GP to run concurrently with wakeups The current expedited RCU grace-period code expects that a task requesting an expedited grace period cannot awaken until that grace period has reached the wakeup phase. However, it is possible for a long preemption to result in the waiting task never sleeping. For example, consider the following sequence of events: 1. Task A starts an expedited grace period by invoking synchronize_rcu_expedited(). It proceeds normally up to the wait_event() near the end of that function, and is then preempted (or interrupted or whatever). 2. The expedited grace period completes, and a kworker task starts the awaken phase, having incremented the counter and acquired the rcu_state structure's .exp_wake_mutex. This kworker task is then preempted or interrupted or whatever. 3. Task A resumes and enters wait_event(), which notes that the expedited grace period has completed, and thus doesn't sleep. 4. Task B starts an expedited grace period exactly as did Task A, complete with the preemption (or whatever delay) just before the call to wait_event(). 5. The expedited grace period completes, and another kworker task starts the awaken phase, having incremented the counter. However, it blocks when attempting to acquire the rcu_state structure's .exp_wake_mutex because step 2's kworker task has not yet released it. 6. Steps 4 and 5 repeat, resulting in overflow of the rcu_node structure's ->exp_wq[] array. In theory, this is harmless. Tasks waiting on the various ->exp_wq[] array will just be spuriously awakened, but they will just sleep again on noting that the rcu_state structure's ->expedited_sequence value has not advanced far enough. In practice, this wastes CPU time and is an accident waiting to happen. This commit therefore moves the rcu_exp_gp_seq_end() call that officially ends the expedited grace period (along with associate tracing) until after the ->exp_wake_mutex has been acquired. This prevents Task A from awakening prematurely, thus preventing more than one expedited grace period from being in flight during a previous expedited grace period's wakeup phase. Fixes: 3b5f668e715b ("rcu: Overlap wakeups with next expedited grace period") Signed-off-by: Neeraj Upadhyay [ paulmck: Added updated comment. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index fa143e40cd93..7a1f09376e62 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -539,14 +539,13 @@ static void rcu_exp_wait_wake(unsigned long s) struct rcu_node *rnp; synchronize_sched_expedited_wait(); - rcu_exp_gp_seq_end(); - trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end")); - /* - * Switch over to wakeup mode, allowing the next GP, but -only- the - * next GP, to proceed. - */ + // Switch over to wakeup mode, allowing the next GP to proceed. + // End the previous grace period only after acquiring the mutex + // to ensure that only one GP runs concurrently with wakeups. mutex_lock(&rcu_state.exp_wake_mutex); + rcu_exp_gp_seq_end(); + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end")); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { -- cgit From 6c7d7dbf5b7f965eda0d39fbbb8fee005b08f340 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 13:59:37 -0800 Subject: rcu: Rename sync_rcu_preempt_exp_done() to sync_rcu_exp_done() Now that the RCU flavors have been consolidated, there is one common function for checking to see if an expedited RCU grace period has completed, namely sync_rcu_preempt_exp_done(). Because this function is no longer specific to RCU-preempt, this commit removes the "_preempt" from its name. This commit also changes sync_rcu_preempt_exp_done_unlocked() to sync_rcu_exp_done_unlocked() for the same reason. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 19 +++++++++---------- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 7a1f09376e62..3923c0743c3e 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -148,7 +148,7 @@ static void __maybe_unused sync_exp_reset_tree(void) * * Caller must hold the specificed rcu_node structure's ->lock */ -static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) +static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); @@ -157,17 +157,16 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) } /* - * Like sync_rcu_preempt_exp_done(), but this function assumes the caller - * doesn't hold the rcu_node's ->lock, and will acquire and release the lock - * itself + * Like sync_rcu_exp_done(), but this function assumes the caller doesn't + * hold the rcu_node's ->lock, and will acquire and release the lock itself */ -static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) +static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) { unsigned long flags; bool ret; raw_spin_lock_irqsave_rcu_node(rnp, flags); - ret = sync_rcu_preempt_exp_done(rnp); + ret = sync_rcu_exp_done(rnp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return ret; @@ -191,7 +190,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, unsigned long mask; for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { + if (!sync_rcu_exp_done(rnp)) { if (!rnp->expmask) rcu_initiate_boost(rnp, flags); else @@ -471,9 +470,9 @@ static void synchronize_sched_expedited_wait(void) for (;;) { ret = swait_event_timeout_exclusive( rcu_state.expedited_wq, - sync_rcu_preempt_exp_done_unlocked(rnp_root), + sync_rcu_exp_done_unlocked(rnp_root), jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) + if (ret > 0 || sync_rcu_exp_done_unlocked(rnp_root)) return; WARN_ON(ret < 0); /* workqueues should not be signaled. */ if (rcu_cpu_stall_suppress) @@ -507,7 +506,7 @@ static void synchronize_sched_expedited_wait(void) rcu_for_each_node_breadth_first(rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done_unlocked(rnp)) + if (sync_rcu_exp_done_unlocked(rnp)) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa08d55f7040..6dbea4bcf065 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -485,7 +485,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); - empty_exp = sync_rcu_preempt_exp_done(rnp); + empty_exp = sync_rcu_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); @@ -509,7 +509,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, * so we must take a snapshot of the expedited state. */ - empty_exp_now = sync_rcu_preempt_exp_done(rnp); + empty_exp_now = sync_rcu_exp_done(rnp); if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gp_seq, -- cgit From de8cd0a533bfb57ff4ec6c85e3bdca013a5adcb7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 14:20:41 -0800 Subject: rcu: Update tree_exp.h function-header comments The function-header comments in kernel/rcu/tree_exp.h have gotten a bit out of date, so this commit updates a number of them. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 3923c0743c3e..1eafbcd56679 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -21,7 +21,7 @@ static void rcu_exp_gp_seq_start(void) } /* - * Return then value that expedited-grace-period counter will have + * Return the value that the expedited-grace-period counter will have * at the end of the current grace period. */ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) @@ -39,7 +39,9 @@ static void rcu_exp_gp_seq_end(void) } /* - * Take a snapshot of the expedited-grace-period counter. + * Take a snapshot of the expedited-grace-period counter, which is the + * earliest value that will indicate that a full grace period has + * elapsed since the current time. */ static unsigned long rcu_exp_gp_seq_snap(void) { @@ -143,22 +145,18 @@ static void __maybe_unused sync_exp_reset_tree(void) * Return non-zero if there is no RCU expedited grace period in progress * for the specified rcu_node structure, in other words, if all CPUs and * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the specificed rcu_node structure's ->lock + * for the current expedited grace period. */ static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); - return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; } /* - * Like sync_rcu_exp_done(), but this function assumes the caller doesn't - * hold the rcu_node's ->lock, and will acquire and release the lock itself + * Like sync_rcu_exp_done(), but where the caller does not hold the + * rcu_node's ->lock. */ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) { @@ -180,8 +178,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) * which the task was queued or to one of that rcu_node structure's ancestors, * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) - * - * Caller must hold the specified rcu_node structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_node *rnp, bool wake, unsigned long flags) @@ -189,6 +185,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, { unsigned long mask; + raw_lockdep_assert_held_rcu_node(rnp); for (;;) { if (!sync_rcu_exp_done(rnp)) { if (!rnp->expmask) @@ -452,6 +449,10 @@ static void sync_rcu_exp_select_cpus(void) flush_work(&rnp->rew.rew_work); } +/* + * Wait for the expedited grace period to elapse, issuing any needed + * RCU CPU stall warnings along the way. + */ static void synchronize_sched_expedited_wait(void) { int cpu; @@ -781,7 +782,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * implementations, it is still unfriendly to real-time workloads, so is * thus not recommended for any sort of common-case code. In fact, if * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() + * your code to batch your updates, and then use a single synchronize_rcu() * instead. * * This has the same semantics as (but is more brutal than) synchronize_rcu(). -- cgit From 28f0361fdfab267a392cd6a6401446c9ea64de95 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 14:24:58 -0800 Subject: rcu: Replace synchronize_sched_expedited_wait() "_sched" with "_rcu" After RCU flavor consolidation, synchronize_sched_expedited_wait() does both RCU-preempt and RCU-sched, whichever happens to have been built into the running kernel. This commit therefore changes this function's name to synchronize_rcu_expedited_wait() to reflect its new generic nature. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1eafbcd56679..081a17942e57 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -453,7 +453,7 @@ static void sync_rcu_exp_select_cpus(void) * Wait for the expedited grace period to elapse, issuing any needed * RCU CPU stall warnings along the way. */ -static void synchronize_sched_expedited_wait(void) +static void synchronize_rcu_expedited_wait(void) { int cpu; unsigned long jiffies_stall; @@ -538,7 +538,7 @@ static void rcu_exp_wait_wake(unsigned long s) { struct rcu_node *rnp; - synchronize_sched_expedited_wait(); + synchronize_rcu_expedited_wait(); // Switch over to wakeup mode, allowing the next GP to proceed. // End the previous grace period only after acquiring the mutex -- cgit From df1e849ae4559544ff00ff5052eefe2479750539 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 16:36:45 -0800 Subject: rcu: Enable tick for nohz_full CPUs slow to provide expedited QS An expedited grace period can be stalled by a nohz_full CPU looping in kernel context. This possibility is currently handled by some carefully crafted checks in rcu_read_unlock_special() that enlist help from ksoftirqd when permitted by the scheduler. However, it is exactly these checks that require the scheduler avoid holding any of its rq or pi locks across rcu_read_unlock() without also having held them across the entire RCU read-side critical section. It would therefore be very nice if expedited grace periods could handle nohz_full CPUs looping in kernel context without such checks. This commit therefore adds code to the expedited grace period's wait and cleanup code that forces the scheduler-clock interrupt on for CPUs that fail to quickly supply a quiescent state. "Quickly" is currently a hard-coded single-jiffy delay. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_exp.h | 52 ++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 055c31781d3a..f9253ed406ba 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -182,6 +182,7 @@ struct rcu_data { bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ + bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 081a17942e57..30b2a02aef39 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -230,7 +230,9 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long mask, bool wake) { + int cpu; unsigned long flags; + struct rcu_data *rdp; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { @@ -238,6 +240,13 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, return; } WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); + for_each_leaf_node_cpu_mask(rnp, cpu, mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp) + continue; + rdp->rcu_forced_tick_exp = false; + tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + } __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */ } @@ -449,6 +458,26 @@ static void sync_rcu_exp_select_cpus(void) flush_work(&rnp->rew.rew_work); } +/* + * Wait for the expedited grace period to elapse, within time limit. + * If the time limit is exceeded without the grace period elapsing, + * return false, otherwise return true. + */ +static bool synchronize_rcu_expedited_wait_once(long tlimit) +{ + int t; + struct rcu_node *rnp_root = rcu_get_root(); + + t = swait_event_timeout_exclusive(rcu_state.expedited_wq, + sync_rcu_exp_done_unlocked(rnp_root), + tlimit); + // Workqueues should not be signaled. + if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root)) + return true; + WARN_ON(t < 0); /* workqueues should not be signaled. */ + return false; +} + /* * Wait for the expedited grace period to elapse, issuing any needed * RCU CPU stall warnings along the way. @@ -460,22 +489,31 @@ static void synchronize_rcu_expedited_wait(void) unsigned long jiffies_start; unsigned long mask; int ndetected; + struct rcu_data *rdp; struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(); - int ret; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; + if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (synchronize_rcu_expedited_wait_once(1)) + return; + rcu_for_each_leaf_node(rnp) { + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->rcu_forced_tick_exp) + continue; + rdp->rcu_forced_tick_exp = true; + tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + } + } + WARN_ON_ONCE(1); + } for (;;) { - ret = swait_event_timeout_exclusive( - rcu_state.expedited_wq, - sync_rcu_exp_done_unlocked(rnp_root), - jiffies_stall); - if (ret > 0 || sync_rcu_exp_done_unlocked(rnp_root)) + if (synchronize_rcu_expedited_wait_once(jiffies_stall)) return; - WARN_ON(ret < 0); /* workqueues should not be signaled. */ if (rcu_cpu_stall_suppress) continue; panic_on_rcu_stall(); -- cgit From 610dea36d3083a977e4f156206cbe1eaa2a532f0 Mon Sep 17 00:00:00 2001 From: Stefan Reiter Date: Fri, 4 Oct 2019 19:49:10 +0000 Subject: rcu/nocb: Fix dump_tree hierarchy print always active Commit 18cd8c93e69e ("rcu/nocb: Print gp/cb kthread hierarchy if dump_tree") added print statements to rcu_organize_nocb_kthreads for debugging, but incorrectly guarded them, causing the function to always spew out its message. This patch fixes it by guarding both pr_alert statements with dump_tree, while also changing the second pr_alert to a pr_cont, to print the hierarchy in a single line (assuming that's how it was supposed to work). Fixes: 18cd8c93e69e ("rcu/nocb: Print gp/cb kthread hierarchy if dump_tree") Signed-off-by: Stefan Reiter [ paulmck: Make single-nocbs-CPU GP kthreads look less erroneous. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa08d55f7040..758bfe1de536 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2321,6 +2321,8 @@ static void __init rcu_organize_nocb_kthreads(void) { int cpu; bool firsttime = true; + bool gotnocbs = false; + bool gotnocbscbs = true; int ls = rcu_nocb_gp_stride; int nl = 0; /* Next GP kthread. */ struct rcu_data *rdp; @@ -2343,21 +2345,31 @@ static void __init rcu_organize_nocb_kthreads(void) rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu >= nl) { /* New GP kthread, set up for CBs & next GP. */ + gotnocbs = true; nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; rdp->nocb_gp_rdp = rdp; rdp_gp = rdp; - if (!firsttime && dump_tree) - pr_cont("\n"); - firsttime = false; - pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu); + if (dump_tree) { + if (!firsttime) + pr_cont("%s\n", gotnocbscbs + ? "" : " (self only)"); + gotnocbscbs = false; + firsttime = false; + pr_alert("%s: No-CB GP kthread CPU %d:", + __func__, cpu); + } } else { /* Another CB kthread, link to previous GP kthread. */ + gotnocbscbs = true; rdp->nocb_gp_rdp = rdp_gp; rdp_prev->nocb_next_cb_rdp = rdp; - pr_alert(" %d", cpu); + if (dump_tree) + pr_cont(" %d", cpu); } rdp_prev = rdp; } + if (gotnocbs && dump_tree) + pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); } /* -- cgit From 6935c3983b246d5fbfebd3b891c825e65c118f2d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 14:21:54 -0700 Subject: rcu: Avoid data-race in rcu_gp_fqs_check_wake() The rcu_gp_fqs_check_wake() function uses rcu_preempt_blocked_readers_cgp() to read ->gp_tasks while other cpus might overwrite this field. We need READ_ONCE()/WRITE_ONCE() pairs to avoid compiler tricks and KCSAN splats like the following : BUG: KCSAN: data-race in rcu_gp_fqs_check_wake / rcu_preempt_deferred_qs_irqrestore write to 0xffffffff85a7f190 of 8 bytes by task 7317 on cpu 0: rcu_preempt_deferred_qs_irqrestore+0x43d/0x580 kernel/rcu/tree_plugin.h:507 rcu_read_unlock_special+0xec/0x370 kernel/rcu/tree_plugin.h:659 __rcu_read_unlock+0xcf/0xe0 kernel/rcu/tree_plugin.h:394 rcu_read_unlock include/linux/rcupdate.h:645 [inline] __ip_queue_xmit+0x3b0/0xa40 net/ipv4/ip_output.c:533 ip_queue_xmit+0x45/0x60 include/net/ip.h:236 __tcp_transmit_skb+0xdeb/0x1cd0 net/ipv4/tcp_output.c:1158 __tcp_send_ack+0x246/0x300 net/ipv4/tcp_output.c:3685 tcp_send_ack+0x34/0x40 net/ipv4/tcp_output.c:3691 tcp_cleanup_rbuf+0x130/0x360 net/ipv4/tcp.c:1575 tcp_recvmsg+0x633/0x1a30 net/ipv4/tcp.c:2179 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 sock_read_iter+0x15f/0x1e0 net/socket.c:967 call_read_iter include/linux/fs.h:1864 [inline] new_sync_read+0x389/0x4f0 fs/read_write.c:414 read to 0xffffffff85a7f190 of 8 bytes by task 10 on cpu 1: rcu_gp_fqs_check_wake kernel/rcu/tree.c:1556 [inline] rcu_gp_fqs_check_wake+0x93/0xd0 kernel/rcu/tree.c:1546 rcu_gp_fqs_loop+0x36c/0x580 kernel/rcu/tree.c:1611 rcu_gp_kthread+0x143/0x220 kernel/rcu/tree.c:1768 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 10 Comm: rcu_preempt Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot [ paulmck: Added another READ_ONCE() for RCU CPU stall warnings. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 758bfe1de536..fe5f44811761 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -220,7 +220,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * blocked tasks. */ if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { - rnp->gp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) @@ -340,7 +340,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); */ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) { - return rnp->gp_tasks != NULL; + return READ_ONCE(rnp->gp_tasks) != NULL; } /* Bias and limit values for ->rcu_read_lock_nesting. */ @@ -493,7 +493,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), rnp->gp_seq, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) - rnp->gp_tasks = np; + WRITE_ONCE(rnp->gp_tasks, np); if (&t->rcu_node_entry == rnp->exp_tasks) rnp->exp_tasks = np; if (IS_ENABLED(CONFIG_RCU_BOOST)) { @@ -663,7 +663,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp) && (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { - rnp->gp_tasks = rnp->blkd_tasks.next; + WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next); t = container_of(rnp->gp_tasks, struct task_struct, rcu_node_entry); trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), @@ -757,7 +757,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", - __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); + __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, + rnp->exp_tasks); pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { -- cgit From 03bd2983d7a9f898fd89f8f7215c3e56732d8ecd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Oct 2019 09:05:27 -0700 Subject: rcu: Use lockdep rather than comment to enforce lock held The rcu_preempt_check_blocked_tasks() function has a comment that states that the rcu_node structure's ->lock must be held, which might be informative, but which carries little weight if not read. This commit therefore removes this comment in favor of raw_lockdep_assert_held_rcu_node(), which will complain quite visibly if the required lock is not held. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fe5f44811761..ed54d36465e2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -648,8 +648,7 @@ static void rcu_read_unlock_special(struct task_struct *t) * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be - * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock - * must be held by the caller. + * invoked -before- updating this rnp's ->gp_seq. * * Also, if there are blocked tasks on the list, they automatically * block the newly created grace period, so set up ->gp_tasks accordingly. @@ -659,6 +658,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); + raw_lockdep_assert_held_rcu_node(rnp); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp) && -- cgit From b3e627d3d5092a87fc9b9e37e341610cfecfbfdc Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 02:55:57 +0000 Subject: rcu: Make PREEMPT_RCU be a modifier to TREE_RCU Currently PREEMPT_RCU and TREE_RCU are mutually exclusive Kconfig options. But PREEMPT_RCU actually specifies a kind of TREE_RCU, namely a preemptible TREE_RCU. This commit therefore makes PREEMPT_RCU be a modifer to the TREE_RCU Kconfig option. This has the benefit of simplifying several of the #if expressions that formerly needed to check both, but now need only check one or the other. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 13 +++++++------ kernel/rcu/Makefile | 1 - kernel/rcu/rcu.h | 2 +- kernel/rcu/update.c | 2 +- kernel/sysctl.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 7644eda17d62..0303934e6ef0 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -7,7 +7,7 @@ menu "RCU Subsystem" config TREE_RCU bool - default y if !PREEMPTION && SMP + default y if SMP help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -17,6 +17,7 @@ config TREE_RCU config PREEMPT_RCU bool default y if PREEMPTION + select TREE_RCU help This option selects the RCU implementation that is designed for very large SMP systems with hundreds or @@ -78,7 +79,7 @@ config TASKS_RCU user-mode execution as quiescent states. config RCU_STALL_COMMON - def_bool ( TREE_RCU || PREEMPT_RCU ) + def_bool TREE_RCU help This option enables RCU CPU stall code that is common between the TINY and TREE variants of RCU. The purpose is to allow @@ -86,13 +87,13 @@ config RCU_STALL_COMMON making these warnings mandatory for the tree variants. config RCU_NEED_SEGCBLIST - def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) + def_bool ( TREE_RCU || TREE_SRCU ) config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" range 2 64 if 64BIT range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + depends on TREE_RCU && RCU_EXPERT default 64 if 64BIT default 32 if !64BIT help @@ -112,7 +113,7 @@ config RCU_FANOUT_LEAF int "Tree-based hierarchical RCU leaf-level fanout value" range 2 64 if 64BIT range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + depends on TREE_RCU && RCU_EXPERT default 16 help This option controls the leaf-level fanout of hierarchical @@ -187,7 +188,7 @@ config RCU_BOOST_DELAY config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" - depends on TREE_RCU || PREEMPT_RCU + depends on TREE_RCU depends on RCU_EXPERT || NO_HZ_FULL default n help diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 020e8b6a644b..82d5fba48b2f 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -9,6 +9,5 @@ obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o -obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ab504fbc76ca..eabafde2349e 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -454,7 +454,7 @@ enum rcutorture_type { INVALID_RCU_FLAVOR }; -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +#if defined(CONFIG_TREE_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq); void do_trace_rcu_torture_read(const char *rcutorturename, diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1861103662db..34a7452b25fd 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -435,7 +435,7 @@ struct debug_obj_descr rcuhead_debug_descr = { EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_RCU_TRACE) void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 70665934d53e..d396aaaf19a3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1268,7 +1268,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_do_static_key, }, #endif -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +#if defined(CONFIG_TREE_RCU) { .procname = "panic_on_rcu_stall", .data = &sysctl_panic_on_rcu_stall, -- cgit From 90326f0521a88004194f88f1b597b54347482b5c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:14 +0200 Subject: rcu: Use CONFIG_PREEMPTION where appropriate The config option `CONFIG_PREEMPT' is used for the preemption model "Low-Latency Desktop". The config option `CONFIG_PREEMPTION' is enabled when kernel preemption is enabled which is true for the preemption model `CONFIG_PREEMPT' and `CONFIG_PREEMPT_RT'. Use `CONFIG_PREEMPTION' if it applies to both preemption models and not just to `CONFIG_PREEMPT'. Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Davidlohr Bueso Cc: rcu@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 4 ++-- kernel/rcu/rcutorture.c | 2 +- kernel/rcu/srcutiny.c | 2 +- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_exp.h | 2 +- kernel/rcu/tree_plugin.h | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 0303934e6ef0..1cc940fef17c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -201,8 +201,8 @@ config RCU_NOCB_CPU specified at boot time by the rcu_nocbs parameter. For each such CPU, a kthread ("rcuox/N") will be created to invoke callbacks, where the "N" is the CPU being offloaded, and where - the "p" for RCU-preempt (PREEMPT kernels) and "s" for RCU-sched - (!PREEMPT kernels). Nothing prevents this kthread from running + the "p" for RCU-preempt (PREEMPTION kernels) and "s" for RCU-sched + (!PREEMPTION kernels). Nothing prevents this kthread from running on the specified CPUs, but (1) the kthreads may be preempted between each callback, and (2) affinity or cgroups can be used to force the kthreads to run on whatever set of CPUs is desired. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index dee043feb71f..121a0507a7ce 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1730,7 +1730,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) // Give the scheduler a chance, even on nohz_full CPUs. static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) { - if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (IS_ENABLED(CONFIG_PREEMPTION) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { // Real call_rcu() floods hit userspace, so emulate that. if (need_resched() || (iter & 0xfff)) schedule(); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 44d6606b8325..6208c1dae5c9 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Workqueue handler to drive one grace period and invoke any callbacks - * that become ready as a result. Single-CPU and !PREEMPT operation + * that become ready as a result. Single-CPU and !PREEMPTION operation * means that we get away with murder on synchronization. ;-) */ void srcu_drive_gp(struct work_struct *wp) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..c9dbb05e4c13 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2698,9 +2698,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); /* * During early boot, any blocking grace-period wait automatically - * implies a grace period. Later on, this is never the case for PREEMPT. + * implies a grace period. Later on, this is never the case for PREEMPTION. * - * Howevr, because a context switch is a grace period for !PREEMPT, any + * Howevr, because a context switch is a grace period for !PREEMPTION, any * blocking grace-period wait automatically implies a grace period if * there is only one CPU online at any point time during execution of * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d632cd019597..98d078cafa5a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -670,7 +670,7 @@ static void rcu_exp_handler(void *unused) } } -/* PREEMPT=y, so no PREEMPT=n expedited grace period to clean up after. */ +/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ static void sync_sched_exp_online_cleanup(int cpu) { } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ed54d36465e2..8cdce111ea73 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -789,7 +789,7 @@ static void __init rcu_bootup_announce(void) } /* - * Note a quiescent state for PREEMPT=n. Because we do not need to know + * Note a quiescent state for PREEMPTION=n. Because we do not need to know * how many quiescent states passed, just if there was at least one since * the start of the grace period, this just sets a flag. The caller must * have disabled preemption. @@ -839,7 +839,7 @@ void rcu_all_qs(void) EXPORT_SYMBOL_GPL(rcu_all_qs); /* - * Note a PREEMPT=n context switch. The caller must have disabled interrupts. + * Note a PREEMPTION=n context switch. The caller must have disabled interrupts. */ void rcu_note_context_switch(bool preempt) { -- cgit From a289e608b3e740c15f623148c26cdec2d6698ce0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Nov 2019 08:31:56 -0800 Subject: rcutorture: Pull callback forward-progress data into rcu_fwd struct Now that RCU behaves reasonably well with the current single-kthread call_rcu() forward-progress testing, it is time to add more kthreads. This commit takes a first step towards that goal by wrapping what will be the per-kthread data into a new rcu_fwd structure. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 103 +++++++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index dee043feb71f..22a75a4b6b40 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1663,23 +1663,34 @@ struct rcu_fwd_cb { struct rcu_fwd_cb *rfc_next; int rfc_gps; }; -static DEFINE_SPINLOCK(rcu_fwd_lock); -static struct rcu_fwd_cb *rcu_fwd_cb_head; -static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head; -static long n_launders_cb; -static unsigned long rcu_fwd_startat; -static bool rcu_fwd_emergency_stop; + #define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */ #define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ #define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */ +#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)) + struct rcu_launder_hist { long n_launders; unsigned long launder_gp_seq; }; -#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)) -static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; -static unsigned long rcu_launder_gp_seq_start; + +struct rcu_fwd { + spinlock_t rcu_fwd_lock; + struct rcu_fwd_cb *rcu_fwd_cb_head; + struct rcu_fwd_cb **rcu_fwd_cb_tail; + long n_launders_cb; + unsigned long rcu_fwd_startat; + struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; + unsigned long rcu_launder_gp_seq_start; +}; + +struct rcu_fwd rcu_fwds = { + .rcu_fwd_lock = __SPIN_LOCK_UNLOCKED(rcu_fwds.rcu_fwd_lock), + .rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head, +}; + +bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(void) { @@ -1688,16 +1699,17 @@ static void rcu_torture_fwd_cb_hist(void) int i; int j; - for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) - if (n_launders_hist[i].n_launders > 0) + for (i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; i > 0; i--) + if (rcu_fwds.n_launders_hist[i].n_launders > 0) break; pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", - __func__, jiffies - rcu_fwd_startat); - gps_old = rcu_launder_gp_seq_start; + __func__, jiffies - rcu_fwds.rcu_fwd_startat); + gps_old = rcu_fwds.rcu_launder_gp_seq_start; for (j = 0; j <= i; j++) { - gps = n_launders_hist[j].launder_gp_seq; + gps = rcu_fwds.n_launders_hist[j].launder_gp_seq; pr_cont(" %ds/%d: %ld:%ld", - j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders, + j + 1, FWD_CBS_HIST_DIV, + rcu_fwds.n_launders_hist[j].n_launders, rcutorture_seq_diff(gps, gps_old)); gps_old = gps; } @@ -1714,17 +1726,17 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) rfcp->rfc_next = NULL; rfcp->rfc_gps++; - spin_lock_irqsave(&rcu_fwd_lock, flags); - rfcpp = rcu_fwd_cb_tail; - rcu_fwd_cb_tail = &rfcp->rfc_next; + spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags); + rfcpp = rcu_fwds.rcu_fwd_cb_tail; + rcu_fwds.rcu_fwd_cb_tail = &rfcp->rfc_next; WRITE_ONCE(*rfcpp, rfcp); - WRITE_ONCE(n_launders_cb, n_launders_cb + 1); - i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); - if (i >= ARRAY_SIZE(n_launders_hist)) - i = ARRAY_SIZE(n_launders_hist) - 1; - n_launders_hist[i].n_launders++; - n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); - spin_unlock_irqrestore(&rcu_fwd_lock, flags); + WRITE_ONCE(rcu_fwds.n_launders_cb, rcu_fwds.n_launders_cb + 1); + i = ((jiffies - rcu_fwds.rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); + if (i >= ARRAY_SIZE(rcu_fwds.n_launders_hist)) + i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; + rcu_fwds.n_launders_hist[i].n_launders++; + rcu_fwds.n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); + spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); } // Give the scheduler a chance, even on nohz_full CPUs. @@ -1751,16 +1763,16 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) struct rcu_fwd_cb *rfcp; for (;;) { - spin_lock_irqsave(&rcu_fwd_lock, flags); - rfcp = rcu_fwd_cb_head; + spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags); + rfcp = rcu_fwds.rcu_fwd_cb_head; if (!rfcp) { - spin_unlock_irqrestore(&rcu_fwd_lock, flags); + spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); break; } - rcu_fwd_cb_head = rfcp->rfc_next; - if (!rcu_fwd_cb_head) - rcu_fwd_cb_tail = &rcu_fwd_cb_head; - spin_unlock_irqrestore(&rcu_fwd_lock, flags); + rcu_fwds.rcu_fwd_cb_head = rfcp->rfc_next; + if (!rcu_fwds.rcu_fwd_cb_head) + rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head; + spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); kfree(rfcp); freed++; rcu_torture_fwd_prog_cond_resched(freed); @@ -1804,8 +1816,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) sd = cur_ops->stall_dur() + 1; sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; dur = sd4 + torture_random(&trs) % (sd - sd4); - WRITE_ONCE(rcu_fwd_startat, jiffies); - stopat = rcu_fwd_startat + dur; + WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies); + stopat = rcu_fwds.rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { @@ -1864,23 +1876,23 @@ static void rcu_torture_fwd_prog_cr(void) /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); cur_ops->sync(); /* Later readers see above write. */ - WRITE_ONCE(rcu_fwd_startat, jiffies); - stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES; + WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies); + stopat = rcu_fwds.rcu_fwd_startat + MAX_FWD_CB_JIFFIES; n_launders = 0; - n_launders_cb = 0; + rcu_fwds.n_launders_cb = 0; // Hoist initialization for multi-kthread n_launders_sa = 0; n_max_cbs = 0; n_max_gps = 0; - for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) - n_launders_hist[i].n_launders = 0; + for (i = 0; i < ARRAY_SIZE(rcu_fwds.n_launders_hist); i++) + rcu_fwds.n_launders_hist[i].n_launders = 0; cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); - rcu_launder_gp_seq_start = gps; + rcu_fwds.rcu_launder_gp_seq_start = gps; tick_dep_set_task(current, TICK_DEP_BIT_RCU); while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { - rfcp = READ_ONCE(rcu_fwd_cb_head); + rfcp = READ_ONCE(rcu_fwds.rcu_fwd_cb_head); rfcpn = NULL; if (rfcp) rfcpn = READ_ONCE(rfcp->rfc_next); @@ -1888,7 +1900,7 @@ static void rcu_torture_fwd_prog_cr(void) if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) break; - rcu_fwd_cb_head = rfcpn; + rcu_fwds.rcu_fwd_cb_head = rfcpn; n_launders++; n_launders_sa++; } else { @@ -1910,7 +1922,7 @@ static void rcu_torture_fwd_prog_cr(void) } } stoppedat = jiffies; - n_launders_cb_snap = READ_ONCE(n_launders_cb); + n_launders_cb_snap = READ_ONCE(rcu_fwds.n_launders_cb); cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ @@ -1921,7 +1933,8 @@ static void rcu_torture_fwd_prog_cr(void) WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", __func__, - stoppedat - rcu_fwd_startat, jiffies - stoppedat, + stoppedat - rcu_fwds.rcu_fwd_startat, + jiffies - stoppedat, n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); @@ -1943,7 +1956,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); rcu_torture_fwd_cb_hist(); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwds.rcu_fwd_startat)) / 2); WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ pr_info("%s: Freed %lu RCU callbacks.\n", -- cgit From 6b1b832546067caac8c5833abf88fa082d253b2f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Nov 2019 09:08:58 -0800 Subject: rcutorture: Thread rcu_fwd pointer through forward-progress functions In order to add multiple kthreads, it will be necessary to allow the various functions to operate on a pointer to their kthread's rcu_fwd structure. This commit therefore starts the process of adding the needed "struct rcu_fwd" parameters and arguments to the various callback forward-progress functions. Note that rcutorture_oom_notify() and rcu_torture_fwd_cb_hist() will eventually need to iterate over all kthreads' rcu_fwd structures. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 78 ++++++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 22a75a4b6b40..cc88ce910a6d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1661,6 +1661,7 @@ static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp) struct rcu_fwd_cb { struct rcu_head rh; struct rcu_fwd_cb *rfc_next; + struct rcu_fwd *rfc_rfp; int rfc_gps; }; @@ -1692,24 +1693,24 @@ struct rcu_fwd rcu_fwds = { bool rcu_fwd_emergency_stop; -static void rcu_torture_fwd_cb_hist(void) +static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) { unsigned long gps; unsigned long gps_old; int i; int j; - for (i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; i > 0; i--) - if (rcu_fwds.n_launders_hist[i].n_launders > 0) + for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--) + if (rfp->n_launders_hist[i].n_launders > 0) break; pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", - __func__, jiffies - rcu_fwds.rcu_fwd_startat); - gps_old = rcu_fwds.rcu_launder_gp_seq_start; + __func__, jiffies - rfp->rcu_fwd_startat); + gps_old = rfp->rcu_launder_gp_seq_start; for (j = 0; j <= i; j++) { - gps = rcu_fwds.n_launders_hist[j].launder_gp_seq; + gps = rfp->n_launders_hist[j].launder_gp_seq; pr_cont(" %ds/%d: %ld:%ld", j + 1, FWD_CBS_HIST_DIV, - rcu_fwds.n_launders_hist[j].n_launders, + rfp->n_launders_hist[j].n_launders, rcutorture_seq_diff(gps, gps_old)); gps_old = gps; } @@ -1723,20 +1724,21 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) int i; struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh); struct rcu_fwd_cb **rfcpp; + struct rcu_fwd *rfp = rfcp->rfc_rfp; rfcp->rfc_next = NULL; rfcp->rfc_gps++; - spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags); - rfcpp = rcu_fwds.rcu_fwd_cb_tail; - rcu_fwds.rcu_fwd_cb_tail = &rfcp->rfc_next; + spin_lock_irqsave(&rfp->rcu_fwd_lock, flags); + rfcpp = rfp->rcu_fwd_cb_tail; + rfp->rcu_fwd_cb_tail = &rfcp->rfc_next; WRITE_ONCE(*rfcpp, rfcp); - WRITE_ONCE(rcu_fwds.n_launders_cb, rcu_fwds.n_launders_cb + 1); - i = ((jiffies - rcu_fwds.rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); - if (i >= ARRAY_SIZE(rcu_fwds.n_launders_hist)) - i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; - rcu_fwds.n_launders_hist[i].n_launders++; - rcu_fwds.n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); - spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); + WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1); + i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); + if (i >= ARRAY_SIZE(rfp->n_launders_hist)) + i = ARRAY_SIZE(rfp->n_launders_hist) - 1; + rfp->n_launders_hist[i].n_launders++; + rfp->n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); + spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags); } // Give the scheduler a chance, even on nohz_full CPUs. @@ -1786,7 +1788,8 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) } /* Carry out need_resched()/cond_resched() forward-progress testing. */ -static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) +static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, + int *tested, int *tested_tries) { unsigned long cver; unsigned long dur; @@ -1816,8 +1819,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) sd = cur_ops->stall_dur() + 1; sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; dur = sd4 + torture_random(&trs) % (sd - sd4); - WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies); - stopat = rcu_fwds.rcu_fwd_startat + dur; + WRITE_ONCE(rfp->rcu_fwd_startat, jiffies); + stopat = rfp->rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { @@ -1852,7 +1855,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) } /* Carry out call_rcu() forward-progress testing. */ -static void rcu_torture_fwd_prog_cr(void) +static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) { unsigned long cver; unsigned long flags; @@ -1876,23 +1879,23 @@ static void rcu_torture_fwd_prog_cr(void) /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); cur_ops->sync(); /* Later readers see above write. */ - WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies); - stopat = rcu_fwds.rcu_fwd_startat + MAX_FWD_CB_JIFFIES; + WRITE_ONCE(rfp->rcu_fwd_startat, jiffies); + stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES; n_launders = 0; - rcu_fwds.n_launders_cb = 0; // Hoist initialization for multi-kthread + rfp->n_launders_cb = 0; // Hoist initialization for multi-kthread n_launders_sa = 0; n_max_cbs = 0; n_max_gps = 0; - for (i = 0; i < ARRAY_SIZE(rcu_fwds.n_launders_hist); i++) - rcu_fwds.n_launders_hist[i].n_launders = 0; + for (i = 0; i < ARRAY_SIZE(rfp->n_launders_hist); i++) + rfp->n_launders_hist[i].n_launders = 0; cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); - rcu_fwds.rcu_launder_gp_seq_start = gps; + rfp->rcu_launder_gp_seq_start = gps; tick_dep_set_task(current, TICK_DEP_BIT_RCU); while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { - rfcp = READ_ONCE(rcu_fwds.rcu_fwd_cb_head); + rfcp = READ_ONCE(rfp->rcu_fwd_cb_head); rfcpn = NULL; if (rfcp) rfcpn = READ_ONCE(rfcp->rfc_next); @@ -1900,7 +1903,7 @@ static void rcu_torture_fwd_prog_cr(void) if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) break; - rcu_fwds.rcu_fwd_cb_head = rfcpn; + rfp->rcu_fwd_cb_head = rfcpn; n_launders++; n_launders_sa++; } else { @@ -1912,6 +1915,7 @@ static void rcu_torture_fwd_prog_cr(void) n_max_cbs++; n_launders_sa = 0; rfcp->rfc_gps = 0; + rfcp->rfc_rfp = rfp; } cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); @@ -1922,7 +1926,7 @@ static void rcu_torture_fwd_prog_cr(void) } } stoppedat = jiffies; - n_launders_cb_snap = READ_ONCE(rcu_fwds.n_launders_cb); + n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb); cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ @@ -1933,12 +1937,11 @@ static void rcu_torture_fwd_prog_cr(void) WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", __func__, - stoppedat - rcu_fwds.rcu_fwd_startat, - jiffies - stoppedat, + stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat, n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); - rcu_torture_fwd_cb_hist(); + rcu_torture_fwd_cb_hist(rfp); } schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ tick_dep_clear_task(current, TICK_DEP_BIT_RCU); @@ -1955,7 +1958,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, { WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); - rcu_torture_fwd_cb_hist(); + rcu_torture_fwd_cb_hist(&rcu_fwds); rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwds.rcu_fwd_startat)) / 2); WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ @@ -1980,6 +1983,7 @@ static struct notifier_block rcutorture_oom_nb = { /* Carry out grace-period forward-progress testing. */ static int rcu_torture_fwd_prog(void *args) { + struct rcu_fwd *rfp = args; int tested = 0; int tested_tries = 0; @@ -1991,8 +1995,8 @@ static int rcu_torture_fwd_prog(void *args) schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); register_oom_notifier(&rcutorture_oom_nb); - rcu_torture_fwd_prog_nr(&tested, &tested_tries); - rcu_torture_fwd_prog_cr(); + rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); + rcu_torture_fwd_prog_cr(rfp); unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ @@ -2027,7 +2031,7 @@ static int __init rcu_torture_fwd_prog_init(void) if (fwd_progress_div <= 0) fwd_progress_div = 4; return torture_create_kthread(rcu_torture_fwd_prog, - NULL, fwd_prog_task); + &rcu_fwds, fwd_prog_task); } /* Callback function for RCU barrier testing. */ -- cgit From 7beba0c06b588c725962bcc0273a489d46e81ccf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Nov 2019 07:49:31 -0800 Subject: rcutorture: Move to dynamic initialization of rcu_fwds In order to add multiple call_rcu() forward-progress kthreads, it will be necessary to dynamically allocate and initialize. This commit therefore moves the initialization from compile time to instead immediately precede thread-creation time. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index cc88ce910a6d..6f540fed942c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1686,11 +1686,7 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; -struct rcu_fwd rcu_fwds = { - .rcu_fwd_lock = __SPIN_LOCK_UNLOCKED(rcu_fwds.rcu_fwd_lock), - .rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head, -}; - +struct rcu_fwd rcu_fwds; bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) @@ -2026,6 +2022,8 @@ static int __init rcu_torture_fwd_prog_init(void) WARN_ON(1); /* Make sure rcutorture notices conflict. */ return 0; } + spin_lock_init(&rcu_fwds.rcu_fwd_lock); + rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head; if (fwd_progress_holdoff <= 0) fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) -- cgit From 6764100bd2927060aae91b40fce015f39fc4fd87 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Nov 2019 08:20:20 -0800 Subject: rcutorture: Complete threading rcu_fwd pointers through functions This commit threads pointers to rcu_fwd structures through the remaining functions using rcu_fwds directly, namely rcu_torture_fwd_prog_cbfree(), rcutorture_oom_notify() and rcu_torture_fwd_prog_init(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6f540fed942c..394baac98ae0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1754,23 +1754,23 @@ static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) * Free all callbacks on the rcu_fwd_cb_head list, either because the * test is over or because we hit an OOM event. */ -static unsigned long rcu_torture_fwd_prog_cbfree(void) +static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp) { unsigned long flags; unsigned long freed = 0; struct rcu_fwd_cb *rfcp; for (;;) { - spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags); - rfcp = rcu_fwds.rcu_fwd_cb_head; + spin_lock_irqsave(&rfp->rcu_fwd_lock, flags); + rfcp = rfp->rcu_fwd_cb_head; if (!rfcp) { - spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); + spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags); break; } - rcu_fwds.rcu_fwd_cb_head = rfcp->rfc_next; - if (!rcu_fwds.rcu_fwd_cb_head) - rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head; - spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags); + rfp->rcu_fwd_cb_head = rfcp->rfc_next; + if (!rfp->rcu_fwd_cb_head) + rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags); kfree(rfcp); freed++; rcu_torture_fwd_prog_cond_resched(freed); @@ -1926,7 +1926,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ - (void)rcu_torture_fwd_prog_cbfree(); + (void)rcu_torture_fwd_prog_cbfree(rfp); if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && !shutdown_time_arrived()) { @@ -1952,20 +1952,22 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { + struct rcu_fwd *rfp = &rcu_fwds; + WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); - rcu_torture_fwd_cb_hist(&rcu_fwds); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwds.rcu_fwd_startat)) / 2); + rcu_torture_fwd_cb_hist(rfp); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2); WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree()); + __func__, rcu_torture_fwd_prog_cbfree(rfp)); rcu_barrier(); pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree()); + __func__, rcu_torture_fwd_prog_cbfree(rfp)); rcu_barrier(); pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree()); + __func__, rcu_torture_fwd_prog_cbfree(rfp)); smp_mb(); /* Frees before return to avoid redoing OOM. */ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ pr_info("%s returning after OOM processing.\n", __func__); @@ -2008,6 +2010,8 @@ static int rcu_torture_fwd_prog(void *args) /* If forward-progress checking is requested and feasible, spawn the thread. */ static int __init rcu_torture_fwd_prog_init(void) { + struct rcu_fwd *rfp = &rcu_fwds; + if (!fwd_progress) return 0; /* Not requested, so don't do it. */ if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || @@ -2022,14 +2026,13 @@ static int __init rcu_torture_fwd_prog_init(void) WARN_ON(1); /* Make sure rcutorture notices conflict. */ return 0; } - spin_lock_init(&rcu_fwds.rcu_fwd_lock); - rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head; + spin_lock_init(&rfp->rcu_fwd_lock); + rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; if (fwd_progress_holdoff <= 0) fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) fwd_progress_div = 4; - return torture_create_kthread(rcu_torture_fwd_prog, - &rcu_fwds, fwd_prog_task); + return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } /* Callback function for RCU barrier testing. */ -- cgit From 5155be9994e557618a8312389fb4e52dfbf28a3c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Nov 2019 08:35:08 -0800 Subject: rcutorture: Dynamically allocate rcu_fwds structure This commit switches from static structure to dynamic allocation for rcu_fwds as another step towards providing multiple call_rcu() forward-progress kthreads. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 394baac98ae0..f77f4d886cc1 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1686,7 +1686,7 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; -struct rcu_fwd rcu_fwds; +struct rcu_fwd *rcu_fwds; bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) @@ -1952,7 +1952,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { - struct rcu_fwd *rfp = &rcu_fwds; + struct rcu_fwd *rfp = rcu_fwds; WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); @@ -2010,7 +2010,7 @@ static int rcu_torture_fwd_prog(void *args) /* If forward-progress checking is requested and feasible, spawn the thread. */ static int __init rcu_torture_fwd_prog_init(void) { - struct rcu_fwd *rfp = &rcu_fwds; + struct rcu_fwd *rfp; if (!fwd_progress) return 0; /* Not requested, so don't do it. */ @@ -2026,12 +2026,15 @@ static int __init rcu_torture_fwd_prog_init(void) WARN_ON(1); /* Make sure rcutorture notices conflict. */ return 0; } - spin_lock_init(&rfp->rcu_fwd_lock); - rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; if (fwd_progress_holdoff <= 0) fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) fwd_progress_div = 4; + rfp = kzalloc(sizeof(*rfp), GFP_KERNEL); + if (!rfp) + return -ENOMEM; + spin_lock_init(&rfp->rcu_fwd_lock); + rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } -- cgit From 07928d9bfc81640bab36f5190e8725894d93b659 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 19 Nov 2019 13:17:31 +0800 Subject: padata: Remove broken queue flushing The function padata_flush_queues is fundamentally broken because it cannot force padata users to complete the request that is underway. IOW padata has to passively wait for the completion of any outstanding work. As it stands flushing is used in two places. Its use in padata_stop is simply unnecessary because nothing depends on the queues to be flushed afterwards. The other use in padata_replace is more substantial as we depend on it to free the old pd structure. This patch instead uses the pd->refcnt to dynamically free the pd structure once all requests are complete. Fixes: 2b73b07ab8a4 ("padata: Flush the padata queues actively") Cc: Signed-off-by: Herbert Xu Reviewed-by: Daniel Jordan Signed-off-by: Herbert Xu --- kernel/padata.c | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index c3fec1413295..da56a235a255 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -35,6 +35,8 @@ #define MAX_OBJ_NUM 1000 +static void padata_free_pd(struct parallel_data *pd); + static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { int cpu, target_cpu; @@ -283,6 +285,7 @@ static void padata_serial_worker(struct work_struct *serial_work) struct padata_serial_queue *squeue; struct parallel_data *pd; LIST_HEAD(local_list); + int cnt; local_bh_disable(); squeue = container_of(serial_work, struct padata_serial_queue, work); @@ -292,6 +295,8 @@ static void padata_serial_worker(struct work_struct *serial_work) list_replace_init(&squeue->serial.list, &local_list); spin_unlock(&squeue->serial.lock); + cnt = 0; + while (!list_empty(&local_list)) { struct padata_priv *padata; @@ -301,9 +306,12 @@ static void padata_serial_worker(struct work_struct *serial_work) list_del_init(&padata->list); padata->serial(padata); - atomic_dec(&pd->refcnt); + cnt++; } local_bh_enable(); + + if (atomic_sub_and_test(cnt, &pd->refcnt)) + padata_free_pd(pd); } /** @@ -440,7 +448,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_squeues(pd); atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); - atomic_set(&pd->refcnt, 0); + atomic_set(&pd->refcnt, 1); spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); INIT_WORK(&pd->reorder_work, invoke_padata_reorder); @@ -466,29 +474,6 @@ static void padata_free_pd(struct parallel_data *pd) kfree(pd); } -/* Flush all objects out of the padata queues. */ -static void padata_flush_queues(struct parallel_data *pd) -{ - int cpu; - struct padata_parallel_queue *pqueue; - struct padata_serial_queue *squeue; - - for_each_cpu(cpu, pd->cpumask.pcpu) { - pqueue = per_cpu_ptr(pd->pqueue, cpu); - flush_work(&pqueue->work); - } - - if (atomic_read(&pd->reorder_objects)) - padata_reorder(pd); - - for_each_cpu(cpu, pd->cpumask.cbcpu) { - squeue = per_cpu_ptr(pd->squeue, cpu); - flush_work(&squeue->work); - } - - BUG_ON(atomic_read(&pd->refcnt) != 0); -} - static void __padata_start(struct padata_instance *pinst) { pinst->flags |= PADATA_INIT; @@ -502,10 +487,6 @@ static void __padata_stop(struct padata_instance *pinst) pinst->flags &= ~PADATA_INIT; synchronize_rcu(); - - get_online_cpus(); - padata_flush_queues(pinst->pd); - put_online_cpus(); } /* Replace the internal control structure with a new one. */ @@ -526,8 +507,8 @@ static void padata_replace(struct padata_instance *pinst, if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) notification_mask |= PADATA_CPU_SERIAL; - padata_flush_queues(pd_old); - padata_free_pd(pd_old); + if (atomic_dec_and_test(&pd_old->refcnt)) + padata_free_pd(pd_old); if (notification_mask) blocking_notifier_call_chain(&pinst->cpumask_change_notifier, -- cgit From 13380a1471aadc517994b7230371a227d1f9f152 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 20 Nov 2019 06:32:50 +0800 Subject: padata: Remove unused padata_remove_cpu The function padata_remove_cpu was supposed to have been removed along with padata_add_cpu but somehow it remained behind. Let's kill it now as it doesn't even have a prototype anymore. Fixes: 815613da6a67 ("kernel/padata.c: removed unused code") Signed-off-by: Herbert Xu Reviewed-by: Daniel Jordan Signed-off-by: Herbert Xu --- kernel/padata.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index da56a235a255..fc00f7e64133 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -718,41 +718,6 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) return 0; } - /** - * padata_remove_cpu - remove a cpu from the one or both(serial and parallel) - * padata cpumasks. - * - * @pinst: padata instance - * @cpu: cpu to remove - * @mask: bitmask specifying from which cpumask @cpu should be removed - * The @mask may be any combination of the following flags: - * PADATA_CPU_SERIAL - serial cpumask - * PADATA_CPU_PARALLEL - parallel cpumask - */ -int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) -{ - int err; - - if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) - return -EINVAL; - - mutex_lock(&pinst->lock); - - get_online_cpus(); - if (mask & PADATA_CPU_SERIAL) - cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu); - if (mask & PADATA_CPU_PARALLEL) - cpumask_clear_cpu(cpu, pinst->cpumask.pcpu); - - err = __padata_remove_cpu(pinst, cpu); - put_online_cpus(); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_remove_cpu); - static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) { return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || -- cgit From bbefa1dd6a6d53537c11624752219e39959d04fb Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 Nov 2019 15:58:45 +0800 Subject: crypto: pcrypt - Avoid deadlock by using per-instance padata queues If the pcrypt template is used multiple times in an algorithm, then a deadlock occurs because all pcrypt instances share the same padata_instance, which completes requests in the order submitted. That is, the inner pcrypt request waits for the outer pcrypt request while the outer request is already waiting for the inner. This patch fixes this by allocating a set of queues for each pcrypt instance instead of using two global queues. In order to maintain the existing user-space interface, the pinst structure remains global so any sysfs modifications will apply to every pcrypt instance. Note that when an update occurs we have to allocate memory for every pcrypt instance. Should one of the allocations fail we will abort the update without rolling back changes already made. The new per-instance data structure is called padata_shell and is essentially a wrapper around parallel_data. Reproducer: #include #include #include int main() { struct sockaddr_alg addr = { .salg_type = "aead", .salg_name = "pcrypt(pcrypt(rfc4106-gcm-aesni))" }; int algfd, reqfd; char buf[32] = { 0 }; algfd = socket(AF_ALG, SOCK_SEQPACKET, 0); bind(algfd, (void *)&addr, sizeof(addr)); setsockopt(algfd, SOL_ALG, ALG_SET_KEY, buf, 20); reqfd = accept(algfd, 0, 0); write(reqfd, buf, 32); read(reqfd, buf, 16); } Reported-by: syzbot+56c7151cad94eec37c521f0e47d2eee53f9361c4@syzkaller.appspotmail.com Fixes: 5068c7a883d1 ("crypto: pcrypt - Add pcrypt crypto parallelization wrapper") Signed-off-by: Herbert Xu Tested-by: Eric Biggers Signed-off-by: Herbert Xu --- kernel/padata.c | 236 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 165 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index fc00f7e64133..8c8755f170ca 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -89,7 +89,7 @@ static void padata_parallel_worker(struct work_struct *parallel_work) /** * padata_do_parallel - padata parallelization function * - * @pinst: padata instance + * @ps: padatashell * @padata: object to be parallelized * @cb_cpu: pointer to the CPU that the serialization callback function should * run on. If it's not in the serial cpumask of @pinst @@ -100,16 +100,17 @@ static void padata_parallel_worker(struct work_struct *parallel_work) * Note: Every object which is parallelized by padata_do_parallel * must be seen by padata_do_serial. */ -int padata_do_parallel(struct padata_instance *pinst, +int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu) { + struct padata_instance *pinst = ps->pinst; int i, cpu, cpu_index, target_cpu, err; struct padata_parallel_queue *queue; struct parallel_data *pd; rcu_read_lock_bh(); - pd = rcu_dereference_bh(pinst->pd); + pd = rcu_dereference_bh(ps->pd); err = -EINVAL; if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) @@ -212,10 +213,10 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd, static void padata_reorder(struct parallel_data *pd) { + struct padata_instance *pinst = pd->ps->pinst; int cb_cpu; struct padata_priv *padata; struct padata_serial_queue *squeue; - struct padata_instance *pinst = pd->pinst; struct padata_parallel_queue *next_queue; /* @@ -349,36 +350,39 @@ void padata_do_serial(struct padata_priv *padata) } EXPORT_SYMBOL(padata_do_serial); -static int padata_setup_cpumasks(struct parallel_data *pd, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +static int padata_setup_cpumasks(struct padata_instance *pinst) { struct workqueue_attrs *attrs; + int err; + + attrs = alloc_workqueue_attrs(); + if (!attrs) + return -ENOMEM; + + /* Restrict parallel_wq workers to pd->cpumask.pcpu. */ + cpumask_copy(attrs->cpumask, pinst->cpumask.pcpu); + err = apply_workqueue_attrs(pinst->parallel_wq, attrs); + free_workqueue_attrs(attrs); + + return err; +} + +static int pd_setup_cpumasks(struct parallel_data *pd, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) +{ int err = -ENOMEM; if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) goto out; - cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); - if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) goto free_pcpu_mask; - cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask); - attrs = alloc_workqueue_attrs(); - if (!attrs) - goto free_cbcpu_mask; - - /* Restrict parallel_wq workers to pd->cpumask.pcpu. */ - cpumask_copy(attrs->cpumask, pd->cpumask.pcpu); - err = apply_workqueue_attrs(pd->pinst->parallel_wq, attrs); - free_workqueue_attrs(attrs); - if (err < 0) - goto free_cbcpu_mask; + cpumask_copy(pd->cpumask.pcpu, pcpumask); + cpumask_copy(pd->cpumask.cbcpu, cbcpumask); return 0; -free_cbcpu_mask: - free_cpumask_var(pd->cpumask.cbcpu); free_pcpu_mask: free_cpumask_var(pd->cpumask.pcpu); out: @@ -422,12 +426,16 @@ static void padata_init_pqueues(struct parallel_data *pd) } /* Allocate and initialize the internal cpumask dependend resources. */ -static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) { + struct padata_instance *pinst = ps->pinst; + const struct cpumask *cbcpumask; + const struct cpumask *pcpumask; struct parallel_data *pd; + cbcpumask = pinst->rcpumask.cbcpu; + pcpumask = pinst->rcpumask.pcpu; + pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); if (!pd) goto err; @@ -440,8 +448,8 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, if (!pd->squeue) goto err_free_pqueue; - pd->pinst = pinst; - if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0) + pd->ps = ps; + if (pd_setup_cpumasks(pd, pcpumask, cbcpumask)) goto err_free_squeue; padata_init_pqueues(pd); @@ -490,32 +498,64 @@ static void __padata_stop(struct padata_instance *pinst) } /* Replace the internal control structure with a new one. */ -static void padata_replace(struct padata_instance *pinst, - struct parallel_data *pd_new) +static int padata_replace_one(struct padata_shell *ps) { - struct parallel_data *pd_old = pinst->pd; - int notification_mask = 0; + struct parallel_data *pd_new; - pinst->flags |= PADATA_RESET; + pd_new = padata_alloc_pd(ps); + if (!pd_new) + return -ENOMEM; - rcu_assign_pointer(pinst->pd, pd_new); + ps->opd = rcu_dereference_protected(ps->pd, 1); + rcu_assign_pointer(ps->pd, pd_new); - synchronize_rcu(); + return 0; +} + +static int padata_replace(struct padata_instance *pinst, int cpu) +{ + int notification_mask = 0; + struct padata_shell *ps; + int err; + + pinst->flags |= PADATA_RESET; - if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu)) + cpumask_copy(pinst->omask, pinst->rcpumask.pcpu); + cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu, + cpu_online_mask); + if (cpu >= 0) + cpumask_clear_cpu(cpu, pinst->rcpumask.pcpu); + if (!cpumask_equal(pinst->omask, pinst->rcpumask.pcpu)) notification_mask |= PADATA_CPU_PARALLEL; - if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) + + cpumask_copy(pinst->omask, pinst->rcpumask.cbcpu); + cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu, + cpu_online_mask); + if (cpu >= 0) + cpumask_clear_cpu(cpu, pinst->rcpumask.cbcpu); + if (!cpumask_equal(pinst->omask, pinst->rcpumask.cbcpu)) notification_mask |= PADATA_CPU_SERIAL; - if (atomic_dec_and_test(&pd_old->refcnt)) - padata_free_pd(pd_old); + list_for_each_entry(ps, &pinst->pslist, list) { + err = padata_replace_one(ps); + if (err) + break; + } + + synchronize_rcu(); + + list_for_each_entry_continue_reverse(ps, &pinst->pslist, list) + if (atomic_dec_and_test(&ps->opd->refcnt)) + padata_free_pd(ps->opd); if (notification_mask) blocking_notifier_call_chain(&pinst->cpumask_change_notifier, notification_mask, - &pd_new->cpumask); + &pinst->cpumask); pinst->flags &= ~PADATA_RESET; + + return err; } /** @@ -568,7 +608,7 @@ static int __padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t cbcpumask) { int valid; - struct parallel_data *pd; + int err; valid = padata_validate_cpumask(pinst, pcpumask); if (!valid) { @@ -581,19 +621,15 @@ static int __padata_set_cpumasks(struct padata_instance *pinst, __padata_stop(pinst); out_replace: - pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); - if (!pd) - return -ENOMEM; - cpumask_copy(pinst->cpumask.pcpu, pcpumask); cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); - padata_replace(pinst, pd); + err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst, -1); if (valid) __padata_start(pinst); - return 0; + return err; } /** @@ -676,46 +712,32 @@ EXPORT_SYMBOL(padata_stop); static int __padata_add_cpu(struct padata_instance *pinst, int cpu) { - struct parallel_data *pd; + int err = 0; if (cpumask_test_cpu(cpu, cpu_online_mask)) { - pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, - pinst->cpumask.cbcpu); - if (!pd) - return -ENOMEM; - - padata_replace(pinst, pd); + err = padata_replace(pinst, -1); if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) && padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) __padata_start(pinst); } - return 0; + return err; } static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { - struct parallel_data *pd = NULL; + int err = 0; if (cpumask_test_cpu(cpu, cpu_online_mask)) { - if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) || !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) __padata_stop(pinst); - pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, - pinst->cpumask.cbcpu); - if (!pd) - return -ENOMEM; - - padata_replace(pinst, pd); - - cpumask_clear_cpu(cpu, pd->cpumask.cbcpu); - cpumask_clear_cpu(cpu, pd->cpumask.pcpu); + err = padata_replace(pinst, cpu); } - return 0; + return err; } static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) @@ -763,8 +785,12 @@ static void __padata_free(struct padata_instance *pinst) cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); #endif + WARN_ON(!list_empty(&pinst->pslist)); + padata_stop(pinst); - padata_free_pd(pinst->pd); + free_cpumask_var(pinst->omask); + free_cpumask_var(pinst->rcpumask.cbcpu); + free_cpumask_var(pinst->rcpumask.pcpu); free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); destroy_workqueue(pinst->serial_wq); @@ -911,7 +937,6 @@ static struct padata_instance *padata_alloc(const char *name, const struct cpumask *cbcpumask) { struct padata_instance *pinst; - struct parallel_data *pd = NULL; pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); if (!pinst) @@ -939,14 +964,22 @@ static struct padata_instance *padata_alloc(const char *name, !padata_validate_cpumask(pinst, cbcpumask)) goto err_free_masks; - pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); - if (!pd) + if (!alloc_cpumask_var(&pinst->rcpumask.pcpu, GFP_KERNEL)) goto err_free_masks; + if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL)) + goto err_free_rcpumask_pcpu; + if (!alloc_cpumask_var(&pinst->omask, GFP_KERNEL)) + goto err_free_rcpumask_cbcpu; - rcu_assign_pointer(pinst->pd, pd); + INIT_LIST_HEAD(&pinst->pslist); cpumask_copy(pinst->cpumask.pcpu, pcpumask); cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); + cpumask_and(pinst->rcpumask.pcpu, pcpumask, cpu_online_mask); + cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask); + + if (padata_setup_cpumasks(pinst)) + goto err_free_omask; pinst->flags = 0; @@ -962,6 +995,12 @@ static struct padata_instance *padata_alloc(const char *name, return pinst; +err_free_omask: + free_cpumask_var(pinst->omask); +err_free_rcpumask_cbcpu: + free_cpumask_var(pinst->rcpumask.cbcpu); +err_free_rcpumask_pcpu: + free_cpumask_var(pinst->rcpumask.pcpu); err_free_masks: free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); @@ -1000,6 +1039,61 @@ void padata_free(struct padata_instance *pinst) } EXPORT_SYMBOL(padata_free); +/** + * padata_alloc_shell - Allocate and initialize padata shell. + * + * @pinst: Parent padata_instance object. + */ +struct padata_shell *padata_alloc_shell(struct padata_instance *pinst) +{ + struct parallel_data *pd; + struct padata_shell *ps; + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto out; + + ps->pinst = pinst; + + get_online_cpus(); + pd = padata_alloc_pd(ps); + put_online_cpus(); + + if (!pd) + goto out_free_ps; + + mutex_lock(&pinst->lock); + RCU_INIT_POINTER(ps->pd, pd); + list_add(&ps->list, &pinst->pslist); + mutex_unlock(&pinst->lock); + + return ps; + +out_free_ps: + kfree(ps); +out: + return NULL; +} +EXPORT_SYMBOL(padata_alloc_shell); + +/** + * padata_free_shell - free a padata shell + * + * @ps: padata shell to free + */ +void padata_free_shell(struct padata_shell *ps) +{ + struct padata_instance *pinst = ps->pinst; + + mutex_lock(&pinst->lock); + list_del(&ps->list); + padata_free_pd(rcu_dereference_protected(ps->pd, 1)); + mutex_unlock(&pinst->lock); + + kfree(ps); +} +EXPORT_SYMBOL(padata_free_shell); + #ifdef CONFIG_HOTPLUG_CPU static __init int padata_driver_init(void) -- cgit From 894c9ef9780c5cf2f143415e867ee39a33ecb75d Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 3 Dec 2019 14:31:10 -0500 Subject: padata: validate cpumask without removed CPU during offline Configuring an instance's parallel mask without any online CPUs... echo 2 > /sys/kernel/pcrypt/pencrypt/parallel_cpumask echo 0 > /sys/devices/system/cpu/cpu1/online ...makes tcrypt mode=215 crash like this: divide error: 0000 [#1] SMP PTI CPU: 4 PID: 283 Comm: modprobe Not tainted 5.4.0-rc8-padata-doc-v2+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20191013_105130-anatol 04/01/2014 RIP: 0010:padata_do_parallel+0x114/0x300 Call Trace: pcrypt_aead_encrypt+0xc0/0xd0 [pcrypt] crypto_aead_encrypt+0x1f/0x30 do_mult_aead_op+0x4e/0xdf [tcrypt] test_mb_aead_speed.constprop.0.cold+0x226/0x564 [tcrypt] do_test+0x28c2/0x4d49 [tcrypt] tcrypt_mod_init+0x55/0x1000 [tcrypt] ... cpumask_weight() in padata_cpu_hash() returns 0 because the mask has no CPUs. The problem is __padata_remove_cpu() checks for valid masks too early and so doesn't mark the instance PADATA_INVALID as expected, which would have made padata_do_parallel() return error before doing the division. Fix by introducing a second padata CPU hotplug state before CPUHP_BRINGUP_CPU so that __padata_remove_cpu() sees the online mask without @cpu. No need for the second argument to padata_replace() since @cpu is now already missing from the online mask. Fixes: 33e54450683c ("padata: Handle empty padata cpumasks") Signed-off-by: Daniel Jordan Cc: Eric Biggers Cc: Herbert Xu Cc: Sebastian Andrzej Siewior Cc: Steffen Klassert Cc: Thomas Gleixner Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 8c8755f170ca..1e6500d64846 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -512,7 +512,7 @@ static int padata_replace_one(struct padata_shell *ps) return 0; } -static int padata_replace(struct padata_instance *pinst, int cpu) +static int padata_replace(struct padata_instance *pinst) { int notification_mask = 0; struct padata_shell *ps; @@ -523,16 +523,12 @@ static int padata_replace(struct padata_instance *pinst, int cpu) cpumask_copy(pinst->omask, pinst->rcpumask.pcpu); cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask); - if (cpu >= 0) - cpumask_clear_cpu(cpu, pinst->rcpumask.pcpu); if (!cpumask_equal(pinst->omask, pinst->rcpumask.pcpu)) notification_mask |= PADATA_CPU_PARALLEL; cpumask_copy(pinst->omask, pinst->rcpumask.cbcpu); cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask); - if (cpu >= 0) - cpumask_clear_cpu(cpu, pinst->rcpumask.cbcpu); if (!cpumask_equal(pinst->omask, pinst->rcpumask.cbcpu)) notification_mask |= PADATA_CPU_SERIAL; @@ -624,7 +620,7 @@ out_replace: cpumask_copy(pinst->cpumask.pcpu, pcpumask); cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); - err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst, -1); + err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst); if (valid) __padata_start(pinst); @@ -715,7 +711,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) int err = 0; if (cpumask_test_cpu(cpu, cpu_online_mask)) { - err = padata_replace(pinst, -1); + err = padata_replace(pinst); if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) && padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) @@ -729,12 +725,12 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { int err = 0; - if (cpumask_test_cpu(cpu, cpu_online_mask)) { + if (!cpumask_test_cpu(cpu, cpu_online_mask)) { if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) || !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) __padata_stop(pinst); - err = padata_replace(pinst, cpu); + err = padata_replace(pinst); } return err; @@ -761,7 +757,7 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) return ret; } -static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node) +static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node) { struct padata_instance *pinst; int ret; @@ -782,6 +778,7 @@ static enum cpuhp_state hp_online; static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU + cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, &pinst->node); cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); #endif @@ -989,6 +986,8 @@ static struct padata_instance *padata_alloc(const char *name, #ifdef CONFIG_HOTPLUG_CPU cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD, + &pinst->node); #endif put_online_cpus(); @@ -1101,17 +1100,24 @@ static __init int padata_driver_init(void) int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", - padata_cpu_online, - padata_cpu_prep_down); + padata_cpu_online, NULL); if (ret < 0) return ret; hp_online = ret; + + ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead", + NULL, padata_cpu_dead); + if (ret < 0) { + cpuhp_remove_multi_state(hp_online); + return ret; + } return 0; } module_init(padata_driver_init); static __exit void padata_driver_exit(void) { + cpuhp_remove_multi_state(CPUHP_PADATA_DEAD); cpuhp_remove_multi_state(hp_online); } module_exit(padata_driver_exit); -- cgit From 38228e8848cd7dd86ccb90406af32de0cad24be3 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 3 Dec 2019 14:31:11 -0500 Subject: padata: always acquire cpu_hotplug_lock before pinst->lock lockdep complains when padata's paths to update cpumasks via CPU hotplug and sysfs are both taken: # echo 0 > /sys/devices/system/cpu/cpu1/online # echo ff > /sys/kernel/pcrypt/pencrypt/parallel_cpumask ====================================================== WARNING: possible circular locking dependency detected 5.4.0-rc8-padata-cpuhp-v3+ #1 Not tainted ------------------------------------------------------ bash/205 is trying to acquire lock: ffffffff8286bcd0 (cpu_hotplug_lock.rw_sem){++++}, at: padata_set_cpumask+0x2b/0x120 but task is already holding lock: ffff8880001abfa0 (&pinst->lock){+.+.}, at: padata_set_cpumask+0x26/0x120 which lock already depends on the new lock. padata doesn't take cpu_hotplug_lock and pinst->lock in a consistent order. Which should be first? CPU hotplug calls into padata with cpu_hotplug_lock already held, so it should have priority. Fixes: 6751fb3c0e0c ("padata: Use get_online_cpus/put_online_cpus") Signed-off-by: Daniel Jordan Cc: Eric Biggers Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 1e6500d64846..f5964f015139 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -643,8 +643,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, struct cpumask *serial_mask, *parallel_mask; int err = -EINVAL; - mutex_lock(&pinst->lock); get_online_cpus(); + mutex_lock(&pinst->lock); switch (cpumask_type) { case PADATA_CPU_PARALLEL: @@ -662,8 +662,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask); out: - put_online_cpus(); mutex_unlock(&pinst->lock); + put_online_cpus(); return err; } -- cgit From 91a71d612128f84f725022d7b7c5d5a741f6fdc7 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 3 Dec 2019 14:31:12 -0500 Subject: padata: remove cpumask change notifier Since commit 63d3578892dc ("crypto: pcrypt - remove padata cpumask notifier") this feature is unused, so get rid of it. Signed-off-by: Daniel Jordan Cc: Eric Biggers Cc: Herbert Xu Cc: Jonathan Corbet Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 52 +--------------------------------------------------- 1 file changed, 1 insertion(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index f5964f015139..bc594c00b26e 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -514,23 +514,16 @@ static int padata_replace_one(struct padata_shell *ps) static int padata_replace(struct padata_instance *pinst) { - int notification_mask = 0; struct padata_shell *ps; int err; pinst->flags |= PADATA_RESET; - cpumask_copy(pinst->omask, pinst->rcpumask.pcpu); cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask); - if (!cpumask_equal(pinst->omask, pinst->rcpumask.pcpu)) - notification_mask |= PADATA_CPU_PARALLEL; - cpumask_copy(pinst->omask, pinst->rcpumask.cbcpu); cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask); - if (!cpumask_equal(pinst->omask, pinst->rcpumask.cbcpu)) - notification_mask |= PADATA_CPU_SERIAL; list_for_each_entry(ps, &pinst->pslist, list) { err = padata_replace_one(ps); @@ -544,48 +537,11 @@ static int padata_replace(struct padata_instance *pinst) if (atomic_dec_and_test(&ps->opd->refcnt)) padata_free_pd(ps->opd); - if (notification_mask) - blocking_notifier_call_chain(&pinst->cpumask_change_notifier, - notification_mask, - &pinst->cpumask); - pinst->flags &= ~PADATA_RESET; return err; } -/** - * padata_register_cpumask_notifier - Registers a notifier that will be called - * if either pcpu or cbcpu or both cpumasks change. - * - * @pinst: A poineter to padata instance - * @nblock: A pointer to notifier block. - */ -int padata_register_cpumask_notifier(struct padata_instance *pinst, - struct notifier_block *nblock) -{ - return blocking_notifier_chain_register(&pinst->cpumask_change_notifier, - nblock); -} -EXPORT_SYMBOL(padata_register_cpumask_notifier); - -/** - * padata_unregister_cpumask_notifier - Unregisters cpumask notifier - * registered earlier using padata_register_cpumask_notifier - * - * @pinst: A pointer to data instance. - * @nlock: A pointer to notifier block. - */ -int padata_unregister_cpumask_notifier(struct padata_instance *pinst, - struct notifier_block *nblock) -{ - return blocking_notifier_chain_unregister( - &pinst->cpumask_change_notifier, - nblock); -} -EXPORT_SYMBOL(padata_unregister_cpumask_notifier); - - /* If cpumask contains no active cpu, we mark the instance as invalid. */ static bool padata_validate_cpumask(struct padata_instance *pinst, const struct cpumask *cpumask) @@ -785,7 +741,6 @@ static void __padata_free(struct padata_instance *pinst) WARN_ON(!list_empty(&pinst->pslist)); padata_stop(pinst); - free_cpumask_var(pinst->omask); free_cpumask_var(pinst->rcpumask.cbcpu); free_cpumask_var(pinst->rcpumask.pcpu); free_cpumask_var(pinst->cpumask.pcpu); @@ -965,8 +920,6 @@ static struct padata_instance *padata_alloc(const char *name, goto err_free_masks; if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL)) goto err_free_rcpumask_pcpu; - if (!alloc_cpumask_var(&pinst->omask, GFP_KERNEL)) - goto err_free_rcpumask_cbcpu; INIT_LIST_HEAD(&pinst->pslist); @@ -976,11 +929,10 @@ static struct padata_instance *padata_alloc(const char *name, cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask); if (padata_setup_cpumasks(pinst)) - goto err_free_omask; + goto err_free_rcpumask_cbcpu; pinst->flags = 0; - BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); @@ -994,8 +946,6 @@ static struct padata_instance *padata_alloc(const char *name, return pinst; -err_free_omask: - free_cpumask_var(pinst->omask); err_free_rcpumask_cbcpu: free_cpumask_var(pinst->rcpumask.cbcpu); err_free_rcpumask_pcpu: -- cgit From 3facced7aeed131c1002b724e488d68ebe59c56f Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 3 Dec 2019 14:31:13 -0500 Subject: padata: remove reorder_objects reorder_objects is unused since the rework of padata's flushing, so remove it. Signed-off-by: Daniel Jordan Cc: Eric Biggers Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index bc594c00b26e..db950d287b3d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -202,7 +202,6 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd, if (remove_object) { list_del_init(&padata->list); - atomic_dec(&pd->reorder_objects); ++pd->processed; pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false); } @@ -336,7 +335,6 @@ void padata_do_serial(struct padata_priv *padata) if (cur->seq_nr < padata->seq_nr) break; list_add(&padata->list, &cur->list); - atomic_inc(&pd->reorder_objects); spin_unlock(&pqueue->reorder.lock); /* @@ -455,7 +453,6 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) padata_init_pqueues(pd); padata_init_squeues(pd); atomic_set(&pd->seq_nr, -1); - atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 1); spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); -- cgit From bfcdcef8c8e3469f4d6c082a1da27a6ef77e5715 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 3 Dec 2019 14:31:14 -0500 Subject: padata: update documentation Remove references to unused functions, standardize language, update to reflect new functionality, migrate to rst format, and fix all kernel-doc warnings. Fixes: 815613da6a67 ("kernel/padata.c: removed unused code") Signed-off-by: Daniel Jordan Cc: Eric Biggers Cc: Herbert Xu Cc: Jonathan Corbet Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Jordan Signed-off-by: Herbert Xu --- kernel/padata.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index db950d287b3d..72777c10bb9c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -2,7 +2,7 @@ /* * padata.c - generic interface to process data streams in parallel * - * See Documentation/padata.txt for an api documentation. + * See Documentation/core-api/padata.rst for more information. * * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert @@ -99,6 +99,8 @@ static void padata_parallel_worker(struct work_struct *parallel_work) * The parallelization callback function will run with BHs off. * Note: Every object which is parallelized by padata_do_parallel * must be seen by padata_do_serial. + * + * Return: 0 on success or else negative error code. */ int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu) @@ -163,14 +165,12 @@ EXPORT_SYMBOL(padata_do_parallel); /* * padata_find_next - Find the next object that needs serialization. * - * Return values are: - * - * A pointer to the control struct of the next object that needs - * serialization, if present in one of the percpu reorder queues. - * - * NULL, if the next object that needs serialization will - * be parallel processed by another cpu and is not yet present in - * the cpu's reorder queue. + * Return: + * * A pointer to the control struct of the next object that needs + * serialization, if present in one of the percpu reorder queues. + * * NULL, if the next object that needs serialization will + * be parallel processed by another cpu and is not yet present in + * the cpu's reorder queue. */ static struct padata_priv *padata_find_next(struct parallel_data *pd, bool remove_object) @@ -582,13 +582,14 @@ out_replace: } /** - * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value - * equivalent to @cpumask. - * + * padata_set_cpumask - Sets specified by @cpumask_type cpumask to the value + * equivalent to @cpumask. * @pinst: padata instance * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding * to parallel and serial cpumasks respectively. * @cpumask: the cpumask to use + * + * Return: 0 on success or negative error code */ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask) @@ -626,6 +627,8 @@ EXPORT_SYMBOL(padata_set_cpumask); * padata_start - start the parallel processing * * @pinst: padata instance to start + * + * Return: 0 on success or negative error code */ int padata_start(struct padata_instance *pinst) { @@ -880,6 +883,8 @@ static struct kobj_type padata_attr_type = { * @name: used to identify the instance * @pcpumask: cpumask that will be used for padata parallelization * @cbcpumask: cpumask that will be used for padata serialization + * + * Return: new instance on success, NULL on error */ static struct padata_instance *padata_alloc(const char *name, const struct cpumask *pcpumask, @@ -967,6 +972,8 @@ err: * parallel workers. * * @name: used to identify the instance + * + * Return: new instance on success, NULL on error */ struct padata_instance *padata_alloc_possible(const char *name) { @@ -977,7 +984,7 @@ EXPORT_SYMBOL(padata_alloc_possible); /** * padata_free - free a padata instance * - * @padata_inst: padata instance to free + * @pinst: padata instance to free */ void padata_free(struct padata_instance *pinst) { @@ -989,6 +996,8 @@ EXPORT_SYMBOL(padata_free); * padata_alloc_shell - Allocate and initialize padata shell. * * @pinst: Parent padata_instance object. + * + * Return: new shell on success, NULL on error */ struct padata_shell *padata_alloc_shell(struct padata_instance *pinst) { -- cgit From bae141f54be83b06652c1d47e50e4e75ed4e9c7e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 6 Dec 2019 22:49:34 +0100 Subject: bpf: Emit audit messages upon successful prog load and unload Allow for audit messages to be emitted upon BPF program load and unload for having a timeline of events. The load itself is in syscall context, so additional info about the process initiating the BPF prog creation can be logged and later directly correlated to the unload event. The only info really needed from BPF side is the globally unique prog ID where then audit user space tooling can query / dump all info needed about the specific BPF program right upon load event and enrich the record, thus these changes needed here can be kept small and non-intrusive to the core. Raw example output: # auditctl -D # auditctl -a always,exit -F arch=x86_64 -S bpf # ausearch --start recent -m 1334 ... ---- time->Wed Nov 27 16:04:13 2019 type=PROCTITLE msg=audit(1574867053.120:84664): proctitle="./bpf" type=SYSCALL msg=audit(1574867053.120:84664): arch=c000003e syscall=321 \ success=yes exit=3 a0=5 a1=7ffea484fbe0 a2=70 a3=0 items=0 ppid=7477 \ pid=12698 auid=1001 uid=1001 gid=1001 euid=1001 suid=1001 fsuid=1001 \ egid=1001 sgid=1001 fsgid=1001 tty=pts2 ses=4 comm="bpf" \ exe="/home/jolsa/auditd/audit-testsuite/tests/bpf/bpf" \ subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1334] msg=audit(1574867053.120:84664): prog-id=76 op=LOAD ---- time->Wed Nov 27 16:04:13 2019 type=UNKNOWN[1334] msg=audit(1574867053.120:84665): prog-id=76 op=UNLOAD ... Signed-off-by: Daniel Borkmann Co-developed-by: Jiri Olsa Signed-off-by: Jiri Olsa Acked-by: Paul Moore Link: https://lore.kernel.org/bpf/20191206214934.11319-1-jolsa@kernel.org --- kernel/bpf/syscall.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e3461ec59570..66b90eaf99fe 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -1306,6 +1307,36 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) return 0; } +enum bpf_audit { + BPF_AUDIT_LOAD, + BPF_AUDIT_UNLOAD, + BPF_AUDIT_MAX, +}; + +static const char * const bpf_audit_str[BPF_AUDIT_MAX] = { + [BPF_AUDIT_LOAD] = "LOAD", + [BPF_AUDIT_UNLOAD] = "UNLOAD", +}; + +static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) +{ + struct audit_context *ctx = NULL; + struct audit_buffer *ab; + + if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX)) + return; + if (audit_enabled == AUDIT_OFF) + return; + if (op == BPF_AUDIT_LOAD) + ctx = audit_context(); + ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); + if (unlikely(!ab)) + return; + audit_log_format(ab, "prog-id=%u op=%s", + prog->aux->id, bpf_audit_str[op]); + audit_log_end(ab); +} + int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1421,6 +1452,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic64_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); __bpf_prog_put_noref(prog, true); @@ -1830,6 +1862,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_LOAD); err = bpf_prog_new_fd(prog); if (err < 0) -- cgit From 81c22041d9f19df07b9cba95e3cd02e0f41bc1e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 9 Dec 2019 16:08:03 +0100 Subject: bpf, x86, arm64: Enable jit by default when not built as always-on After Spectre 2 fix via 290af86629b2 ("bpf: introduce BPF_JIT_ALWAYS_ON config") most major distros use BPF_JIT_ALWAYS_ON configuration these days which compiles out the BPF interpreter entirely and always enables the JIT. Also given recent fix in e1608f3fa857 ("bpf: Avoid setting bpf insns pages read-only when prog is jited"), we additionally avoid fragmenting the direct map for the BPF insns pages sitting in the general data heap since they are not used during execution. Latter is only needed when run through the interpreter. Since both x86 and arm64 JITs have seen a lot of exposure over the years, are generally most up to date and maintained, there is more downside in !BPF_JIT_ALWAYS_ON configurations to have the interpreter enabled by default rather than the JIT. Add a ARCH_WANT_DEFAULT_BPF_JIT config which archs can use to set the bpf_jit_{enable,kallsyms} to 1. Back in the days the bpf_jit_kallsyms knob was set to 0 by default since major distros still had /proc/kallsyms addresses exposed to unprivileged user space which is not the case anymore. Hence both knobs are set via BPF_JIT_DEFAULT_ON which is set to 'y' in case of BPF_JIT_ALWAYS_ON or ARCH_WANT_DEFAULT_BPF_JIT. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Will Deacon Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/f78ad24795c2966efcc2ee19025fa3459f622185.1575903816.git.daniel@iogearbox.net --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 49e32acad7d8..2ff01a716128 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -520,9 +520,9 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) #ifdef CONFIG_BPF_JIT /* All BPF JIT sysctl knobs here. */ -int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); +int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; -int bpf_jit_kallsyms __read_mostly; long bpf_jit_limit __read_mostly; static __always_inline void -- cgit From c30fe541896440667bd9e9068aedd1d440fbbcd2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Oct 2019 21:40:09 -0700 Subject: rcu: Mark non-global functions and variables as static Each of rcu_state, rcu_rnp_online_cpus(), rcu_dynticks_curr_cpu_in_eqs(), and rcu_dynticks_snap() are used only in the kernel/rcu/tree.o translation unit, and may thus be marked static. This commit therefore makes this change. Reported-by: Ben Dooks Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/tree.c | 8 ++++---- kernel/rcu/tree.h | 2 -- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..dd8cfc34f4da 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -84,7 +84,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; -struct rcu_state rcu_state = { +static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, .gp_state = RCU_GP_IDLE, .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, @@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); * held, but the bit corresponding to the current CPU will be stable * in most contexts. */ -unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) { return READ_ONCE(rnp->qsmaskinitnext); } @@ -294,7 +294,7 @@ static void rcu_dynticks_eqs_online(void) * * No ordering, as we are sampling CPU-local information. */ -bool rcu_dynticks_curr_cpu_in_eqs(void) +static bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -305,7 +305,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void) * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. */ -int rcu_dynticks_snap(struct rcu_data *rdp) +static int rcu_dynticks_snap(struct rcu_data *rdp) { int snap = atomic_add_return(0, &rdp->dynticks); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 055c31781d3a..e4dc5debfc84 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -403,8 +403,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; #define RCU_NAME rcu_name #endif /* #else #ifdef CONFIG_TRACING */ -int rcu_dynticks_snap(struct rcu_data *rdp); - /* Forward declarations for tree_plugin.h */ static void rcu_bootup_announce(void); static void rcu_qs(void); -- cgit From 98e8627efcada18ac043a77b9101b4b4c768090b Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 13 Dec 2019 18:51:07 +0100 Subject: bpf: Move trampoline JIT image allocation to a function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the image allocation in the BPF trampoline code into a separate function, so it can be shared with the BPF dispatcher in upcoming commits. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-2-bjorn.topel@gmail.com --- kernel/bpf/trampoline.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7e89f1f49d77..5ee301ddbd00 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -13,6 +13,22 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); +void *bpf_jit_alloc_exec_page(void) +{ + void *image; + + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!image) + return NULL; + + set_vm_flush_reset_perms(image); + /* Keep image as writeable. The alternative is to keep flipping ro/rw + * everytime new program is attached or detached. + */ + set_memory_x((long)image, 1); + return image; +} + struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; @@ -33,7 +49,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) goto out; /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = bpf_jit_alloc_exec(PAGE_SIZE); + image = bpf_jit_alloc_exec_page(); if (!image) { kfree(tr); tr = NULL; @@ -47,12 +63,6 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); - - set_vm_flush_reset_perms(image); - /* Keep image as writeable. The alternative is to keep flipping ro/rw - * everytime new program is attached or detached. - */ - set_memory_x((long)image, 1); tr->image = image; out: mutex_unlock(&trampoline_mutex); -- cgit From 75ccbef6369e94ecac696a152a998a978d41376b Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 13 Dec 2019 18:51:08 +0100 Subject: bpf: Introduce BPF dispatcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BPF dispatcher is a multi-way branch code generator, mainly targeted for XDP programs. When an XDP program is executed via the bpf_prog_run_xdp(), it is invoked via an indirect call. The indirect call has a substantial performance impact, when retpolines are enabled. The dispatcher transform indirect calls to direct calls, and therefore avoids the retpoline. The dispatcher is generated using the BPF JIT, and relies on text poking provided by bpf_arch_text_poke(). The dispatcher hijacks a trampoline function it via the __fentry__ nop of the trampoline. One dispatcher instance currently supports up to 64 dispatch points. A user creates a dispatcher with its corresponding trampoline with the DEFINE_BPF_DISPATCHER macro. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-3-bjorn.topel@gmail.com --- kernel/bpf/Makefile | 1 + kernel/bpf/dispatcher.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 kernel/bpf/dispatcher.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3f671bf617e8..d4f330351f87 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o +obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c new file mode 100644 index 000000000000..204ee61a3904 --- /dev/null +++ b/kernel/bpf/dispatcher.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2019 Intel Corporation. */ + +#include +#include +#include + +/* The BPF dispatcher is a multiway branch code generator. The + * dispatcher is a mechanism to avoid the performance penalty of an + * indirect call, which is expensive when retpolines are enabled. A + * dispatch client registers a BPF program into the dispatcher, and if + * there is available room in the dispatcher a direct call to the BPF + * program will be generated. All calls to the BPF programs called via + * the dispatcher will then be a direct call, instead of an + * indirect. The dispatcher hijacks a trampoline function it via the + * __fentry__ of the trampoline. The trampoline function has the + * following signature: + * + * unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi, + * unsigned int (*bpf_func)(const void *, + * const struct bpf_insn *)); + */ + +static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog( + struct bpf_dispatcher *d, struct bpf_prog *prog) +{ + int i; + + for (i = 0; i < BPF_DISPATCHER_MAX; i++) { + if (prog == d->progs[i].prog) + return &d->progs[i]; + } + return NULL; +} + +static struct bpf_dispatcher_prog *bpf_dispatcher_find_free( + struct bpf_dispatcher *d) +{ + return bpf_dispatcher_find_prog(d, NULL); +} + +static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d, + struct bpf_prog *prog) +{ + struct bpf_dispatcher_prog *entry; + + if (!prog) + return false; + + entry = bpf_dispatcher_find_prog(d, prog); + if (entry) { + refcount_inc(&entry->users); + return false; + } + + entry = bpf_dispatcher_find_free(d); + if (!entry) + return false; + + bpf_prog_inc(prog); + entry->prog = prog; + refcount_set(&entry->users, 1); + d->num_progs++; + return true; +} + +static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d, + struct bpf_prog *prog) +{ + struct bpf_dispatcher_prog *entry; + + if (!prog) + return false; + + entry = bpf_dispatcher_find_prog(d, prog); + if (!entry) + return false; + + if (refcount_dec_and_test(&entry->users)) { + entry->prog = NULL; + bpf_prog_put(prog); + d->num_progs--; + return true; + } + return false; +} + +int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +{ + return -ENOTSUPP; +} + +static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) +{ + s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; + int i; + + for (i = 0; i < BPF_DISPATCHER_MAX; i++) { + if (d->progs[i].prog) + *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func; + } + return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs); +} + +static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) +{ + void *old, *new; + u32 noff; + int err; + + if (!prev_num_progs) { + old = NULL; + noff = 0; + } else { + old = d->image + d->image_off; + noff = d->image_off ^ (PAGE_SIZE / 2); + } + + new = d->num_progs ? d->image + noff : NULL; + if (new) { + if (bpf_dispatcher_prepare(d, new)) + return; + } + + err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new); + if (err || !new) + return; + + d->image_off = noff; +} + +void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, + struct bpf_prog *to) +{ + bool changed = false; + int prev_num_progs; + + if (from == to) + return; + + mutex_lock(&d->mutex); + if (!d->image) { + d->image = bpf_jit_alloc_exec_page(); + if (!d->image) + goto out; + } + + prev_num_progs = d->num_progs; + changed |= bpf_dispatcher_remove_prog(d, from); + changed |= bpf_dispatcher_add_prog(d, to); + + if (!changed) + goto out; + + bpf_dispatcher_update(d, prev_num_progs); +out: + mutex_unlock(&d->mutex); +} -- cgit From 7e6897f95935973c3253fd756135b5ea58043dc8 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 13 Dec 2019 18:51:09 +0100 Subject: bpf, xdp: Start using the BPF dispatcher for XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds a BPF dispatcher for XDP. The dispatcher is updated from the XDP control-path, dev_xdp_install(), and used when an XDP program is run via bpf_prog_run_xdp(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-4-bjorn.topel@gmail.com --- kernel/bpf/syscall.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 66b90eaf99fe..b08c362f4e02 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2338,17 +2338,12 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id -static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +struct bpf_prog *bpf_prog_by_id(u32 id) { struct bpf_prog *prog; - u32 id = attr->prog_id; - int fd; - - if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!id) + return ERR_PTR(-ENOENT); spin_lock_bh(&prog_idr_lock); prog = idr_find(&prog_idr, id); @@ -2357,7 +2352,22 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) else prog = ERR_PTR(-ENOENT); spin_unlock_bh(&prog_idr_lock); + return prog; +} + +static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + u32 id = attr->prog_id; + int fd; + + if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + prog = bpf_prog_by_id(id); if (IS_ERR(prog)) return PTR_ERR(prog); -- cgit From 7c2e8bbd87db661122e92d71a394dd7bb3ada4d3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 3 Dec 2019 17:01:05 +0100 Subject: sched: Spare resched IPI when prio changes on a single fair task The runqueue of a fair task being remotely reniced is going to get a resched IPI in order to reassess which task should be the current running on the CPU. However that evaluation is useless if the fair task is running alone, in which case we can spare that IPI, preventing nohz_full CPUs from being disturbed. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Link: https://lkml.kernel.org/r/20191203160106.18806-2-frederic@kernel.org --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08a233e97a01..846f50bd0c0b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10322,6 +10322,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; + if (rq->cfs.nr_running == 1) + return; + /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on -- cgit From 5443a0be6121d557e12951537e10159e4c61035d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 3 Dec 2019 17:01:06 +0100 Subject: sched: Use fair:prio_changed() instead of ad-hoc implementation set_user_nice() implements its own version of fair::prio_changed() and therefore misses a specific optimization towards nohz_full CPUs that avoid sending an resched IPI to a reniced task running alone. Use the proper callback instead. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Link: https://lkml.kernel.org/r/20191203160106.18806-3-frederic@kernel.org --- kernel/sched/core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 90e4b00ace89..15508c202bf5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4540,17 +4540,17 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); delta = p->prio - old_prio; - if (queued) { + if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_curr(rq); - } if (running) set_next_task(rq, p); + + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + p->sched_class->prio_changed(rq, p, old_prio); + out_unlock: task_rq_unlock(rq, p, &rf); } -- cgit From cde65194502778665c1b52afc5722cf7dbfaa399 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 10 Dec 2019 20:19:03 +0100 Subject: sched/wait: fix ___wait_var_event(exclusive) init_wait_var_entry() forgets to initialize wq_entry->flags. Currently not a problem, we don't have wait_var_event_exclusive(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Vincent Guittot Cc: Ingo Molnar Cc: Felipe Balbi Cc: Linus Torvalds Cc: Miklos Szeredi Cc: Juri Lelli Link: https://lkml.kernel.org/r/20191210191902.GB14449@redhat.com --- kernel/sched/wait_bit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 45eba18a2898..02ce292b9bc0 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -179,6 +179,7 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int .bit_nr = -1, }, .wq_entry = { + .flags = flags, .private = current, .func = var_wake_function, .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), -- cgit From 45178ac0cea853fe0e405bf11e101bdebea57b15 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Dec 2019 09:34:54 +0100 Subject: cpu/hotplug, stop_machine: Fix stop_machine vs hotplug order Paul reported a very sporadic, rcutorture induced, workqueue failure. When the planets align, the workqueue rescuer's self-migrate fails and then triggers a WARN for running a work on the wrong CPU. Tejun then figured that set_cpus_allowed_ptr()'s stop_one_cpu() call could be ignored! When stopper->enabled is false, stop_machine will insta complete the work, without actually doing the work. Worse, it will not WARN about this (we really should fix this). It turns out there is a small window where a freshly online'ed CPU is marked 'online' but doesn't yet have the stopper task running: BP AP bringup_cpu() __cpu_up(cpu, idle) --> start_secondary() ... cpu_startup_entry() bringup_wait_for_ap() wait_for_ap_thread() <-- cpuhp_online_idle() while (1) do_idle() ... available to run kthreads ... stop_machine_unpark() stopper->enable = true; Close this by moving the stop_machine_unpark() into cpuhp_online_idle(), such that the stopper thread is ready before we start the idle loop and schedule. Reported-by: "Paul E. McKenney" Debugged-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Tested-by: "Paul E. McKenney" --- kernel/cpu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a59cc980adad..e7f79674824d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -525,8 +525,7 @@ static int bringup_wait_for_ap(unsigned int cpu) if (WARN_ON_ONCE((!cpu_online(cpu)))) return -ECANCELED; - /* Unpark the stopper thread and the hotplug thread of the target cpu */ - stop_machine_unpark(cpu); + /* Unpark the hotplug thread of the target cpu */ kthread_unpark(st->thread); /* @@ -1089,8 +1088,8 @@ void notify_cpu_starting(unsigned int cpu) /* * Called from the idle task. Wake up the controlling task which brings the - * stopper and the hotplug thread of the upcoming CPU up and then delegates - * the rest of the online bringup to the hotplug thread. + * hotplug thread of the upcoming CPU up and then delegates the rest of the + * online bringup to the hotplug thread. */ void cpuhp_online_idle(enum cpuhp_state state) { @@ -1100,6 +1099,12 @@ void cpuhp_online_idle(enum cpuhp_state state) if (state != CPUHP_AP_ONLINE_IDLE) return; + /* + * Unpart the stopper thread before we start the idle loop (and start + * scheduling); this ensures the stopper task is always available. + */ + stop_machine_unpark(smp_processor_id()); + st->state = CPUHP_AP_ONLINE_IDLE; complete_ap_thread(st, true); } -- cgit From 60588bfa223ff675b95f866249f90616613fbe31 Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Fri, 13 Dec 2019 10:45:30 +0800 Subject: sched/fair: Optimize select_idle_cpu select_idle_cpu() will scan the LLC domain for idle CPUs, it's always expensive. so the next commit : 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()") introduces a way to limit how many CPUs we scan. But it consume some CPUs out of 'nr' that are not allowed for the task and thus waste our attempts. The function always return nr_cpumask_bits, and we can't find a CPU which our task is allowed to run. Cpumask may be too big, similar to select_idle_core(), use per_cpu_ptr 'select_idle_mask' to prevent stack overflow. Fixes: 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()") Signed-off-by: Cheng Jian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Srikar Dronamraju Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20191213024530.28052-1-cj.chengjian@huawei.com --- kernel/sched/fair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 846f50bd0c0b..280d54ccb4be 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5828,6 +5828,7 @@ static inline int select_idle_smt(struct task_struct *p, int target) */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; u64 avg_cost, avg_idle; u64 time, cost; @@ -5859,11 +5860,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = cpu_clock(this); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + + for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) return si_cpu; - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) - continue; if (available_idle_cpu(cpu)) break; if (si_cpu == -1 && sched_idle_cpu(cpu)) -- cgit From d040e0734fb3dedfe24c3d94f5a32b4812eca610 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Fri, 13 Dec 2019 11:45:40 +0800 Subject: schied/fair: Skip calculating @contrib without load Because of the: if (!load) runnable = running = 0; clause in ___update_load_sum(), all the actual users of @contrib in accumulate_sum(): if (load) sa->load_sum += load * contrib; if (runnable) sa->runnable_load_sum += runnable * contrib; if (running) sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; don't happen, and therefore we don't care what @contrib actually is and calculating it is pointless. If we count the times when @load equals zero and not as below: if (load) { load_is_not_zero_count++; contrib = __accumulate_pelt_segments(periods, 1024 - sa->period_contrib,delta); } else load_is_zero_count++; As we can see, load_is_zero_count is much bigger than load_is_zero_count, and the gap is gradually widening: load_is_zero_count: 6016044 times load_is_not_zero_count: 244316 times 19:50:43 up 1 min, 1 user, load average: 0.09, 0.06, 0.02 load_is_zero_count: 7956168 times load_is_not_zero_count: 261472 times 19:51:42 up 2 min, 1 user, load average: 0.03, 0.05, 0.01 load_is_zero_count: 10199896 times load_is_not_zero_count: 278364 times 19:52:51 up 3 min, 1 user, load average: 0.06, 0.05, 0.01 load_is_zero_count: 14333700 times load_is_not_zero_count: 318424 times 19:54:53 up 5 min, 1 user, load average: 0.01, 0.03, 0.00 Perhaps we can gain some performance advantage by saving these unnecessary calculation. Signed-off-by: Peng Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot < vincent.guittot@linaro.org> Link: https://lkml.kernel.org/r/1576208740-35609-1-git-send-email-rocking@linux.alibaba.com --- kernel/sched/pelt.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a96db50d40e0..bd006b79b360 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -129,8 +129,20 @@ accumulate_sum(u64 delta, struct sched_avg *sa, * Step 2 */ delta %= 1024; - contrib = __accumulate_pelt_segments(periods, - 1024 - sa->period_contrib, delta); + if (load) { + /* + * This relies on the: + * + * if (!load) + * runnable = running = 0; + * + * clause from ___update_load_sum(); this results in + * the below usage of @contrib to dissapear entirely, + * so no point in calculating it. + */ + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } } sa->period_contrib = delta; @@ -205,7 +217,9 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * This means that weight will be 0 but not running for a sched_entity * but also for a cfs_rq if the latter becomes idle. As an example, * this happens during idle_balance() which calls - * update_blocked_averages() + * update_blocked_averages(). + * + * Also see the comment in accumulate_sum(). */ if (!load) runnable = running = 0; -- cgit From a5e37de90e67ac1072a9a44bd0cec9f5e98ded08 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 14 Dec 2019 19:51:07 +0000 Subject: stop_machine: remove try_stop_cpus helper try_stop_cpus is not used after this: commit c190c3b16c0f ("rcu: Switch synchronize_sched_expedited() to stop_one_cpu()") So remove it. Signed-off-by: Yangtao Li Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191214195107.26480-1-tiny.windzz@gmail.com --- kernel/stop_machine.c | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 1fe34a9fabc2..5d68ec4c4015 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -453,36 +453,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) return ret; } -/** - * try_stop_cpus - try to stop multiple cpus - * @cpumask: cpus to stop - * @fn: function to execute - * @arg: argument to @fn - * - * Identical to stop_cpus() except that it fails with -EAGAIN if - * someone else is already using the facility. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * -EAGAIN if someone else is already stopping cpus, -ENOENT if - * @fn(@arg) was not executed at all because all cpus in @cpumask were - * offline; otherwise, 0 if all executions of @fn returned 0, any non - * zero return value if any returned non zero. - */ -int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) -{ - int ret; - - /* static works are used, process one request at a time */ - if (!mutex_trylock(&stop_cpus_mutex)) - return -EAGAIN; - ret = __stop_cpus(cpumask, fn, arg); - mutex_unlock(&stop_cpus_mutex); - return ret; -} - static int cpu_stop_should_run(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); -- cgit From 5bf2fc1f9c88397b125d5ec5f65b1ed9300ba59d Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Thu, 19 Dec 2019 11:57:35 -0600 Subject: bpf: Remove unnecessary assertion on fp_old The two callers of bpf_prog_realloc - bpf_patch_insn_single and bpf_migrate_filter dereference the struct fp_old, before passing it to the function. Thus assertion to check fp_old is unnecessary and can be removed. Signed-off-by: Aditya Pakki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191219175735.19231-1-pakki001@umn.edu --- kernel/bpf/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2ff01a716128..7622dfc36705 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -222,8 +222,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, u32 pages, delta; int ret; - BUG_ON(fp_old == NULL); - size = round_up(size, PAGE_SIZE); pages = size / PAGE_SIZE; if (pages <= fp_old->pages) -- cgit From 0536b85239b8440735cdd910aae0eb076ebbb439 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 19 Dec 2019 07:09:59 +0100 Subject: xdp: Simplify devmap cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the RCU flavor consolidation [1], call_rcu() and synchronize_rcu() waits for preempt-disable regions (NAPI) in addition to the read-side critical sections. As a result of this, the cleanup code in devmap can be simplified * There is no longer a need to flush in __dev_map_entry_free, since we know that this has been done when the call_rcu() callback is triggered. * When freeing the map, there is no need to explicitly wait for a flush. It's guaranteed to be done after the synchronize_rcu() call in dev_map_free(). The rcu_barrier() is still needed, so that the map is not freed prior the elements. [1] https://lwn.net/Articles/777036/ Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-2-bjorn.topel@gmail.com --- kernel/bpf/devmap.c | 43 +++++-------------------------------------- 1 file changed, 5 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3d3d61b5985b..b7595de6a91a 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -201,7 +201,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int i, cpu; + int i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were @@ -221,18 +221,6 @@ static void dev_map_free(struct bpf_map *map) /* Make sure prior __dev_map_entry_free() have completed. */ rcu_barrier(); - /* To ensure all pending flush operations have completed wait for flush - * list to empty on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new items will be added. - */ - for_each_online_cpu(cpu) { - struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - - while (!list_empty(flush_list)) - cond_resched(); - } - if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; @@ -345,8 +333,7 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, - bool in_napi_ctx) +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) { struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; @@ -384,11 +371,7 @@ error: for (i = 0; i < bq->count; i++) { struct xdp_frame *xdpf = bq->q[i]; - /* RX path under NAPI protection, can return frames faster */ - if (likely(in_napi_ctx)) - xdp_return_frame_rx_napi(xdpf); - else - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); drops++; } goto out; @@ -409,7 +392,7 @@ void __dev_map_flush(struct bpf_map *map) rcu_read_lock(); list_for_each_entry_safe(bq, tmp, flush_list, flush_node) - bq_xmit_all(bq, XDP_XMIT_FLUSH, true); + bq_xmit_all(bq, XDP_XMIT_FLUSH); rcu_read_unlock(); } @@ -440,7 +423,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(bq, 0, true); + bq_xmit_all(bq, 0); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -509,27 +492,11 @@ static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } -static void dev_map_flush_old(struct bpf_dtab_netdev *dev) -{ - if (dev->dev->netdev_ops->ndo_xdp_xmit) { - struct xdp_bulk_queue *bq; - int cpu; - - rcu_read_lock(); - for_each_online_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(bq, XDP_XMIT_FLUSH, false); - } - rcu_read_unlock(); - } -} - static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); - dev_map_flush_old(dev); free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); -- cgit From 4bc188c7f23a5a308d7f15dda1b6a286d74e8954 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 19 Dec 2019 07:10:00 +0100 Subject: xdp: Simplify cpumap cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the RCU flavor consolidation [1], call_rcu() and synchronize_rcu() waits for preempt-disable regions (NAPI) in addition to the read-side critical sections. As a result of this, the cleanup code in cpumap can be simplified * There is no longer a need to flush in __cpu_map_entry_free, since we know that this has been done when the call_rcu() callback is triggered. * When freeing the map, there is no need to explicitly wait for a flush. It's guaranteed to be done after the synchronize_rcu() call in cpu_map_free(). [1] https://lwn.net/Articles/777036/ Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-3-bjorn.topel@gmail.com --- kernel/bpf/cpumap.c | 34 +++++----------------------------- 1 file changed, 5 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ef49e17ae47c..04c8eb11cd90 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -75,7 +75,7 @@ struct bpf_cpu_map { struct list_head __percpu *flush_list; }; -static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); +static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { @@ -399,7 +399,6 @@ free_rcu: static void __cpu_map_entry_free(struct rcu_head *rcu) { struct bpf_cpu_map_entry *rcpu; - int cpu; /* This cpu_map_entry have been disconnected from map and one * RCU graze-period have elapsed. Thus, XDP cannot queue any @@ -408,13 +407,6 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) */ rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); - /* Flush remaining packets in percpu bulkq */ - for_each_online_cpu(cpu) { - struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); - - /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(bq, false); - } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ put_cpu_map_entry(rcpu); @@ -507,7 +499,6 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, static void cpu_map_free(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - int cpu; u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, @@ -522,18 +513,6 @@ static void cpu_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_rcu(); - /* To ensure all pending flush operations have completed wait for flush - * list be empty on _all_ cpus. Because the above synchronize_rcu() - * ensures the map is disconnected from the program we can assume no new - * items will be added to the list. - */ - for_each_online_cpu(cpu) { - struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); - - while (!list_empty(flush_list)) - cond_resched(); - } - /* For cpu_map the remote CPUs can still be using the entries * (struct bpf_cpu_map_entry). */ @@ -599,7 +578,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, }; -static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) +static int bq_flush_to_queue(struct xdp_bulk_queue *bq) { struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; @@ -620,10 +599,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - if (likely(in_napi_ctx)) - xdp_return_frame_rx_napi(xdpf); - else - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); } processed++; } @@ -646,7 +622,7 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(bq, true); + bq_flush_to_queue(bq); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -688,7 +664,7 @@ void __cpu_map_flush(struct bpf_map *map) struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { - bq_flush_to_queue(bq, true); + bq_flush_to_queue(bq); /* If already running, costs spin_lock_irqsave + smb_mb */ wake_up_process(bq->obj->kthread); -- cgit From fb5aacdf3603ccbafe1da74eecd132eb4a31e53f Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 19 Dec 2019 07:10:01 +0100 Subject: xdp: Fix graze->grace type-o in cpumap comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simple spelling fix. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-4-bjorn.topel@gmail.com --- kernel/bpf/cpumap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 04c8eb11cd90..f9deed659798 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -401,7 +401,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct bpf_cpu_map_entry *rcpu; /* This cpu_map_entry have been disconnected from map and one - * RCU graze-period have elapsed. Thus, XDP cannot queue any + * RCU grace-period have elapsed. Thus, XDP cannot queue any * new packets and cannot change/set flush_needed that can * find this entry. */ @@ -428,7 +428,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) * percpu bulkq to queue. Due to caller map_delete_elem() disable * preemption, cannot call kthread_stop() to make sure queue is empty. * Instead a work_queue is started for stopping kthread, - * cpu_map_kthread_stop, which waits for an RCU graze period before + * cpu_map_kthread_stop, which waits for an RCU grace period before * stopping kthread, emptying the queue. */ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, @@ -523,7 +523,7 @@ static void cpu_map_free(struct bpf_map *map) if (!rcpu) continue; - /* bq flush and cleanup happens after RCU graze-period */ + /* bq flush and cleanup happens after RCU grace-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } free_percpu(cmap->flush_list); -- cgit From e312b9e706ed6d94f6cc9088fcd9fbd81de4525c Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 19 Dec 2019 07:10:02 +0100 Subject: xsk: Make xskmap flush_list common for all map instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xskmap flush list is used to track entries that need to flushed from via the xdp_do_flush_map() function. This list used to be per-map, but there is really no reason for that. Instead make the flush list global for all xskmaps, which simplifies __xsk_map_flush() and xsk_map_alloc(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-5-bjorn.topel@gmail.com --- kernel/bpf/xskmap.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 90c4fce1c981..2cc5c8f4c800 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -72,9 +72,9 @@ static void xsk_map_sock_delete(struct xdp_sock *xs, static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { struct bpf_map_memory mem; - int cpu, err, numa_node; + int err, numa_node; struct xsk_map *m; - u64 cost, size; + u64 size; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -86,9 +86,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) numa_node = bpf_map_attr_numa_node(attr); size = struct_size(m, xsk_map, attr->max_entries); - cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus()); - err = bpf_map_charge_init(&mem, cost); + err = bpf_map_charge_init(&mem, size); if (err < 0) return ERR_PTR(err); @@ -102,16 +101,6 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) bpf_map_charge_move(&m->map.memory, &mem); spin_lock_init(&m->lock); - m->flush_list = alloc_percpu(struct list_head); - if (!m->flush_list) { - bpf_map_charge_finish(&m->map.memory); - bpf_map_area_free(m); - return ERR_PTR(-ENOMEM); - } - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); - return &m->map; } @@ -121,7 +110,6 @@ static void xsk_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_net(); - free_percpu(m->flush_list); bpf_map_area_free(m); } -- cgit From 96360004b8628541f5d05a845ea213267db0b1a2 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 19 Dec 2019 07:10:03 +0100 Subject: xdp: Make devmap flush_list common for all map instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The devmap flush list is used to track entries that need to flushed from via the xdp_do_flush_map() function. This list used to be per-map, but there is really no reason for that. Instead make the flush list global for all devmaps, which simplifies __dev_map_flush() and dev_map_init_map(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-6-bjorn.topel@gmail.com --- kernel/bpf/devmap.c | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index b7595de6a91a..da9c832fc5c8 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -75,7 +75,6 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ - struct list_head __percpu *flush_list; struct list_head list; /* these are only used for DEVMAP_HASH type maps */ @@ -85,6 +84,7 @@ struct bpf_dtab { u32 n_buckets; }; +static DEFINE_PER_CPU(struct list_head, dev_map_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); @@ -109,8 +109,8 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - int err, cpu; - u64 cost; + u64 cost = 0; + int err; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -125,9 +125,6 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); - /* make sure page count doesn't overflow */ - cost = (u64) sizeof(struct list_head) * num_possible_cpus(); - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); @@ -143,17 +140,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (err) return -EINVAL; - dtab->flush_list = alloc_percpu(struct list_head); - if (!dtab->flush_list) - goto free_charge; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); if (!dtab->dev_index_head) - goto free_percpu; + goto free_charge; spin_lock_init(&dtab->index_lock); } else { @@ -161,13 +151,11 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_percpu; + goto free_charge; } return 0; -free_percpu: - free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); return -ENOMEM; @@ -254,7 +242,6 @@ static void dev_map_free(struct bpf_map *map) bpf_map_area_free(dtab->netdev_map); } - free_percpu(dtab->flush_list); kfree(dtab); } @@ -384,10 +371,9 @@ error: * net device can be torn down. On devmap tear down we ensure the flush list * is empty before completing to ensure all flush operations have completed. */ -void __dev_map_flush(struct bpf_map *map) +void __dev_map_flush(void) { - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct xdp_bulk_queue *bq, *tmp; rcu_read_lock(); @@ -419,7 +405,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { - struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) @@ -777,10 +763,15 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { + int cpu; + /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu)); return 0; } -- cgit From cdfafe98cabefeedbbc65af5c191c59745c03298 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 19 Dec 2019 07:10:04 +0100 Subject: xdp: Make cpumap flush_list common for all map instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpumap flush list is used to track entries that need to flushed from via the xdp_do_flush_map() function. This list used to be per-map, but there is really no reason for that. Instead make the flush list global for all devmaps, which simplifies __cpu_map_flush() and cpu_map_alloc(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-7-bjorn.topel@gmail.com --- kernel/bpf/cpumap.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index f9deed659798..70f71b154fa5 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -72,17 +72,18 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - struct list_head __percpu *flush_list; }; +static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); + static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; - int ret, cpu; u64 cost; + int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -106,7 +107,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ ret = bpf_map_charge_init(&cmap->map.memory, cost); @@ -115,23 +115,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - cmap->flush_list = alloc_percpu(struct list_head); - if (!cmap->flush_list) - goto free_charge; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); - /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), cmap->map.numa_node); if (!cmap->cpu_map) - goto free_percpu; + goto free_charge; return &cmap->map; -free_percpu: - free_percpu(cmap->flush_list); free_charge: bpf_map_charge_finish(&cmap->map.memory); free_cmap: @@ -526,7 +517,6 @@ static void cpu_map_free(struct bpf_map *map) /* bq flush and cleanup happens after RCU grace-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -618,7 +608,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq) */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { - struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) @@ -657,10 +647,9 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_flush(struct bpf_map *map) +void __cpu_map_flush(void) { - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { @@ -670,3 +659,14 @@ void __cpu_map_flush(struct bpf_map *map) wake_up_process(bq->obj->kthread); } } + +static int __init cpu_map_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); + return 0; +} + +subsys_initcall(cpu_map_init); -- cgit From 1020c1f24a946e7d5d8a67db741b20efcd2cefc5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 18 Dec 2019 23:44:33 -0800 Subject: bpf: Simplify __cgroup_bpf_attach __cgroup_bpf_attach has a lot of identical code to handle two scenarios: BPF_F_ALLOW_MULTI is set and unset. Simplify it by splitting the two main steps: * First, the decision is made whether a new bpf_prog_list entry should be allocated or existing entry should be reused for the new program. This decision is saved in replace_pl pointer; * Next, replace_pl pointer is used to handle both possible states of BPF_F_ALLOW_MULTI flag (set / unset) instead of doing similar work for them separately. This splitting, in turn, allows to make further simplifications: * The check for attaching same program twice in BPF_F_ALLOW_MULTI mode can be done before allocating cgroup storage, so that if user tries to attach same program twice no alloc/free happens as it was before; * pl_was_allocated becomes redundant so it's removed. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/c6193db6fe630797110b0d3ff06c125d093b834c.1576741281.git.rdna@fb.com --- kernel/bpf/cgroup.c | 62 ++++++++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9f90d3c92bda..e8cbdd1be687 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -295,9 +295,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + struct bpf_prog_list *pl, *replace_pl = NULL; enum bpf_cgroup_storage_type stype; - struct bpf_prog_list *pl; - bool pl_was_allocated; int err; if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) @@ -317,6 +316,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; + if (flags & BPF_F_ALLOW_MULTI) { + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog) + /* disallow attaching the same prog twice */ + return -EINVAL; + } + } else if (!list_empty(progs)) { + replace_pl = list_first_entry(progs, typeof(*pl), node); + } + for_each_cgroup_storage_type(stype) { storage[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(storage[stype])) { @@ -327,52 +336,27 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } } - if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) { - if (pl->prog == prog) { - /* disallow attaching the same prog twice */ - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); - return -EINVAL; - } + if (replace_pl) { + pl = replace_pl; + old_prog = pl->prog; + for_each_cgroup_storage_type(stype) { + old_storage[stype] = pl->storage[stype]; + bpf_cgroup_storage_unlink(old_storage[stype]); } - + } else { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); return -ENOMEM; } - - pl_was_allocated = true; - pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; list_add_tail(&pl->node, progs); - } else { - if (list_empty(progs)) { - pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) { - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); - return -ENOMEM; - } - pl_was_allocated = true; - list_add_tail(&pl->node, progs); - } else { - pl = list_first_entry(progs, typeof(*pl), node); - old_prog = pl->prog; - for_each_cgroup_storage_type(stype) { - old_storage[stype] = pl->storage[stype]; - bpf_cgroup_storage_unlink(old_storage[stype]); - } - pl_was_allocated = false; - } - pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; } + pl->prog = prog; + for_each_cgroup_storage_type(stype) + pl->storage[stype] = storage[stype]; + cgrp->bpf.flags[type] = flags; err = update_effective_progs(cgrp, type); @@ -401,7 +385,7 @@ cleanup: pl->storage[stype] = old_storage[stype]; bpf_cgroup_storage_link(old_storage[stype], cgrp, type); } - if (pl_was_allocated) { + if (!replace_pl) { list_del(&pl->node); kfree(pl); } -- cgit From 9fab329d6a04c0a52a84d207b5e0d83aeb660aa0 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 18 Dec 2019 23:44:34 -0800 Subject: bpf: Remove unused new_flags in hierarchy_allows_attach() new_flags is unused, remove it. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/2c49b30ab750f93cfef04a1e40b097d70c3a39a1.1576741281.git.rdna@fb.com --- kernel/bpf/cgroup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e8cbdd1be687..283efe3ce052 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -103,8 +103,7 @@ static u32 prog_list_length(struct list_head *head) * if parent has overridable or multi-prog, allow attaching */ static bool hierarchy_allows_attach(struct cgroup *cgrp, - enum bpf_attach_type type, - u32 new_flags) + enum bpf_attach_type type) { struct cgroup *p; @@ -303,7 +302,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, /* invalid combination */ return -EINVAL; - if (!hierarchy_allows_attach(cgrp, type, flags)) + if (!hierarchy_allows_attach(cgrp, type)) return -EPERM; if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) -- cgit From 7dd68b3279f1792103d12e69933db3128c6d416e Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 18 Dec 2019 23:44:35 -0800 Subject: bpf: Support replacing cgroup-bpf program in MULTI mode The common use-case in production is to have multiple cgroup-bpf programs per attach type that cover multiple use-cases. Such programs are attached with BPF_F_ALLOW_MULTI and can be maintained by different people. Order of programs usually matters, for example imagine two egress programs: the first one drops packets and the second one counts packets. If they're swapped the result of counting program will be different. It brings operational challenges with updating cgroup-bpf program(s) attached with BPF_F_ALLOW_MULTI since there is no way to replace a program: * One way to update is to detach all programs first and then attach the new version(s) again in the right order. This introduces an interruption in the work a program is doing and may not be acceptable (e.g. if it's egress firewall); * Another way is attach the new version of a program first and only then detach the old version. This introduces the time interval when two versions of same program are working, what may not be acceptable if a program is not idempotent. It also imposes additional burden on program developers to make sure that two versions of their program can co-exist. Solve the problem by introducing a "replace" mode in BPF_PROG_ATTACH command for cgroup-bpf programs being attached with BPF_F_ALLOW_MULTI flag. This mode is enabled by newly introduced BPF_F_REPLACE attach flag and bpf_attr.replace_bpf_fd attribute to pass fd of the old program to replace That way user can replace any program among those attached with BPF_F_ALLOW_MULTI flag without the problems described above. Details of the new API: * If BPF_F_REPLACE is set but replace_bpf_fd doesn't have valid descriptor of BPF program, BPF_PROG_ATTACH will return corresponding error (EINVAL or EBADF). * If replace_bpf_fd has valid descriptor of BPF program but such a program is not attached to specified cgroup, BPF_PROG_ATTACH will return ENOENT. BPF_F_REPLACE is introduced to make the user intent clear, since replace_bpf_fd alone can't be used for this (its default value, 0, is a valid fd). BPF_F_REPLACE also makes it possible to extend the API in the future (e.g. add BPF_F_BEFORE and BPF_F_AFTER if needed). Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Andrii Narkyiko Link: https://lore.kernel.org/bpf/30cd850044a0057bdfcaaf154b7d2f39850ba813.1576741281.git.rdna@fb.com --- kernel/bpf/cgroup.c | 30 ++++++++++++++++++++++++++---- kernel/bpf/syscall.c | 4 ++-- kernel/cgroup/cgroup.c | 5 +++-- 3 files changed, 31 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 283efe3ce052..45346c79613a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -282,14 +282,17 @@ cleanup: * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach + * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags * * Must be called with cgroup_mutex held. */ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_prog *replace_prog, enum bpf_attach_type type, u32 flags) { + u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], @@ -298,14 +301,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_cgroup_storage_type stype; int err; - if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) + if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || + ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; if (!hierarchy_allows_attach(cgrp, type)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) + if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -320,7 +324,12 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (pl->prog == prog) /* disallow attaching the same prog twice */ return -EINVAL; + if (pl->prog == replace_prog) + replace_pl = pl; } + if ((flags & BPF_F_REPLACE) && !replace_pl) + /* prog to replace not found for cgroup */ + return -ENOENT; } else if (!list_empty(progs)) { replace_pl = list_first_entry(progs, typeof(*pl), node); } @@ -356,7 +365,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, for_each_cgroup_storage_type(stype) pl->storage[stype] = storage[stype]; - cgrp->bpf.flags[type] = flags; + cgrp->bpf.flags[type] = saved_flags; err = update_effective_progs(cgrp, type); if (err) @@ -522,6 +531,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog) { + struct bpf_prog *replace_prog = NULL; struct cgroup *cgrp; int ret; @@ -529,8 +539,20 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, if (IS_ERR(cgrp)) return PTR_ERR(cgrp); - ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + if ((attr->attach_flags & BPF_F_ALLOW_MULTI) && + (attr->attach_flags & BPF_F_REPLACE)) { + replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype); + if (IS_ERR(replace_prog)) { + cgroup_put(cgrp); + return PTR_ERR(replace_prog); + } + } + + ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type, attr->attach_flags); + + if (replace_prog) + bpf_prog_put(replace_prog); cgroup_put(cgrp); return ret; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b08c362f4e02..81ee8595dfee 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2073,10 +2073,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, } } -#define BPF_PROG_ATTACH_LAST_FIELD attach_flags +#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd #define BPF_F_ATTACH_MASK \ - (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) + (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) static int bpf_prog_attach(const union bpf_attr *attr) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 735af8f15f95..725365df066d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6288,12 +6288,13 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #ifdef CONFIG_CGROUP_BPF int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 flags) + struct bpf_prog *replace_prog, enum bpf_attach_type type, + u32 flags) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_attach(cgrp, prog, type, flags); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags); mutex_unlock(&cgroup_mutex); return ret; } -- cgit From 53a23364b6b0c679a8ecfc48e74d652f18e3631f Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 19 Dec 2019 09:03:14 -0500 Subject: sched/core: Remove unused variable from set_user_nice() This commit left behind an unused variable: 5443a0be6121 ("sched: Use fair:prio_changed() instead of ad-hoc implementation") left behind an unused variable. kernel/sched/core.c: In function 'set_user_nice': kernel/sched/core.c:4507:16: warning: variable 'delta' set but not used int old_prio, delta; ^~~~~ Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 5443a0be6121 ("sched: Use fair:prio_changed() instead of ad-hoc implementation") Link: https://lkml.kernel.org/r/20191219140314.1252-1-cai@lca.pw Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 15508c202bf5..1f6c094520e0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4504,7 +4504,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { bool queued, running; - int old_prio, delta; + int old_prio; struct rq_flags rf; struct rq *rq; @@ -4538,7 +4538,6 @@ void set_user_nice(struct task_struct *p, long nice) set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); - delta = p->prio - old_prio; if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); -- cgit From 17346452b25b98acfb395d2a82ec2e4ad0cb7a01 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 14 Nov 2019 16:19:27 +0530 Subject: sched/fair: Make sched-idle CPU selection consistent throughout There are instances where we keep searching for an idle CPU despite already having a sched-idle CPU (in find_idlest_group_cpu(), select_idle_smt() and select_idle_cpu() and then there are places where we don't necessarily do that and return a sched-idle CPU as soon as we find one (in select_idle_sibling()). This looks a bit inconsistent and it may be worth having the same policy everywhere. On the other hand, choosing a sched-idle CPU over a idle one shall be beneficial from performance and power point of view as well, as we don't need to get the CPU online from a deep idle state which wastes quite a lot of time and energy and delays the scheduling of the newly woken up task. This patch tries to simplify code around sched-idle CPU selection and make it consistent throughout. Testing is done with the help of rt-app on hikey board (ARM64 octa-core, 2 clusters, 0-3 and 4-7). The cpufreq governor was set to performance to avoid any side affects from CPU frequency. Following are the tests performed: Test 1: 1-cfs-task: A single SCHED_NORMAL task is pinned to CPU5 which runs for 2333 us out of 7777 us (so gives time for the cluster to go in deep idle state). Test 2: 1-cfs-1-idle-task: A single SCHED_NORMAL task is pinned on CPU5 and single SCHED_IDLE task is pinned on CPU6 (to make sure cluster 1 doesn't go in deep idle state). Test 3: 1-cfs-8-idle-task: A single SCHED_NORMAL task is pinned on CPU5 and eight SCHED_IDLE tasks are created which run forever (not pinned anywhere, so they run on all CPUs). Checked with kernelshark that as soon as NORMAL task sleeps, the SCHED_IDLE task starts running on CPU5. And here are the results on mean latency (in us), using the "st" tool. $ st 1-cfs-task/rt-app-cfs_thread-0.log N min max sum mean stddev 642 90 592 197180 307.134 109.906 $ st 1-cfs-1-idle-task/rt-app-cfs_thread-0.log N min max sum mean stddev 642 67 311 113850 177.336 41.4251 $ st 1-cfs-8-idle-task/rt-app-cfs_thread-0.log N min max sum mean stddev 643 29 173 41364 64.3297 13.2344 The mean latency when we need to: - wakeup from deep idle state is 307 us. - wakeup from shallow idle state is 177 us. - preempt a SCHED_IDLE task is 64 us. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/b90cbcce608cef4e02a7bbfe178335f76d201bab.1573728344.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8da0222924cf..1f34fa9732d8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5588,7 +5588,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this unsigned int min_exit_latency = UINT_MAX; u64 latest_idle_timestamp = 0; int least_loaded_cpu = this_cpu; - int shallowest_idle_cpu = -1, si_cpu = -1; + int shallowest_idle_cpu = -1; int i; /* Check if we have any choice: */ @@ -5597,6 +5597,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + if (sched_idle_cpu(i)) + return i; + if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5619,12 +5622,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; } - } else if (shallowest_idle_cpu == -1 && si_cpu == -1) { - if (sched_idle_cpu(i)) { - si_cpu = i; - continue; - } - + } else if (shallowest_idle_cpu == -1) { load = cpu_load(cpu_rq(i)); if (load < min_load) { min_load = load; @@ -5633,11 +5631,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this } } - if (shallowest_idle_cpu != -1) - return shallowest_idle_cpu; - if (si_cpu != -1) - return si_cpu; - return least_loaded_cpu; + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, @@ -5790,7 +5784,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int */ static int select_idle_smt(struct task_struct *p, int target) { - int cpu, si_cpu = -1; + int cpu; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5798,13 +5792,11 @@ static int select_idle_smt(struct task_struct *p, int target) for_each_cpu(cpu, cpu_smt_mask(target)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - if (available_idle_cpu(cpu)) + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; - if (si_cpu == -1 && sched_idle_cpu(cpu)) - si_cpu = cpu; } - return si_cpu; + return -1; } #else /* CONFIG_SCHED_SMT */ @@ -5834,7 +5826,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t u64 time, cost; s64 delta; int this = smp_processor_id(); - int cpu, nr = INT_MAX, si_cpu = -1; + int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -5864,11 +5856,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) - return si_cpu; - if (available_idle_cpu(cpu)) + return -1; + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) break; - if (si_cpu == -1 && sched_idle_cpu(cpu)) - si_cpu = cpu; } time = cpu_clock(this) - time; -- cgit From 59fe675248ffc37d4167e9ec6920a2f3d5ec67bb Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:47 +0000 Subject: sched/uclamp: Remove uclamp_util() The sole user of uclamp_util(), schedutil_cpu_util(), was made to use uclamp_util_with() instead in commit: af24bde8df20 ("sched/uclamp: Add uclamp support to energy_compute()") From then on, uclamp_util() has remained unused. Being a simple wrapper around uclamp_util_with(), we can get rid of it and win back a few lines. Tested-By: Dietmar Eggemann Suggested-by: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-2-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 280a3c735935..d9b24513d71d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2324,21 +2324,12 @@ unsigned int uclamp_util_with(struct rq *rq, unsigned int util, return clamp(util, min_util, max_util); } - -static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) -{ - return uclamp_util_with(rq, util, NULL); -} #else /* CONFIG_UCLAMP_TASK */ static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, struct task_struct *p) { return util; } -static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) -{ - return util; -} #endif /* CONFIG_UCLAMP_TASK */ #ifdef arch_scale_freq_capacity -- cgit From 686516b55e98edf18c2a02d36aaaa6f4c0f6c39c Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:48 +0000 Subject: sched/uclamp: Make uclamp util helpers use and return UL values Vincent pointed out recently that the canonical type for utilization values is 'unsigned long'. Internally uclamp uses 'unsigned int' values for cache optimization, but this doesn't have to be exported to its users. Make the uclamp helpers that deal with utilization use and return unsigned long values. Tested-By: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-3-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/sched.h | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1f6c094520e0..e7b08d52db93 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -919,17 +919,17 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) return uc_req; } -unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_eff; /* Task currently refcounted: use back-annotated (effective) value */ if (p->uclamp[clamp_id].active) - return p->uclamp[clamp_id].value; + return (unsigned long)p->uclamp[clamp_id].value; uc_eff = uclamp_eff_get(p, clamp_id); - return uc_eff.value; + return (unsigned long)uc_eff.value; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d9b24513d71d..b478474ea847 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2300,14 +2300,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef CONFIG_UCLAMP_TASK -unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static __always_inline -unsigned int uclamp_util_with(struct rq *rq, unsigned int util, - struct task_struct *p) +unsigned long uclamp_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { - unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); if (p) { min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); @@ -2325,8 +2325,8 @@ unsigned int uclamp_util_with(struct rq *rq, unsigned int util, return clamp(util, min_util, max_util); } #else /* CONFIG_UCLAMP_TASK */ -static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, - struct task_struct *p) +static inline unsigned long uclamp_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { return util; } -- cgit From d2b58a286e89824900d501db0be1d4f6aed474fc Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:49 +0000 Subject: sched/uclamp: Rename uclamp_util_with() into uclamp_rq_util_with() The current helper returns (CPU) rq utilization with uclamp restrictions taken into account. A uclamp task utilization helper would be quite helpful, but this requires some renaming. Prepare the code for the introduction of a uclamp_task_util() by renaming the existing uclamp_util_with() to uclamp_rq_util_with(). Tested-By: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-4-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/sched.h | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 9b8916fd00a2..7fbaee24c824 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -238,7 +238,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, */ util = util_cfs + cpu_util_rt(rq); if (type == FREQUENCY_UTIL) - util = uclamp_util_with(rq, util, p); + util = uclamp_rq_util_with(rq, util, p); dl_util = cpu_util_dl(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b478474ea847..1a88dc8ad11b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2303,8 +2303,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static __always_inline -unsigned long uclamp_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); @@ -2325,8 +2325,9 @@ unsigned long uclamp_util_with(struct rq *rq, unsigned long util, return clamp(util, min_util, max_util); } #else /* CONFIG_UCLAMP_TASK */ -static inline unsigned long uclamp_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) +static inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) { return util; } -- cgit From a7008c07a568278ed2763436404752a98004c7ff Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:50 +0000 Subject: sched/fair: Make task_fits_capacity() consider uclamp restrictions task_fits_capacity() drives CPU selection at wakeup time, and is also used to detect misfit tasks. Right now it does so by comparing task_util_est() with a CPU's capacity, but doesn't take into account uclamp restrictions. There's a few interesting uses that can come out of doing this. For instance, a low uclamp.max value could prevent certain tasks from being flagged as misfit tasks, so they could merrily remain on low-capacity CPUs. Similarly, a high uclamp.min value would steer tasks towards high capacity CPUs at wakeup (and, should that fail, later steered via misfit balancing), so such "boosted" tasks would favor CPUs of higher capacity. Introduce uclamp_task_util() and make task_fits_capacity() use it. Tested-By: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-5-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1f34fa9732d8..26c59bc5b2ed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3711,6 +3711,20 @@ static inline unsigned long task_util_est(struct task_struct *p) return max(task_util(p), _task_util_est(p)); } +#ifdef CONFIG_UCLAMP_TASK +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return clamp(task_util_est(p), + uclamp_eff_value(p, UCLAMP_MIN), + uclamp_eff_value(p, UCLAMP_MAX)); +} +#else +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return task_util_est(p); +} +#endif + static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) { @@ -3822,7 +3836,7 @@ done: static inline int task_fits_capacity(struct task_struct *p, long capacity) { - return fits_capacity(task_util_est(p), capacity); + return fits_capacity(uclamp_task_util(p), capacity); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) -- cgit From 1d42509e475cdc8542aa5b3e03a7e845244f4f57 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Dec 2019 11:38:51 +0000 Subject: sched/fair: Make EAS wakeup placement consider uclamp restrictions task_fits_capacity() has just been made uclamp-aware, and find_energy_efficient_cpu() needs to go through the same treatment. Things are somewhat different here however - using the task max clamp isn't sufficient. Consider the following setup: The target runqueue, rq: rq.cpu_capacity_orig = 512 rq.cfs.avg.util_avg = 200 rq.uclamp.max = 768 // the max p.uclamp.max of all enqueued p's is 768 The waking task, p (not yet enqueued on rq): p.util_est = 600 p.uclamp.max = 100 Now, consider the following code which doesn't use the rq clamps: util = uclamp_task_util(p); // Does the task fit in the spare CPU capacity? cpu = cpu_of(rq); fits_capacity(util, cpu_capacity(cpu) - cpu_util(cpu)) This would lead to: util = 100; fits_capacity(100, 512 - 200) fits_capacity() would return true. However, enqueuing p on that CPU *will* cause it to become overutilized since rq clamp values are max-aggregated, so we'd remain with rq.uclamp.max = 768 which comes from the other tasks already enqueued on rq. Thus, we could select a high enough frequency to reach beyond 0.8 * 512 utilization (== overutilized) after enqueuing p on rq. What find_energy_efficient_cpu() needs here is uclamp_rq_util_with() which lets us peek at the future utilization landscape, including rq-wide uclamp values. Make find_energy_efficient_cpu() use uclamp_rq_util_with() for its fits_capacity() check. This is in line with what compute_energy() ends up using for estimating utilization. Tested-By: Dietmar Eggemann Suggested-by: Quentin Perret Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-6-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26c59bc5b2ed..2d170b5da0e3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6273,9 +6273,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - /* Skip CPUs that will be overutilized. */ util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); + spare_cap = cpu_cap - util; + + /* + * Skip CPUs that cannot satisfy the capacity request. + * IOW, placing the task there would make the CPU + * overutilized. Take uclamp into account to see how + * much capacity we can get out of the CPU; this is + * aligned with schedutil_cpu_util(). + */ + util = uclamp_rq_util_with(cpu_rq(cpu), util, p); if (!fits_capacity(util, cpu_cap)) continue; @@ -6290,7 +6299,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * Find the CPU with the maximum spare capacity in * the performance domain */ - spare_cap = cpu_cap - util; if (spare_cap > max_spare_cap) { max_spare_cap = spare_cap; max_spare_cap_cpu = cpu; -- cgit From 804d402fb6f6487b825aae8cf42fda6426c62867 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Wed, 9 Oct 2019 11:46:11 +0100 Subject: sched/rt: Make RT capacity-aware Capacity Awareness refers to the fact that on heterogeneous systems (like Arm big.LITTLE), the capacity of the CPUs is not uniform, hence when placing tasks we need to be aware of this difference of CPU capacities. In such scenarios we want to ensure that the selected CPU has enough capacity to meet the requirement of the running task. Enough capacity means here that capacity_orig_of(cpu) >= task.requirement. The definition of task.requirement is dependent on the scheduling class. For CFS, utilization is used to select a CPU that has >= capacity value than the cfs_task.util. capacity_orig_of(cpu) >= cfs_task.util DL isn't capacity aware at the moment but can make use of the bandwidth reservation to implement that in a similar manner CFS uses utilization. The following patchset implements that: https://lore.kernel.org/lkml/20190506044836.2914-1-luca.abeni@santannapisa.it/ capacity_orig_of(cpu)/SCHED_CAPACITY >= dl_deadline/dl_runtime For RT we don't have a per task utilization signal and we lack any information in general about what performance requirement the RT task needs. But with the introduction of uclamp, RT tasks can now control that by setting uclamp_min to guarantee a minimum performance point. ATM the uclamp value are only used for frequency selection; but on heterogeneous systems this is not enough and we need to ensure that the capacity of the CPU is >= uclamp_min. Which is what implemented here. capacity_orig_of(cpu) >= rt_task.uclamp_min Note that by default uclamp.min is 1024, which means that RT tasks will always be biased towards the big CPUs, which make for a better more predictable behavior for the default case. Must stress that the bias acts as a hint rather than a definite placement strategy. For example, if all big cores are busy executing other RT tasks we can't guarantee that a new RT task will be placed there. On non-heterogeneous systems the original behavior of RT should be retained. Similarly if uclamp is not selected in the config. [ mingo: Minor edits to comments. ] Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Steven Rostedt (VMware) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191009104611.15363-1-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpupri.c | 25 ++++++++++++++-- kernel/sched/cpupri.h | 4 ++- kernel/sched/rt.c | 83 +++++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 94 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index b7abca987d94..1a2719e1350a 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -46,6 +46,8 @@ static int convert_prio(int prio) * @cp: The cpupri context * @p: The task * @lowest_mask: A mask to fill in with selected CPUs (or NULL) + * @fitness_fn: A pointer to a function to do custom checks whether the CPU + * fits a specific criteria so that we only return those CPUs. * * Note: This function returns the recommended CPUs as calculated during the * current invocation. By the time the call returns, the CPUs may have in @@ -57,7 +59,8 @@ static int convert_prio(int prio) * Return: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask) + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)) { int idx = 0; int task_pri = convert_prio(p->prio); @@ -98,6 +101,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, continue; if (lowest_mask) { + int cpu; + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); /* @@ -108,7 +113,23 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, * condition, simply act as though we never hit this * priority level and continue on. */ - if (cpumask_any(lowest_mask) >= nr_cpu_ids) + if (cpumask_empty(lowest_mask)) + continue; + + if (!fitness_fn) + return 1; + + /* Ensure the capacity of the CPUs fit the task */ + for_each_cpu(cpu, lowest_mask) { + if (!fitness_fn(p, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); + } + + /* + * If no CPU at the current priority can fit the task + * continue looking + */ + if (cpumask_empty(lowest_mask)) continue; } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 7dc20a3232e7..32dd520db11f 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -18,7 +18,9 @@ struct cpupri { }; #ifdef CONFIG_SMP -int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); +int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)); void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e591d40fd645..4043abe45459 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -437,6 +437,45 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se) return rt_se->on_rq; } +#ifdef CONFIG_UCLAMP_TASK +/* + * Verify the fitness of task @p to run on @cpu taking into account the uclamp + * settings. + * + * This check is only important for heterogeneous systems where uclamp_min value + * is higher than the capacity of a @cpu. For non-heterogeneous system this + * function will always return true. + * + * The function will return true if the capacity of the @cpu is >= the + * uclamp_min and false otherwise. + * + * Note that uclamp_min will be clamped to uclamp_max if uclamp_min + * > uclamp_max. + */ +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned int min_cap; + unsigned int max_cap; + unsigned int cpu_cap; + + /* Only heterogeneous systems can benefit from this check */ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return true; + + min_cap = uclamp_eff_value(p, UCLAMP_MIN); + max_cap = uclamp_eff_value(p, UCLAMP_MAX); + + cpu_cap = capacity_orig_of(cpu); + + return cpu_cap >= min(min_cap, max_cap); +} +#else +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + return true; +} +#endif + #ifdef CONFIG_RT_GROUP_SCHED static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) @@ -1391,6 +1430,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; + bool test; /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) @@ -1422,10 +1462,16 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * * This test is optimistic, if we get it wrong the load-balancer * will have to sort it out. + * + * We take into account the capacity of the CPU to ensure it fits the + * requirement of the task - which is only important on heterogeneous + * systems like big.LITTLE. */ - if (curr && unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio)) { + test = curr && + unlikely(rt_task(curr)) && + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); + + if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); /* @@ -1449,15 +1495,15 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL)) return; /* * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, NULL)) + if (p->nr_cpus_allowed != 1 && + cpupri_find(&rq->rd->cpupri, p, NULL, NULL)) return; /* @@ -1601,7 +1647,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + cpumask_test_cpu(cpu, p->cpus_ptr) && + rt_task_fits_capacity(p, cpu)) return 1; return 0; @@ -1643,7 +1690,8 @@ static int find_lowest_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask, + rt_task_fits_capacity)) return -1; /* No targets found */ /* @@ -2147,12 +2195,14 @@ skip: */ static void task_woken_rt(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && - !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && - (dl_task(rq->curr) || rt_task(rq->curr)) && - (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio)) + bool need_to_push = !task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + p->nr_cpus_allowed > 1 && + (dl_task(rq->curr) || rt_task(rq->curr)) && + (rq->curr->nr_cpus_allowed < 2 || + rq->curr->prio <= p->prio); + + if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq))) push_rt_tasks(rq); } @@ -2224,7 +2274,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) + bool need_to_push = rq->rt.overloaded || + !rt_task_fits_capacity(p, cpu_of(rq)); + + if (p->nr_cpus_allowed > 1 && need_to_push) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) -- cgit From e4add247789e4ba5e08ad8256183ce2e211877d4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 7 Jan 2020 23:42:24 +0900 Subject: kprobes: Fix optimize_kprobe()/unoptimize_kprobe() cancellation logic optimize_kprobe() and unoptimize_kprobe() cancels if a given kprobe is on the optimizing_list or unoptimizing_list already. However, since the following commit: f66c0447cca1 ("kprobes: Set unoptimized flag after unoptimizing code") modified the update timing of the KPROBE_FLAG_OPTIMIZED, it doesn't work as expected anymore. The optimized_kprobe could be in the following states: - [optimizing]: Before inserting jump instruction op.kp->flags has KPROBE_FLAG_OPTIMIZED and op->list is not empty. - [optimized]: jump inserted op.kp->flags has KPROBE_FLAG_OPTIMIZED and op->list is empty. - [unoptimizing]: Before removing jump instruction (including unused optprobe) op.kp->flags has KPROBE_FLAG_OPTIMIZED and op->list is not empty. - [unoptimized]: jump removed op.kp->flags doesn't have KPROBE_FLAG_OPTIMIZED and op->list is empty. Current code mis-expects [unoptimizing] state doesn't have KPROBE_FLAG_OPTIMIZED, and that can cause incorrect results. To fix this, introduce optprobe_queued_unopt() to distinguish [optimizing] and [unoptimizing] states and fixes the logic in optimize_kprobe() and unoptimize_kprobe(). [ mingo: Cleaned up the changelog and the code a bit. ] Signed-off-by: Masami Hiramatsu Reviewed-by: Steven Rostedt (VMware) Cc: Alexei Starovoitov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Fixes: f66c0447cca1 ("kprobes: Set unoptimized flag after unoptimizing code") Link: https://lkml.kernel.org/r/157840814418.7181.13478003006386303481.stgit@devnote2 Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 67 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 34e28b236d68..2625c241ac00 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -612,6 +612,18 @@ void wait_for_kprobe_optimizer(void) mutex_unlock(&kprobe_mutex); } +static bool optprobe_queued_unopt(struct optimized_kprobe *op) +{ + struct optimized_kprobe *_op; + + list_for_each_entry(_op, &unoptimizing_list, list) { + if (op == _op) + return true; + } + + return false; +} + /* Optimize kprobe if p is ready to be optimized */ static void optimize_kprobe(struct kprobe *p) { @@ -633,17 +645,21 @@ static void optimize_kprobe(struct kprobe *p) return; /* Check if it is already optimized. */ - if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) + if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) { + if (optprobe_queued_unopt(op)) { + /* This is under unoptimizing. Just dequeue the probe */ + list_del_init(&op->list); + } return; + } op->kp.flags |= KPROBE_FLAG_OPTIMIZED; - if (!list_empty(&op->list)) - /* This is under unoptimizing. Just dequeue the probe */ - list_del_init(&op->list); - else { - list_add(&op->list, &optimizing_list); - kick_kprobe_optimizer(); - } + /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */ + if (WARN_ON_ONCE(!list_empty(&op->list))) + return; + + list_add(&op->list, &optimizing_list); + kick_kprobe_optimizer(); } /* Short cut to direct unoptimizing */ @@ -665,30 +681,33 @@ static void unoptimize_kprobe(struct kprobe *p, bool force) return; /* This is not an optprobe nor optimized */ op = container_of(p, struct optimized_kprobe, kp); - if (!kprobe_optimized(p)) { - /* Unoptimized or unoptimizing case */ - if (force && !list_empty(&op->list)) { - /* - * Only if this is unoptimizing kprobe and forced, - * forcibly unoptimize it. (No need to unoptimize - * unoptimized kprobe again :) - */ - list_del_init(&op->list); - force_unoptimize_kprobe(op); - } + if (!kprobe_optimized(p)) return; - } if (!list_empty(&op->list)) { - /* Dequeue from the optimization queue */ - list_del_init(&op->list); + if (optprobe_queued_unopt(op)) { + /* Queued in unoptimizing queue */ + if (force) { + /* + * Forcibly unoptimize the kprobe here, and queue it + * in the freeing list for release afterwards. + */ + force_unoptimize_kprobe(op); + list_move(&op->list, &freeing_list); + } + } else { + /* Dequeue from the optimizing queue */ + list_del_init(&op->list); + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + } return; } + /* Optimized kprobe case */ - if (force) + if (force) { /* Forcibly update the code: this is a special case */ force_unoptimize_kprobe(op); - else { + } else { list_add(&op->list, &unoptimizing_list); kick_kprobe_optimizer(); } -- cgit From 65726b5b7efae718391752ae8bb85b9843fd83f4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:34:54 -0800 Subject: bpf: Save PTR_TO_BTF_ID register state when spilling to stack This patch makes the verifier save the PTR_TO_BTF_ID register state when spilling to the stack. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003454.3854870-1-kafai@fb.com --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f63ae7a370c..d433d70022fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1916,6 +1916,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: + case PTR_TO_BTF_ID: return true; default: return false; -- cgit From 275517ff452a535da5eef25b1c22e53fc50b0a12 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:34:56 -0800 Subject: bpf: Avoid storing modifier to info->btf_id info->btf_id expects the btf_id of a struct, so it should store the final result after skipping modifiers (if any). It also takes this chanace to add a missing newline in one of the bpf_log() messages. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003456.3855176-1-kafai@fb.com --- kernel/bpf/btf.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ed2075884724..497ecf62d79d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3697,7 +3697,6 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* this is a pointer to another type */ info->reg_type = PTR_TO_BTF_ID; - info->btf_id = t->type; if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type); @@ -3708,10 +3707,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; } } + + info->btf_id = t->type; t = btf_type_by_id(btf, t->type); /* skip modifiers */ - while (btf_type_is_modifier(t)) + while (btf_type_is_modifier(t)) { + info->btf_id = t->type; t = btf_type_by_id(btf, t->type); + } if (!btf_type_is_struct(t)) { bpf_log(log, "func '%s' arg%d type %s is not a struct\n", @@ -3737,7 +3740,7 @@ int btf_struct_access(struct bpf_verifier_log *log, again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); if (!btf_type_is_struct(t)) { - bpf_log(log, "Type '%s' is not a struct", tname); + bpf_log(log, "Type '%s' is not a struct\n", tname); return -EINVAL; } -- cgit From 218b3f65f9081f5e1bffe6de5f0f4b22c935410b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:34:59 -0800 Subject: bpf: Add enum support to btf_ctx_access() It allows bpf prog (e.g. tracing) to attach to a kernel function that takes enum argument. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003459.3855366-1-kafai@fb.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 497ecf62d79d..6a5ccb748a72 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3677,7 +3677,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t)) + if (btf_type_is_int(t) || btf_type_is_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { -- cgit From 976aba002fcb4b1baa71e4b0854f3d4ae48c1d4d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:35:01 -0800 Subject: bpf: Support bitfield read access in btf_struct_access This patch allows bitfield access as a scalar. It checks "off + size > t->size" to avoid accessing bitfield end up accessing beyond the struct. This check is done outside of the loop since it is applicable to all access. It also takes this chance to break early on the "off < moff" case. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003501.3855427-1-kafai@fb.com --- kernel/bpf/btf.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6a5ccb748a72..48bbde2e1c1e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3744,19 +3744,53 @@ again: return -EINVAL; } - for_each_member(i, t, member) { - if (btf_member_bitfield_size(t, member)) - /* bitfields are not supported yet */ - continue; + if (off + size > t->size) { + bpf_log(log, "access beyond struct %s at off %u size %u\n", + tname, off, size); + return -EACCES; + } + for_each_member(i, t, member) { /* offset of the field in bytes */ moff = btf_member_bit_offset(t, member) / 8; if (off + size <= moff) /* won't find anything, field is already too far */ break; + + if (btf_member_bitfield_size(t, member)) { + u32 end_bit = btf_member_bit_offset(t, member) + + btf_member_bitfield_size(t, member); + + /* off <= moff instead of off == moff because clang + * does not generate a BTF member for anonymous + * bitfield like the ":16" here: + * struct { + * int :16; + * int x:8; + * }; + */ + if (off <= moff && + BITS_ROUNDUP_BYTES(end_bit) <= off + size) + return SCALAR_VALUE; + + /* off may be accessing a following member + * + * or + * + * Doing partial access at either end of this + * bitfield. Continue on this case also to + * treat it as not accessing this bitfield + * and eventually error out as field not + * found to keep it simple. + * It could be relaxed if there was a legit + * partial access case later. + */ + continue; + } + /* In case of "off" is pointing to holes of a struct */ if (off < moff) - continue; + break; /* type of the field */ mtype = btf_type_by_id(btf_vmlinux, member->type); -- cgit From 27ae7997a66174cb8afd6a75b3989f5e0c1b9e5a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:35:03 -0800 Subject: bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS This patch allows the kernel's struct ops (i.e. func ptr) to be implemented in BPF. The first use case in this series is the "struct tcp_congestion_ops" which will be introduced in a latter patch. This patch introduces a new prog type BPF_PROG_TYPE_STRUCT_OPS. The BPF_PROG_TYPE_STRUCT_OPS prog is verified against a particular func ptr of a kernel struct. The attr->attach_btf_id is the btf id of a kernel struct. The attr->expected_attach_type is the member "index" of that kernel struct. The first member of a struct starts with member index 0. That will avoid ambiguity when a kernel struct has multiple func ptrs with the same func signature. For example, a BPF_PROG_TYPE_STRUCT_OPS prog is written to implement the "init" func ptr of the "struct tcp_congestion_ops". The attr->attach_btf_id is the btf id of the "struct tcp_congestion_ops" of the _running_ kernel. The attr->expected_attach_type is 3. The ctx of BPF_PROG_TYPE_STRUCT_OPS is an array of u64 args saved by arch_prepare_bpf_trampoline that will be done in the next patch when introducing BPF_MAP_TYPE_STRUCT_OPS. "struct bpf_struct_ops" is introduced as a common interface for the kernel struct that supports BPF_PROG_TYPE_STRUCT_OPS prog. The supporting kernel struct will need to implement an instance of the "struct bpf_struct_ops". The supporting kernel struct also needs to implement a bpf_verifier_ops. During BPF_PROG_LOAD, bpf_struct_ops_find() will find the right bpf_verifier_ops by searching the attr->attach_btf_id. A new "btf_struct_access" is also added to the bpf_verifier_ops such that the supporting kernel struct can optionally provide its own specific check on accessing the func arg (e.g. provide limited write access). After btf_vmlinux is parsed, the new bpf_struct_ops_init() is called to initialize some values (e.g. the btf id of the supporting kernel struct) and it can only be done once the btf_vmlinux is available. The R0 checks at BPF_EXIT is excluded for the BPF_PROG_TYPE_STRUCT_OPS prog if the return type of the prog->aux->attach_func_proto is "void". Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003503.3855825-1-kafai@fb.com --- kernel/bpf/Makefile | 3 + kernel/bpf/bpf_struct_ops.c | 121 ++++++++++++++++++++++++++++++++++ kernel/bpf/bpf_struct_ops_types.h | 4 ++ kernel/bpf/btf.c | 88 +++++++++++++++++-------- kernel/bpf/syscall.c | 17 +++-- kernel/bpf/verifier.c | 134 +++++++++++++++++++++++++++++--------- 6 files changed, 304 insertions(+), 63 deletions(-) create mode 100644 kernel/bpf/bpf_struct_ops.c create mode 100644 kernel/bpf/bpf_struct_ops_types.h (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index d4f330351f87..046ce5d98033 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -27,3 +27,6 @@ endif ifeq ($(CONFIG_SYSFS),y) obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o endif +ifeq ($(CONFIG_BPF_JIT),y) +obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o +endif diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c new file mode 100644 index 000000000000..2ea68fe34c33 --- /dev/null +++ b/kernel/bpf/bpf_struct_ops.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2019 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define BPF_STRUCT_OPS_TYPE(_name) \ +extern struct bpf_struct_ops bpf_##_name; +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE + +enum { +#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name, +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE + __NR_BPF_STRUCT_OPS_TYPE, +}; + +static struct bpf_struct_ops * const bpf_struct_ops[] = { +#define BPF_STRUCT_OPS_TYPE(_name) \ + [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name, +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE +}; + +const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { +}; + +const struct bpf_prog_ops bpf_struct_ops_prog_ops = { +}; + +void bpf_struct_ops_init(struct btf *btf) +{ + const struct btf_member *member; + struct bpf_struct_ops *st_ops; + struct bpf_verifier_log log = {}; + const struct btf_type *t; + const char *mname; + s32 type_id; + u32 i, j; + + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { + st_ops = bpf_struct_ops[i]; + + type_id = btf_find_by_name_kind(btf, st_ops->name, + BTF_KIND_STRUCT); + if (type_id < 0) { + pr_warn("Cannot find struct %s in btf_vmlinux\n", + st_ops->name); + continue; + } + t = btf_type_by_id(btf, type_id); + if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { + pr_warn("Cannot support #%u members in struct %s\n", + btf_type_vlen(t), st_ops->name); + continue; + } + + for_each_member(j, t, member) { + const struct btf_type *func_proto; + + mname = btf_name_by_offset(btf, member->name_off); + if (!*mname) { + pr_warn("anon member in struct %s is not supported\n", + st_ops->name); + break; + } + + if (btf_member_bitfield_size(t, member)) { + pr_warn("bit field member %s in struct %s is not supported\n", + mname, st_ops->name); + break; + } + + func_proto = btf_type_resolve_func_ptr(btf, + member->type, + NULL); + if (func_proto && + btf_distill_func_proto(&log, btf, + func_proto, mname, + &st_ops->func_models[j])) { + pr_warn("Error in parsing func ptr %s in struct %s\n", + mname, st_ops->name); + break; + } + } + + if (j == btf_type_vlen(t)) { + if (st_ops->init(btf)) { + pr_warn("Error in init bpf_struct_ops %s\n", + st_ops->name); + } else { + st_ops->type_id = type_id; + st_ops->type = t; + } + } + } +} + +extern struct btf *btf_vmlinux; + +const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) +{ + unsigned int i; + + if (!type_id || !btf_vmlinux) + return NULL; + + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { + if (bpf_struct_ops[i]->type_id == type_id) + return bpf_struct_ops[i]; + } + + return NULL; +} diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h new file mode 100644 index 000000000000..7bb13ff49ec2 --- /dev/null +++ b/kernel/bpf/bpf_struct_ops_types.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* internal file - do not include directly */ + +/* To be filled in a later patch */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 48bbde2e1c1e..12af4a1bb1a4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -180,11 +180,6 @@ */ #define BTF_MAX_SIZE (16 * 1024 * 1024) -#define for_each_member(i, struct_type, member) \ - for (i = 0, member = btf_type_member(struct_type); \ - i < btf_type_vlen(struct_type); \ - i++, member++) - #define for_each_member_from(i, from, struct_type, member) \ for (i = from, member = btf_type_member(struct_type) + from; \ i < btf_type_vlen(struct_type); \ @@ -382,6 +377,65 @@ static bool btf_type_is_datasec(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } +s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) +{ + const struct btf_type *t; + const char *tname; + u32 i; + + for (i = 1; i <= btf->nr_types; i++) { + t = btf->types[i]; + if (BTF_INFO_KIND(t->info) != kind) + continue; + + tname = btf_name_by_offset(btf, t->name_off); + if (!strcmp(tname, name)) + return i; + } + + return -ENOENT; +} + +const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, + u32 id, u32 *res_id) +{ + const struct btf_type *t = btf_type_by_id(btf, id); + + while (btf_type_is_modifier(t)) { + id = t->type; + t = btf_type_by_id(btf, t->type); + } + + if (res_id) + *res_id = id; + + return t; +} + +const struct btf_type *btf_type_resolve_ptr(const struct btf *btf, + u32 id, u32 *res_id) +{ + const struct btf_type *t; + + t = btf_type_skip_modifiers(btf, id, NULL); + if (!btf_type_is_ptr(t)) + return NULL; + + return btf_type_skip_modifiers(btf, t->type, res_id); +} + +const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, + u32 id, u32 *res_id) +{ + const struct btf_type *ptype; + + ptype = btf_type_resolve_ptr(btf, id, res_id); + if (ptype && btf_type_is_func_proto(ptype)) + return ptype; + + return NULL; +} + /* Types that act only as a source, not sink or intermediate * type when resolving. */ @@ -446,16 +500,6 @@ static const char *btf_int_encoding_str(u8 encoding) return "UNKN"; } -static u16 btf_type_vlen(const struct btf_type *t) -{ - return BTF_INFO_VLEN(t->info); -} - -static bool btf_type_kflag(const struct btf_type *t) -{ - return BTF_INFO_KFLAG(t->info); -} - static u32 btf_member_bit_offset(const struct btf_type *struct_type, const struct btf_member *member) { @@ -463,13 +507,6 @@ static u32 btf_member_bit_offset(const struct btf_type *struct_type, : member->offset; } -static u32 btf_member_bitfield_size(const struct btf_type *struct_type, - const struct btf_member *member) -{ - return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) - : 0; -} - static u32 btf_type_int(const struct btf_type *t) { return *(u32 *)(t + 1); @@ -480,11 +517,6 @@ static const struct btf_array *btf_type_array(const struct btf_type *t) return (const struct btf_array *)(t + 1); } -static const struct btf_member *btf_type_member(const struct btf_type *t) -{ - return (const struct btf_member *)(t + 1); -} - static const struct btf_enum *btf_type_enum(const struct btf_type *t) { return (const struct btf_enum *)(t + 1); @@ -3605,6 +3637,8 @@ struct btf *btf_parse_vmlinux(void) goto errout; } + bpf_struct_ops_init(btf); + btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 81ee8595dfee..03a02ef4c496 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1672,17 +1672,22 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, u32 btf_id, u32 prog_fd) { - switch (prog_type) { - case BPF_PROG_TYPE_TRACING: + if (btf_id) { if (btf_id > BTF_MAX_TYPE) return -EINVAL; - break; - default: - if (btf_id || prog_fd) + + switch (prog_type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_STRUCT_OPS: + break; + default: return -EINVAL; - break; + } } + if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING) + return -EINVAL; + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d433d70022fd..586ed3c94b80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2859,11 +2859,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, u32 btf_id; int ret; - if (atype != BPF_READ) { - verbose(env, "only read is supported\n"); - return -EACCES; - } - if (off < 0) { verbose(env, "R%d is ptr_%s invalid negative access: off=%d\n", @@ -2880,17 +2875,32 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); + if (env->ops->btf_struct_access) { + ret = env->ops->btf_struct_access(&env->log, t, off, size, + atype, &btf_id); + } else { + if (atype != BPF_READ) { + verbose(env, "only read is supported\n"); + return -EACCES; + } + + ret = btf_struct_access(&env->log, t, off, size, atype, + &btf_id); + } + if (ret < 0) return ret; - if (ret == SCALAR_VALUE) { - mark_reg_unknown(env, regs, value_regno); - return 0; + if (atype == BPF_READ) { + if (ret == SCALAR_VALUE) { + mark_reg_unknown(env, regs, value_regno); + return 0; + } + mark_reg_known_zero(env, regs, value_regno); + regs[value_regno].type = PTR_TO_BTF_ID; + regs[value_regno].btf_id = btf_id; } - mark_reg_known_zero(env, regs, value_regno); - regs[value_regno].type = PTR_TO_BTF_ID; - regs[value_regno].btf_id = btf_id; + return 0; } @@ -6349,8 +6359,30 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) static int check_return_code(struct bpf_verifier_env *env) { struct tnum enforce_attach_type_range = tnum_unknown; + const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); + int err; + + /* The struct_ops func-ptr's return type could be "void" */ + if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS && + !prog->aux->attach_func_proto->type) + return 0; + + /* eBPF calling convetion is such that R0 is used + * to return the value from eBPF program. + * Make sure that it's readable at this time + * of bpf_exit, which means that program wrote + * something into it earlier + */ + err = check_reg_arg(env, BPF_REG_0, SRC_OP); + if (err) + return err; + + if (is_pointer_value(env, BPF_REG_0)) { + verbose(env, "R0 leaks addr as return value\n"); + return -EACCES; + } switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: @@ -8016,21 +8048,6 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - /* eBPF calling convetion is such that R0 is used - * to return the value from eBPF program. - * Make sure that it's readable at this time - * of bpf_exit, which means that program wrote - * something into it earlier - */ - err = check_reg_arg(env, BPF_REG_0, SRC_OP); - if (err) - return err; - - if (is_pointer_value(env, BPF_REG_0)) { - verbose(env, "R0 leaks addr as return value\n"); - return -EACCES; - } - err = check_return_code(env); if (err) return err; @@ -8829,12 +8846,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = bpf_xdp_sock_convert_ctx_access; break; case PTR_TO_BTF_ID: - if (type == BPF_WRITE) { + if (type == BPF_READ) { + insn->code = BPF_LDX | BPF_PROBE_MEM | + BPF_SIZE((insn)->code); + env->prog->aux->num_exentries++; + } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) { verbose(env, "Writes through BTF pointers are not allowed\n"); return -EINVAL; } - insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); - env->prog->aux->num_exentries++; continue; default: continue; @@ -9502,6 +9521,58 @@ static void print_verification_stats(struct bpf_verifier_env *env) env->peak_states, env->longest_mark_read_walk); } +static int check_struct_ops_btf_id(struct bpf_verifier_env *env) +{ + const struct btf_type *t, *func_proto; + const struct bpf_struct_ops *st_ops; + const struct btf_member *member; + struct bpf_prog *prog = env->prog; + u32 btf_id, member_idx; + const char *mname; + + btf_id = prog->aux->attach_btf_id; + st_ops = bpf_struct_ops_find(btf_id); + if (!st_ops) { + verbose(env, "attach_btf_id %u is not a supported struct\n", + btf_id); + return -ENOTSUPP; + } + + t = st_ops->type; + member_idx = prog->expected_attach_type; + if (member_idx >= btf_type_vlen(t)) { + verbose(env, "attach to invalid member idx %u of struct %s\n", + member_idx, st_ops->name); + return -EINVAL; + } + + member = &btf_type_member(t)[member_idx]; + mname = btf_name_by_offset(btf_vmlinux, member->name_off); + func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type, + NULL); + if (!func_proto) { + verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n", + mname, member_idx, st_ops->name); + return -EINVAL; + } + + if (st_ops->check_member) { + int err = st_ops->check_member(t, member); + + if (err) { + verbose(env, "attach to unsupported member %s of struct %s\n", + mname, st_ops->name); + return err; + } + } + + prog->aux->attach_func_proto = func_proto; + prog->aux->attach_func_name = mname; + env->ops = st_ops->verifier_ops; + + return 0; +} + static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; @@ -9517,6 +9588,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) long addr; u64 key; + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) + return check_struct_ops_btf_id(env); + if (prog->type != BPF_PROG_TYPE_TRACING) return 0; -- cgit From 85d33df357b634649ddbe0a20fd2d0fc5732c3cb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:35:05 -0800 Subject: bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS The patch introduces BPF_MAP_TYPE_STRUCT_OPS. The map value is a kernel struct with its func ptr implemented in bpf prog. This new map is the interface to register/unregister/introspect a bpf implemented kernel struct. The kernel struct is actually embedded inside another new struct (or called the "value" struct in the code). For example, "struct tcp_congestion_ops" is embbeded in: struct bpf_struct_ops_tcp_congestion_ops { refcount_t refcnt; enum bpf_struct_ops_state state; struct tcp_congestion_ops data; /* <-- kernel subsystem struct here */ } The map value is "struct bpf_struct_ops_tcp_congestion_ops". The "bpftool map dump" will then be able to show the state ("inuse"/"tobefree") and the number of subsystem's refcnt (e.g. number of tcp_sock in the tcp_congestion_ops case). This "value" struct is created automatically by a macro. Having a separate "value" struct will also make extending "struct bpf_struct_ops_XYZ" easier (e.g. adding "void (*init)(void)" to "struct bpf_struct_ops_XYZ" to do some initialization works before registering the struct_ops to the kernel subsystem). The libbpf will take care of finding and populating the "struct bpf_struct_ops_XYZ" from "struct XYZ". Register a struct_ops to a kernel subsystem: 1. Load all needed BPF_PROG_TYPE_STRUCT_OPS prog(s) 2. Create a BPF_MAP_TYPE_STRUCT_OPS with attr->btf_vmlinux_value_type_id set to the btf id "struct bpf_struct_ops_tcp_congestion_ops" of the running kernel. Instead of reusing the attr->btf_value_type_id, btf_vmlinux_value_type_id s added such that attr->btf_fd can still be used as the "user" btf which could store other useful sysadmin/debug info that may be introduced in the furture, e.g. creation-date/compiler-details/map-creator...etc. 3. Create a "struct bpf_struct_ops_tcp_congestion_ops" object as described in the running kernel btf. Populate the value of this object. The function ptr should be populated with the prog fds. 4. Call BPF_MAP_UPDATE with the object created in (3) as the map value. The key is always "0". During BPF_MAP_UPDATE, the code that saves the kernel-func-ptr's args as an array of u64 is generated. BPF_MAP_UPDATE also allows the specific struct_ops to do some final checks in "st_ops->init_member()" (e.g. ensure all mandatory func ptrs are implemented). If everything looks good, it will register this kernel struct to the kernel subsystem. The map will not allow further update from this point. Unregister a struct_ops from the kernel subsystem: BPF_MAP_DELETE with key "0". Introspect a struct_ops: BPF_MAP_LOOKUP_ELEM with key "0". The map value returned will have the prog _id_ populated as the func ptr. The map value state (enum bpf_struct_ops_state) will transit from: INIT (map created) => INUSE (map updated, i.e. reg) => TOBEFREE (map value deleted, i.e. unreg) The kernel subsystem needs to call bpf_struct_ops_get() and bpf_struct_ops_put() to manage the "refcnt" in the "struct bpf_struct_ops_XYZ". This patch uses a separate refcnt for the purose of tracking the subsystem usage. Another approach is to reuse the map->refcnt and then "show" (i.e. during map_lookup) the subsystem's usage by doing map->refcnt - map->usercnt to filter out the map-fd/pinned-map usage. However, that will also tie down the future semantics of map->refcnt and map->usercnt. The very first subsystem's refcnt (during reg()) holds one count to map->refcnt. When the very last subsystem's refcnt is gone, it will also release the map->refcnt. All bpf_prog will be freed when the map->refcnt reaches 0 (i.e. during map_free()). Here is how the bpftool map command will look like: [root@arch-fb-vm1 bpf]# bpftool map show 6: struct_ops name dctcp flags 0x0 key 4B value 256B max_entries 1 memlock 4096B btf_id 6 [root@arch-fb-vm1 bpf]# bpftool map dump id 6 [{ "value": { "refcnt": { "refs": { "counter": 1 } }, "state": 1, "data": { "list": { "next": 0, "prev": 0 }, "key": 0, "flags": 2, "init": 24, "release": 0, "ssthresh": 25, "cong_avoid": 30, "set_state": 27, "cwnd_event": 28, "in_ack_event": 26, "undo_cwnd": 29, "pkts_acked": 0, "min_tso_segs": 0, "sndbuf_expand": 0, "cong_control": 0, "get_info": 0, "name": [98,112,102,95,100,99,116,99,112,0,0,0,0,0,0,0 ], "owner": 0 } } } ] Misc Notes: * bpf_struct_ops_map_sys_lookup_elem() is added for syscall lookup. It does an inplace update on "*value" instead returning a pointer to syscall.c. Otherwise, it needs a separate copy of "zero" value for the BPF_STRUCT_OPS_STATE_INIT to avoid races. * The bpf_struct_ops_map_delete_elem() is also called without preempt_disable() from map_delete_elem(). It is because the "->unreg()" may requires sleepable context, e.g. the "tcp_unregister_congestion_control()". * "const" is added to some of the existing "struct btf_func_model *" function arg to avoid a compiler warning caused by this patch. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003505.3855919-1-kafai@fb.com --- kernel/bpf/bpf_struct_ops.c | 511 +++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/btf.c | 20 +- kernel/bpf/map_in_map.c | 3 +- kernel/bpf/syscall.c | 52 +++-- kernel/bpf/trampoline.c | 8 +- kernel/bpf/verifier.c | 5 + 6 files changed, 565 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 2ea68fe34c33..ddf48f49914b 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -9,9 +9,68 @@ #include #include #include +#include +enum bpf_struct_ops_state { + BPF_STRUCT_OPS_STATE_INIT, + BPF_STRUCT_OPS_STATE_INUSE, + BPF_STRUCT_OPS_STATE_TOBEFREE, +}; + +#define BPF_STRUCT_OPS_COMMON_VALUE \ + refcount_t refcnt; \ + enum bpf_struct_ops_state state + +struct bpf_struct_ops_value { + BPF_STRUCT_OPS_COMMON_VALUE; + char data[0] ____cacheline_aligned_in_smp; +}; + +struct bpf_struct_ops_map { + struct bpf_map map; + const struct bpf_struct_ops *st_ops; + /* protect map_update */ + struct mutex lock; + /* progs has all the bpf_prog that is populated + * to the func ptr of the kernel's struct + * (in kvalue.data). + */ + struct bpf_prog **progs; + /* image is a page that has all the trampolines + * that stores the func args before calling the bpf_prog. + * A PAGE_SIZE "image" is enough to store all trampoline for + * "progs[]". + */ + void *image; + /* uvalue->data stores the kernel struct + * (e.g. tcp_congestion_ops) that is more useful + * to userspace than the kvalue. For example, + * the bpf_prog's id is stored instead of the kernel + * address of a func ptr. + */ + struct bpf_struct_ops_value *uvalue; + /* kvalue.data stores the actual kernel's struct + * (e.g. tcp_congestion_ops) that will be + * registered to the kernel subsystem. + */ + struct bpf_struct_ops_value kvalue; +}; + +#define VALUE_PREFIX "bpf_struct_ops_" +#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) + +/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is + * the map's value exposed to the userspace and its btf-type-id is + * stored at the map->btf_vmlinux_value_type_id. + * + */ #define BPF_STRUCT_OPS_TYPE(_name) \ -extern struct bpf_struct_ops bpf_##_name; +extern struct bpf_struct_ops bpf_##_name; \ + \ +struct bpf_struct_ops_##_name { \ + BPF_STRUCT_OPS_COMMON_VALUE; \ + struct _name data ____cacheline_aligned_in_smp; \ +}; #include "bpf_struct_ops_types.h" #undef BPF_STRUCT_OPS_TYPE @@ -35,19 +94,50 @@ const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { const struct bpf_prog_ops bpf_struct_ops_prog_ops = { }; +static const struct btf_type *module_type; + void bpf_struct_ops_init(struct btf *btf) { + s32 type_id, value_id, module_id; const struct btf_member *member; struct bpf_struct_ops *st_ops; struct bpf_verifier_log log = {}; const struct btf_type *t; + char value_name[128]; const char *mname; - s32 type_id; u32 i, j; + /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */ +#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name); +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE + + module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT); + if (module_id < 0) { + pr_warn("Cannot find struct module in btf_vmlinux\n"); + return; + } + module_type = btf_type_by_id(btf, module_id); + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { st_ops = bpf_struct_ops[i]; + if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= + sizeof(value_name)) { + pr_warn("struct_ops name %s is too long\n", + st_ops->name); + continue; + } + sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); + + value_id = btf_find_by_name_kind(btf, value_name, + BTF_KIND_STRUCT); + if (value_id < 0) { + pr_warn("Cannot find struct %s in btf_vmlinux\n", + value_name); + continue; + } + type_id = btf_find_by_name_kind(btf, st_ops->name, BTF_KIND_STRUCT); if (type_id < 0) { @@ -98,6 +188,9 @@ void bpf_struct_ops_init(struct btf *btf) } else { st_ops->type_id = type_id; st_ops->type = t; + st_ops->value_id = value_id; + st_ops->value_type = btf_type_by_id(btf, + value_id); } } } @@ -105,6 +198,22 @@ void bpf_struct_ops_init(struct btf *btf) extern struct btf *btf_vmlinux; +static const struct bpf_struct_ops * +bpf_struct_ops_find_value(u32 value_id) +{ + unsigned int i; + + if (!value_id || !btf_vmlinux) + return NULL; + + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { + if (bpf_struct_ops[i]->value_id == value_id) + return bpf_struct_ops[i]; + } + + return NULL; +} + const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) { unsigned int i; @@ -119,3 +228,401 @@ const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) return NULL; } + +static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + if (key && *(u32 *)key == 0) + return -ENOENT; + + *(u32 *)next_key = 0; + return 0; +} + +int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + struct bpf_struct_ops_value *uvalue, *kvalue; + enum bpf_struct_ops_state state; + + if (unlikely(*(u32 *)key != 0)) + return -ENOENT; + + kvalue = &st_map->kvalue; + /* Pair with smp_store_release() during map_update */ + state = smp_load_acquire(&kvalue->state); + if (state == BPF_STRUCT_OPS_STATE_INIT) { + memset(value, 0, map->value_size); + return 0; + } + + /* No lock is needed. state and refcnt do not need + * to be updated together under atomic context. + */ + uvalue = (struct bpf_struct_ops_value *)value; + memcpy(uvalue, st_map->uvalue, map->value_size); + uvalue->state = state; + refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); + + return 0; +} + +static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EINVAL); +} + +static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) +{ + const struct btf_type *t = st_map->st_ops->type; + u32 i; + + for (i = 0; i < btf_type_vlen(t); i++) { + if (st_map->progs[i]) { + bpf_prog_put(st_map->progs[i]); + st_map->progs[i] = NULL; + } + } +} + +static int check_zero_holes(const struct btf_type *t, void *data) +{ + const struct btf_member *member; + u32 i, moff, msize, prev_mend = 0; + const struct btf_type *mtype; + + for_each_member(i, t, member) { + moff = btf_member_bit_offset(t, member) / 8; + if (moff > prev_mend && + memchr_inv(data + prev_mend, 0, moff - prev_mend)) + return -EINVAL; + + mtype = btf_type_by_id(btf_vmlinux, member->type); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, + NULL, NULL); + if (IS_ERR(mtype)) + return PTR_ERR(mtype); + prev_mend = moff + msize; + } + + if (t->size > prev_mend && + memchr_inv(data + prev_mend, 0, t->size - prev_mend)) + return -EINVAL; + + return 0; +} + +static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + const struct bpf_struct_ops *st_ops = st_map->st_ops; + struct bpf_struct_ops_value *uvalue, *kvalue; + const struct btf_member *member; + const struct btf_type *t = st_ops->type; + void *udata, *kdata; + int prog_fd, err = 0; + void *image; + u32 i; + + if (flags) + return -EINVAL; + + if (*(u32 *)key != 0) + return -E2BIG; + + err = check_zero_holes(st_ops->value_type, value); + if (err) + return err; + + uvalue = (struct bpf_struct_ops_value *)value; + err = check_zero_holes(t, uvalue->data); + if (err) + return err; + + if (uvalue->state || refcount_read(&uvalue->refcnt)) + return -EINVAL; + + uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; + kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; + + mutex_lock(&st_map->lock); + + if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { + err = -EBUSY; + goto unlock; + } + + memcpy(uvalue, value, map->value_size); + + udata = &uvalue->data; + kdata = &kvalue->data; + image = st_map->image; + + for_each_member(i, t, member) { + const struct btf_type *mtype, *ptype; + struct bpf_prog *prog; + u32 moff; + + moff = btf_member_bit_offset(t, member) / 8; + ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); + if (ptype == module_type) { + if (*(void **)(udata + moff)) + goto reset_unlock; + *(void **)(kdata + moff) = BPF_MODULE_OWNER; + continue; + } + + err = st_ops->init_member(t, member, kdata, udata); + if (err < 0) + goto reset_unlock; + + /* The ->init_member() has handled this member */ + if (err > 0) + continue; + + /* If st_ops->init_member does not handle it, + * we will only handle func ptrs and zero-ed members + * here. Reject everything else. + */ + + /* All non func ptr member must be 0 */ + if (!ptype || !btf_type_is_func_proto(ptype)) { + u32 msize; + + mtype = btf_type_by_id(btf_vmlinux, member->type); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, + NULL, NULL); + if (IS_ERR(mtype)) { + err = PTR_ERR(mtype); + goto reset_unlock; + } + + if (memchr_inv(udata + moff, 0, msize)) { + err = -EINVAL; + goto reset_unlock; + } + + continue; + } + + prog_fd = (int)(*(unsigned long *)(udata + moff)); + /* Similar check as the attr->attach_prog_fd */ + if (!prog_fd) + continue; + + prog = bpf_prog_get(prog_fd); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto reset_unlock; + } + st_map->progs[i] = prog; + + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || + prog->aux->attach_btf_id != st_ops->type_id || + prog->expected_attach_type != i) { + err = -EINVAL; + goto reset_unlock; + } + + err = arch_prepare_bpf_trampoline(image, + st_map->image + PAGE_SIZE, + &st_ops->func_models[i], 0, + &prog, 1, NULL, 0, NULL); + if (err < 0) + goto reset_unlock; + + *(void **)(kdata + moff) = image; + image += err; + + /* put prog_id to udata */ + *(unsigned long *)(udata + moff) = prog->aux->id; + } + + refcount_set(&kvalue->refcnt, 1); + bpf_map_inc(map); + + set_memory_ro((long)st_map->image, 1); + set_memory_x((long)st_map->image, 1); + err = st_ops->reg(kdata); + if (likely(!err)) { + /* Pair with smp_load_acquire() during lookup_elem(). + * It ensures the above udata updates (e.g. prog->aux->id) + * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. + */ + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); + goto unlock; + } + + /* Error during st_ops->reg(). It is very unlikely since + * the above init_member() should have caught it earlier + * before reg(). The only possibility is if there was a race + * in registering the struct_ops (under the same name) to + * a sub-system through different struct_ops's maps. + */ + set_memory_nx((long)st_map->image, 1); + set_memory_rw((long)st_map->image, 1); + bpf_map_put(map); + +reset_unlock: + bpf_struct_ops_map_put_progs(st_map); + memset(uvalue, 0, map->value_size); + memset(kvalue, 0, map->value_size); +unlock: + mutex_unlock(&st_map->lock); + return err; +} + +static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) +{ + enum bpf_struct_ops_state prev_state; + struct bpf_struct_ops_map *st_map; + + st_map = (struct bpf_struct_ops_map *)map; + prev_state = cmpxchg(&st_map->kvalue.state, + BPF_STRUCT_OPS_STATE_INUSE, + BPF_STRUCT_OPS_STATE_TOBEFREE); + if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) { + st_map->st_ops->unreg(&st_map->kvalue.data); + if (refcount_dec_and_test(&st_map->kvalue.refcnt)) + bpf_map_put(map); + } + + return 0; +} + +static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + value = bpf_struct_ops_map_lookup_elem(map, key); + if (!value) + return; + + btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, + value, m); + seq_puts(m, "\n"); +} + +static void bpf_struct_ops_map_free(struct bpf_map *map) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + + if (st_map->progs) + bpf_struct_ops_map_put_progs(st_map); + bpf_map_area_free(st_map->progs); + bpf_jit_free_exec(st_map->image); + bpf_map_area_free(st_map->uvalue); + bpf_map_area_free(st_map); +} + +static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) +{ + if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || + attr->map_flags || !attr->btf_vmlinux_value_type_id) + return -EINVAL; + return 0; +} + +static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) +{ + const struct bpf_struct_ops *st_ops; + size_t map_total_size, st_map_size; + struct bpf_struct_ops_map *st_map; + const struct btf_type *t, *vt; + struct bpf_map_memory mem; + struct bpf_map *map; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); + if (!st_ops) + return ERR_PTR(-ENOTSUPP); + + vt = st_ops->value_type; + if (attr->value_size != vt->size) + return ERR_PTR(-EINVAL); + + t = st_ops->type; + + st_map_size = sizeof(*st_map) + + /* kvalue stores the + * struct bpf_struct_ops_tcp_congestions_ops + */ + (vt->size - sizeof(struct bpf_struct_ops_value)); + map_total_size = st_map_size + + /* uvalue */ + sizeof(vt->size) + + /* struct bpf_progs **progs */ + btf_type_vlen(t) * sizeof(struct bpf_prog *); + err = bpf_map_charge_init(&mem, map_total_size); + if (err < 0) + return ERR_PTR(err); + + st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); + if (!st_map) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + st_map->st_ops = st_ops; + map = &st_map->map; + + st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); + st_map->progs = + bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *), + NUMA_NO_NODE); + st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!st_map->uvalue || !st_map->progs || !st_map->image) { + bpf_struct_ops_map_free(map); + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + + mutex_init(&st_map->lock); + set_vm_flush_reset_perms(st_map->image); + bpf_map_init_from_attr(map, attr); + bpf_map_charge_move(&map->memory, &mem); + + return map; +} + +const struct bpf_map_ops bpf_struct_ops_map_ops = { + .map_alloc_check = bpf_struct_ops_map_alloc_check, + .map_alloc = bpf_struct_ops_map_alloc, + .map_free = bpf_struct_ops_map_free, + .map_get_next_key = bpf_struct_ops_map_get_next_key, + .map_lookup_elem = bpf_struct_ops_map_lookup_elem, + .map_delete_elem = bpf_struct_ops_map_delete_elem, + .map_update_elem = bpf_struct_ops_map_update_elem, + .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, +}; + +/* "const void *" because some subsystem is + * passing a const (e.g. const struct tcp_congestion_ops *) + */ +bool bpf_struct_ops_get(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + + return refcount_inc_not_zero(&kvalue->refcnt); +} + +void bpf_struct_ops_put(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + if (refcount_dec_and_test(&kvalue->refcnt)) { + struct bpf_struct_ops_map *st_map; + + st_map = container_of(kvalue, struct bpf_struct_ops_map, + kvalue); + bpf_map_put(&st_map->map); + } +} diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 12af4a1bb1a4..81d9cf75cacd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -500,13 +500,6 @@ static const char *btf_int_encoding_str(u8 encoding) return "UNKN"; } -static u32 btf_member_bit_offset(const struct btf_type *struct_type, - const struct btf_member *member) -{ - return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) - : member->offset; -} - static u32 btf_type_int(const struct btf_type *t) { return *(u32 *)(t + 1); @@ -1089,7 +1082,7 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) * *elem_type: same as return type ("struct X") * *total_nelems: 1 */ -static const struct btf_type * +const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size, const struct btf_type **elem_type, u32 *total_nelems) @@ -1143,8 +1136,10 @@ resolved: return ERR_PTR(-EINVAL); *type_size = nelems * size; - *total_nelems = nelems; - *elem_type = type; + if (total_nelems) + *total_nelems = nelems; + if (elem_type) + *elem_type = type; return array_type ? : type; } @@ -1858,7 +1853,10 @@ static void btf_modifier_seq_show(const struct btf *btf, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) { - t = btf_type_id_resolve(btf, &type_id); + if (btf->resolved_ids) + t = btf_type_id_resolve(btf, &type_id); + else + t = btf_type_skip_modifiers(btf, type_id, NULL); btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 5e9366b33f0f..b3c48d1533cb 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -22,7 +22,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || - inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE || + inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { fdput(f); return ERR_PTR(-ENOTSUPP); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 03a02ef4c496..f9db72a96ec0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -628,7 +628,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return ret; } -#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id +#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -642,6 +642,14 @@ static int map_create(union bpf_attr *attr) if (err) return -EINVAL; + if (attr->btf_vmlinux_value_type_id) { + if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || + attr->btf_key_type_id || attr->btf_value_type_id) + return -EINVAL; + } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { + return -EINVAL; + } + f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) return f_flags; @@ -664,32 +672,35 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); - if (attr->btf_key_type_id || attr->btf_value_type_id) { + map->spin_lock_off = -EINVAL; + if (attr->btf_key_type_id || attr->btf_value_type_id || + /* Even the map's value is a kernel's struct, + * the bpf_prog.o must have BTF to begin with + * to figure out the corresponding kernel's + * counter part. Thus, attr->btf_fd has + * to be valid also. + */ + attr->btf_vmlinux_value_type_id) { struct btf *btf; - if (!attr->btf_value_type_id) { - err = -EINVAL; - goto free_map; - } - btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { err = PTR_ERR(btf); goto free_map; } + map->btf = btf; - err = map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); - if (err) { - btf_put(btf); - goto free_map; + if (attr->btf_value_type_id) { + err = map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); + if (err) + goto free_map; } - map->btf = btf; map->btf_key_type_id = attr->btf_key_type_id; map->btf_value_type_id = attr->btf_value_type_id; - } else { - map->spin_lock_off = -EINVAL; + map->btf_vmlinux_value_type_id = + attr->btf_vmlinux_value_type_id; } err = security_bpf_map_alloc(map); @@ -888,6 +899,9 @@ static int map_lookup_elem(union bpf_attr *attr) } else if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK) { err = map->ops->map_peek_elem(map, value); + } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* struct_ops map requires directly updating "value" */ + err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); } else { rcu_read_lock(); if (map->ops->map_lookup_elem_sys_only) @@ -1003,7 +1017,8 @@ static int map_update_elem(union bpf_attr *attr) goto out; } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || map->map_type == BPF_MAP_TYPE_SOCKHASH || - map->map_type == BPF_MAP_TYPE_SOCKMAP) { + map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } else if (IS_FD_PROG_ARRAY(map)) { @@ -1092,7 +1107,9 @@ static int map_delete_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; - } else if (IS_FD_PROG_ARRAY(map)) { + } else if (IS_FD_PROG_ARRAY(map) || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* These maps require sleepable context */ err = map->ops->map_delete_elem(map, key); goto out; } @@ -2822,6 +2839,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.btf_key_type_id = map->btf_key_type_id; info.btf_value_type_id = map->btf_value_type_id; } + info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_info_fill(&info, map); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 505f4e4b31d2..79a04417050d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -160,11 +160,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (fexit_cnt) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; - err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags, + err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + &tr->func.model, flags, fentry, fentry_cnt, fexit, fexit_cnt, tr->func.addr); - if (err) + if (err < 0) goto out; if (tr->selector) @@ -296,7 +297,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) } int __weak -arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, +arch_prepare_bpf_trampoline(void *image, void *image_end, + const struct btf_func_model *m, u32 flags, struct bpf_prog **fentry_progs, int fentry_cnt, struct bpf_prog **fexit_progs, int fexit_cnt, void *orig_call) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 586ed3c94b80..f5af759a8a5f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8155,6 +8155,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + verbose(env, "bpf_struct_ops map cannot be used in prog\n"); + return -EINVAL; + } + return 0; } -- cgit From 0baf26b0fcd74bbfcef53c5d5e8bad2b99c8d0d2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:35:08 -0800 Subject: bpf: tcp: Support tcp_congestion_ops in bpf This patch makes "struct tcp_congestion_ops" to be the first user of BPF STRUCT_OPS. It allows implementing a tcp_congestion_ops in bpf. The BPF implemented tcp_congestion_ops can be used like regular kernel tcp-cc through sysctl and setsockopt. e.g. [root@arch-fb-vm1 bpf]# sysctl -a | egrep congestion net.ipv4.tcp_allowed_congestion_control = reno cubic bpf_cubic net.ipv4.tcp_available_congestion_control = reno bic cubic bpf_cubic net.ipv4.tcp_congestion_control = bpf_cubic There has been attempt to move the TCP CC to the user space (e.g. CCP in TCP). The common arguments are faster turn around, get away from long-tail kernel versions in production...etc, which are legit points. BPF has been the continuous effort to join both kernel and userspace upsides together (e.g. XDP to gain the performance advantage without bypassing the kernel). The recent BPF advancements (in particular BTF-aware verifier, BPF trampoline, BPF CO-RE...) made implementing kernel struct ops (e.g. tcp cc) possible in BPF. It allows a faster turnaround for testing algorithm in the production while leveraging the existing (and continue growing) BPF feature/framework instead of building one specifically for userspace TCP CC. This patch allows write access to a few fields in tcp-sock (in bpf_tcp_ca_btf_struct_access()). The optional "get_info" is unsupported now. It can be added later. One possible way is to output the info with a btf-id to describe the content. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003508.3856115-1-kafai@fb.com --- kernel/bpf/bpf_struct_ops_types.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h index 7bb13ff49ec2..066d83ea1c99 100644 --- a/kernel/bpf/bpf_struct_ops_types.h +++ b/kernel/bpf/bpf_struct_ops_types.h @@ -1,4 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* internal file - do not include directly */ -/* To be filled in a later patch */ +#ifdef CONFIG_BPF_JIT +#ifdef CONFIG_INET +#include +BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) +#endif +#endif -- cgit From 51c39bb1d5d105a02e29aa7960f0a395086e6342 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 9 Jan 2020 22:41:20 -0800 Subject: bpf: Introduce function-by-function verification New llvm and old llvm with libbpf help produce BTF that distinguish global and static functions. Unlike arguments of static function the arguments of global functions cannot be removed or optimized away by llvm. The compiler has to use exactly the arguments specified in a function prototype. The argument type information allows the verifier validate each global function independently. For now only supported argument types are pointer to context and scalars. In the future pointers to structures, sizes, pointer to packet data can be supported as well. Consider the following example: static int f1(int ...) { ... } int f3(int b); int f2(int a) { f1(a) + f3(a); } int f3(int b) { ... } int main(...) { f1(...) + f2(...) + f3(...); } The verifier will start its safety checks from the first global function f2(). It will recursively descend into f1() because it's static. Then it will check that arguments match for the f3() invocation inside f2(). It will not descend into f3(). It will finish f2() that has to be successfully verified for all possible values of 'a'. Then it will proceed with f3(). That function also has to be safe for all possible values of 'b'. Then it will start subprog 0 (which is main() function). It will recursively descend into f1() and will skip full check of f2() and f3(), since they are global. The order of processing global functions doesn't affect safety, since all global functions must be proven safe based on their arguments only. Such function by function verification can drastically improve speed of the verification and reduce complexity. Note that the stack limit of 512 still applies to the call chain regardless whether functions were static or global. The nested level of 8 also still applies. The same recursion prevention checks are in place as well. The type information and static/global kind is preserved after the verification hence in the above example global function f2() and f3() can be replaced later by equivalent functions with the same types that are loaded and verified later without affecting safety of this main() program. Such replacement (re-linking) of global functions is a subject of future patches. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200110064124.1760511-3-ast@kernel.org --- kernel/bpf/btf.c | 175 ++++++++++++++++++++++++++++------- kernel/bpf/verifier.c | 252 ++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 346 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 81d9cf75cacd..832b5d7fd892 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2651,8 +2651,8 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen != 0"); + if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) { + btf_verifier_log_type(env, t, "Invalid func linkage"); return -EINVAL; } @@ -3506,7 +3506,8 @@ static u8 bpf_ctx_convert_map[] = { static const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, - const struct btf_type *t, enum bpf_prog_type prog_type) + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) { const struct btf_type *conv_struct; const struct btf_type *ctx_struct; @@ -3527,12 +3528,13 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, * is not supported yet. * BPF_PROG_TYPE_RAW_TRACEPOINT is fine. */ - bpf_log(log, "BPF program ctx type is not a struct\n"); + if (log->level & BPF_LOG_LEVEL) + bpf_log(log, "arg#%d type is not a struct\n", arg); return NULL; } tname = btf_name_by_offset(btf, t->name_off); if (!tname) { - bpf_log(log, "BPF program ctx struct doesn't have a name\n"); + bpf_log(log, "arg#%d struct doesn't have a name\n", arg); return NULL; } /* prog_type is valid bpf program type. No need for bounds check. */ @@ -3565,11 +3567,12 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, - enum bpf_prog_type prog_type) + enum bpf_prog_type prog_type, + int arg) { const struct btf_member *prog_ctx_type, *kern_ctx_type; - prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type); + prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg); if (!prog_ctx_type) return -ENOENT; kern_ctx_type = prog_ctx_type + 1; @@ -3731,7 +3734,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { - ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type); + ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); if (ret > 0) { info->btf_id = ret; return true; @@ -4112,11 +4115,16 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return 0; } -int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) +/* Compare BTF of a function with given bpf_reg_state. + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - there is a type mismatch or BTF is not available. + * 0 - BTF matches with what bpf_reg_state expects. + * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. + */ +int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *reg) { - struct bpf_verifier_state *st = env->cur_state; - struct bpf_func_state *func = st->frame[st->curframe]; - struct bpf_reg_state *reg = func->regs; struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; struct btf *btf = prog->aux->btf; @@ -4126,27 +4134,30 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) const char *tname; if (!prog->aux->func_info) - return 0; + return -EINVAL; btf_id = prog->aux->func_info[subprog].type_id; if (!btf_id) - return 0; + return -EFAULT; if (prog->aux->func_info_aux[subprog].unreliable) - return 0; + return -EINVAL; t = btf_type_by_id(btf, btf_id); if (!t || !btf_type_is_func(t)) { - bpf_log(log, "BTF of subprog %d doesn't point to KIND_FUNC\n", + /* These checks were already done by the verifier while loading + * struct bpf_func_info + */ + bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n", subprog); - return -EINVAL; + return -EFAULT; } tname = btf_name_by_offset(btf, t->name_off); t = btf_type_by_id(btf, t->type); if (!t || !btf_type_is_func_proto(t)) { - bpf_log(log, "Invalid type of func %s\n", tname); - return -EINVAL; + bpf_log(log, "Invalid BTF of func %s\n", tname); + return -EFAULT; } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); @@ -4172,25 +4183,127 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) bpf_log(log, "R%d is not a pointer\n", i + 1); goto out; } - /* If program is passing PTR_TO_CTX into subprogram - * check that BTF type matches. + /* If function expects ctx type in BTF check that caller + * is passing PTR_TO_CTX. */ - if (reg[i + 1].type == PTR_TO_CTX && - !btf_get_prog_ctx_type(log, btf, t, prog->type)) - goto out; - /* All other pointers are ok */ - continue; + if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { + if (reg[i + 1].type != PTR_TO_CTX) { + bpf_log(log, + "arg#%d expected pointer to ctx, but got %s\n", + i, btf_kind_str[BTF_INFO_KIND(t->info)]); + goto out; + } + if (check_ctx_reg(env, ®[i + 1], i + 1)) + goto out; + continue; + } } - bpf_log(log, "Unrecognized argument type %s\n", - btf_kind_str[BTF_INFO_KIND(t->info)]); + bpf_log(log, "Unrecognized arg#%d type %s\n", + i, btf_kind_str[BTF_INFO_KIND(t->info)]); goto out; } return 0; out: - /* LLVM optimizations can remove arguments from static functions. */ - bpf_log(log, - "Type info disagrees with actual arguments due to compiler optimizations\n"); + /* Compiler optimizations can remove arguments from static functions + * or mismatched type can be passed into a global function. + * In such cases mark the function as unreliable from BTF point of view. + */ prog->aux->func_info_aux[subprog].unreliable = true; + return -EINVAL; +} + +/* Convert BTF of a function into bpf_reg_state if possible + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - cannot convert BTF. + * 0 - Successfully converted BTF into bpf_reg_state + * (either PTR_TO_CTX or SCALAR_VALUE). + */ +int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *reg) +{ + struct bpf_verifier_log *log = &env->log; + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + const struct btf_param *args; + const struct btf_type *t; + u32 i, nargs, btf_id; + const char *tname; + + if (!prog->aux->func_info || + prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) { + bpf_log(log, "Verifier bug\n"); + return -EFAULT; + } + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) { + bpf_log(log, "Global functions need valid BTF\n"); + return -EFAULT; + } + + t = btf_type_by_id(btf, btf_id); + if (!t || !btf_type_is_func(t)) { + /* These checks were already done by the verifier while loading + * struct bpf_func_info + */ + bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n", + subprog); + return -EFAULT; + } + tname = btf_name_by_offset(btf, t->name_off); + + if (log->level & BPF_LOG_LEVEL) + bpf_log(log, "Validating %s() func#%d...\n", + tname, subprog); + + if (prog->aux->func_info_aux[subprog].unreliable) { + bpf_log(log, "Verifier bug in function %s()\n", tname); + return -EFAULT; + } + + t = btf_type_by_id(btf, t->type); + if (!t || !btf_type_is_func_proto(t)) { + bpf_log(log, "Invalid type of function %s()\n", tname); + return -EFAULT; + } + args = (const struct btf_param *)(t + 1); + nargs = btf_type_vlen(t); + if (nargs > 5) { + bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n", + tname, nargs); + return -EINVAL; + } + /* check that function returns int */ + t = btf_type_by_id(btf, t->type); + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_int(t) && !btf_type_is_enum(t)) { + bpf_log(log, + "Global function %s() doesn't return scalar. Only those are supported.\n", + tname); + return -EINVAL; + } + /* Convert BTF function arguments into verifier types. + * Only PTR_TO_CTX and SCALAR are supported atm. + */ + for (i = 0; i < nargs; i++) { + t = btf_type_by_id(btf, args[i].type); + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (btf_type_is_int(t) || btf_type_is_enum(t)) { + reg[i + 1].type = SCALAR_VALUE; + continue; + } + if (btf_type_is_ptr(t) && + btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { + reg[i + 1].type = PTR_TO_CTX; + continue; + } + bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", + i, btf_kind_str[BTF_INFO_KIND(t->info)], tname); + return -EINVAL; + } return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f5af759a8a5f..ca17dccc17ba 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1122,10 +1122,6 @@ static void init_reg_state(struct bpf_verifier_env *env, regs[BPF_REG_FP].type = PTR_TO_STACK; mark_reg_known_zero(env, regs, BPF_REG_FP); regs[BPF_REG_FP].frameno = state->frameno; - - /* 1st arg to a function */ - regs[BPF_REG_1].type = PTR_TO_CTX; - mark_reg_known_zero(env, regs, BPF_REG_1); } #define BPF_MAIN_FUNC (-1) @@ -2739,8 +2735,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -static int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +int check_ctx_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) { /* Access to ctx or passing it to a helper is only allowed in * its original, unmodified form. @@ -3956,12 +3952,26 @@ static int release_reference(struct bpf_verifier_env *env, return 0; } +static void clear_caller_saved_regs(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) +{ + int i; + + /* after the call registers r0 - r5 were scratched */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + mark_reg_not_init(env, regs, caller_saved[i]); + check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + } +} + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_info_aux *func_info_aux; struct bpf_func_state *caller, *callee; int i, err, subprog, target_insn; + bool is_global = false; if (state->curframe + 1 >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep\n", @@ -3984,6 +3994,32 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } + func_info_aux = env->prog->aux->func_info_aux; + if (func_info_aux) + is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; + err = btf_check_func_arg_match(env, subprog, caller->regs); + if (err == -EFAULT) + return err; + if (is_global) { + if (err) { + verbose(env, "Caller passes invalid args into func#%d\n", + subprog); + return err; + } else { + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, + "Func#%d is global and valid. Skipping.\n", + subprog); + clear_caller_saved_regs(env, caller->regs); + + /* All global functions return SCALAR_VALUE */ + mark_reg_unknown(env, caller->regs, BPF_REG_0); + + /* continue with next insn after call */ + return 0; + } + } + callee = kzalloc(sizeof(*callee), GFP_KERNEL); if (!callee) return -ENOMEM; @@ -4010,18 +4046,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, for (i = BPF_REG_1; i <= BPF_REG_5; i++) callee->regs[i] = caller->regs[i]; - /* after the call registers r0 - r5 were scratched */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(env, caller->regs, caller_saved[i]); - check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); - } + clear_caller_saved_regs(env, caller->regs); /* only increment it after check_reg_arg() finished */ state->curframe++; - if (btf_check_func_arg_match(env, subprog)) - return -EINVAL; - /* and go analyze first insn of the callee */ *insn_idx = target_insn; @@ -6771,12 +6800,13 @@ static int check_btf_func(struct bpf_verifier_env *env, /* check type_id */ type = btf_type_by_id(btf, krecord[i].type_id); - if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) { + if (!type || !btf_type_is_func(type)) { verbose(env, "invalid type id %d in func info", krecord[i].type_id); ret = -EINVAL; goto err_free; } + info_aux[i].linkage = BTF_INFO_VLEN(type->info); prev_offset = krecord[i].insn_off; urecord += urec_size; } @@ -7756,35 +7786,13 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) static int do_check(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; int insn_cnt = env->prog->len; bool do_print_state = false; int prev_insn_idx = -1; - env->prev_linfo = NULL; - - state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); - if (!state) - return -ENOMEM; - state->curframe = 0; - state->speculative = false; - state->branches = 1; - state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); - if (!state->frame[0]) { - kfree(state); - return -ENOMEM; - } - env->cur_state = state; - init_func_state(env, state->frame[0], - BPF_MAIN_FUNC /* callsite */, - 0 /* frameno */, - 0 /* subprogno, zero == main subprog */); - - if (btf_check_func_arg_match(env, 0)) - return -EINVAL; - for (;;) { struct bpf_insn *insn; u8 class; @@ -7862,7 +7870,7 @@ static int do_check(struct bpf_verifier_env *env) } regs = cur_regs(env); - env->insn_aux_data[env->insn_idx].seen = true; + env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { @@ -8082,7 +8090,7 @@ process_bpf_exit: return err; env->insn_idx++; - env->insn_aux_data[env->insn_idx].seen = true; + env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -8095,7 +8103,6 @@ process_bpf_exit: env->insn_idx++; } - env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return 0; } @@ -8372,7 +8379,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); for (i = off; i < off + cnt - 1; i++) { - new_data[i].seen = true; + new_data[i].seen = env->pass_cnt; new_data[i].zext_dst = insn_has_def32(env, insn + i); } env->insn_aux_data = new_data; @@ -9484,6 +9491,7 @@ static void free_states(struct bpf_verifier_env *env) kfree(sl); sl = sln; } + env->free_list = NULL; if (!env->explored_states) return; @@ -9497,11 +9505,159 @@ static void free_states(struct bpf_verifier_env *env) kfree(sl); sl = sln; } + env->explored_states[i] = NULL; } +} - kvfree(env->explored_states); +/* The verifier is using insn_aux_data[] to store temporary data during + * verification and to store information for passes that run after the + * verification like dead code sanitization. do_check_common() for subprogram N + * may analyze many other subprograms. sanitize_insn_aux_data() clears all + * temporary data after do_check_common() finds that subprogram N cannot be + * verified independently. pass_cnt counts the number of times + * do_check_common() was run and insn->aux->seen tells the pass number + * insn_aux_data was touched. These variables are compared to clear temporary + * data from failed pass. For testing and experiments do_check_common() can be + * run multiple times even when prior attempt to verify is unsuccessful. + */ +static void sanitize_insn_aux_data(struct bpf_verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + struct bpf_insn_aux_data *aux; + int i, class; + + for (i = 0; i < env->prog->len; i++) { + class = BPF_CLASS(insn[i].code); + if (class != BPF_LDX && class != BPF_STX) + continue; + aux = &env->insn_aux_data[i]; + if (aux->seen != env->pass_cnt) + continue; + memset(aux, 0, offsetof(typeof(*aux), orig_idx)); + } } +static int do_check_common(struct bpf_verifier_env *env, int subprog) +{ + struct bpf_verifier_state *state; + struct bpf_reg_state *regs; + int ret, i; + + env->prev_linfo = NULL; + env->pass_cnt++; + + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + if (!state) + return -ENOMEM; + state->curframe = 0; + state->speculative = false; + state->branches = 1; + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + if (!state->frame[0]) { + kfree(state); + return -ENOMEM; + } + env->cur_state = state; + init_func_state(env, state->frame[0], + BPF_MAIN_FUNC /* callsite */, + 0 /* frameno */, + subprog); + + regs = state->frame[state->curframe]->regs; + if (subprog) { + ret = btf_prepare_func_args(env, subprog, regs); + if (ret) + goto out; + for (i = BPF_REG_1; i <= BPF_REG_5; i++) { + if (regs[i].type == PTR_TO_CTX) + mark_reg_known_zero(env, regs, i); + else if (regs[i].type == SCALAR_VALUE) + mark_reg_unknown(env, regs, i); + } + } else { + /* 1st arg to a function */ + regs[BPF_REG_1].type = PTR_TO_CTX; + mark_reg_known_zero(env, regs, BPF_REG_1); + ret = btf_check_func_arg_match(env, subprog, regs); + if (ret == -EFAULT) + /* unlikely verifier bug. abort. + * ret == 0 and ret < 0 are sadly acceptable for + * main() function due to backward compatibility. + * Like socket filter program may be written as: + * int bpf_prog(struct pt_regs *ctx) + * and never dereference that ctx in the program. + * 'struct pt_regs' is a type mismatch for socket + * filter that should be using 'struct __sk_buff'. + */ + goto out; + } + + ret = do_check(env); +out: + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + while (!pop_stack(env, NULL, NULL)); + free_states(env); + if (ret) + /* clean aux data in case subprog was rejected */ + sanitize_insn_aux_data(env); + return ret; +} + +/* Verify all global functions in a BPF program one by one based on their BTF. + * All global functions must pass verification. Otherwise the whole program is rejected. + * Consider: + * int bar(int); + * int foo(int f) + * { + * return bar(f); + * } + * int bar(int b) + * { + * ... + * } + * foo() will be verified first for R1=any_scalar_value. During verification it + * will be assumed that bar() already verified successfully and call to bar() + * from foo() will be checked for type match only. Later bar() will be verified + * independently to check that it's safe for R1=any_scalar_value. + */ +static int do_check_subprogs(struct bpf_verifier_env *env) +{ + struct bpf_prog_aux *aux = env->prog->aux; + int i, ret; + + if (!aux->func_info) + return 0; + + for (i = 1; i < env->subprog_cnt; i++) { + if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL) + continue; + env->insn_idx = env->subprog_info[i].start; + WARN_ON_ONCE(env->insn_idx == 0); + ret = do_check_common(env, i); + if (ret) { + return ret; + } else if (env->log.level & BPF_LOG_LEVEL) { + verbose(env, + "Func#%d is safe for any args that match its prototype\n", + i); + } + } + return 0; +} + +static int do_check_main(struct bpf_verifier_env *env) +{ + int ret; + + env->insn_idx = 0; + ret = do_check_common(env, 0); + if (!ret) + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; + return ret; +} + + static void print_verification_stats(struct bpf_verifier_env *env) { int i; @@ -9849,18 +10005,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; - ret = do_check(env); - if (env->cur_state) { - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - } + ret = do_check_subprogs(env); + ret = ret ?: do_check_main(env); if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux)) ret = bpf_prog_offload_finalize(env); skip_full_check: - while (!pop_stack(env, NULL, NULL)); - free_states(env); + kvfree(env->explored_states); if (ret == 0) ret = check_max_stack_depth(env); -- cgit From 3b4130418f62b0e7a4685cc2c03bb41c6cb8922d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 13 Jan 2020 23:26:47 -0800 Subject: bpf: Fix seq_show for BPF_MAP_TYPE_STRUCT_OPS Instead of using bpf_struct_ops_map_lookup_elem() which is not implemented, bpf_struct_ops_map_seq_show_elem() should also use bpf_struct_ops_map_sys_lookup_elem() which does an inplace update to the value. The change allocates a value to pass to bpf_struct_ops_map_sys_lookup_elem(). [root@arch-fb-vm1 bpf]# cat /sys/fs/bpf/dctcp {{{1}},BPF_STRUCT_OPS_STATE_INUSE,{{00000000df93eebc,00000000df93eebc},0,2, ... Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS") Reported-by: Dan Carpenter Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200114072647.3188298-1-kafai@fb.com --- kernel/bpf/bpf_struct_ops.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index ddf48f49914b..8ad1c9ea26b2 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -496,14 +496,20 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { void *value; + int err; - value = bpf_struct_ops_map_lookup_elem(map, key); + value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); if (!value) return; - btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, - value, m); - seq_puts(m, "\n"); + err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); + if (!err) { + btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, + value, m); + seq_puts(m, "\n"); + } + + kfree(value); } static void bpf_struct_ops_map_free(struct bpf_map *map) -- cgit From 8482941f09067da42f9c3362e15bfb3f3c19d610 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 14 Jan 2020 19:50:02 -0800 Subject: bpf: Add bpf_send_signal_thread() helper Commit 8b401f9ed244 ("bpf: implement bpf_send_signal() helper") added helper bpf_send_signal() which permits bpf program to send a signal to the current process. The signal may be delivered to any threads in the process. We found a use case where sending the signal to the current thread is more preferable. - A bpf program will collect the stack trace and then send signal to the user application. - The user application will add some thread specific information to the just collected stack trace for later analysis. If bpf_send_signal() is used, user application will need to check whether the thread receiving the signal matches the thread collecting the stack by checking thread id. If not, it will need to send signal to another thread through pthread_kill(). This patch proposed a new helper bpf_send_signal_thread(), which sends the signal to the thread corresponding to the current kernel task. This way, user space is guaranteed that bpf_program execution context and user space signal handling context are the same thread. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115035002.602336-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e5ef4ae9edb5..19e793aa441a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -703,6 +703,7 @@ struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; u32 sig; + enum pid_type type; }; static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); @@ -712,10 +713,10 @@ static void do_bpf_send_signal(struct irq_work *entry) struct send_signal_irq_work *work; work = container_of(entry, struct send_signal_irq_work, irq_work); - group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID); + group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type); } -BPF_CALL_1(bpf_send_signal, u32, sig) +static int bpf_send_signal_common(u32 sig, enum pid_type type) { struct send_signal_irq_work *work = NULL; @@ -748,11 +749,17 @@ BPF_CALL_1(bpf_send_signal, u32, sig) */ work->task = current; work->sig = sig; + work->type = type; irq_work_queue(&work->irq_work); return 0; } - return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); + return group_send_sig_info(sig, SEND_SIG_PRIV, current, type); +} + +BPF_CALL_1(bpf_send_signal, u32, sig) +{ + return bpf_send_signal_common(sig, PIDTYPE_TGID); } static const struct bpf_func_proto bpf_send_signal_proto = { @@ -762,6 +769,18 @@ static const struct bpf_func_proto bpf_send_signal_proto = { .arg1_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_send_signal_thread, u32, sig) +{ + return bpf_send_signal_common(sig, PIDTYPE_PID); +} + +static const struct bpf_func_proto bpf_send_signal_thread_proto = { + .func = bpf_send_signal_thread, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -822,6 +841,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; + case BPF_FUNC_send_signal_thread: + return &bpf_send_signal_thread_proto; default: return NULL; } -- cgit From 15c14a3dca421f086c187155afba3222b879472d Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Wed, 15 Jan 2020 10:43:00 -0800 Subject: bpf: Add bpf_map_{value_size, update_value, map_copy_value} functions This commit moves reusable code from map_lookup_elem and map_update_elem to avoid code duplication in kernel/bpf/syscall.c. Signed-off-by: Brian Vazquez Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200115184308.162644-2-brianvv@google.com --- kernel/bpf/syscall.c | 280 ++++++++++++++++++++++++++++----------------------- 1 file changed, 152 insertions(+), 128 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f9db72a96ec0..08b0b6e40454 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -129,6 +129,154 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } +static u32 bpf_map_value_size(struct bpf_map *map) +{ + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) + return round_up(map->value_size, 8) * num_possible_cpus(); + else if (IS_FD_MAP(map)) + return sizeof(u32); + else + return map->value_size; +} + +static void maybe_wait_bpf_programs(struct bpf_map *map) +{ + /* Wait for any running BPF programs to complete so that + * userspace, when we return to it, knows that all programs + * that could be running use the new map value. + */ + if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || + map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) + synchronize_rcu(); +} + +static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, + void *value, __u64 flags) +{ + int err; + + /* Need to create a kthread, thus must support schedule */ + if (bpf_map_is_dev_bound(map)) { + return bpf_map_offload_update_elem(map, key, value, flags); + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH || + map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + return map->ops->map_update_elem(map, key, value, flags); + } else if (IS_FD_PROG_ARRAY(map)) { + return bpf_fd_array_map_update_elem(map, f.file, key, value, + flags); + } + + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from + * inside bpf map update or delete otherwise deadlocks are possible + */ + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + err = bpf_percpu_hash_update(map, key, value, flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_update(map, key, value, flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_update(map, key, value, + flags); + } else if (IS_FD_ARRAY(map)) { + rcu_read_lock(); + err = bpf_fd_array_map_update_elem(map, f.file, key, value, + flags); + rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + rcu_read_lock(); + err = bpf_fd_htab_map_update_elem(map, f.file, key, value, + flags); + rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + /* rcu_read_lock() is not needed */ + err = bpf_fd_reuseport_array_update_elem(map, key, value, + flags); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_push_elem(map, value, flags); + } else { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, flags); + rcu_read_unlock(); + } + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + maybe_wait_bpf_programs(map); + + return err; +} + +static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, + __u64 flags) +{ + void *ptr; + int err; + + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_lookup_elem(map, key, value); + return err; + } + + preempt_disable(); + this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + err = bpf_percpu_hash_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { + err = bpf_stackmap_copy(map, key, value); + } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { + err = bpf_fd_array_map_lookup_elem(map, key, value); + } else if (IS_FD_HASH(map)) { + err = bpf_fd_htab_map_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_peek_elem(map, value); + } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* struct_ops map requires directly updating "value" */ + err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); + } else { + rcu_read_lock(); + if (map->ops->map_lookup_elem_sys_only) + ptr = map->ops->map_lookup_elem_sys_only(map, key); + else + ptr = map->ops->map_lookup_elem(map, key); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + } else if (!ptr) { + err = -ENOENT; + } else { + err = 0; + if (flags & BPF_F_LOCK) + /* lock 'ptr' and copy everything but lock */ + copy_map_value_locked(map, value, ptr, true); + else + copy_map_value(map, value, ptr); + /* mask lock, since value wasn't zero inited */ + check_and_init_map_lock(map, value); + } + rcu_read_unlock(); + } + + this_cpu_dec(bpf_prog_active); + preempt_enable(); + maybe_wait_bpf_programs(map); + + return err; +} + static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer @@ -827,7 +975,7 @@ static int map_lookup_elem(union bpf_attr *attr) void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; - void *key, *value, *ptr; + void *key, *value; u32 value_size; struct fd f; int err; @@ -859,75 +1007,14 @@ static int map_lookup_elem(union bpf_attr *attr) goto err_put; } - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) - value_size = round_up(map->value_size, 8) * num_possible_cpus(); - else if (IS_FD_MAP(map)) - value_size = sizeof(u32); - else - value_size = map->value_size; + value_size = bpf_map_value_size(map); err = -ENOMEM; value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_lookup_elem(map, key, value); - goto done; - } - - preempt_disable(); - this_cpu_inc(bpf_prog_active); - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - err = bpf_percpu_hash_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - err = bpf_percpu_array_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { - err = bpf_stackmap_copy(map, key, value); - } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { - err = bpf_fd_array_map_lookup_elem(map, key, value); - } else if (IS_FD_HASH(map)) { - err = bpf_fd_htab_map_lookup_elem(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { - err = bpf_fd_reuseport_array_lookup_elem(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_QUEUE || - map->map_type == BPF_MAP_TYPE_STACK) { - err = map->ops->map_peek_elem(map, value); - } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { - /* struct_ops map requires directly updating "value" */ - err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); - } else { - rcu_read_lock(); - if (map->ops->map_lookup_elem_sys_only) - ptr = map->ops->map_lookup_elem_sys_only(map, key); - else - ptr = map->ops->map_lookup_elem(map, key); - if (IS_ERR(ptr)) { - err = PTR_ERR(ptr); - } else if (!ptr) { - err = -ENOENT; - } else { - err = 0; - if (attr->flags & BPF_F_LOCK) - /* lock 'ptr' and copy everything but lock */ - copy_map_value_locked(map, value, ptr, true); - else - copy_map_value(map, value, ptr); - /* mask lock, since value wasn't zero inited */ - check_and_init_map_lock(map, value); - } - rcu_read_unlock(); - } - this_cpu_dec(bpf_prog_active); - preempt_enable(); - -done: + err = bpf_map_copy_value(map, key, value, attr->flags); if (err) goto free_value; @@ -946,16 +1033,6 @@ err_put: return err; } -static void maybe_wait_bpf_programs(struct bpf_map *map) -{ - /* Wait for any running BPF programs to complete so that - * userspace, when we return to it, knows that all programs - * that could be running use the new map value. - */ - if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || - map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) - synchronize_rcu(); -} #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags @@ -1011,61 +1088,8 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; - /* Need to create a kthread, thus must support schedule */ - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_update_elem(map, key, value, attr->flags); - goto out; - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || - map->map_type == BPF_MAP_TYPE_SOCKHASH || - map->map_type == BPF_MAP_TYPE_SOCKMAP || - map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { - err = map->ops->map_update_elem(map, key, value, attr->flags); - goto out; - } else if (IS_FD_PROG_ARRAY(map)) { - err = bpf_fd_array_map_update_elem(map, f.file, key, value, - attr->flags); - goto out; - } + err = bpf_map_update_value(map, f, key, value, attr->flags); - /* must increment bpf_prog_active to avoid kprobe+bpf triggering from - * inside bpf map update or delete otherwise deadlocks are possible - */ - preempt_disable(); - __this_cpu_inc(bpf_prog_active); - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - err = bpf_percpu_hash_update(map, key, value, attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - err = bpf_percpu_array_update(map, key, value, attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_update(map, key, value, - attr->flags); - } else if (IS_FD_ARRAY(map)) { - rcu_read_lock(); - err = bpf_fd_array_map_update_elem(map, f.file, key, value, - attr->flags); - rcu_read_unlock(); - } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { - rcu_read_lock(); - err = bpf_fd_htab_map_update_elem(map, f.file, key, value, - attr->flags); - rcu_read_unlock(); - } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { - /* rcu_read_lock() is not needed */ - err = bpf_fd_reuseport_array_update_elem(map, key, value, - attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_QUEUE || - map->map_type == BPF_MAP_TYPE_STACK) { - err = map->ops->map_push_elem(map, value, attr->flags); - } else { - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, attr->flags); - rcu_read_unlock(); - } - __this_cpu_dec(bpf_prog_active); - preempt_enable(); - maybe_wait_bpf_programs(map); -out: free_value: kfree(value); free_key: -- cgit From cb4d03ab499d4c040f4ab6fd4389d2b49f42b5a5 Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Wed, 15 Jan 2020 10:43:01 -0800 Subject: bpf: Add generic support for lookup batch op This commit introduces generic support for the bpf_map_lookup_batch. This implementation can be used by almost all the bpf maps since its core implementation is relying on the existing map_get_next_key and map_lookup_elem. The bpf syscall subcommand introduced is: BPF_MAP_LOOKUP_BATCH The UAPI attribute is: struct { /* struct used by BPF_MAP_*_BATCH commands */ __aligned_u64 in_batch; /* start batch, * NULL to start from beginning */ __aligned_u64 out_batch; /* output: next start batch */ __aligned_u64 keys; __aligned_u64 values; __u32 count; /* input/output: * input: # of key/value * elements * output: # of filled elements */ __u32 map_fd; __u64 elem_flags; __u64 flags; } batch; in_batch/out_batch are opaque values use to communicate between user/kernel space, in_batch/out_batch must be of key_size length. To start iterating from the beginning in_batch must be null, count is the # of key/value elements to retrieve. Note that the 'keys' buffer must be a buffer of key_size * count size and the 'values' buffer must be value_size * count, where value_size must be aligned to 8 bytes by userspace if it's dealing with percpu maps. 'count' will contain the number of keys/values successfully retrieved. Note that 'count' is an input/output variable and it can contain a lower value after a call. If there's no more entries to retrieve, ENOENT will be returned. If error is ENOENT, count might be > 0 in case it copied some values but there were no more entries to retrieve. Note that if the return code is an error and not -EFAULT, count indicates the number of elements successfully processed. Suggested-by: Stanislav Fomichev Signed-off-by: Brian Vazquez Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115184308.162644-3-brianvv@google.com --- kernel/bpf/syscall.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 156 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 08b0b6e40454..d604ddbb1afb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -219,10 +219,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, void *ptr; int err; - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_lookup_elem(map, key, value); - return err; - } + if (bpf_map_is_dev_bound(map)) + return bpf_map_offload_lookup_elem(map, key, value); preempt_disable(); this_cpu_inc(bpf_prog_active); @@ -1220,6 +1218,109 @@ err_put: return err; } +#define MAP_LOOKUP_RETRIES 3 + +int generic_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); + void __user *values = u64_to_user_ptr(attr->batch.values); + void __user *keys = u64_to_user_ptr(attr->batch.keys); + void *buf, *buf_prevkey, *prev_key, *key, *value; + int err, retry = MAP_LOOKUP_RETRIES; + u32 value_size, cp, max_count; + bool first_key = false; + + if (attr->batch.elem_flags & ~BPF_F_LOCK) + return -EINVAL; + + if ((attr->batch.elem_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) + return -EINVAL; + + value_size = bpf_map_value_size(map); + + max_count = attr->batch.count; + if (!max_count) + return 0; + + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + + buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + if (!buf_prevkey) + return -ENOMEM; + + buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); + if (!buf) { + kvfree(buf_prevkey); + return -ENOMEM; + } + + err = -EFAULT; + first_key = false; + prev_key = NULL; + if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size)) + goto free_buf; + key = buf; + value = key + map->key_size; + if (ubatch) + prev_key = buf_prevkey; + + for (cp = 0; cp < max_count;) { + rcu_read_lock(); + err = map->ops->map_get_next_key(map, prev_key, key); + rcu_read_unlock(); + if (err) + break; + err = bpf_map_copy_value(map, key, value, + attr->batch.elem_flags); + + if (err == -ENOENT) { + if (retry) { + retry--; + continue; + } + err = -EINTR; + break; + } + + if (err) + goto free_buf; + + if (copy_to_user(keys + cp * map->key_size, key, + map->key_size)) { + err = -EFAULT; + goto free_buf; + } + if (copy_to_user(values + cp * value_size, value, value_size)) { + err = -EFAULT; + goto free_buf; + } + + if (!prev_key) + prev_key = buf_prevkey; + + swap(prev_key, key); + retry = MAP_LOOKUP_RETRIES; + cp++; + } + + if (err == -EFAULT) + goto free_buf; + + if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || + (cp && copy_to_user(uobatch, prev_key, map->key_size)))) + err = -EFAULT; + +free_buf: + kfree(buf_prevkey); + kfree(buf); + return err; +} + #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value static int map_lookup_and_delete_elem(union bpf_attr *attr) @@ -3076,6 +3177,54 @@ out: return err; } +#define BPF_MAP_BATCH_LAST_FIELD batch.flags + +#define BPF_DO_BATCH(fn) \ + do { \ + if (!fn) { \ + err = -ENOTSUPP; \ + goto err_put; \ + } \ + err = fn(map, attr, uattr); \ + } while (0) + +static int bpf_map_do_batch(const union bpf_attr *attr, + union bpf_attr __user *uattr, + int cmd) +{ + struct bpf_map *map; + int err, ufd; + struct fd f; + + if (CHECK_ATTR(BPF_MAP_BATCH)) + return -EINVAL; + + ufd = attr->batch.map_fd; + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (cmd == BPF_MAP_LOOKUP_BATCH && + !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { + err = -EPERM; + goto err_put; + } + + if (cmd != BPF_MAP_LOOKUP_BATCH && + !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + + if (cmd == BPF_MAP_LOOKUP_BATCH) + BPF_DO_BATCH(map->ops->map_lookup_batch); + +err_put: + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -3173,6 +3322,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_LOOKUP_AND_DELETE_ELEM: err = map_lookup_and_delete_elem(&attr); break; + case BPF_MAP_LOOKUP_BATCH: + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); + break; default: err = -EINVAL; break; -- cgit From aa2e93b8e58e18442edfb2427446732415bc215e Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Wed, 15 Jan 2020 10:43:02 -0800 Subject: bpf: Add generic support for update and delete batch ops This commit adds generic support for update and delete batch ops that can be used for almost all the bpf maps. These commands share the same UAPI attr that lookup and lookup_and_delete batch ops use and the syscall commands are: BPF_MAP_UPDATE_BATCH BPF_MAP_DELETE_BATCH The main difference between update/delete and lookup batch ops is that for update/delete keys/values must be specified for userspace and because of that, neither in_batch nor out_batch are used. Suggested-by: Stanislav Fomichev Signed-off-by: Brian Vazquez Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115184308.162644-4-brianvv@google.com --- kernel/bpf/syscall.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d604ddbb1afb..ce8244d1ba99 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1218,6 +1218,111 @@ err_put: return err; } +int generic_map_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *keys = u64_to_user_ptr(attr->batch.keys); + u32 cp, max_count; + int err = 0; + void *key; + + if (attr->batch.elem_flags & ~BPF_F_LOCK) + return -EINVAL; + + if ((attr->batch.elem_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + return -EINVAL; + } + + max_count = attr->batch.count; + if (!max_count) + return 0; + + for (cp = 0; cp < max_count; cp++) { + key = __bpf_copy_key(keys + cp * map->key_size, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); + break; + } + + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_delete_elem(map, key); + break; + } + + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + rcu_read_lock(); + err = map->ops->map_delete_elem(map, key); + rcu_read_unlock(); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + maybe_wait_bpf_programs(map); + if (err) + break; + } + if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) + err = -EFAULT; + return err; +} + +int generic_map_update_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *values = u64_to_user_ptr(attr->batch.values); + void __user *keys = u64_to_user_ptr(attr->batch.keys); + u32 value_size, cp, max_count; + int ufd = attr->map_fd; + void *key, *value; + struct fd f; + int err = 0; + + f = fdget(ufd); + if (attr->batch.elem_flags & ~BPF_F_LOCK) + return -EINVAL; + + if ((attr->batch.elem_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + return -EINVAL; + } + + value_size = bpf_map_value_size(map); + + max_count = attr->batch.count; + if (!max_count) + return 0; + + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + if (!value) + return -ENOMEM; + + for (cp = 0; cp < max_count; cp++) { + key = __bpf_copy_key(keys + cp * map->key_size, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); + break; + } + err = -EFAULT; + if (copy_from_user(value, values + cp * value_size, value_size)) + break; + + err = bpf_map_update_value(map, f, key, value, + attr->batch.elem_flags); + + if (err) + break; + } + + if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) + err = -EFAULT; + + kfree(value); + kfree(key); + return err; +} + #define MAP_LOOKUP_RETRIES 3 int generic_map_lookup_batch(struct bpf_map *map, @@ -3219,6 +3324,10 @@ static int bpf_map_do_batch(const union bpf_attr *attr, if (cmd == BPF_MAP_LOOKUP_BATCH) BPF_DO_BATCH(map->ops->map_lookup_batch); + else if (cmd == BPF_MAP_UPDATE_BATCH) + BPF_DO_BATCH(map->ops->map_update_batch); + else + BPF_DO_BATCH(map->ops->map_delete_batch); err_put: fdput(f); @@ -3325,6 +3434,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_LOOKUP_BATCH: err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); break; + case BPF_MAP_UPDATE_BATCH: + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH); + break; + case BPF_MAP_DELETE_BATCH: + err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH); + break; default: err = -EINVAL; break; -- cgit From c60f2d2861778de6370a4f4ca6ab1d7d4a32efae Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Wed, 15 Jan 2020 10:43:03 -0800 Subject: bpf: Add lookup and update batch ops to arraymap This adds the generic batch ops functionality to bpf arraymap, note that since deletion is not a valid operation for arraymap, only batch and lookup are added. Signed-off-by: Brian Vazquez Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200115184308.162644-5-brianvv@google.com --- kernel/bpf/arraymap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f0d19bbb9211..95d77770353c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -503,6 +503,8 @@ const struct bpf_map_ops array_map_ops = { .map_mmap = array_map_mmap, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, + .map_lookup_batch = generic_map_lookup_batch, + .map_update_batch = generic_map_update_batch, }; const struct bpf_map_ops percpu_array_map_ops = { -- cgit From 057996380a42bb64ccc04383cfa9c0ace4ea11f0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 15 Jan 2020 10:43:04 -0800 Subject: bpf: Add batch ops to all htab bpf map htab can't use generic batch support due some problematic behaviours inherent to the data structre, i.e. while iterating the bpf map a concurrent program might delete the next entry that batch was about to use, in that case there's no easy solution to retrieve the next entry, the issue has been discussed multiple times (see [1] and [2]). The only way hmap can be traversed without the problem previously exposed is by making sure that the map is traversing entire buckets. This commit implements those strict requirements for hmap, the implementation follows the same interaction that generic support with some exceptions: - If keys/values buffer are not big enough to traverse a bucket, ENOSPC will be returned. - out_batch contains the value of the next bucket in the iteration, not the next key, but this is transparent for the user since the user should never use out_batch for other than bpf batch syscalls. This commits implements BPF_MAP_LOOKUP_BATCH and adds support for new command BPF_MAP_LOOKUP_AND_DELETE_BATCH. Note that for update/delete batch ops it is possible to use the generic implementations. [1] https://lore.kernel.org/bpf/20190724165803.87470-1-brianvv@google.com/ [2] https://lore.kernel.org/bpf/20190906225434.3635421-1-yhs@fb.com/ Signed-off-by: Yonghong Song Signed-off-by: Brian Vazquez Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115184308.162644-6-brianvv@google.com --- kernel/bpf/hashtab.c | 264 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 9 +- 2 files changed, 272 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 22066a62c8c9..2d182c4ee9d9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -17,6 +17,16 @@ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) +#define BATCH_OPS(_name) \ + .map_lookup_batch = \ + _name##_map_lookup_batch, \ + .map_lookup_and_delete_batch = \ + _name##_map_lookup_and_delete_batch, \ + .map_update_batch = \ + generic_map_update_batch, \ + .map_delete_batch = \ + generic_map_delete_batch + struct bucket { struct hlist_nulls_head head; raw_spinlock_t lock; @@ -1232,6 +1242,256 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static int +__htab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr, + bool do_delete, bool is_lru_map, + bool is_percpu) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 bucket_cnt, total, key_size, value_size, roundup_key_size; + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; + void __user *uvalues = u64_to_user_ptr(attr->batch.values); + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); + void *ubatch = u64_to_user_ptr(attr->batch.in_batch); + u32 batch, max_count, size, bucket_size; + u64 elem_map_flags, map_flags; + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; + unsigned long flags; + struct htab_elem *l; + struct bucket *b; + int ret = 0; + + elem_map_flags = attr->batch.elem_flags; + if ((elem_map_flags & ~BPF_F_LOCK) || + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + return -EINVAL; + + map_flags = attr->batch.flags; + if (map_flags) + return -EINVAL; + + max_count = attr->batch.count; + if (!max_count) + return 0; + + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + + batch = 0; + if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) + return -EFAULT; + + if (batch >= htab->n_buckets) + return -ENOENT; + + key_size = htab->map.key_size; + roundup_key_size = round_up(htab->map.key_size, 8); + value_size = htab->map.value_size; + size = round_up(value_size, 8); + if (is_percpu) + value_size = size * num_possible_cpus(); + total = 0; + /* while experimenting with hash tables with sizes ranging from 10 to + * 1000, it was observed that a bucket can have upto 5 entries. + */ + bucket_size = 5; + +alloc: + /* We cannot do copy_from_user or copy_to_user inside + * the rcu_read_lock. Allocate enough space here. + */ + keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN); + values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN); + if (!keys || !values) { + ret = -ENOMEM; + goto after_loop; + } + +again: + preempt_disable(); + this_cpu_inc(bpf_prog_active); + rcu_read_lock(); +again_nocopy: + dst_key = keys; + dst_val = values; + b = &htab->buckets[batch]; + head = &b->head; + raw_spin_lock_irqsave(&b->lock, flags); + + bucket_cnt = 0; + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) + bucket_cnt++; + + if (bucket_cnt > (max_count - total)) { + if (total == 0) + ret = -ENOSPC; + raw_spin_unlock_irqrestore(&b->lock, flags); + rcu_read_unlock(); + this_cpu_dec(bpf_prog_active); + preempt_enable(); + goto after_loop; + } + + if (bucket_cnt > bucket_size) { + bucket_size = bucket_cnt; + raw_spin_unlock_irqrestore(&b->lock, flags); + rcu_read_unlock(); + this_cpu_dec(bpf_prog_active); + preempt_enable(); + kvfree(keys); + kvfree(values); + goto alloc; + } + + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + memcpy(dst_key, l->key, key_size); + + if (is_percpu) { + int off = 0, cpu; + void __percpu *pptr; + + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(dst_val + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + } else { + value = l->key + roundup_key_size; + if (elem_map_flags & BPF_F_LOCK) + copy_map_value_locked(map, dst_val, value, + true); + else + copy_map_value(map, dst_val, value); + check_and_init_map_lock(map, dst_val); + } + if (do_delete) { + hlist_nulls_del_rcu(&l->hash_node); + if (is_lru_map) + bpf_lru_push_free(&htab->lru, &l->lru_node); + else + free_htab_elem(htab, l); + } + dst_key += key_size; + dst_val += value_size; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + /* If we are not copying data, we can go to next bucket and avoid + * unlocking the rcu. + */ + if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { + batch++; + goto again_nocopy; + } + + rcu_read_unlock(); + this_cpu_dec(bpf_prog_active); + preempt_enable(); + if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, + key_size * bucket_cnt) || + copy_to_user(uvalues + total * value_size, values, + value_size * bucket_cnt))) { + ret = -EFAULT; + goto after_loop; + } + + total += bucket_cnt; + batch++; + if (batch >= htab->n_buckets) { + ret = -ENOENT; + goto after_loop; + } + goto again; + +after_loop: + if (ret == -EFAULT) + goto out; + + /* copy # of entries and next batch */ + ubatch = u64_to_user_ptr(attr->batch.out_batch); + if (copy_to_user(ubatch, &batch, sizeof(batch)) || + put_user(total, &uattr->batch.count)) + ret = -EFAULT; + +out: + kvfree(keys); + kvfree(values); + return ret; +} + +static int +htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + false, true); +} + +static int +htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + false, true); +} + +static int +htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + false, false); +} + +static int +htab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + false, false); +} + +static int +htab_lru_percpu_map_lookup_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + true, true); +} + +static int +htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + true, true); +} + +static int +htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, + true, false); +} + +static int +htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, + true, false); +} + const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1242,6 +1502,7 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + BATCH_OPS(htab), }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1255,6 +1516,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + BATCH_OPS(htab_lru), }; /* Called from eBPF program */ @@ -1368,6 +1630,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + BATCH_OPS(htab_percpu), }; const struct bpf_map_ops htab_lru_percpu_map_ops = { @@ -1379,6 +1642,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + BATCH_OPS(htab_lru_percpu), }; static int fd_htab_map_alloc_check(union bpf_attr *attr) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ce8244d1ba99..0d94d361379c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3310,7 +3310,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr, if (IS_ERR(map)) return PTR_ERR(map); - if (cmd == BPF_MAP_LOOKUP_BATCH && + if ((cmd == BPF_MAP_LOOKUP_BATCH || + cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; @@ -3324,6 +3325,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr, if (cmd == BPF_MAP_LOOKUP_BATCH) BPF_DO_BATCH(map->ops->map_lookup_batch); + else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) + BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch); else if (cmd == BPF_MAP_UPDATE_BATCH) BPF_DO_BATCH(map->ops->map_update_batch); else @@ -3434,6 +3437,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_LOOKUP_BATCH: err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); break; + case BPF_MAP_LOOKUP_AND_DELETE_BATCH: + err = bpf_map_do_batch(&attr, uattr, + BPF_MAP_LOOKUP_AND_DELETE_BATCH); + break; case BPF_MAP_UPDATE_BATCH: err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH); break; -- cgit From 75ccae62cb8d42a619323a85c577107b8b37d797 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 16 Jan 2020 16:14:44 +0100 Subject: xdp: Move devmap bulk queue into struct net_device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 96360004b862 ("xdp: Make devmap flush_list common for all map instances"), changed devmap flushing to be a global operation instead of a per-map operation. However, the queue structure used for bulking was still allocated as part of the containing map. This patch moves the devmap bulk queue into struct net_device. The motivation for this is reusing it for the non-map variant of XDP_REDIRECT, which will be changed in a subsequent commit. To avoid other fields of struct net_device moving to different cache lines, we also move a couple of other members around. We defer the actual allocation of the bulk queue structure until the NETDEV_REGISTER notification devmap.c. This makes it possible to check for ndo_xdp_xmit support before allocating the structure, which is not possible at the time struct net_device is allocated. However, we keep the freeing in free_netdev() to avoid adding another RCU callback on NETDEV_UNREGISTER. Because of this change, we lose the reference back to the map that originated the redirect, so change the tracepoint to always return 0 as the map ID and index. Otherwise no functional change is intended with this patch. After this patch, the relevant part of struct net_device looks like this, according to pahole: /* --- cacheline 14 boundary (896 bytes) --- */ struct netdev_queue * _tx __attribute__((__aligned__(64))); /* 896 8 */ unsigned int num_tx_queues; /* 904 4 */ unsigned int real_num_tx_queues; /* 908 4 */ struct Qdisc * qdisc; /* 912 8 */ unsigned int tx_queue_len; /* 920 4 */ spinlock_t tx_global_lock; /* 924 4 */ struct xdp_dev_bulk_queue * xdp_bulkq; /* 928 8 */ struct xps_dev_maps * xps_cpus_map; /* 936 8 */ struct xps_dev_maps * xps_rxqs_map; /* 944 8 */ struct mini_Qdisc * miniq_egress; /* 952 8 */ /* --- cacheline 15 boundary (960 bytes) --- */ struct hlist_head qdisc_hash[16]; /* 960 128 */ /* --- cacheline 17 boundary (1088 bytes) --- */ struct timer_list watchdog_timer; /* 1088 40 */ /* XXX last struct has 4 bytes of padding */ int watchdog_timeo; /* 1128 4 */ /* XXX 4 bytes hole, try to pack */ struct list_head todo_list; /* 1136 16 */ /* --- cacheline 18 boundary (1152 bytes) --- */ Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Björn Töpel Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/157918768397.1458396.12673224324627072349.stgit@toke.dk --- kernel/bpf/devmap.c | 63 +++++++++++++++++++++++------------------------------ 1 file changed, 27 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index da9c832fc5c8..030d125c3839 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -53,13 +53,11 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 -struct bpf_dtab_netdev; - -struct xdp_bulk_queue { +struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; + struct net_device *dev; struct net_device *dev_rx; - struct bpf_dtab_netdev *obj; unsigned int count; }; @@ -67,9 +65,8 @@ struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; - struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; - unsigned int idx; /* keep track of map index for tracepoint */ + unsigned int idx; }; struct bpf_dtab { @@ -219,7 +216,6 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -234,7 +230,6 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -320,10 +315,9 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) +static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { - struct bpf_dtab_netdev *obj = bq->obj; - struct net_device *dev = obj->dev; + struct net_device *dev = bq->dev; int sent = 0, drops = 0, err = 0; int i; @@ -346,8 +340,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) out: bq->count = 0; - trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx, - sent, drops, bq->dev_rx, dev, err); + trace_xdp_devmap_xmit(NULL, 0, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); return 0; @@ -374,7 +367,7 @@ error: void __dev_map_flush(void) { struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); - struct xdp_bulk_queue *bq, *tmp; + struct xdp_dev_bulk_queue *bq, *tmp; rcu_read_lock(); list_for_each_entry_safe(bq, tmp, flush_list, flush_node) @@ -401,12 +394,12 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, +static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); - struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); + struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(bq, 0); @@ -444,7 +437,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, if (unlikely(!xdpf)) return -EOVERFLOW; - return bq_enqueue(dst, xdpf, dev_rx); + return bq_enqueue(dev, xdpf, dev_rx); } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, @@ -483,7 +476,6 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -538,30 +530,15 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, u32 ifindex, unsigned int idx) { - gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev; - struct xdp_bulk_queue *bq; - int cpu; - dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); + dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, + dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); - dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), - sizeof(void *), gfp); - if (!dev->bulkq) { - kfree(dev); - return ERR_PTR(-ENOMEM); - } - - for_each_possible_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq->obj = dev; - } - dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { - free_percpu(dev->bulkq); kfree(dev); return ERR_PTR(-EINVAL); } @@ -721,9 +698,23 @@ static int dev_map_notification(struct notifier_block *notifier, { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct bpf_dtab *dtab; - int i; + int i, cpu; switch (event) { + case NETDEV_REGISTER: + if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) + break; + + /* will be freed in free_netdev() */ + netdev->xdp_bulkq = + __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue), + sizeof(void *), GFP_ATOMIC); + if (!netdev->xdp_bulkq) + return NOTIFY_BAD; + + for_each_possible_cpu(cpu) + per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; + break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because * dev_map_list is an RCU list AND to ensure a delete -- cgit From 1d233886dd904edbf239eeffe435c3308ae97625 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 16 Jan 2020 16:14:45 +0100 Subject: xdp: Use bulking for non-map XDP_REDIRECT and consolidate code paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the bulk queue used by XDP_REDIRECT now lives in struct net_device, we can re-use the bulking for the non-map version of the bpf_redirect() helper. This is a simple matter of having xdp_do_redirect_slow() queue the frame on the bulk queue instead of sending it out with __bpf_tx_xdp(). Unfortunately we can't make the bpf_redirect() helper return an error if the ifindex doesn't exit (as bpf_redirect_map() does), because we don't have a reference to the network namespace of the ingress device at the time the helper is called. So we have to leave it as-is and keep the device lookup in xdp_do_redirect_slow(). Since this leaves less reason to have the non-map redirect code in a separate function, so we get rid of the xdp_do_redirect_slow() function entirely. This does lose us the tracepoint disambiguation, but fortunately the xdp_redirect and xdp_redirect_map tracepoints use the same tracepoint entry structures. This means both can contain a map index, so we can just amend the tracepoint definitions so we always emit the xdp_redirect(_err) tracepoints, but with the map ID only populated if a map is present. This means we retire the xdp_redirect_map(_err) tracepoints entirely, but keep the definitions around in case someone is still listening for them. With this change, the performance of the xdp_redirect sample program goes from 5Mpps to 8.4Mpps (a 68% increase). Since the flush functions are no longer map-specific, rename the flush() functions to drop _map from their names. One of the renamed functions is the xdp_do_flush_map() callback used in all the xdp-enabled drivers. To keep from having to update all drivers, use a #define to keep the old name working, and only update the virtual drivers in this patch. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/157918768505.1458396.17518057312953572912.stgit@toke.dk --- kernel/bpf/devmap.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 030d125c3839..d5311009953f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -81,7 +81,7 @@ struct bpf_dtab { u32 n_buckets; }; -static DEFINE_PER_CPU(struct list_head, dev_map_flush_list); +static DEFINE_PER_CPU(struct list_head, dev_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); @@ -357,16 +357,16 @@ error: goto out; } -/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled +/* __dev_flush is called from xdp_do_flush() which _must_ be signaled * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the * net device can be torn down. On devmap tear down we ensure the flush list * is empty before completing to ensure all flush operations have completed. */ -void __dev_map_flush(void) +void __dev_flush(void) { - struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq, *tmp; rcu_read_lock(); @@ -396,9 +396,8 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) */ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) - { - struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) @@ -419,10 +418,9 @@ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, return 0; } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, - struct net_device *dev_rx) +static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) { - struct net_device *dev = dst->dev; struct xdp_frame *xdpf; int err; @@ -440,6 +438,20 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return bq_enqueue(dev, xdpf, dev_rx); } +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + return __xdp_enqueue(dev, xdp, dev_rx); +} + +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct net_device *dev = dst->dev; + + return __xdp_enqueue(dev, xdp, dev_rx); +} + int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog) { @@ -762,7 +774,7 @@ static int __init dev_map_init(void) register_netdevice_notifier(&dev_map_notifier); for_each_possible_cpu(cpu) - INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu)); + INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu)); return 0; } -- cgit From 58aa94f922c1b44e0919d1814d2eab5b9e8bf945 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 16 Jan 2020 16:14:46 +0100 Subject: devmap: Adjust tracepoint for map-less queue flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we don't have a reference to a devmap when flushing the device bulk queue, let's change the the devmap_xmit tracepoint to remote the map_id and map_index fields entirely. Rearrange the fields so 'drops' and 'sent' stay in the same position in the tracepoint struct, to make it possible for the xdp_monitor utility to read both the old and the new format. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/157918768613.1458396.9165902403373826572.stgit@toke.dk --- kernel/bpf/devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d5311009953f..de630f980282 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -340,7 +340,7 @@ static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) out: bq->count = 0; - trace_xdp_devmap_xmit(NULL, 0, sent, drops, bq->dev_rx, dev, err); + trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); return 0; -- cgit From 81f2b572cf4fd5c4178fe0e2b5bb1ab096385fd8 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 16 Jan 2020 22:53:00 +0800 Subject: bpf: Remove set but not used variable 'first_key' kernel/bpf/syscall.c: In function generic_map_lookup_batch: kernel/bpf/syscall.c:1339:7: warning: variable first_key set but not used [-Wunused-but-set-variable] It is never used, so remove it. Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Alexei Starovoitov Acked-by: Brian Vazquez Link: https://lore.kernel.org/bpf/20200116145300.59056-1-yuehaibing@huawei.com --- kernel/bpf/syscall.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0d94d361379c..c26a71460f02 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1336,7 +1336,6 @@ int generic_map_lookup_batch(struct bpf_map *map, void *buf, *buf_prevkey, *prev_key, *key, *value; int err, retry = MAP_LOOKUP_RETRIES; u32 value_size, cp, max_count; - bool first_key = false; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; @@ -1365,7 +1364,6 @@ int generic_map_lookup_batch(struct bpf_map *map, } err = -EFAULT; - first_key = false; prev_key = NULL; if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size)) goto free_buf; -- cgit From 31537cf8f3f95d45360b995ad8be2c870edc5b02 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 8 Jan 2020 08:57:55 -0500 Subject: tracing: Initialize ret in syscall_enter_define_fields() If syscall_enter_define_fields() is called on a system call with no arguments, the return code variable "ret" will never get initialized. Initialize it to zero. Fixes: 04ae87a52074e ("ftrace: Rework event_create_dir()") Reported-by: Qian Cai Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/0FA8C6E3-D9F5-416D-A1B0-5E4CD583A101@lca.pw --- kernel/trace/trace_syscalls.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 73140d80dd46..2978c29d87d4 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -274,7 +274,8 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; int offset = offsetof(typeof(trace), args); - int ret, i; + int ret = 0; + int i; for (i = 0; i < meta->nb_args; i++) { ret = trace_define_field(call, meta->types[i], -- cgit From db5793c5993d265fe6644b6638fcb0758f6b5347 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 18 Dec 2019 05:31:25 +0000 Subject: watchdog: Remove soft_lockup_hrtimer_cnt and related code After commit 9cf57731b63e ("watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work"), the percpu soft_lockup_hrtimer_cnt is not used any more, so remove it and related code. Signed-off-by: Jisheng Zhang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191218131720.4146aea2@xhacker.debian --- kernel/watchdog.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f41334ef0971..0621301ae8cf 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -173,7 +173,6 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); -static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; @@ -350,8 +349,6 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); */ static int softlockup_fn(void *data) { - __this_cpu_write(soft_lockup_hrtimer_cnt, - __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); complete(this_cpu_ptr(&softlockup_completion)); -- cgit From 5f68eb19b5716f8cf3ccfa833cffd1522813b0e8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 20 Dec 2019 12:04:53 +0100 Subject: sched/fair : Improve update_sd_pick_busiest for spare capacity case Similarly to calculate_imbalance() and find_busiest_group(), using the number of idle CPUs when there is only 1 CPU in the group is not efficient because we can't make a difference between a CPU running 1 task and a CPU running dozens of small tasks competing for the same CPU but not enough to overload it. More generally speaking, we should use the number of running tasks when there is the same number of idle CPUs in a group instead of blindly select the 1st one. When the groups have spare capacity and the same number of idle CPUs, we compare the number of running tasks to select the busiest group. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1576839893-26930-1-git-send-email-vincent.guittot@linaro.org --- kernel/sched/fair.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2d170b5da0e3..35c105759dfa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8181,14 +8181,18 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_has_spare: /* - * Select not overloaded group with lowest number of - * idle cpus. We could also compare the spare capacity - * which is more stable but it can end up that the - * group has less spare capacity but finally more idle + * Select not overloaded group with lowest number of idle cpus + * and highest number of running tasks. We could also compare + * the spare capacity which is more stable but it can end up + * that the group has less spare capacity but finally more idle * CPUs which means less opportunity to pull tasks. */ - if (sgs->idle_cpus >= busiest->idle_cpus) + if (sgs->idle_cpus > busiest->idle_cpus) return false; + else if ((sgs->idle_cpus == busiest->idle_cpus) && + (sgs->sum_nr_running <= busiest->sum_nr_running)) + return false; + break; } -- cgit From 323af6deaf70f204880caf94678350802682e0dc Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 8 Jan 2020 13:57:04 +0530 Subject: sched/fair: Load balance aggressively for SCHED_IDLE CPUs The fair scheduler performs periodic load balance on every CPU to check if it can pull some tasks from other busy CPUs. The duration of this periodic load balance is set to sd->balance_interval for the idle CPUs and is calculated by multiplying the sd->balance_interval with the sd->busy_factor (set to 32 by default) for the busy CPUs. The multiplication is done for busy CPUs to avoid doing load balance too often and rather spend more time executing actual task. While that is the right thing to do for the CPUs busy with SCHED_OTHER or SCHED_BATCH tasks, it may not be the optimal thing for CPUs running only SCHED_IDLE tasks. With the recent enhancements in the fair scheduler around SCHED_IDLE CPUs, we now prefer to enqueue a newly-woken task to a SCHED_IDLE CPU instead of other busy or idle CPUs. The same reasoning should be applied to the load balancer as well to make it migrate tasks more aggressively to a SCHED_IDLE CPU, as that will reduce the scheduling latency of the migrated (SCHED_OTHER) tasks. This patch makes minimal changes to the fair scheduler to do the next load balance soon after the last non SCHED_IDLE task is dequeued from a runqueue, i.e. making the CPU SCHED_IDLE. Also the sd->busy_factor is ignored while calculating the balance_interval for such CPUs. This is done to avoid delaying the periodic load balance by few hundred milliseconds for SCHED_IDLE CPUs. This is tested on ARM64 Hikey620 platform (octa-core) with the help of rt-app and it is verified, using kernel traces, that the newly SCHED_IDLE CPU does load balancing shortly after it becomes SCHED_IDLE and pulls tasks from other busy CPUs. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/e485827eb8fe7db0943d6f3f6e0f5a4a70272781.1578471925.git.viresh.kumar@linaro.org --- kernel/sched/fair.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 35c105759dfa..d292883694b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5210,6 +5210,18 @@ static inline void update_overutilized_status(struct rq *rq) static inline void update_overutilized_status(struct rq *rq) { } #endif +/* Runqueue only has SCHED_IDLE tasks enqueued */ +static int sched_idle_rq(struct rq *rq) +{ + return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + rq->nr_running); +} + +static int sched_idle_cpu(int cpu) +{ + return sched_idle_rq(cpu_rq(cpu)); +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5324,6 +5336,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); + bool was_sched_idle = sched_idle_rq(rq); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -5370,6 +5383,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) sub_nr_running(rq, 1); + /* balance early to pull high priority tasks */ + if (unlikely(!was_sched_idle && sched_idle_rq(rq))) + rq->next_balance = jiffies; + util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5392,15 +5409,6 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -/* CPU only has SCHED_IDLE tasks enqueued */ -static int sched_idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && - rq->nr_running); -} - static unsigned long cpu_load(struct rq *rq) { return cfs_rq_load_avg(&rq->cfs); @@ -9546,6 +9554,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; + int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -9582,7 +9591,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) break; } - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { @@ -9598,9 +9607,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; + busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); } if (need_serialize) spin_unlock(&balancing); -- cgit From 7226017ad37a888915628e59a84a2d1e57b40707 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 24 Dec 2019 11:54:04 +0000 Subject: sched/uclamp: Fix a bug in propagating uclamp value in new cgroups When a new cgroup is created, the effective uclamp value wasn't updated with a call to cpu_util_update_eff() that looks at the hierarchy and update to the most restrictive values. Fix it by ensuring to call cpu_util_update_eff() when a new cgroup becomes online. Without this change, the newly created cgroup uses the default root_task_group uclamp values, which is 1024 for both uclamp_{min, max}, which will cause the rq to to be clamped to max, hence cause the system to run at max frequency. The problem was observed on Ubuntu server and was reproduced on Debian and Buildroot rootfs. By default, Ubuntu and Debian create a cpu controller cgroup hierarchy and add all tasks to it - which creates enough noise to keep the rq uclamp value at max most of the time. Imitating this behavior makes the problem visible in Buildroot too which otherwise looks fine since it's a minimal userspace. Fixes: 0b60ba2dd342 ("sched/uclamp: Propagate parent clamps") Reported-by: Doug Smythies Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Tested-by: Doug Smythies Link: https://lore.kernel.org/lkml/000701d5b965$361b6c60$a2524520$@net/ --- kernel/sched/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e7b08d52db93..d0270b14c132 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7099,6 +7099,12 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) if (parent) sched_online_group(tg, parent); + +#ifdef CONFIG_UCLAMP_TASK_GROUP + /* Propagate the effective uclamp value for the new group */ + cpu_util_update_eff(css); +#endif + return 0; } -- cgit From dcd6dffb0a75741471297724640733fa4e958d72 Mon Sep 17 00:00:00 2001 From: Li Guanglei Date: Wed, 25 Dec 2019 15:44:04 +0800 Subject: sched/core: Fix size of rq::uclamp initialization rq::uclamp is an array of struct uclamp_rq, make sure we clear the whole thing. Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcountinga") Signed-off-by: Li Guanglei Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Qais Yousef Link: https://lkml.kernel.org/r/1577259844-12677-1-git-send-email-guangleix.li@gmail.com --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d0270b14c132..fc1dfc007604 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1253,7 +1253,8 @@ static void __init init_uclamp(void) mutex_init(&uclamp_mutex); for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); + memset(&cpu_rq(cpu)->uclamp, 0, + sizeof(struct uclamp_rq)*UCLAMP_CNT); cpu_rq(cpu)->uclamp_flags = 0; } -- cgit From 02d4ac5885a18d326b500b94808f0956dcce2832 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Thu, 26 Dec 2019 16:52:24 +0800 Subject: sched/debug: Reset watchdog on all CPUs while processing sysrq-t Lengthy output of sysrq-t may take a lot of time on slow serial console with lots of processes and CPUs. So we need to reset NMI-watchdog to avoid spurious lockup messages, and we also reset softlockup watchdogs on all other CPUs since another CPU might be blocked waiting for us to process an IPI or stop_machine. Add to sysrq_sched_debug_show() as what we did in show_state_filter(). Signed-off-by: Wei Li Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20191226085224.48942-1-liwei391@huawei.com --- kernel/sched/debug.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f7e4579e746c..879d3ccf3806 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -751,9 +751,16 @@ void sysrq_sched_debug_show(void) int cpu; sched_debug_header(NULL); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + /* + * Need to reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI or stop_machine. + */ + touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); print_cpu(NULL, cpu); - + } } /* -- cgit From 35f4cd96f5551dc1b2641159e7bb7bf91de6600f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 28 Dec 2019 16:19:12 +0000 Subject: stop_machine: Make stop_cpus() static The function stop_cpus() is only used internally by the stop_machine for stop multiple cpus. Make it static. Signed-off-by: Yangtao Li Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191228161912.24082-1-tiny.windzz@gmail.com --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 5d68ec4c4015..865bb0228ab6 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -442,7 +442,7 @@ static int __stop_cpus(const struct cpumask *cpumask, * @cpumask were offline; otherwise, 0 if all executions of @fn * returned 0, any non zero return value if any returned non zero. */ -int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +static int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) { int ret; -- cgit From 9dec1b6949ae9509cdc3edb2d75fda39c9db9fa2 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 2 Jan 2020 18:07:52 +0800 Subject: sched/cputime: move rq parameter in irqtime_account_process_tick Every time we call irqtime_account_process_tick() is in a interrupt, Every caller will get and assign a parameter rq = this_rq(), This is unnecessary and increase the code size a little bit. Move the rq getting action to irqtime_account_process_tick internally is better. base with this patch cputime.o 578792 bytes 577888 bytes Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1577959674-255537-1-git-send-email-alex.shi@linux.alibaba.com --- kernel/sched/cputime.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index d43318a489f2..cff3e656566d 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -355,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq, int ticks) + int ticks) { u64 other, cputime = TICK_NSEC * ticks; @@ -381,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); } else if (user_tick) { account_user_time(p, cputime); - } else if (p == rq->idle) { + } else if (p == this_rq()->idle) { account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ account_guest_time(p, cputime); @@ -392,14 +392,12 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, static void irqtime_account_idle_ticks(int ticks) { - struct rq *rq = this_rq(); - - irqtime_account_process_tick(current, 0, rq, ticks); + irqtime_account_process_tick(current, 0, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq, int nr_ticks) { } + int nr_ticks) { } #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* @@ -473,13 +471,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) void account_process_tick(struct task_struct *p, int user_tick) { u64 cputime, steal; - struct rq *rq = this_rq(); if (vtime_accounting_enabled_this_cpu()) return; if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq, 1); + irqtime_account_process_tick(p, user_tick, 1); return; } @@ -493,7 +490,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (user_tick) account_user_time(p, cputime); - else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) account_system_time(p, HARDIRQ_OFFSET, cputime); else account_idle_time(cputime); -- cgit From fe71bbb21ee14160f73f81b113d71145327a1c0d Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Fri, 3 Jan 2020 19:44:00 +0800 Subject: sched/fair: calculate delta runnable load only when it's needed Move the code of calculation for delta_sum/delta_avg to where it is really needed to be done. Signed-off-by: Peng Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200103114400.17668-1-rocking@linux.alibaba.com --- kernel/sched/fair.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d292883694b7..32c5421b6a25 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3366,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf runnable_load_sum = (s64)se_runnable(se) * runnable_sum; runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); - delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum; - delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - - se->avg.runnable_load_sum = runnable_sum; - se->avg.runnable_load_avg = runnable_load_avg; if (se->on_rq) { + delta_sum = runnable_load_sum - + se_weight(se) * se->avg.runnable_load_sum; + delta_avg = runnable_load_avg - se->avg.runnable_load_avg; add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); } + + se->avg.runnable_load_sum = runnable_sum; + se->avg.runnable_load_avg = runnable_load_avg; } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) -- cgit From 4c58f57fa6e93318a0899f70d8b99fe6bac22ce8 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Sat, 4 Jan 2020 21:08:28 +0800 Subject: sched/fair: Fix sgc->{min,max}_capacity calculation for SD_OVERLAP commit bf475ce0a3dd ("sched/fair: Add per-CPU min capacity to sched_group_capacity") introduced per-cpu min_capacity. commit e3d6d0cb66f2 ("sched/fair: Add sched_group per-CPU max capacity") introduced per-cpu max_capacity. In the SD_OVERLAP case, the local variable 'capacity' represents the sum of CPU capacity of all CPUs in the first sched group (sg) of the sched domain (sd). It is erroneously used to calculate sg's min and max CPU capacity. To fix this use capacity_of(cpu) instead of 'capacity'. The code which achieves this via cpu_rq(cpu)->sd->groups->sgc->capacity (for rq->sd != NULL) can be removed since it delivers the same value as capacity_of(cpu) which is currently only used for the (!rq->sd) case (see update_cpu_capacity()). An sg of the lowest sd (rq->sd or sd->child == NULL) represents a single CPU (and hence sg->sgc->capacity == capacity_of(cpu)). Signed-off-by: Peng Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20200104130828.GA7718@iZj6chx1xj0e0buvshuecpZ --- kernel/sched/fair.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 32c5421b6a25..e84723c5c661 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7802,29 +7802,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_span(sdg)) { - struct sched_group_capacity *sgc; - struct rq *rq = cpu_rq(cpu); + unsigned long cpu_cap = capacity_of(cpu); - /* - * build_sched_domains() -> init_sched_groups_capacity() - * gets here before we've attached the domains to the - * runqueues. - * - * Use capacity_of(), which is set irrespective of domains - * in update_cpu_capacity(). - * - * This avoids capacity from being 0 and - * causing divide-by-zero issues on boot. - */ - if (unlikely(!rq->sd)) { - capacity += capacity_of(cpu); - } else { - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; - } - - min_capacity = min(capacity, min_capacity); - max_capacity = max(capacity, max_capacity); + capacity += cpu_cap; + min_capacity = min(cpu_cap, min_capacity); + max_capacity = max(cpu_cap, max_capacity); } } else { /* -- cgit From 3d817689a62cf71bbb290af18cd26cf9764f38fe Mon Sep 17 00:00:00 2001 From: Wang Long Date: Wed, 18 Dec 2019 20:38:18 +0800 Subject: sched/psi: create /proc/pressure and /proc/pressure/{io|memory|cpu} only when psi enabled when CONFIG_PSI_DEFAULT_DISABLED set to N or the command line set psi=0, I think we should not create /proc/pressure and /proc/pressure/{io|memory|cpu}. In the future, user maybe determine whether the psi feature is enabled by checking the existence of the /proc/pressure dir or /proc/pressure/{io|memory|cpu} files. Signed-off-by: Wang Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/1576672698-32504-1-git-send-email-w@laoqinren.net --- kernel/sched/psi.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ce8f6748678a..db7b50bba3f1 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1280,10 +1280,12 @@ static const struct file_operations psi_cpu_fops = { static int __init psi_proc_init(void) { - proc_mkdir("pressure", NULL); - proc_create("pressure/io", 0, NULL, &psi_io_fops); - proc_create("pressure/memory", 0, NULL, &psi_memory_fops); - proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + if (psi_enable) { + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_fops); + proc_create("pressure/memory", 0, NULL, &psi_memory_fops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + } return 0; } module_init(psi_proc_init); -- cgit From a4f9a0e51bbf89cb461b1985a1a570e6b87da3b5 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 15 Jan 2020 11:20:20 +0100 Subject: sched/fair: Remove redundant call to cpufreq_update_util() With commit bef69dd87828 ("sched/cpufreq: Move the cfs_rq_util_change() call to cpufreq_update_util()") update_load_avg() has become the central point for calling cpufreq (not including the update of blocked load). This change helps to simplify further the number of calls to cpufreq_update_util() and to remove last redundant ones. With update_load_avg(), we are now sure that cpufreq_update_util() will be called after every task attachment to a cfs_rq and especially after propagating this event down to the util_avg of the root cfs_rq, which is the level that is used by cpufreq governors like schedutil to set the frequency of a CPU. The SCHED_CPUFREQ_MIGRATION flag forces an early call to cpufreq when the migration happens in a cgroup whereas util_avg of root cfs_rq is not yet updated and this call is duplicated with the one that happens immediately after when the migration event reaches the root cfs_rq. The dedicated flag SCHED_CPUFREQ_MIGRATION is now useless and can be removed. The interface of attach_entity_load_avg() can also be simplified accordingly. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/1579083620-24943-1-git-send-email-vincent.guittot@linaro.org --- kernel/sched/fair.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e84723c5c661..ebf50955fe8a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -801,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p) * For !fair tasks do: * update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se, 0); + attach_entity_load_avg(cfs_rq, se); switched_from_fair(rq, p); * * such that the next switched_to_fair() has the @@ -3114,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { struct rq *rq = rq_of(cfs_rq); - if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) { + if (&rq->cfs == cfs_rq) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -3521,7 +3521,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. */ -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; @@ -3557,7 +3557,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); - cfs_rq_util_change(cfs_rq, flags); + cfs_rq_util_change(cfs_rq, 0); trace_pelt_cfs_tp(cfs_rq); } @@ -3615,7 +3615,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * * IOW we're enqueueing a task on a new CPU. */ - attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); + attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, 0); } else if (decayed) { @@ -3872,7 +3872,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} @@ -10436,7 +10436,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); - attach_entity_load_avg(cfs_rq, se, 0); + attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); } -- cgit From 3e0de271fff77abb933f1b69c213854c3eda9125 Mon Sep 17 00:00:00 2001 From: Hewenliang Date: Thu, 9 Jan 2020 21:56:04 -0500 Subject: idle: fix spelling mistake "iterrupts" -> "interrupts" There is a spelling misake in comments of cpuidle_idle_call. Fix it. Signed-off-by: Hewenliang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20200110025604.34373-1-hewenliang4@huawei.com --- kernel/sched/idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ffa959e91227..b743bf38f08f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -158,7 +158,7 @@ static void cpuidle_idle_call(void) /* * Suspend-to-idle ("s2idle") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only - * activity happens here and in iterrupts (if any). In that case bypass + * activity happens here and in interrupts (if any). In that case bypass * the cpuidle governor and go stratight for the deepest idle state * available. Possibly also suspend the local tick and the entire * timekeeping to prevent timer interrupts from kicking us out of idle -- cgit From ccf74128d66ce937876184ad55db2e0276af08d3 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 15 Jan 2020 16:09:15 +0000 Subject: sched/topology: Assert non-NUMA topology masks don't (partially) overlap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit topology.c::get_group() relies on the assumption that non-NUMA domains do not partially overlap. Zeng Tao pointed out in [1] that such topology descriptions, while completely bogus, can end up being exposed to the scheduler. In his example (8 CPUs, 2-node system), we end up with: MC span for CPU3 == 3-7 MC span for CPU4 == 4-7 The first pass through get_group(3, sdd@MC) will result in the following sched_group list: 3 -> 4 -> 5 -> 6 -> 7 ^ / `----------------' And a later pass through get_group(4, sdd@MC) will "corrupt" that to: 3 -> 4 -> 5 -> 6 -> 7 ^ / `-----------' which will completely break things like 'while (sg != sd->groups)' when using CPU3's base sched_domain. There already are some architecture-specific checks in place such as x86/kernel/smpboot.c::topology.sane(), but this is something we can detect in the core scheduler, so it seems worthwhile to do so. Warn and abort the construction of the sched domains if such a broken topology description is detected. Note that this is somewhat expensive (O(t.c²), 't' non-NUMA topology levels and 'c' CPUs) and could be gated under SCHED_DEBUG if deemed necessary. Testing ======= Dietmar managed to reproduce this using the following qemu incantation: $ qemu-system-aarch64 -kernel ./Image -hda ./qemu-image-aarch64.img \ -append 'root=/dev/vda console=ttyAMA0 loglevel=8 sched_debug' -smp \ cores=8 --nographic -m 512 -cpu cortex-a53 -machine virt -numa \ node,cpus=0-2,nodeid=0 -numa node,cpus=3-7,nodeid=1 alongside the following drivers/base/arch_topology.c hack (AIUI wouldn't be needed if '-smp cores=X, sockets=Y' would work with qemu): 8<--- @@ -465,6 +465,9 @@ void update_siblings_masks(unsigned int cpuid) if (cpuid_topo->package_id != cpu_topo->package_id) continue; + if ((cpu < 4 && cpuid > 3) || (cpu > 3 && cpuid < 4)) + continue; + cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); 8<--- [1]: https://lkml.kernel.org/r/1577088979-8545-1-git-send-email-prime.zeng@hisilicon.com Reported-by: Zeng Tao Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200115160915.22575-1-valentin.schneider@arm.com --- kernel/sched/topology.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6ec1e595b1d4..dfb64c08a407 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1879,6 +1879,42 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve return sd; } +/* + * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for + * any two given CPUs at this (non-NUMA) topology level. + */ +static bool topology_span_sane(struct sched_domain_topology_level *tl, + const struct cpumask *cpu_map, int cpu) +{ + int i; + + /* NUMA levels are allowed to overlap */ + if (tl->flags & SDTL_OVERLAP) + return true; + + /* + * Non-NUMA levels cannot partially overlap - they must be either + * completely equal or completely disjoint. Otherwise we can end up + * breaking the sched_group lists - i.e. a later get_group() pass + * breaks the linking done for an earlier span. + */ + for_each_cpu(i, cpu_map) { + if (i == cpu) + continue; + /* + * We should 'and' all those masks with 'cpu_map' to exactly + * match the topology we're about to build, but that can only + * remove CPUs, which only lessens our ability to detect + * overlaps + */ + if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && + cpumask_intersects(tl->mask(cpu), tl->mask(i))) + return false; + } + + return true; +} + /* * Find the sched_domain_topology_level where all CPU capacities are visible * for all CPUs. @@ -1975,6 +2011,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att has_asym = true; } + if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) + goto error; + sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); if (tl == sched_domain_topology) -- cgit From a030f9767da1a6bbcec840fc54770eb11c2414b6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 11 Dec 2019 16:31:39 -0500 Subject: locking/lockdep: Fix lockdep_stats indentation problem It was found that two lines in the output of /proc/lockdep_stats have indentation problem: # cat /proc/lockdep_stats : in-process chains: 25057 stack-trace entries: 137827 [max: 524288] number of stack traces: 7973 number of stack hash chains: 6355 combined max dependencies: 1356414598 hardirq-safe locks: 57 hardirq-unsafe locks: 1286 : All the numbers displayed in /proc/lockdep_stats except the two stack trace numbers are formatted with a field with of 11. To properly align all the numbers, a field width of 11 is now added to the two stack trace numbers. Fixes: 8c779229d0f4 ("locking/lockdep: Report more stack trace statistics") Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Bart Van Assche Link: https://lkml.kernel.org/r/20191211213139.29934-1-longman@redhat.com --- kernel/locking/lockdep_proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index dadb7b7fba37..9bb6d2497b04 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -286,9 +286,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v) seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) - seq_printf(m, " number of stack traces: %llu\n", + seq_printf(m, " number of stack traces: %11llu\n", lockdep_stack_trace_count()); - seq_printf(m, " number of stack hash chains: %llu\n", + seq_printf(m, " number of stack hash chains: %11llu\n", lockdep_stack_hash_count()); #endif seq_printf(m, " combined max dependencies: %11u\n", -- cgit From 57097124cbbd310cc2b5884189e22e60a3c20514 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 7 Jan 2020 12:49:14 -0500 Subject: locking/qspinlock: Fix inaccessible URL of MCS lock paper It turns out that the URL of the MCS lock paper listed in the source code is no longer accessible. I did got question about where the paper was. This patch updates the URL to BZ 206115 which contains a copy of the paper from https://www.cs.rochester.edu/u/scott/papers/1991_TOCS_synch.pdf Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Link: https://lkml.kernel.org/r/20200107174914.4187-1-longman@redhat.com --- kernel/locking/qspinlock.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 2473f10c6956..b9515fcc9b29 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -31,14 +31,15 @@ /* * The basic principle of a queue-based spinlock can best be understood * by studying a classic queue-based spinlock implementation called the - * MCS lock. The paper below provides a good description for this kind - * of lock. + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and + * Scott") is available at * - * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf + * https://bugzilla.kernel.org/show_bug.cgi?id=206115 * - * This queued spinlock implementation is based on the MCS lock, however to make - * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing - * API, we must modify it somehow. + * This queued spinlock implementation is based on the MCS lock, however to + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its + * existing API, we must modify it somehow. * * In particular; where the traditional MCS lock consists of a tail pointer * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to -- cgit From f5bfdc8e3947a7ae489cf8ae9cfd6b3fb357b952 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 13 Jan 2020 10:07:35 -0500 Subject: locking/osq: Use optimized spinning loop for arm64 Arm64 has a more optimized spinning loop (atomic_cond_read_acquire) using wfe for spinlock that can boost performance of sibling threads by putting the current cpu to a wait state that is broken only when the monitored variable changes or an external event happens. OSQ has a more complicated spinning loop. Besides the lock value, it also checks for need_resched() and vcpu_is_preempted(). The check for need_resched() is not a problem as it is only set by the tick interrupt handler. That will be detected by the spinning cpu right after iret. The vcpu_is_preempted() check, however, is a problem as changes to the preempt state of of previous node will not affect the wait state. For ARM64, vcpu_is_preempted is not currently defined and so is a no-op. Will has indicated that he is planning to para-virtualize wfe instead of defining vcpu_is_preempted for PV support. So just add a comment in arch/arm64/include/asm/spinlock.h to indicate that vcpu_is_preempted() should not be defined as suggested. On a 2-socket 56-core 224-thread ARM64 system, a kernel mutex locking microbenchmark was run for 10s with and without the patch. The performance numbers before patch were: Running locktest with mutex [runtime = 10s, load = 1] Threads = 224, Min/Mean/Max = 316/123,143/2,121,269 Threads = 224, Total Rate = 2,757 kop/s; Percpu Rate = 12 kop/s After patch, the numbers were: Running locktest with mutex [runtime = 10s, load = 1] Threads = 224, Min/Mean/Max = 334/147,836/1,304,787 Threads = 224, Total Rate = 3,311 kop/s; Percpu Rate = 15 kop/s So there was about 20% performance improvement. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Link: https://lkml.kernel.org/r/20200113150735.21956-1-longman@redhat.com --- kernel/locking/osq_lock.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 6ef600aa0f47..1f7734949ac8 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -134,20 +134,17 @@ bool osq_lock(struct optimistic_spin_queue *lock) * cmpxchg in an attempt to undo our queueing. */ - while (!READ_ONCE(node->locked)) { - /* - * If we need to reschedule bail... so we can block. - * Use vcpu_is_preempted() to avoid waiting for a preempted - * lock holder: - */ - if (need_resched() || vcpu_is_preempted(node_cpu(node->prev))) - goto unqueue; - - cpu_relax(); - } - return true; + /* + * Wait to acquire the lock or cancelation. Note that need_resched() + * will come with an IPI, which will wake smp_cond_load_relaxed() if it + * is implemented with a monitor-wait. vcpu_is_preempted() relies on + * polling, be careful. + */ + if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() || + vcpu_is_preempted(node_cpu(node->prev)))) + return true; -unqueue: + /* unqueue */ /* * Step - A -- stabilize @prev * -- cgit From 87c9366e17259040a9118e06b6dc8de986e5d3d1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 4 Dec 2019 17:43:46 +0100 Subject: Revert "um: Enable CONFIG_CONSTRUCTORS" This reverts commit 786b2384bf1c ("um: Enable CONFIG_CONSTRUCTORS"). There are two issues with this commit, uncovered by Anton in tests on some (Debian) systems: 1) I completely forgot to call any constructors if CONFIG_CONSTRUCTORS isn't set. Don't recall now if it just wasn't needed on my system, or if I never tested this case. 2) With that fixed, it works - with CONFIG_CONSTRUCTORS *unset*. If I set CONFIG_CONSTRUCTORS, it fails again, which isn't totally unexpected since whatever wanted to run is likely to have to run before the kernel init etc. that calls the constructors in this case. Basically, some constructors that gcc emits (libc has?) need to run very early during init; the failure mode otherwise was that the ptrace fork test already failed: ---------------------- $ ./linux mem=512M Core dump limits : soft - 0 hard - NONE Checking that ptrace can change system call numbers...check_ptrace : child exited with exitcode 6, while expecting 0; status 0x67f Aborted ---------------------- Thinking more about this, it's clear that we simply cannot support CONFIG_CONSTRUCTORS in UML. All the cases we need now (gcov, kasan) involve not use of the __attribute__((constructor)), but instead some constructor code/entry generated by gcc. Therefore, we cannot distinguish between kernel constructors and system constructors. Thus, revert this commit. Cc: stable@vger.kernel.org [5.4+] Fixes: 786b2384bf1c ("um: Enable CONFIG_CONSTRUCTORS") Reported-by: Anton Ivanov Signed-off-by: Johannes Berg Acked-by: Anton Ivanov Signed-off-by: Richard Weinberger --- kernel/gcov/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 060e8e726755..3941a9c48f83 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - select CONSTRUCTORS + select CONSTRUCTORS if !UML default n ---help--- This option enables gcov-based code profiling (e.g. for code coverage -- cgit From afa70d941f663c69c9a64ec1021bbcfa82f0e54a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 20 Jan 2020 11:29:05 +0530 Subject: sched/fair: Define sched_idle_cpu() only for SMP configurations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sched_idle_cpu() isn't used for non SMP configuration and with a recent change, we have started getting following warning: kernel/sched/fair.c:5221:12: warning: ‘sched_idle_cpu’ defined but not used [-Wunused-function] Fix that by defining sched_idle_cpu() only for SMP configurations. Fixes: 323af6deaf70 ("sched/fair: Load balance aggressively for SCHED_IDLE CPUs") Reported-by: Stephen Rothwell Signed-off-by: Viresh Kumar Signed-off-by: Ingo Molnar Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Link: https://lore.kernel.org/r/f0554f590687478b33914a4aff9f0e6a62886d44.1579499907.git.viresh.kumar@linaro.org --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ebf50955fe8a..fe4e0d775375 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5218,10 +5218,12 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } +#ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { return sched_idle_rq(cpu_rq(cpu)); } +#endif /* * The enqueue_task method is called before nr_running is -- cgit From 2e3a94aa2bfc6de95a0700f0a868c6f5db3a9592 Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Sun, 19 Jan 2020 11:40:40 -0800 Subject: bpf: Fix memory leaks in generic update/delete batch ops Generic update/delete batch ops functions were using __bpf_copy_key without properly freeing the memory. Handle the memory allocation and copy_from_user separately. Fixes: aa2e93b8e58e ("bpf: Add generic support for update and delete batch ops") Reported-by: Dan Carpenter Signed-off-by: Brian Vazquez Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200119194040.128369-1-brianvv@google.com --- kernel/bpf/syscall.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c26a71460f02..9a840c57f6df 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1239,12 +1239,15 @@ int generic_map_delete_batch(struct bpf_map *map, if (!max_count) return 0; + key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + if (!key) + return -ENOMEM; + for (cp = 0; cp < max_count; cp++) { - key = __bpf_copy_key(keys + cp * map->key_size, map->key_size); - if (IS_ERR(key)) { - err = PTR_ERR(key); + err = -EFAULT; + if (copy_from_user(key, keys + cp * map->key_size, + map->key_size)) break; - } if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_delete_elem(map, key); @@ -1264,6 +1267,8 @@ int generic_map_delete_batch(struct bpf_map *map, } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; + + kfree(key); return err; } @@ -1294,18 +1299,21 @@ int generic_map_update_batch(struct bpf_map *map, if (!max_count) return 0; + key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + if (!key) + return -ENOMEM; + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); - if (!value) + if (!value) { + kfree(key); return -ENOMEM; + } for (cp = 0; cp < max_count; cp++) { - key = __bpf_copy_key(keys + cp * map->key_size, map->key_size); - if (IS_ERR(key)) { - err = PTR_ERR(key); - break; - } err = -EFAULT; - if (copy_from_user(value, values + cp * value_size, value_size)) + if (copy_from_user(key, keys + cp * map->key_size, + map->key_size) || + copy_from_user(value, values + cp * value_size, value_size)) break; err = bpf_map_update_value(map, f, key, value, -- cgit From b87121dd3fa0a4d2636c0ad02a3b3fc5161fa26b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Jan 2020 23:28:58 +0000 Subject: bpf: don't bother with getname/kern_path - use user_path_at kernel/bpf/inode.c misuses kern_path...() - it's much simpler (and more efficient, on top of that) to use user_path...() counterparts rather than bothering with doing getname() manually. Signed-off-by: Al Viro Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200120232858.GF8904@ZenIV.linux.org.uk --- kernel/bpf/inode.c | 43 +++++++++++++------------------------------ 1 file changed, 13 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index ecf42bec38c0..e11059b99f18 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -380,7 +380,7 @@ static const struct inode_operations bpf_dir_iops = { .unlink = simple_unlink, }; -static int bpf_obj_do_pin(const struct filename *pathname, void *raw, +static int bpf_obj_do_pin(const char __user *pathname, void *raw, enum bpf_type type) { struct dentry *dentry; @@ -389,7 +389,7 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, umode_t mode; int ret; - dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); + dentry = user_path_create(AT_FDCWD, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -422,30 +422,22 @@ out: int bpf_obj_pin_user(u32 ufd, const char __user *pathname) { - struct filename *pname; enum bpf_type type; void *raw; int ret; - pname = getname(pathname); - if (IS_ERR(pname)) - return PTR_ERR(pname); - raw = bpf_fd_probe_obj(ufd, &type); - if (IS_ERR(raw)) { - ret = PTR_ERR(raw); - goto out; - } + if (IS_ERR(raw)) + return PTR_ERR(raw); - ret = bpf_obj_do_pin(pname, raw, type); + ret = bpf_obj_do_pin(pathname, raw, type); if (ret != 0) bpf_any_put(raw, type); -out: - putname(pname); + return ret; } -static void *bpf_obj_do_get(const struct filename *pathname, +static void *bpf_obj_do_get(const char __user *pathname, enum bpf_type *type, int flags) { struct inode *inode; @@ -453,7 +445,7 @@ static void *bpf_obj_do_get(const struct filename *pathname, void *raw; int ret; - ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path); + ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path); if (ret) return ERR_PTR(ret); @@ -480,36 +472,27 @@ out: int bpf_obj_get_user(const char __user *pathname, int flags) { enum bpf_type type = BPF_TYPE_UNSPEC; - struct filename *pname; - int ret = -ENOENT; int f_flags; void *raw; + int ret; f_flags = bpf_get_file_flag(flags); if (f_flags < 0) return f_flags; - pname = getname(pathname); - if (IS_ERR(pname)) - return PTR_ERR(pname); - - raw = bpf_obj_do_get(pname, &type, f_flags); - if (IS_ERR(raw)) { - ret = PTR_ERR(raw); - goto out; - } + raw = bpf_obj_do_get(pathname, &type, f_flags); + if (IS_ERR(raw)) + return PTR_ERR(raw); if (type == BPF_TYPE_PROG) ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) ret = bpf_map_new_fd(raw, f_flags); else - goto out; + return -ENOENT; if (ret < 0) bpf_any_put(raw, type); -out: - putname(pname); return ret; } -- cgit From 05d57f1793fb250c85028c9952c3720010baa853 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 20 Jan 2020 19:22:31 -0800 Subject: bpf: Fix trampoline usage in preempt Though the second half of trampoline page is unused a task could be preempted in the middle of the first half of trampoline and two updates to trampoline would change the code from underneath the preempted task. Hence wait for tasks to voluntarily schedule or go to userspace. Add similar wait before freeing the trampoline. Fixes: fec56f5890d9 ("bpf: Introduce BPF trampoline") Reported-by: Jann Horn Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Paul E. McKenney Link: https://lore.kernel.org/bpf/20200121032231.3292185-1-ast@kernel.org --- kernel/bpf/trampoline.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 79a04417050d..7657ede7aee2 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -160,6 +160,14 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (fexit_cnt) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; + /* Though the second half of trampoline page is unused a task could be + * preempted in the middle of the first half of trampoline and two + * updates to trampoline would change the code from underneath the + * preempted task. Hence wait for tasks to voluntarily schedule or go + * to userspace. + */ + synchronize_rcu_tasks(); + err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, &tr->func.model, flags, fentry, fentry_cnt, @@ -251,6 +259,8 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) goto out; if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; + /* wait for tasks to get out of trampoline before freeing it */ + synchronize_rcu_tasks(); bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); kfree(tr); -- cgit From f59bbfc2f6099e8655f9e8f585e10ffde17176d0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 Jan 2020 18:41:38 -0800 Subject: bpf: Fix error path under memory pressure Restore the 'if (env->cur_state)' check that was incorrectly removed during code move. Under memory pressure env->cur_state can be freed and zeroed inside do_check(). Hence the check is necessary. Fixes: 51c39bb1d5d1 ("bpf: Introduce function-by-function verification") Reported-by: syzbot+b296579ba5015704d9fa@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200122024138.3385590-1-ast@kernel.org --- kernel/bpf/verifier.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ca17dccc17ba..6defbec9eb62 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9594,8 +9594,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) ret = do_check(env); out: - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; + /* check for NULL is necessary, since cur_state can be freed inside + * do_check() under memory pressure. + */ + if (env->cur_state) { + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + } while (!pop_stack(env, NULL, NULL)); free_states(env); if (ret) -- cgit From be8704ff07d2374bcc5c675526f95e70c6459683 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 20 Jan 2020 16:53:46 -0800 Subject: bpf: Introduce dynamic program extensions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce dynamic program extensions. The users can load additional BPF functions and replace global functions in previously loaded BPF programs while these programs are executing. Global functions are verified individually by the verifier based on their types only. Hence the global function in the new program which types match older function can safely replace that corresponding function. This new function/program is called 'an extension' of old program. At load time the verifier uses (attach_prog_fd, attach_btf_id) pair to identify the function to be replaced. The BPF program type is derived from the target program into extension program. Technically bpf_verifier_ops is copied from target program. The BPF_PROG_TYPE_EXT program type is a placeholder. It has empty verifier_ops. The extension program can call the same bpf helper functions as target program. Single BPF_PROG_TYPE_EXT type is used to extend XDP, SKB and all other program types. The verifier allows only one level of replacement. Meaning that the extension program cannot recursively extend an extension. That also means that the maximum stack size is increasing from 512 to 1024 bytes and maximum function nesting level from 8 to 16. The programs don't always consume that much. The stack usage is determined by the number of on-stack variables used by the program. The verifier could have enforced 512 limit for combined original plus extension program, but it makes for difficult user experience. The main use case for extensions is to provide generic mechanism to plug external programs into policy program or function call chaining. BPF trampoline is used to track both fentry/fexit and program extensions because both are using the same nop slot at the beginning of every BPF function. Attaching fentry/fexit to a function that was replaced is not allowed. The opposite is true as well. Replacing a function that currently being analyzed with fentry/fexit is not allowed. The executable page allocated by BPF trampoline is not used by program extensions. This inefficiency will be optimized in future patches. Function by function verification of global function supports scalars and pointer to context only. Hence program extensions are supported for such class of global functions only. In the future the verifier will be extended with support to pointers to structures, arrays with sizes, etc. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200121005348.2769920-2-ast@kernel.org --- kernel/bpf/btf.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 15 ++++- kernel/bpf/trampoline.c | 41 ++++++++++++- kernel/bpf/verifier.c | 85 ++++++++++++++++++++------- 4 files changed, 266 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 832b5d7fd892..32963b6d5a9c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -276,6 +276,11 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_DATASEC] = "DATASEC", }; +static const char *btf_type_str(const struct btf_type *t) +{ + return btf_kind_str[BTF_INFO_KIND(t->info)]; +} + struct btf_kind_operations { s32 (*check_meta)(struct btf_verifier_env *env, const struct btf_type *t, @@ -4115,6 +4120,148 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return 0; } +/* Compare BTFs of two functions assuming only scalars and pointers to context. + * t1 points to BTF_KIND_FUNC in btf1 + * t2 points to BTF_KIND_FUNC in btf2 + * Returns: + * EINVAL - function prototype mismatch + * EFAULT - verifier bug + * 0 - 99% match. The last 1% is validated by the verifier. + */ +int btf_check_func_type_match(struct bpf_verifier_log *log, + struct btf *btf1, const struct btf_type *t1, + struct btf *btf2, const struct btf_type *t2) +{ + const struct btf_param *args1, *args2; + const char *fn1, *fn2, *s1, *s2; + u32 nargs1, nargs2, i; + + fn1 = btf_name_by_offset(btf1, t1->name_off); + fn2 = btf_name_by_offset(btf2, t2->name_off); + + if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) { + bpf_log(log, "%s() is not a global function\n", fn1); + return -EINVAL; + } + if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) { + bpf_log(log, "%s() is not a global function\n", fn2); + return -EINVAL; + } + + t1 = btf_type_by_id(btf1, t1->type); + if (!t1 || !btf_type_is_func_proto(t1)) + return -EFAULT; + t2 = btf_type_by_id(btf2, t2->type); + if (!t2 || !btf_type_is_func_proto(t2)) + return -EFAULT; + + args1 = (const struct btf_param *)(t1 + 1); + nargs1 = btf_type_vlen(t1); + args2 = (const struct btf_param *)(t2 + 1); + nargs2 = btf_type_vlen(t2); + + if (nargs1 != nargs2) { + bpf_log(log, "%s() has %d args while %s() has %d args\n", + fn1, nargs1, fn2, nargs2); + return -EINVAL; + } + + t1 = btf_type_skip_modifiers(btf1, t1->type, NULL); + t2 = btf_type_skip_modifiers(btf2, t2->type, NULL); + if (t1->info != t2->info) { + bpf_log(log, + "Return type %s of %s() doesn't match type %s of %s()\n", + btf_type_str(t1), fn1, + btf_type_str(t2), fn2); + return -EINVAL; + } + + for (i = 0; i < nargs1; i++) { + t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL); + t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL); + + if (t1->info != t2->info) { + bpf_log(log, "arg%d in %s() is %s while %s() has %s\n", + i, fn1, btf_type_str(t1), + fn2, btf_type_str(t2)); + return -EINVAL; + } + if (btf_type_has_size(t1) && t1->size != t2->size) { + bpf_log(log, + "arg%d in %s() has size %d while %s() has %d\n", + i, fn1, t1->size, + fn2, t2->size); + return -EINVAL; + } + + /* global functions are validated with scalars and pointers + * to context only. And only global functions can be replaced. + * Hence type check only those types. + */ + if (btf_type_is_int(t1) || btf_type_is_enum(t1)) + continue; + if (!btf_type_is_ptr(t1)) { + bpf_log(log, + "arg%d in %s() has unrecognized type\n", + i, fn1); + return -EINVAL; + } + t1 = btf_type_skip_modifiers(btf1, t1->type, NULL); + t2 = btf_type_skip_modifiers(btf2, t2->type, NULL); + if (!btf_type_is_struct(t1)) { + bpf_log(log, + "arg%d in %s() is not a pointer to context\n", + i, fn1); + return -EINVAL; + } + if (!btf_type_is_struct(t2)) { + bpf_log(log, + "arg%d in %s() is not a pointer to context\n", + i, fn2); + return -EINVAL; + } + /* This is an optional check to make program writing easier. + * Compare names of structs and report an error to the user. + * btf_prepare_func_args() already checked that t2 struct + * is a context type. btf_prepare_func_args() will check + * later that t1 struct is a context type as well. + */ + s1 = btf_name_by_offset(btf1, t1->name_off); + s2 = btf_name_by_offset(btf2, t2->name_off); + if (strcmp(s1, s2)) { + bpf_log(log, + "arg%d %s(struct %s *) doesn't match %s(struct %s *)\n", + i, fn1, s1, fn2, s2); + return -EINVAL; + } + } + return 0; +} + +/* Compare BTFs of given program with BTF of target program */ +int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, + struct btf *btf2, const struct btf_type *t2) +{ + struct btf *btf1 = prog->aux->btf; + const struct btf_type *t1; + u32 btf_id = 0; + + if (!prog->aux->func_info) { + bpf_log(&env->log, "Program extension requires BTF\n"); + return -EINVAL; + } + + btf_id = prog->aux->func_info[0].type_id; + if (!btf_id) + return -EFAULT; + + t1 = btf_type_by_id(btf1, btf_id); + if (!t1 || !btf_type_is_func(t1)) + return -EFAULT; + + return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2); +} + /* Compare BTF of a function with given bpf_reg_state. * Returns: * EFAULT - there is a verifier bug. Abort verification. @@ -4224,6 +4371,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; + enum bpf_prog_type prog_type = prog->type; struct btf *btf = prog->aux->btf; const struct btf_param *args; const struct btf_type *t; @@ -4261,6 +4409,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, bpf_log(log, "Verifier bug in function %s()\n", tname); return -EFAULT; } + if (prog_type == BPF_PROG_TYPE_EXT) + prog_type = prog->aux->linked_prog->type; t = btf_type_by_id(btf, t->type); if (!t || !btf_type_is_func_proto(t)) { @@ -4296,7 +4446,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, continue; } if (btf_type_is_ptr(t) && - btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { + btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { reg[i + 1].type = PTR_TO_CTX; continue; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9a840c57f6df..a91ad518c050 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1932,13 +1932,15 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_STRUCT_OPS: + case BPF_PROG_TYPE_EXT: break; default: return -EINVAL; } } - if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING) + if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING && + prog_type != BPF_PROG_TYPE_EXT) return -EINVAL; switch (prog_type) { @@ -1981,6 +1983,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_EXT: + if (expected_attach_type) + return -EINVAL; + /* fallthrough */ default: return 0; } @@ -2183,7 +2189,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) int tr_fd, err; if (prog->expected_attach_type != BPF_TRACE_FENTRY && - prog->expected_attach_type != BPF_TRACE_FEXIT) { + prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; } @@ -2250,12 +2257,14 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_EXT && prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { err = -EINVAL; goto out_put_prog; } - if (prog->type == BPF_PROG_TYPE_TRACING) { + if (prog->type == BPF_PROG_TYPE_TRACING || + prog->type == BPF_PROG_TYPE_EXT) { if (attr->raw_tracepoint.name) { /* The attach point for this category of programs * should be specified via btf_id during program load. diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7657ede7aee2..eb64c245052b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -5,6 +5,12 @@ #include #include +/* dummy _ops. The verifier will operate on target program's ops. */ +const struct bpf_verifier_ops bpf_extension_verifier_ops = { +}; +const struct bpf_prog_ops bpf_extension_prog_ops = { +}; + /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ #define TRAMPOLINE_HASH_BITS 10 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) @@ -194,8 +200,10 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t) switch (t) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; - default: + case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + default: + return BPF_TRAMP_REPLACE; } } @@ -204,12 +212,31 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog) enum bpf_tramp_prog_type kind; struct bpf_trampoline *tr; int err = 0; + int cnt; tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog->expected_attach_type); mutex_lock(&tr->mutex); - if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT] - >= BPF_MAX_TRAMP_PROGS) { + if (tr->extension_prog) { + /* cannot attach fentry/fexit if extension prog is attached. + * cannot overwrite extension prog either. + */ + err = -EBUSY; + goto out; + } + cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]; + if (kind == BPF_TRAMP_REPLACE) { + /* Cannot attach extension if fentry/fexit are in use. */ + if (cnt) { + err = -EBUSY; + goto out; + } + tr->extension_prog = prog; + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + prog->bpf_func); + goto out; + } + if (cnt >= BPF_MAX_TRAMP_PROGS) { err = -E2BIG; goto out; } @@ -240,9 +267,17 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog) tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog->expected_attach_type); mutex_lock(&tr->mutex); + if (kind == BPF_TRAMP_REPLACE) { + WARN_ON_ONCE(!tr->extension_prog); + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, + tr->extension_prog->bpf_func, NULL); + tr->extension_prog = NULL; + goto out; + } hlist_del(&prog->aux->tramp_hlist); tr->progs_cnt[kind]--; err = bpf_trampoline_update(prog->aux->trampoline); +out: mutex_unlock(&tr->mutex); return err; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6defbec9eb62..9fe94f64f0ec 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9564,7 +9564,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) subprog); regs = state->frame[state->curframe]->regs; - if (subprog) { + if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { ret = btf_prepare_func_args(env, subprog, regs); if (ret) goto out; @@ -9742,6 +9742,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; + bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; @@ -9757,7 +9758,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) return check_struct_ops_btf_id(env); - if (prog->type != BPF_PROG_TYPE_TRACING) + if (prog->type != BPF_PROG_TYPE_TRACING && !prog_extension) return 0; if (!btf_id) { @@ -9793,8 +9794,59 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } conservative = aux->func_info_aux[subprog].unreliable; + if (prog_extension) { + if (conservative) { + verbose(env, + "Cannot replace static functions\n"); + return -EINVAL; + } + if (!prog->jit_requested) { + verbose(env, + "Extension programs should be JITed\n"); + return -EINVAL; + } + env->ops = bpf_verifier_ops[tgt_prog->type]; + } + if (!tgt_prog->jited) { + verbose(env, "Can attach to only JITed progs\n"); + return -EINVAL; + } + if (tgt_prog->type == prog->type) { + /* Cannot fentry/fexit another fentry/fexit program. + * Cannot attach program extension to another extension. + * It's ok to attach fentry/fexit to extension program. + */ + verbose(env, "Cannot recursively attach\n"); + return -EINVAL; + } + if (tgt_prog->type == BPF_PROG_TYPE_TRACING && + prog_extension && + (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) { + /* Program extensions can extend all program types + * except fentry/fexit. The reason is the following. + * The fentry/fexit programs are used for performance + * analysis, stats and can be attached to any program + * type except themselves. When extension program is + * replacing XDP function it is necessary to allow + * performance analysis of all functions. Both original + * XDP program and its program extension. Hence + * attaching fentry/fexit to BPF_PROG_TYPE_EXT is + * allowed. If extending of fentry/fexit was allowed it + * would be possible to create long call chain + * fentry->extension->fentry->extension beyond + * reasonable stack size. Hence extending fentry is not + * allowed. + */ + verbose(env, "Cannot extend fentry/fexit\n"); + return -EINVAL; + } key = ((u64)aux->id) << 32 | btf_id; } else { + if (prog_extension) { + verbose(env, "Cannot replace kernel functions\n"); + return -EINVAL; + } key = btf_id; } @@ -9832,6 +9884,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + default: + if (!prog_extension) + return -EINVAL; + /* fallthrough */ case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { @@ -9839,6 +9895,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) btf_id); return -EINVAL; } + if (prog_extension && + btf_check_type_match(env, prog, btf, t)) + return -EINVAL; t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) return -EINVAL; @@ -9862,18 +9921,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (ret < 0) goto out; if (tgt_prog) { - if (!tgt_prog->jited) { - /* for now */ - verbose(env, "Can trace only JITed BPF progs\n"); - ret = -EINVAL; - goto out; - } - if (tgt_prog->type == BPF_PROG_TYPE_TRACING) { - /* prevent cycles */ - verbose(env, "Cannot recursively attach\n"); - ret = -EINVAL; - goto out; - } if (subprog == 0) addr = (long) tgt_prog->bpf_func; else @@ -9895,8 +9942,6 @@ out: if (ret) bpf_trampoline_put(tr); return ret; - default: - return -EINVAL; } } @@ -9966,10 +10011,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - ret = check_attach_btf_id(env); - if (ret) - goto skip_full_check; - env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; @@ -10006,6 +10047,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; + ret = check_attach_btf_id(env); + if (ret) + goto skip_full_check; + ret = check_cfg(env); if (ret < 0) goto skip_full_check; -- cgit From 5576b991e9c1a11d2cc21c4b94fc75ec27603896 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Jan 2020 15:36:46 -0800 Subject: bpf: Add BPF_FUNC_jiffies64 This patch adds a helper to read the 64bit jiffies. It will be used in a later patch to implement the bpf_cubic.c. The helper is inlined for jit_requested and 64 BITS_PER_LONG as the map_gen_lookup(). Other cases could be considered together with map_gen_lookup() if needed. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200122233646.903260-1-kafai@fb.com --- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 12 ++++++++++++ kernel/bpf/verifier.c | 24 ++++++++++++++++++++++++ 3 files changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 29d47aae0dd1..973a20d49749 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2137,6 +2137,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; +const struct bpf_func_proto bpf_jiffies64_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index cada974c9f4e..d8b7b110a1c5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -312,6 +313,17 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, preempt_enable(); } +BPF_CALL_0(bpf_jiffies64) +{ + return get_jiffies_64(); +} + +const struct bpf_func_proto bpf_jiffies64_proto = { + .func = bpf_jiffies64, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9fe94f64f0ec..734abaa02123 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9445,6 +9445,30 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) goto patch_call_imm; } + if (prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_jiffies64) { + struct bpf_insn ld_jiffies_addr[2] = { + BPF_LD_IMM64(BPF_REG_0, + (unsigned long)&jiffies), + }; + + insn_buf[0] = ld_jiffies_addr[0]; + insn_buf[1] = ld_jiffies_addr[1]; + insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, + BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, + cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed -- cgit From 485ec2ea9cf556e9c120e07961b7b459d776a115 Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Thu, 23 Jan 2020 17:34:38 +0530 Subject: bpf, devmap: Pass lockdep expression to RCU lists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit head is traversed using hlist_for_each_entry_rcu outside an RCU read-side critical section but under the protection of dtab->index_lock. Hence, add corresponding lockdep expression to silence false-positive lockdep warnings, and harden RCU lists. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Signed-off-by: Amol Grover Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200123120437.26506-1-frextrite@gmail.com --- kernel/bpf/devmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index de630f980282..f3a44f6ca86e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -263,7 +263,8 @@ struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) struct hlist_head *head = dev_map_index_hash(dtab, key); struct bpf_dtab_netdev *dev; - hlist_for_each_entry_rcu(dev, head, index_hlist) + hlist_for_each_entry_rcu(dev, head, index_hlist, + lockdep_is_held(&dtab->index_lock)) if (dev->idx == key) return dev; -- cgit From a35d16905efc6ad5523d864a5c6efcb1e657e386 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 5 Aug 2019 18:22:27 -0400 Subject: rcu: Add basic support for kfree_rcu() batching Recently a discussion about stability and performance of a system involving a high rate of kfree_rcu() calls surfaced on the list [1] which led to another discussion how to prepare for this situation. This patch adds basic batching support for kfree_rcu(). It is "basic" because we do none of the slab management, dynamic allocation, code moving or any of the other things, some of which previous attempts did [2]. These fancier improvements can be follow-up patches and there are different ideas being discussed in those regards. This is an effort to start simple, and build up from there. In the future, an extension to use kfree_bulk and possibly per-slab batching could be done to further improve performance due to cache-locality and slab-specific bulk free optimizations. By using an array of pointers, the worker thread processing the work would need to read lesser data since it does not need to deal with large rcu_head(s) any longer. Torture tests follow in the next patch and show improvements of around 5x reduction in number of grace periods on a 16 CPU system. More details and test data are in that patch. There is an implication with rcu_barrier() with this patch. Since the kfree_rcu() calls can be batched, and may not be handed yet to the RCU machinery in fact, the monitor may not have even run yet to do the queue_rcu_work(), there seems no easy way of implementing rcu_barrier() to wait for those kfree_rcu()s that are already made. So this means a kfree_rcu() followed by an rcu_barrier() does not imply that memory will be freed once rcu_barrier() returns. Another implication is higher active memory usage (although not run-away..) until the kfree_rcu() flooding ends, in comparison to without batching. More details about this are in the second patch which adds an rcuperf test. Finally, in the near future we will get rid of kfree_rcu() special casing within RCU such as in rcu_do_batch and switch everything to just batching. Currently we don't do that since timer subsystem is not yet up and we cannot schedule the kfree_rcu() monitor as the timer subsystem's lock are not initialized. That would also mean getting rid of kfree_call_rcu_nobatch() entirely. [1] http://lore.kernel.org/lkml/20190723035725-mutt-send-email-mst@kernel.org [2] https://lkml.org/lkml/2017/12/19/824 Cc: kernel-team@android.com Cc: kernel-team@lge.com Co-developed-by: Byungchul Park Signed-off-by: Byungchul Park Signed-off-by: Joel Fernandes (Google) [ paulmck: Applied 0day and Paul Walmsley feedback on ->monitor_todo. ] [ paulmck: Make it work during early boot. ] [ paulmck: Add a crude early boot self-test. ] [ paulmck: Style adjustments and experimental docbook structure header. ] Link: https://lore.kernel.org/lkml/alpine.DEB.2.21.9999.1908161931110.32497@viisi.sifive.com/T/#me9956f66cb611b95d26ae92700e1d901f46e8c59 Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcu/update.c | 10 +++ 2 files changed, 198 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..0af016fdbf19 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2683,19 +2683,187 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu); + +/* Maximum number of jiffies to wait before draining a batch. */ +#define KFREE_DRAIN_JIFFIES (HZ / 50) + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head: List of kfree_rcu() objects not yet waiting for a grace period + * @head_free: List of kfree_rcu() objects already waiting for a grace period + * @lock: Synchronize access to this structure + * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES + * @monitor_todo: Tracks whether a @monitor_work delayed work is pending + * @initialized: The @lock and @rcu_work fields have been initialized + * + * This is a per-CPU structure. The reason that it is not included in + * the rcu_data structure is to permit this code to be extracted from + * the RCU files. Such extraction could allow further optimization of + * the interactions with the slab allocators. + */ +struct kfree_rcu_cpu { + struct rcu_work rcu_work; + struct rcu_head *head; + struct rcu_head *head_free; + spinlock_t lock; + struct delayed_work monitor_work; + int monitor_todo; + bool initialized; +}; + +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); + /* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). + * This function is invoked in workqueue context after a grace period. + * It frees all the objects queued on ->head_free. */ -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +static void kfree_rcu_work(struct work_struct *work) +{ + unsigned long flags; + struct rcu_head *head, *next; + struct kfree_rcu_cpu *krcp; + + krcp = container_of(to_rcu_work(work), struct kfree_rcu_cpu, rcu_work); + spin_lock_irqsave(&krcp->lock, flags); + head = krcp->head_free; + krcp->head_free = NULL; + spin_unlock_irqrestore(&krcp->lock, flags); + + // List "head" is now private, so traverse locklessly. + for (; head; head = next) { + next = head->next; + // Potentially optimize with kfree_bulk in future. + __rcu_reclaim(rcu_state.name, head); + cond_resched_tasks_rcu_qs(); + } +} + +/* + * Schedule the kfree batch RCU work to run in workqueue context after a GP. + * + * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES + * timeout has been reached. + */ +static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) +{ + lockdep_assert_held(&krcp->lock); + + // If a previous RCU batch is in progress, we cannot immediately + // queue another one, so return false to tell caller to retry. + if (krcp->head_free) + return false; + + krcp->head_free = krcp->head; + krcp->head = NULL; + INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work); + queue_rcu_work(system_wq, &krcp->rcu_work); + return true; +} + +static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, + unsigned long flags) +{ + // Attempt to start a new batch. + if (queue_kfree_rcu_work(krcp)) { + // Success! Our job is done here. + spin_unlock_irqrestore(&krcp->lock, flags); + return; + } + + // Previous RCU batch still in progress, try again later. + if (!xchg(&krcp->monitor_todo, true)) + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + * It invokes kfree_rcu_drain_unlock() to attempt to start another batch. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + unsigned long flags; + struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, + monitor_work.work); + + spin_lock_irqsave(&krcp->lock, flags); + if (xchg(&krcp->monitor_todo, false)) + kfree_rcu_drain_unlock(krcp, flags); + else + spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * This version of kfree_call_rcu does not do batching of kfree_rcu() requests. + * Used only by rcuperf torture test for comparison with kfree_rcu_batch(). + */ +void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, 1); } +EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch); + +/* + * Queue a request for lazy invocation of kfree() after a grace period. + * + * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch + * will be kfree'd in workqueue context. This allows us to: + * + * 1. Batch requests together to reduce the number of grace periods during + * heavy kfree_rcu() load. + * + * 2. It makes it possible to use kfree_bulk() on a large number of + * kfree_rcu() requests thus reducing cache misses and the per-object + * overhead of kfree(). + */ +void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + unsigned long flags; + struct kfree_rcu_cpu *krcp; + + head->func = func; + + local_irq_save(flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + if (krcp->initialized) + spin_lock(&krcp->lock); + + // Queue the object but don't yet schedule the batch. + head->func = func; + head->next = krcp->head; + krcp->head = head; + + // Set timer to drain after KFREE_DRAIN_JIFFIES. + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !xchg(&krcp->monitor_todo, true)) + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + + if (krcp->initialized) + spin_unlock(&krcp->lock); + local_irq_restore(flags); +} EXPORT_SYMBOL_GPL(kfree_call_rcu); +void __init kfree_rcu_scheduler_running(void) +{ + int cpu; + unsigned long flags; + + for_each_online_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_irqsave(&krcp->lock, flags); + if (!krcp->head || xchg(&krcp->monitor_todo, true)) { + spin_unlock_irqrestore(&krcp->lock, flags); + continue; + } + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + spin_unlock_irqrestore(&krcp->lock, flags); + } +} + /* * During early boot, any blocking grace-period wait automatically * implies a grace period. Later on, this is never the case for PREEMPT. @@ -3557,12 +3725,26 @@ static void __init rcu_dump_rcu_node_tree(void) struct workqueue_struct *rcu_gp_wq; struct workqueue_struct *rcu_par_gp_wq; +static void __init kfree_rcu_batch_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_init(&krcp->lock); + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); + krcp->initialized = true; + } +} + void __init rcu_init(void) { int cpu; rcu_early_boot_tests(); + kfree_rcu_batch_init(); rcu_bootup_announce(); rcu_init_geometry(); rcu_init_one(); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1861103662db..196487762b96 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -40,6 +40,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS @@ -218,6 +219,7 @@ static int __init rcu_set_runtime_mode(void) { rcu_test_sync_prims(); rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + kfree_rcu_scheduler_running(); rcu_test_sync_prims(); return 0; } @@ -853,14 +855,22 @@ static void test_callback(struct rcu_head *r) DEFINE_STATIC_SRCU(early_srcu); +struct early_boot_kfree_rcu { + struct rcu_head rh; +}; + static void early_boot_test_call_rcu(void) { static struct rcu_head head; static struct rcu_head shead; + struct early_boot_kfree_rcu *rhp; call_rcu(&head, test_callback); if (IS_ENABLED(CONFIG_SRCU)) call_srcu(&early_srcu, &shead, test_callback); + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (!WARN_ON_ONCE(!rhp)) + kfree_rcu(rhp, rh); } void rcu_early_boot_tests(void) -- cgit From e6e78b004fa7e0ab455d46d27f218bf6ce178a18 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 30 Aug 2019 12:36:29 -0400 Subject: rcuperf: Add kfree_rcu() performance Tests This test runs kfree_rcu() in a loop to measure performance of the new kfree_rcu() batching functionality. The following table shows results when booting with arguments: rcuperf.kfree_loops=20000 rcuperf.kfree_alloc_num=8000 rcuperf.kfree_rcu_test=1 rcuperf.kfree_no_batch=X rcuperf.kfree_no_batch=X # Grace Periods Test Duration (s) X=1 (old behavior) 9133 11.5 X=0 (new behavior) 1732 12.5 On a 16 CPU system with the above boot parameters, we see that the total number of grace periods that elapse during the test drops from 9133 when not batching to 1732 when batching (a 5X improvement). The kfree_rcu() flood itself slows down a bit when batching, though, as shown. Note that the active memory consumption during the kfree_rcu() flood does increase to around 200-250MB due to the batching (from around 50MB without batching). However, this memory consumption is relatively constant. In other words, the system is able to keep up with the kfree_rcu() load. The memory consumption comes down considerably if KFREE_DRAIN_JIFFIES is increased from HZ/50 to HZ/80. A later patch will reduce memory consumption further by using multiple lists. Also, when running the test, please disable CONFIG_DEBUG_PREEMPT and CONFIG_PROVE_RCU for realistic comparisons with/without batching. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 173 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 5f884d560384..c1e25fd10f2a 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -86,6 +86,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN, "Shutdown at end of performance tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); +torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -105,8 +106,8 @@ static atomic_t n_rcu_perf_writer_finished; static wait_queue_head_t shutdown_wq; static u64 t_rcu_perf_writer_started; static u64 t_rcu_perf_writer_finished; -static unsigned long b_rcu_perf_writer_started; -static unsigned long b_rcu_perf_writer_finished; +static unsigned long b_rcu_gp_test_started; +static unsigned long b_rcu_gp_test_finished; static DEFINE_PER_CPU(atomic_t, n_async_inflight); #define MAX_MEAS 10000 @@ -378,10 +379,10 @@ rcu_perf_writer(void *arg) if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { t_rcu_perf_writer_started = t; if (gp_exp) { - b_rcu_perf_writer_started = + b_rcu_gp_test_started = cur_ops->exp_completed() / 2; } else { - b_rcu_perf_writer_started = cur_ops->get_gp_seq(); + b_rcu_gp_test_started = cur_ops->get_gp_seq(); } } @@ -429,10 +430,10 @@ retry: PERFOUT_STRING("Test complete"); t_rcu_perf_writer_finished = t; if (gp_exp) { - b_rcu_perf_writer_finished = + b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; } else { - b_rcu_perf_writer_finished = + b_rcu_gp_test_finished = cur_ops->get_gp_seq(); } if (shutdown) { @@ -515,8 +516,8 @@ rcu_perf_cleanup(void) t_rcu_perf_writer_finished - t_rcu_perf_writer_started, ngps, - rcuperf_seq_diff(b_rcu_perf_writer_finished, - b_rcu_perf_writer_started)); + rcuperf_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); for (i = 0; i < nrealwriters; i++) { if (!writer_durations) break; @@ -584,6 +585,167 @@ rcu_perf_shutdown(void *arg) return -EINVAL; } +/* + * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number + * of iterations and measure total time and number of GP for all iterations to complete. + */ + +torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); +torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); +torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); +torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu()."); + +static struct task_struct **kfree_reader_tasks; +static int kfree_nrealthreads; +static atomic_t n_kfree_perf_thread_started; +static atomic_t n_kfree_perf_thread_ended; + +struct kfree_obj { + char kfree_obj[8]; + struct rcu_head rh; +}; + +static int +kfree_perf_thread(void *arg) +{ + int i, loop = 0; + long me = (long)arg; + struct kfree_obj *alloc_ptr; + u64 start_time, end_time; + + VERBOSE_PERFOUT_STRING("kfree_perf_thread task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + start_time = ktime_get_mono_fast_ns(); + + if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) { + if (gp_exp) + b_rcu_gp_test_started = cur_ops->exp_completed() / 2; + else + b_rcu_gp_test_started = cur_ops->get_gp_seq(); + } + + do { + for (i = 0; i < kfree_alloc_num; i++) { + alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); + if (!alloc_ptr) + return -ENOMEM; + + if (!kfree_no_batch) { + kfree_rcu(alloc_ptr, rh); + } else { + rcu_callback_t cb; + + cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh); + kfree_call_rcu_nobatch(&(alloc_ptr->rh), cb); + } + } + + cond_resched(); + } while (!torture_must_stop() && ++loop < kfree_loops); + + if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) { + end_time = ktime_get_mono_fast_ns(); + + if (gp_exp) + b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; + else + b_rcu_gp_test_finished = cur_ops->get_gp_seq(); + + pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n", + (unsigned long long)(end_time - start_time), kfree_loops, + rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started)); + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + + torture_kthread_stopping("kfree_perf_thread"); + return 0; +} + +static void +kfree_perf_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (kfree_reader_tasks) { + for (i = 0; i < kfree_nrealthreads; i++) + torture_stop_kthread(kfree_perf_thread, + kfree_reader_tasks[i]); + kfree(kfree_reader_tasks); + } + + torture_cleanup_end(); +} + +/* + * shutdown kthread. Just waits to be awakened, then shuts down system. + */ +static int +kfree_perf_shutdown(void *arg) +{ + do { + wait_event(shutdown_wq, + atomic_read(&n_kfree_perf_thread_ended) >= + kfree_nrealthreads); + } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads); + + smp_mb(); /* Wake before output. */ + + kfree_perf_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +static int __init +kfree_perf_init(void) +{ + long i; + int firsterr = 0; + + kfree_nrealthreads = compute_real(kfree_nthreads); + /* Start up the kthreads. */ + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(kfree_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), + GFP_KERNEL); + if (kfree_reader_tasks == NULL) { + firsterr = -ENOMEM; + goto unwind; + } + + for (i = 0; i < kfree_nrealthreads; i++) { + firsterr = torture_create_kthread(kfree_perf_thread, (void *)i, + kfree_reader_tasks[i]); + if (firsterr) + goto unwind; + } + + while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads) + schedule_timeout_uninterruptible(1); + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + kfree_perf_cleanup(); + return firsterr; +} + static int __init rcu_perf_init(void) { @@ -616,6 +778,9 @@ rcu_perf_init(void) if (cur_ops->init) cur_ops->init(); + if (kfree_rcu_test) + return kfree_perf_init(); + nrealwriters = compute_real(nwriters); nrealreaders = compute_real(nreaders); atomic_set(&n_rcu_perf_reader_started, 0); -- cgit From 569d767087ef56a59bc2d1d5f25ee447eeae442a Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 22 Sep 2019 10:49:57 -0700 Subject: rcu: Make kfree_rcu() use a non-atomic ->monitor_todo Because the ->monitor_todo field is always protected by krcp->lock, this commit downgrades from xchg() to non-atomic unmarked assignment statements. Signed-off-by: Joel Fernandes [ paulmck: Update to include early-boot kick code. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0af016fdbf19..6106b9e0b5fb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2708,7 +2708,7 @@ struct kfree_rcu_cpu { struct rcu_head *head_free; spinlock_t lock; struct delayed_work monitor_work; - int monitor_todo; + bool monitor_todo; bool initialized; }; @@ -2765,6 +2765,7 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) { // Attempt to start a new batch. + krcp->monitor_todo = false; if (queue_kfree_rcu_work(krcp)) { // Success! Our job is done here. spin_unlock_irqrestore(&krcp->lock, flags); @@ -2772,8 +2773,8 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, } // Previous RCU batch still in progress, try again later. - if (!xchg(&krcp->monitor_todo, true)) - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + krcp->monitor_todo = true; + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); spin_unlock_irqrestore(&krcp->lock, flags); } @@ -2788,7 +2789,7 @@ static void kfree_rcu_monitor(struct work_struct *work) monitor_work.work); spin_lock_irqsave(&krcp->lock, flags); - if (xchg(&krcp->monitor_todo, false)) + if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else spin_unlock_irqrestore(&krcp->lock, flags); @@ -2837,8 +2838,10 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !xchg(&krcp->monitor_todo, true)) + !krcp->monitor_todo) { + krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + } if (krcp->initialized) spin_unlock(&krcp->lock); @@ -2855,10 +2858,11 @@ void __init kfree_rcu_scheduler_running(void) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); spin_lock_irqsave(&krcp->lock, flags); - if (!krcp->head || xchg(&krcp->monitor_todo, true)) { + if (!krcp->head || krcp->monitor_todo) { spin_unlock_irqrestore(&krcp->lock, flags); continue; } + krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); spin_unlock_irqrestore(&krcp->lock, flags); } -- cgit From 0392bebebf26f09434e6c7ca4c09c014efeef76a Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 19 Sep 2019 14:58:26 -0700 Subject: rcu: Add multiple in-flight batches of kfree_rcu() work During testing, it was observed that amount of memory consumed due kfree_rcu() batching is 300-400MB. Previously we had only a single head_free pointer pointing to the list of rcu_head(s) that are to be freed after a grace period. Until this list is drained, we cannot queue any more objects on it since such objects may not be ready to be reclaimed when the worker thread eventually gets to drainin g the head_free list. We can do better by maintaining multiple lists as done by this patch. Testing shows that memory consumption came down by around 100-150MB with just adding another list. Adding more than 1 additional list did not show any improvement. Suggested-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) [ paulmck: Code style and initialization handling. ] [ paulmck: Fix field name, reported by kbuild test robot . ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 51 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6106b9e0b5fb..a40fd58bd4b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2686,12 +2686,25 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (HZ / 50) +#define KFREE_N_BATCHES 2 /** - * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head_free: List of kfree_rcu() objects waiting for a grace period + * @krcp: Pointer to @kfree_rcu_cpu structure + */ + +struct kfree_rcu_cpu_work { + struct rcu_work rcu_work; + struct rcu_head *head_free; + struct kfree_rcu_cpu *krcp; +}; + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @head_free: List of kfree_rcu() objects already waiting for a grace period + * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending @@ -2703,9 +2716,8 @@ EXPORT_SYMBOL_GPL(call_rcu); * the interactions with the slab allocators. */ struct kfree_rcu_cpu { - struct rcu_work rcu_work; struct rcu_head *head; - struct rcu_head *head_free; + struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; @@ -2723,11 +2735,14 @@ static void kfree_rcu_work(struct work_struct *work) unsigned long flags; struct rcu_head *head, *next; struct kfree_rcu_cpu *krcp; + struct kfree_rcu_cpu_work *krwp; - krcp = container_of(to_rcu_work(work), struct kfree_rcu_cpu, rcu_work); + krwp = container_of(to_rcu_work(work), + struct kfree_rcu_cpu_work, rcu_work); + krcp = krwp->krcp; spin_lock_irqsave(&krcp->lock, flags); - head = krcp->head_free; - krcp->head_free = NULL; + head = krwp->head_free; + krwp->head_free = NULL; spin_unlock_irqrestore(&krcp->lock, flags); // List "head" is now private, so traverse locklessly. @@ -2747,17 +2762,25 @@ static void kfree_rcu_work(struct work_struct *work) */ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { + int i; + struct kfree_rcu_cpu_work *krwp = NULL; + lockdep_assert_held(&krcp->lock); + for (i = 0; i < KFREE_N_BATCHES; i++) + if (!krcp->krw_arr[i].head_free) { + krwp = &(krcp->krw_arr[i]); + break; + } // If a previous RCU batch is in progress, we cannot immediately // queue another one, so return false to tell caller to retry. - if (krcp->head_free) + if (!krwp) return false; - krcp->head_free = krcp->head; + krwp->head_free = krcp->head; krcp->head = NULL; - INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work); - queue_rcu_work(system_wq, &krcp->rcu_work); + INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work); + queue_rcu_work(system_wq, &krwp->rcu_work); return true; } @@ -2863,7 +2886,8 @@ void __init kfree_rcu_scheduler_running(void) continue; } krcp->monitor_todo = true; - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + schedule_delayed_work_on(cpu, &krcp->monitor_work, + KFREE_DRAIN_JIFFIES); spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -3732,11 +3756,14 @@ struct workqueue_struct *rcu_par_gp_wq; static void __init kfree_rcu_batch_init(void) { int cpu; + int i; for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); spin_lock_init(&krcp->lock); + for (i = 0; i < KFREE_N_BATCHES; i++) + krcp->krw_arr[i].krcp = krcp; INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } -- cgit From e99637becb2e684bee2b9117f817f4d1346b8353 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 22 Sep 2019 13:03:17 -0700 Subject: rcu: Add support for debug_objects debugging for kfree_rcu() This commit applies RCU's debug_objects debugging to the new batched kfree_rcu() implementations. The object is queued at the kfree_rcu() call and dequeued during reclaim. Tested that enabling CONFIG_DEBUG_OBJECTS_RCU_HEAD successfully detects double kfree_rcu() calls. Signed-off-by: Joel Fernandes (Google) [ paulmck: Fix IRQ per kbuild test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a40fd58bd4b6..0512221cd84b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2749,6 +2749,7 @@ static void kfree_rcu_work(struct work_struct *work) for (; head; head = next) { next = head->next; // Potentially optimize with kfree_bulk in future. + debug_rcu_head_unqueue(head); __rcu_reclaim(rcu_state.name, head); cond_resched_tasks_rcu_qs(); } @@ -2855,6 +2856,12 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) spin_lock(&krcp->lock); // Queue the object but don't yet schedule the batch. + if (debug_rcu_head_queue(head)) { + // Probable double kfree_rcu(), just leak. + WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", + __func__, head); + goto unlock_return; + } head->func = func; head->next = krcp->head; krcp->head = head; @@ -2866,6 +2873,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); } +unlock_return: if (krcp->initialized) spin_unlock(&krcp->lock); local_irq_restore(flags); -- cgit From 77a40f97030b27b3fc1640a3ed203870f0817f57 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 30 Aug 2019 12:36:32 -0400 Subject: rcu: Remove kfree_rcu() special casing and lazy-callback handling This commit removes kfree_rcu() special-casing and the lazy-callback handling from Tree RCU. It moves some of this special casing to Tiny RCU, the removal of which will be the subject of later commits. This results in a nice negative delta. Suggested-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) [ paulmck: Add slab.h #include, thanks to kbuild test robot . ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 27 -------------------------- kernel/rcu/rcu_segcblist.c | 25 +++--------------------- kernel/rcu/rcu_segcblist.h | 25 ++---------------------- kernel/rcu/srcutree.c | 4 ++-- kernel/rcu/tiny.c | 28 ++++++++++++++++++++++++++- kernel/rcu/tree.c | 40 ++++++++++++++++++++++++++------------ kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 48 +++++++++++----------------------------------- kernel/rcu/tree_stall.h | 6 ++---- 9 files changed, 75 insertions(+), 129 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ab504fbc76ca..c30a1f7dbd15 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -198,33 +198,6 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -void kfree(const void *); - -/* - * Reclaim the specified callback, either by invoking it (non-lazy case) - * or freeing it directly (lazy case). Return true if lazy, false otherwise. - */ -static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) -{ - rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; - - rcu_lock_acquire(&rcu_callback_map); - if (__is_kfree_rcu_offset(offset)) { - trace_rcu_invoke_kfree_callback(rn, head, offset); - kfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } else { - trace_rcu_invoke_callback(rn, head); - f = head->func; - WRITE_ONCE(head->func, (rcu_callback_t)0L); - f(head); - rcu_lock_release(&rcu_callback_map); - return false; - } -} - #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_ftrace_dump; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index cbc87b804db9..5f4fd3b8777c 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -20,14 +20,10 @@ void rcu_cblist_init(struct rcu_cblist *rclp) rclp->head = NULL; rclp->tail = &rclp->head; rclp->len = 0; - rclp->len_lazy = 0; } /* * Enqueue an rcu_head structure onto the specified callback list. - * This function assumes that the callback is non-lazy because it - * is intended for use by no-CBs CPUs, which do not distinguish - * between lazy and non-lazy RCU callbacks. */ void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp) { @@ -54,7 +50,6 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, else drclp->tail = &drclp->head; drclp->len = srclp->len; - drclp->len_lazy = srclp->len_lazy; if (!rhp) { rcu_cblist_init(srclp); } else { @@ -62,16 +57,12 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, srclp->head = rhp; srclp->tail = &rhp->next; WRITE_ONCE(srclp->len, 1); - srclp->len_lazy = 0; } } /* * Dequeue the oldest rcu_head structure from the specified callback - * list. This function assumes that the callback is non-lazy, but - * the caller can later invoke rcu_cblist_dequeued_lazy() if it - * finds otherwise (and if it cares about laziness). This allows - * different users to have different ways of determining laziness. + * list. */ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) { @@ -161,7 +152,6 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) for (i = 0; i < RCU_CBLIST_NSEGS; i++) rsclp->tails[i] = &rsclp->head; rcu_segcblist_set_len(rsclp, 0); - rsclp->len_lazy = 0; rsclp->enabled = 1; } @@ -173,7 +163,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) { WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); - WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); rsclp->enabled = 0; } @@ -253,11 +242,9 @@ bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp) * absolutely not OK for it to ever miss posting a callback. */ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) + struct rcu_head *rhp) { rcu_segcblist_inc_len(rsclp); - if (lazy) - rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is enqueued. */ rhp->next = NULL; WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); @@ -275,15 +262,13 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, * period. You have been warned. */ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) + struct rcu_head *rhp) { int i; if (rcu_segcblist_n_cbs(rsclp) == 0) return false; rcu_segcblist_inc_len(rsclp); - if (lazy) - rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is entrained. */ rhp->next = NULL; for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) @@ -307,8 +292,6 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { - rclp->len_lazy += rsclp->len_lazy; - rsclp->len_lazy = 0; rclp->len = rcu_segcblist_xchg_len(rsclp, 0); } @@ -361,9 +344,7 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { - rsclp->len_lazy += rclp->len_lazy; rcu_segcblist_add_len(rsclp, rclp->len); - rclp->len_lazy = 0; rclp->len = 0; } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 815c2fdd3fcc..5c293afc07b8 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -15,15 +15,6 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) return READ_ONCE(rclp->len); } -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) -{ - rclp->len_lazy--; -} - void rcu_cblist_init(struct rcu_cblist *rclp); void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp); void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, @@ -59,18 +50,6 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) #endif } -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) -{ - return rsclp->len_lazy; -} - -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) -{ - return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy; -} - /* * Is the specified rcu_segcblist enabled, for example, not corresponding * to an offline CPU? @@ -106,9 +85,9 @@ struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp); void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy); + struct rcu_head *rhp); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy); + struct rcu_head *rhp); void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5dffade2d7cd..d0a9d5b69087 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -853,7 +853,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, local_irq_save(flags); sdp = this_cpu_ptr(ssp->sda); spin_lock_rcu_node(sdp); - rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); s = rcu_seq_snap(&ssp->srcu_gp_seq); @@ -1052,7 +1052,7 @@ void srcu_barrier(struct srcu_struct *ssp) sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head, 0)) { + &sdp->srcu_barrier_head)) { debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&ssp->srcu_barrier_cpu_cnt); } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 477b4eb44af5..dd572ce7c747 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "rcu.h" @@ -73,6 +74,31 @@ void rcu_sched_clock_irq(int user) } } +/* + * Reclaim the specified callback, either by invoking it for non-kfree cases or + * freeing it directly (for kfree). Return true if kfreeing, false otherwise. + */ +static inline bool rcu_reclaim_tiny(struct rcu_head *head) +{ + rcu_callback_t f; + unsigned long offset = (unsigned long)head->func; + + rcu_lock_acquire(&rcu_callback_map); + if (__is_kfree_rcu_offset(offset)) { + trace_rcu_invoke_kfree_callback("", head, offset); + kfree((void *)head - offset); + rcu_lock_release(&rcu_callback_map); + return true; + } + + trace_rcu_invoke_callback("", head); + f = head->func; + WRITE_ONCE(head->func, (rcu_callback_t)0L); + f(head); + rcu_lock_release(&rcu_callback_map); + return false; +} + /* Invoke the RCU callbacks whose grace period has elapsed. */ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { @@ -100,7 +126,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - __rcu_reclaim("", list); + rcu_reclaim_tiny(list); local_bh_enable(); list = next; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0512221cd84b..a8dd612098bf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include "../time/tick-internal.h" @@ -2146,7 +2147,6 @@ static void rcu_do_batch(struct rcu_data *rdp) /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), 0); trace_rcu_batch_end(rcu_state.name, 0, !rcu_segcblist_empty(&rdp->cblist), @@ -2168,7 +2168,6 @@ static void rcu_do_batch(struct rcu_data *rdp) if (unlikely(bl > 100)) tlimit = local_clock() + rcu_resched_ns; trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); if (offloaded) @@ -2179,9 +2178,19 @@ static void rcu_do_batch(struct rcu_data *rdp) tick_dep_set_task(current, TICK_DEP_BIT_RCU); rhp = rcu_cblist_dequeue(&rcl); for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { + rcu_callback_t f; + debug_rcu_head_unqueue(rhp); - if (__rcu_reclaim(rcu_state.name, rhp)) - rcu_cblist_dequeued_lazy(&rcl); + + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_callback(rcu_state.name, rhp); + + f = rhp->func; + WRITE_ONCE(rhp->func, (rcu_callback_t)0L); + f(rhp); + + rcu_lock_release(&rcu_callback_map); + /* * Stop only if limit reached and CPU has something to do. * Note: The rcl structure counts down from zero. @@ -2583,7 +2592,7 @@ static void rcu_leak_callback(struct rcu_head *rhp) * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) +__call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct rcu_data *rdp; @@ -2618,18 +2627,17 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); } + if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) return; // Enqueued onto ->nocb_bypass, so just leave. /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ - rcu_segcblist_enqueue(&rdp->cblist, head, lazy); + rcu_segcblist_enqueue(&rdp->cblist, head); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, (unsigned long)func, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); else trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ @@ -2679,7 +2687,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, 0); + __call_rcu(head, func); } EXPORT_SYMBOL_GPL(call_rcu); @@ -2747,10 +2755,18 @@ static void kfree_rcu_work(struct work_struct *work) // List "head" is now private, so traverse locklessly. for (; head; head = next) { + unsigned long offset = (unsigned long)head->func; + next = head->next; // Potentially optimize with kfree_bulk in future. debug_rcu_head_unqueue(head); - __rcu_reclaim(rcu_state.name, head); + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + + /* Could be possible to optimize with kfree_bulk in future */ + kfree((void *)head - offset); + + rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); } } @@ -2825,7 +2841,7 @@ static void kfree_rcu_monitor(struct work_struct *work) */ void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, 1); + __call_rcu(head, func); } EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch); @@ -3100,7 +3116,7 @@ static void rcu_barrier_func(void *unused) debug_rcu_head_queue(&rdp->barrier_head); rcu_nocb_lock(rdp); WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); - if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { + if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) { atomic_inc(&rcu_state.barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 055c31781d3a..15405420b40c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -183,7 +183,6 @@ struct rcu_data { bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ #ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa08d55f7040..d5334e49ccca 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1262,10 +1262,9 @@ static void rcu_prepare_for_idle(void) /* * This code is invoked when a CPU goes idle, at which point we want * to have the CPU do everything required for RCU so that it can enter - * the energy-efficient dyntick-idle mode. This is handled by a - * state machine implemented by rcu_prepare_for_idle() below. + * the energy-efficient dyntick-idle mode. * - * The following three proprocessor symbols control this state machine: + * The following preprocessor symbol controls this: * * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted * to sleep in dyntick-idle mode with RCU callbacks pending. This @@ -1274,21 +1273,15 @@ static void rcu_prepare_for_idle(void) * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your * system. And if you are -that- concerned about energy efficiency, * just power the system down and be done with it! - * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is - * permitted to sleep in dyntick-idle mode with only lazy RCU - * callbacks pending. Setting this too high can OOM your system. * - * The values below work well in practice. If future workloads require + * The value below works well in practice. If future workloads require * adjustment, they can be converted into kernel config parameters, though * making the state machine smarter might be a better option. */ #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ -#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; module_param(rcu_idle_gp_delay, int, 0644); -static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; -module_param(rcu_idle_lazy_gp_delay, int, 0644); /* * Try to advance callbacks on the current CPU, but only if it has been @@ -1327,8 +1320,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) /* * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready * to invoke. If the CPU has callbacks, try to advance them. Tell the - * caller to set the timeout based on whether or not there are non-lazy - * callbacks. + * caller about what to set the timeout. * * The caller must have disabled interrupts. */ @@ -1354,25 +1346,18 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) } rdp->last_accelerate = jiffies; - /* Request timer delay depending on laziness, and round. */ - rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist); - if (rdp->all_lazy) { - dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; - } else { - dj = round_up(rcu_idle_gp_delay + jiffies, - rcu_idle_gp_delay) - jiffies; - } + /* Request timer and round. */ + dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; + *nextevt = basemono + dj * TICK_NSEC; return 0; } /* - * Prepare a CPU for idle from an RCU perspective. The first major task - * is to sense whether nohz mode has been enabled or disabled via sysfs. - * The second major task is to check to see if a non-lazy callback has - * arrived at a CPU that previously had only lazy callbacks. The third - * major task is to accelerate (that is, assign grace-period numbers to) - * any recently arrived callbacks. + * Prepare a CPU for idle from an RCU perspective. The first major task is to + * sense whether nohz mode has been enabled or disabled via sysfs. The second + * major task is to accelerate (that is, assign grace-period numbers to) any + * recently arrived callbacks. * * The caller must have disabled interrupts. */ @@ -1398,17 +1383,6 @@ static void rcu_prepare_for_idle(void) if (!tne) return; - /* - * If a non-lazy callback arrived at a CPU having only lazy - * callbacks, invoke RCU core for the side-effect of recalculating - * idle duration on re-entry to idle. - */ - if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) { - rdp->all_lazy = false; - invoke_rcu_core(); - return; - } - /* * If we have not yet accelerated this jiffy, accelerate all * callbacks on this CPU. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6..806f2ddc8f74 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -263,11 +263,9 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", + sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d", rdp->last_accelerate & 0xffff, jiffies & 0xffff, - ".l"[rdp->all_lazy], - ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], - ".D"[!!rdp->tick_nohz_enabled_snap]); + !!rdp->tick_nohz_enabled_snap); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -- cgit From 189a6883dcf7fa70e17403ae4225c60ffc9e404b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 30 Aug 2019 12:36:33 -0400 Subject: rcu: Remove kfree_call_rcu_nobatch() Now that the kfree_rcu() special-casing has been removed from tree RCU, this commit removes kfree_call_rcu_nobatch() since it is no longer needed. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 10 +--------- kernel/rcu/tree.c | 18 ++++-------------- 2 files changed, 5 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index c1e25fd10f2a..da94b89cd531 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -593,7 +593,6 @@ rcu_perf_shutdown(void *arg) torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); -torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu()."); static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; @@ -632,14 +631,7 @@ kfree_perf_thread(void *arg) if (!alloc_ptr) return -ENOMEM; - if (!kfree_no_batch) { - kfree_rcu(alloc_ptr, rh); - } else { - rcu_callback_t cb; - - cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh); - kfree_call_rcu_nobatch(&(alloc_ptr->rh), cb); - } + kfree_rcu(alloc_ptr, rh); } cond_resched(); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a8dd612098bf..31d2d9255d95 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2763,8 +2763,10 @@ static void kfree_rcu_work(struct work_struct *work) rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); - /* Could be possible to optimize with kfree_bulk in future */ - kfree((void *)head - offset); + if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) { + /* Could be optimized with kfree_bulk() in future. */ + kfree((void *)head - offset); + } rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -2835,16 +2837,6 @@ static void kfree_rcu_monitor(struct work_struct *work) spin_unlock_irqrestore(&krcp->lock, flags); } -/* - * This version of kfree_call_rcu does not do batching of kfree_rcu() requests. - * Used only by rcuperf torture test for comparison with kfree_rcu_batch(). - */ -void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch); - /* * Queue a request for lazy invocation of kfree() after a grace period. * @@ -2864,8 +2856,6 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unsigned long flags; struct kfree_rcu_cpu *krcp; - head->func = func; - local_irq_save(flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); if (krcp->initialized) -- cgit From c130d2dc93cd03323494d82dbe7b5fb0d101ab62 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:28:48 +0000 Subject: rcu: Rename some instance of CONFIG_PREEMPTION to CONFIG_PREEMPT_RCU CONFIG_PREEMPTION and CONFIG_PREEMPT_RCU are always identical, but some code depends on CONFIG_PREEMPTION to access to rcu_preempt functionality. This patch changes CONFIG_PREEMPTION to CONFIG_PREEMPT_RCU in these cases. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_stall.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c9dbb05e4c13..5445da2326a0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1934,7 +1934,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2294,7 +2294,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPTION) || + if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6..a6652efedd48 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp) // // Printing RCU CPU stall warnings -#ifdef CONFIG_PREEMPTION +#ifdef CONFIG_PREEMPT_RCU /* * Dump detailed information for all tasks blocking the current RCU @@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return ndetected; } -#else /* #ifdef CONFIG_PREEMPTION */ +#else /* #ifdef CONFIG_PREEMPT_RCU */ /* * Because preemptible RCU does not exist, we never have to check for @@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) { return 0; } -#endif /* #else #ifdef CONFIG_PREEMPTION */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* * Dump stacks of all tasks running on stalled CPUs. First try using -- cgit From 2eeba5838fd8c5e19bb91e25624116936348e7af Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 1 Nov 2019 04:06:22 -0700 Subject: rcu: Clear .exp_hint only when deferred quiescent state has been reported Currently, the .exp_hint flag is cleared in rcu_read_unlock_special(), which works, but which can also prevent subsequent rcu_read_unlock() calls from helping expedite the quiescent state needed by an ongoing expedited RCU grace period. This commit therefore defers clearing of .exp_hint from rcu_read_unlock_special() to rcu_preempt_deferred_qs_irqrestore(), thus ensuring that intervening calls to rcu_read_unlock() have a chance to help end the expedited grace period. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8cdce111ea73..7487c7930a47 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -444,6 +444,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) local_irq_restore(flags); return; } + t->rcu_read_unlock_special.b.exp_hint = false; t->rcu_read_unlock_special.b.deferred_qs = false; if (special.b.need_qs) { rcu_qs(); @@ -610,7 +611,6 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - t->rcu_read_unlock_special.b.exp_hint = false; exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || (rdp->grpmask & rnp->expmask) || tick_nohz_full_cpu(rdp->cpu); @@ -640,7 +640,6 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_restore(flags); return; } - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); rcu_preempt_deferred_qs_irqrestore(t, flags); } -- cgit From 3717e1e9f25ec7059e421ab6fc602cab7063c11c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 1 Nov 2019 05:06:21 -0700 Subject: rcu: Clear ->rcu_read_unlock_special only once In rcu_preempt_deferred_qs_irqrestore(), ->rcu_read_unlock_special is cleared one piece at a time. Given that the "if" statements in this function use the copy in "special", this commit removes the clearing of the individual pieces in favor of clearing ->rcu_read_unlock_special in one go just after it has been determined to be non-zero. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7487c7930a47..c3a32717c42d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -444,16 +444,9 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) local_irq_restore(flags); return; } - t->rcu_read_unlock_special.b.exp_hint = false; - t->rcu_read_unlock_special.b.deferred_qs = false; - if (special.b.need_qs) { + t->rcu_read_unlock_special.s = 0; + if (special.b.need_qs) rcu_qs(); - t->rcu_read_unlock_special.b.need_qs = false; - if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) { - local_irq_restore(flags); - return; - } - } /* * Respond to a request by an expedited grace period for a @@ -461,17 +454,11 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->exp_deferred_qs) { + if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); - if (!t->rcu_read_unlock_special.s) { - local_irq_restore(flags); - return; - } - } /* Clean up if blocked during RCU read-side critical section. */ if (special.b.blocked) { - t->rcu_read_unlock_special.b.blocked = false; /* * Remove this task from the list it blocked on. The task -- cgit From c51f83c315c392d9776c33eb16a2fe1349d65c7f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Nov 2019 08:22:45 -0800 Subject: rcu: Use READ_ONCE() for ->expmask in rcu_read_unlock_special() The rcu_node structure's ->expmask field is updated only when holding the ->lock, but is also accessed locklessly. This means that all ->expmask updates must use WRITE_ONCE() and all reads carried out without holding ->lock must use READ_ONCE(). This commit therefore changes the lockless ->expmask read in rcu_read_unlock_special() to use READ_ONCE(). Reported-by: syzbot+99f4ddade3c22ab0cf23@syzkaller.appspotmail.com Signed-off-by: Paul E. McKenney Acked-by: Marco Elver --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c3a32717c42d..698a7f195f88 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -599,7 +599,7 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_node *rnp = rdp->mynode; exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || - (rdp->grpmask & rnp->expmask) || + (rdp->grpmask & READ_ONCE(rnp->expmask)) || tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. if (irqs_were_disabled && use_softirq && -- cgit From 77339e61aa309310a535bd01eb3388f7a27b36f9 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 15 Nov 2019 14:08:53 -0800 Subject: rcu: Provide wrappers for uses of ->rcu_read_lock_nesting This commit provides wrapper functions for uses of ->rcu_read_lock_nesting to improve readability and to ease future changes to support inlining of __rcu_read_lock() and __rcu_read_unlock(). Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 4 ++-- kernel/rcu/tree_plugin.h | 53 +++++++++++++++++++++++++++++++----------------- 2 files changed, 36 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 98d078cafa5a..d8da6b1a3209 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -610,7 +610,7 @@ static void rcu_exp_handler(void *unused) * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ - if (!t->rcu_read_lock_nesting) { + if (!rcu_preempt_depth()) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); @@ -634,7 +634,7 @@ static void rcu_exp_handler(void *unused) * can have caused this quiescent state to already have been * reported, so we really do need to check ->expmask. */ - if (t->rcu_read_lock_nesting > 0) { + if (rcu_preempt_depth() > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { rdp->exp_deferred_qs = true; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 698a7f195f88..ebdbdec5911f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -290,8 +290,8 @@ void rcu_note_context_switch(bool preempt) trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); - WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); - if (t->rcu_read_lock_nesting > 0 && + WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0); + if (rcu_preempt_depth() > 0 && !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ @@ -348,6 +348,21 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) #define RCU_NEST_NMAX (-INT_MAX / 2) #define RCU_NEST_PMAX (INT_MAX / 2) +static void rcu_preempt_read_enter(void) +{ + current->rcu_read_lock_nesting++; +} + +static void rcu_preempt_read_exit(void) +{ + current->rcu_read_lock_nesting--; +} + +static void rcu_preempt_depth_set(int val) +{ + current->rcu_read_lock_nesting = val; +} + /* * Preemptible RCU implementation for rcu_read_lock(). * Just increment ->rcu_read_lock_nesting, shared state will be updated @@ -355,9 +370,9 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) */ void __rcu_read_lock(void) { - current->rcu_read_lock_nesting++; + rcu_preempt_read_enter(); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) - WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX); + WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX); barrier(); /* critical section after entry code. */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -373,19 +388,19 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - if (t->rcu_read_lock_nesting != 1) { - --t->rcu_read_lock_nesting; + if (rcu_preempt_depth() != 1) { + rcu_preempt_read_exit(); } else { barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = -RCU_NEST_BIAS; + rcu_preempt_depth_set(-RCU_NEST_BIAS); barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; + rcu_preempt_depth_set(0); } if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { - int rrln = t->rcu_read_lock_nesting; + int rrln = rcu_preempt_depth(); WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); } @@ -539,7 +554,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && - t->rcu_read_lock_nesting <= 0; + rcu_preempt_depth() <= 0; } /* @@ -552,16 +567,16 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) static void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; - bool couldrecurse = t->rcu_read_lock_nesting >= 0; + bool couldrecurse = rcu_preempt_depth() >= 0; if (!rcu_preempt_need_deferred_qs(t)) return; if (couldrecurse) - t->rcu_read_lock_nesting -= RCU_NEST_BIAS; + rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS); local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); if (couldrecurse) - t->rcu_read_lock_nesting += RCU_NEST_BIAS; + rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS); } /* @@ -672,7 +687,7 @@ static void rcu_flavor_sched_clock_irq(int user) if (user || rcu_is_cpu_rrupt_from_idle()) { rcu_note_voluntary_context_switch(current); } - if (t->rcu_read_lock_nesting > 0 || + if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ if (rcu_preempt_need_deferred_qs(t)) { @@ -682,13 +697,13 @@ static void rcu_flavor_sched_clock_irq(int user) } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; - } else if (!t->rcu_read_lock_nesting) { + } else if (!rcu_preempt_depth()) { rcu_qs(); /* Report immediate QS. */ return; } /* If GP is oldish, ask for help from rcu_read_unlock_special(). */ - if (t->rcu_read_lock_nesting > 0 && + if (rcu_preempt_depth() > 0 && __this_cpu_read(rcu_data.core_needs_qs) && __this_cpu_read(rcu_data.cpu_no_qs.b.norm) && !t->rcu_read_unlock_special.b.need_qs && @@ -709,11 +724,11 @@ void exit_rcu(void) struct task_struct *t = current; if (unlikely(!list_empty(¤t->rcu_node_entry))) { - t->rcu_read_lock_nesting = 1; + rcu_preempt_depth_set(1); barrier(); WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); - } else if (unlikely(t->rcu_read_lock_nesting)) { - t->rcu_read_lock_nesting = 1; + } else if (unlikely(rcu_preempt_depth())) { + rcu_preempt_depth_set(1); } else { return; } -- cgit From 5b14557b073c96a7cf79adc4d7b6c4a8c26b2a43 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 Nov 2019 18:05:45 -0800 Subject: rcu: Avoid tick_dep_set_cpu() misordering In the current code, rcu_nmi_enter_common() might decide to turn on the tick using tick_dep_set_cpu(), but be delayed just before doing so. Then the grace-period kthread might notice that the CPU in question had in fact gone through a quiescent state, thus turning off the tick using tick_dep_clear_cpu(). The later invocation of tick_dep_set_cpu() would then incorrectly leave the tick on. This commit therefore enlists the aid of the leaf rcu_node structure's ->lock to ensure that decisions to enable or disable the tick are carried out before they can be reversed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5445da2326a0..b0e0612392a9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -800,8 +800,8 @@ void rcu_user_exit(void) */ static __always_inline void rcu_nmi_enter_common(bool irq) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); long incby = 2; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); /* Complain about underflow. */ WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); @@ -828,8 +828,13 @@ static __always_inline void rcu_nmi_enter_common(bool irq) } else if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { - rdp->rcu_forced_tick = true; - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + raw_spin_lock_rcu_node(rdp->mynode); + // Recheck under lock. + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + rdp->rcu_forced_tick = true; + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, @@ -898,6 +903,7 @@ void rcu_irq_enter_irqson(void) */ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) { + raw_lockdep_assert_held_rcu_node(rdp->mynode); WRITE_ONCE(rdp->rcu_urgent_qs, false); WRITE_ONCE(rdp->rcu_need_heavy_qs, false); if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { -- cgit From 822175e72995a9ff7eeef4f5cd3f945f2697b67d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:23:56 +0000 Subject: rcu: Fix harmless omission of "CONFIG_" from #if condition The C preprocessor macros SRCU and TINY_RCU should instead be CONFIG_SRCU and CONFIG_TINY_RCU, respectively in the #f in kernel/rcu/rcu.h. But there is no harm when "TINY_RCU" is wrongly used, which are always non-defined, which makes "!defined(TINY_RCU)" always true, which means the code block is always included, and the included code block doesn't cause any compilation error so far in CONFIG_TINY_RCU builds. It is also the reason this change should not be taken in -stable. This commit adds the needed "CONFIG_" prefix to both macros. Not for -stable. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ab504fbc76ca..473259422547 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -281,7 +281,7 @@ void rcu_test_sync_prims(void); */ extern void resched_cpu(int cpu); -#if defined(SRCU) || !defined(TINY_RCU) +#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) #include @@ -418,7 +418,7 @@ do { \ #define raw_lockdep_assert_held_rcu_node(p) \ lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) -#endif /* #if defined(SRCU) || !defined(TINY_RCU) */ +#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */ #ifdef CONFIG_SRCU void srcu_init(void); -- cgit From 2488a5e695564897a0f8ea42cd3af71647cd26d2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:23:57 +0000 Subject: rcu: Fix tracepoint tracking RCU CPU kthread utilization In the call to trace_rcu_utilization() at the start of the loop in rcu_cpu_kthread(), "rcu_wait" is incorrect, plus this trace event needs to be hoisted above the loop to balance with either the "rcu_wait" or "rcu_yield", depending on how the loop exits. This commit therefore makes these changes. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dd8cfc34f4da..ba154a3080fb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2474,8 +2474,8 @@ static void rcu_cpu_kthread(unsigned int cpu) char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); int spincnt; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_run")); for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); -- cgit From 4778339df0ee361dbf2cbdcb87abaec9dfbc841d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:28:46 +0000 Subject: rcu: Remove the declaration of call_rcu() in tree.h The call_rcu() function is an external RCU API that is declared in include/linux/rcupdate.h. There is thus no point in redeclaring it in kernel/rcu/tree.h, so this commit removes that redundant declaration. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e4dc5debfc84..54ff9896ae31 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -413,7 +413,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_flavor_sched_clock_irq(int user); -void call_rcu(struct rcu_head *head, rcu_callback_t func); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -- cgit From e2167b38c87a0c9e85c342a823dae1e6f67b11d9 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:28:47 +0000 Subject: rcu: Move gp_state_names[] and gp_state_getname() to tree_stall.h Only tree_stall.h needs to get name from GP state, so this commit moves the gp_state_names[] array and the gp_state_getname() from kernel/rcu/tree.h and kernel/rcu/tree.c, respectively, to kernel/rcu/tree_stall.h. While moving gp_state_names[], this commit uses the GCC syntax to ensure that the right string is associated with the right CPP macro. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ---------- kernel/rcu/tree.h | 12 ------------ kernel/rcu/tree_stall.h | 22 ++++++++++++++++++++++ 3 files changed, 22 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ba154a3080fb..bbb60ed310b1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -528,16 +528,6 @@ static struct rcu_node *rcu_get_root(void) return &rcu_state.node[0]; } -/* - * Convert a ->gp_state value to a character string. - */ -static const char *gp_state_getname(short gs) -{ - if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) - return "???"; - return gp_state_names[gs]; -} - /* * Send along grace-period-related data for rcutorture diagnostics. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 54ff9896ae31..9d5986abfc67 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -368,18 +368,6 @@ struct rcu_state { #define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */ -static const char * const gp_state_names[] = { - "RCU_GP_IDLE", - "RCU_GP_WAIT_GPS", - "RCU_GP_DONE_GPS", - "RCU_GP_ONOFF", - "RCU_GP_INIT", - "RCU_GP_WAIT_FQS", - "RCU_GP_DOING_FQS", - "RCU_GP_CLEANUP", - "RCU_GP_CLEANED", -}; - /* * In order to export the rcu_state name to the tracing tools, it * needs to be added in the __tracepoint_string section. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6..f18adaf3bf39 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -279,6 +279,28 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ +static const char * const gp_state_names[] = { + [RCU_GP_IDLE] = "RCU_GP_IDLE", + [RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS", + [RCU_GP_DONE_GPS] = "RCU_GP_DONE_GPS", + [RCU_GP_ONOFF] = "RCU_GP_ONOFF", + [RCU_GP_INIT] = "RCU_GP_INIT", + [RCU_GP_WAIT_FQS] = "RCU_GP_WAIT_FQS", + [RCU_GP_DOING_FQS] = "RCU_GP_DOING_FQS", + [RCU_GP_CLEANUP] = "RCU_GP_CLEANUP", + [RCU_GP_CLEANED] = "RCU_GP_CLEANED", +}; + +/* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + /* * Print out diagnostic information for the specified stalled CPU. * -- cgit From e1350e8e0ea5d959c23c5e593ff3026a67dbb049 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 15 Oct 2019 14:48:22 +0100 Subject: rcu: Move rcu_{expedited,normal} definitions into rcupdate.h This commit moves the rcu_{expedited,normal} definitions from kernel/rcu/update.c to include/linux/rcupdate.h to make sure they are in sync, and also to avoid the following warning from sparse: kernel/ksysfs.c:150:5: warning: symbol 'rcu_expedited' was not declared. Should it be static? kernel/ksysfs.c:167:5: warning: symbol 'rcu_normal' was not declared. Should it be static? Signed-off-by: Ben Dooks Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1861103662db..294d357abd0c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -51,9 +51,7 @@ #define MODULE_PARAM_PREFIX "rcupdate." #ifndef CONFIG_TINY_RCU -extern int rcu_expedited; /* from sysctl */ module_param(rcu_expedited, int, 0); -extern int rcu_normal; /* from sysctl */ module_param(rcu_normal, int, 0); static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); -- cgit From 7441e7661d6586ae36329b7956e4d713d81e9903 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Oct 2019 09:37:11 -0700 Subject: rcu: Switch force_qs_rnp() to for_each_leaf_node_cpu_mask() Currently, force_qs_rnp() uses a for_each_leaf_node_possible_cpu() loop containing a check of the current CPU's bit in ->qsmask. This works, but this commit saves three lines by instead using for_each_leaf_node_cpu_mask(), which combines the functionality of for_each_leaf_node_possible_cpu() and leaf_node_cpu_bit(). This commit also replaces the use of the local variable "bit" with rdp->grpmask. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bbb60ed310b1..d95076498488 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2298,14 +2298,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; } - for_each_leaf_node_possible_cpu(rnp, cpu) { - unsigned long bit = leaf_node_cpu_bit(rnp, cpu); - if ((rnp->qsmask & bit) != 0) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (f(rdp)) { - mask |= bit; - rcu_disable_urgency_upon_qs(rdp); - } + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (f(rdp)) { + mask |= rdp->grpmask; + rcu_disable_urgency_upon_qs(rdp); } } if (mask != 0) { -- cgit From 844a378de3372c923909681706d62336d702531e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Nov 2019 08:08:30 -0800 Subject: srcu: Apply *_ONCE() to ->srcu_last_gp_end The ->srcu_last_gp_end field is accessed from any CPU at any time by synchronize_srcu(), so non-initialization references need to use READ_ONCE() and WRITE_ONCE(). This commit therefore makes that change. Reported-by: syzbot+08f3e9d26e5541e1ecf2@syzkaller.appspotmail.com Acked-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5dffade2d7cd..21acdff3bd27 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -530,7 +530,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); cbdelay = srcu_get_delay(ssp); - ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) @@ -762,6 +762,7 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long flags; struct srcu_data *sdp; unsigned long t; + unsigned long tlast; /* If the local srcu_data structure has callbacks, not idle. */ local_irq_save(flags); @@ -780,9 +781,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) /* First, see if enough time has passed since the last GP. */ t = ktime_get_mono_fast_ns(); + tlast = READ_ONCE(ssp->srcu_last_gp_end); if (exp_holdoff == 0 || - time_in_range_open(t, ssp->srcu_last_gp_end, - ssp->srcu_last_gp_end + exp_holdoff)) + time_in_range_open(t, tlast, tlast + exp_holdoff)) return false; /* Too soon after last GP. */ /* Next, check for probable idleness. */ -- cgit From f6105fc2a9c0ea5be6b9e5c19b2551af0b8b4eac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 11:36:07 -0800 Subject: rcu: Remove unused stop-machine #include Long ago, RCU used the stop-machine mechanism to implement expedited grace periods, but no longer does so. This commit therefore removes the no-longer-needed #includes of linux/stop_machine.h. Link: https://lwn.net/Articles/805317/ Reported-by: Viresh Kumar Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 - kernel/rcu/tree.h | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d95076498488..878f62f218e9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9d5986abfc67..ce90c68c184b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -16,7 +16,6 @@ #include #include #include -#include #include #include "rcu_segcblist.h" -- cgit From 84ad7a7ab69f112c0c4b878c9be91b950a1fb1f8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 23 Jan 2020 17:15:06 +0100 Subject: bpf: Allow BTF ctx access for string pointers When accessing the context we allow access to arguments with scalar type and pointer to struct. But we deny access for pointer to scalar type, which is the case for many functions. Alexei suggested to take conservative approach and allow currently only string pointer access, which is the case for most functions now: Adding check if the pointer is to string type and allow access to it. Suggested-by: Alexei Starovoitov Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200123161508.915203-2-jolsa@kernel.org --- kernel/bpf/btf.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 32963b6d5a9c..b7c1660fb594 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3669,6 +3669,19 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) } } +static bool is_string_ptr(struct btf *btf, const struct btf_type *t) +{ + /* t comes in already as a pointer */ + t = btf_type_by_id(btf, t->type); + + /* allow const */ + if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST) + t = btf_type_by_id(btf, t->type); + + /* char, signed char, unsigned char */ + return btf_type_is_int(t) && t->size == 1; +} + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -3735,6 +3748,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, */ return true; + if (is_string_ptr(btf, t)) + return true; + /* this is a pointer to another type */ info->reg_type = PTR_TO_BTF_ID; -- cgit From e9b4e606c2289d6610113253922bb8c9ac7f68b0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 23 Jan 2020 17:15:07 +0100 Subject: bpf: Allow to resolve bpf trampoline and dispatcher in unwind When unwinding the stack we need to identify each address to successfully continue. Adding latch tree to keep trampolines for quick lookup during the unwind. The patch uses first 48 bytes for latch tree node, leaving 4048 bytes from the rest of the page for trampoline or dispatcher generated code. It's still enough not to affect trampoline and dispatcher progs maximum counts. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200123161508.915203-3-jolsa@kernel.org --- kernel/bpf/dispatcher.c | 4 +-- kernel/bpf/trampoline.c | 80 ++++++++++++++++++++++++++++++++++++++++++++----- kernel/extable.c | 7 +++-- 3 files changed, 79 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 204ee61a3904..b3e5b214fed8 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -113,7 +113,7 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) noff = 0; } else { old = d->image + d->image_off; - noff = d->image_off ^ (PAGE_SIZE / 2); + noff = d->image_off ^ (BPF_IMAGE_SIZE / 2); } new = d->num_progs ? d->image + noff : NULL; @@ -140,7 +140,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, mutex_lock(&d->mutex); if (!d->image) { - d->image = bpf_jit_alloc_exec_page(); + d->image = bpf_image_alloc(); if (!d->image) goto out; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index eb64c245052b..6b264a92064b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -4,6 +4,7 @@ #include #include #include +#include /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -16,11 +17,12 @@ const struct bpf_prog_ops bpf_extension_prog_ops = { #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; +static struct latch_tree_root image_tree __cacheline_aligned; -/* serializes access to trampoline_table */ +/* serializes access to trampoline_table and image_tree */ static DEFINE_MUTEX(trampoline_mutex); -void *bpf_jit_alloc_exec_page(void) +static void *bpf_jit_alloc_exec_page(void) { void *image; @@ -36,6 +38,64 @@ void *bpf_jit_alloc_exec_page(void) return image; } +static __always_inline bool image_tree_less(struct latch_tree_node *a, + struct latch_tree_node *b) +{ + struct bpf_image *ia = container_of(a, struct bpf_image, tnode); + struct bpf_image *ib = container_of(b, struct bpf_image, tnode); + + return ia < ib; +} + +static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n) +{ + void *image = container_of(n, struct bpf_image, tnode); + + if (addr < image) + return -1; + if (addr >= image + PAGE_SIZE) + return 1; + + return 0; +} + +static const struct latch_tree_ops image_tree_ops = { + .less = image_tree_less, + .comp = image_tree_comp, +}; + +static void *__bpf_image_alloc(bool lock) +{ + struct bpf_image *image; + + image = bpf_jit_alloc_exec_page(); + if (!image) + return NULL; + + if (lock) + mutex_lock(&trampoline_mutex); + latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops); + if (lock) + mutex_unlock(&trampoline_mutex); + return image->data; +} + +void *bpf_image_alloc(void) +{ + return __bpf_image_alloc(true); +} + +bool is_bpf_image_address(unsigned long addr) +{ + bool ret; + + rcu_read_lock(); + ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL; + rcu_read_unlock(); + + return ret; +} + struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; @@ -56,7 +116,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) goto out; /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = bpf_jit_alloc_exec_page(); + image = __bpf_image_alloc(false); if (!image) { kfree(tr); tr = NULL; @@ -131,14 +191,14 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 - * bytes on x86. Pick a number to fit into PAGE_SIZE / 2 + * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 */ #define BPF_MAX_TRAMP_PROGS 40 static int bpf_trampoline_update(struct bpf_trampoline *tr) { - void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; - void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; + void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2; + void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2; struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS]; int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY]; int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT]; @@ -174,7 +234,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) */ synchronize_rcu_tasks(); - err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2, &tr->func.model, flags, fentry, fentry_cnt, fexit, fexit_cnt, @@ -284,6 +344,8 @@ out: void bpf_trampoline_put(struct bpf_trampoline *tr) { + struct bpf_image *image; + if (!tr) return; mutex_lock(&trampoline_mutex); @@ -294,9 +356,11 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) goto out; if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; + image = container_of(tr->image, struct bpf_image, data); + latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops); /* wait for tasks to get out of trampoline before freeing it */ synchronize_rcu_tasks(); - bpf_jit_free_exec(tr->image); + bpf_jit_free_exec(image); hlist_del(&tr->hlist); kfree(tr); out: diff --git a/kernel/extable.c b/kernel/extable.c index f6920a11e28a..a0024f27d3a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -131,8 +131,9 @@ int kernel_text_address(unsigned long addr) * triggers a stack trace, or a WARN() that happens during * coming back from idle, or cpu on or offlining. * - * is_module_text_address() as well as the kprobe slots - * and is_bpf_text_address() require RCU to be watching. + * is_module_text_address() as well as the kprobe slots, + * is_bpf_text_address() and is_bpf_image_address require + * RCU to be watching. */ no_rcu = !rcu_is_watching(); @@ -148,6 +149,8 @@ int kernel_text_address(unsigned long addr) goto out; if (is_bpf_text_address(addr)) goto out; + if (is_bpf_image_address(addr)) + goto out; ret = 0; out: if (no_rcu) -- cgit From 90435a7891a2259b0f74c5a1bc5600d0d64cba8f Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sat, 25 Jan 2020 12:10:02 +0300 Subject: bpf: map_seq_next should always increase position index If seq_file .next fuction does not change position index, read after some lseek can generate an unexpected output. See also: https://bugzilla.kernel.org/show_bug.cgi?id=206283 v1 -> v2: removed missed increment in end of function Signed-off-by: Vasily Averin Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/eca84fdd-c374-a154-d874-6c7b55fc3bc4@virtuozzo.com --- kernel/bpf/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index e11059b99f18..bd2fd8eab470 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -196,6 +196,7 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) void *key = map_iter(m)->key; void *prev_key; + (*pos)++; if (map_iter(m)->done) return NULL; @@ -208,8 +209,6 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) map_iter(m)->done = true; return NULL; } - - ++(*pos); return key; } -- cgit From 42a84a8cd0ff0cbff5a4595e1304c4567a30267d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 26 Jan 2020 16:14:00 -0800 Subject: bpf, xdp: Update devmap comments to reflect napi/rcu usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we rely on synchronize_rcu and call_rcu waiting to exit perempt-disable regions (NAPI) lets update the comments to reflect this. Fixes: 0536b85239b84 ("xdp: Simplify devmap cleanup") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Acked-by: Song Liu Link: https://lore.kernel.org/bpf/1580084042-11598-2-git-send-email-john.fastabend@gmail.com --- kernel/bpf/devmap.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f3a44f6ca86e..373c6a30d8a5 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -190,10 +190,12 @@ static void dev_map_free(struct bpf_map *map) /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete. The rcu critical section only guarantees - * no further reads against netdev_map. It does __not__ ensure pending - * flush operations (if any) are complete. + * disconnected from events. The following synchronize_rcu() guarantees + * both rcu read critical sections complete and waits for + * preempt-disable regions (NAPI being the relevant context here) so we + * are certain there will be no further reads against the netdev_map and + * all flush operations are complete. Flush operations can only be done + * from NAPI context for this reason. */ spin_lock(&dev_map_lock); @@ -503,12 +505,11 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; /* Use call_rcu() here to ensure any rcu critical sections have - * completed, but this does not guarantee a flush has happened - * yet. Because driver side rcu_read_lock/unlock only protects the - * running XDP program. However, for pending flush operations the - * dev and ctx are stored in another per cpu map. And additionally, - * the driver tear down ensures all soft irqs are complete before - * removing the net device in the case of dev_put equals zero. + * completed as well as any flush operations because call_rcu + * will wait for preempt-disable region to complete, NAPI in this + * context. And additionally, the driver tear down ensures all + * soft irqs are complete before removing the net device in the + * case of dev_put equals zero. */ old_dev = xchg(&dtab->netdev_map[k], NULL); if (old_dev) -- cgit From b23bfa5633b19bf1db87b36a76b2225c734f794c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 26 Jan 2020 16:14:02 -0800 Subject: bpf, xdp: Remove no longer required rcu_read_{un}lock() Now that we depend on rcu_call() and synchronize_rcu() to also wait for preempt_disabled region to complete the rcu read critical section in __dev_map_flush() is no longer required. Except in a few special cases in drivers that need it for other reasons. These originally ensured the map reference was safe while a map was also being free'd. And additionally that bpf program updates via ndo_bpf did not happen while flush updates were in flight. But flush by new rules can only be called from preempt-disabled NAPI context. The synchronize_rcu from the map free path and the rcu_call from the delete path will ensure the reference there is safe. So lets remove the rcu_read_lock and rcu_read_unlock pair to avoid any confusion around how this is being protected. If the rcu_read_lock was required it would mean errors in the above logic and the original patch would also be wrong. Now that we have done above we put the rcu_read_lock in the driver code where it is needed in a driver dependent way. I think this helps readability of the code so we know where and why we are taking read locks. Most drivers will not need rcu_read_locks here and further XDP drivers already have rcu_read_locks in their code paths for reading xdp programs on RX side so this makes it symmetric where we don't have half of rcu critical sections define in driver and the other half in devmap. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/1580084042-11598-4-git-send-email-john.fastabend@gmail.com --- kernel/bpf/devmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 373c6a30d8a5..58bdca5d978a 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -366,16 +366,17 @@ error: * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the * net device can be torn down. On devmap tear down we ensure the flush list * is empty before completing to ensure all flush operations have completed. + * When drivers update the bpf program they may need to ensure any flush ops + * are also complete. Using synchronize_rcu or call_rcu will suffice for this + * because both wait for napi context to exit. */ void __dev_flush(void) { struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq, *tmp; - rcu_read_lock(); list_for_each_entry_safe(bq, tmp, flush_list, flush_node) bq_xmit_all(bq, XDP_XMIT_FLUSH); - rcu_read_unlock(); } /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or -- cgit From 20279420ae3a8ef4c5d9fedc360a2c37a1dbdf1b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 24 Jan 2020 10:07:42 -0500 Subject: tracing/kprobes: Have uname use __get_str() in print_fmt Thomas Richter reported: > Test case 66 'Use vfs_getname probe to get syscall args filenames' > is broken on s390, but works on x86. The test case fails with: > > [root@m35lp76 perf]# perf test -F 66 > 66: Use vfs_getname probe to get syscall args filenames > :Recording open file: > [ perf record: Woken up 1 times to write data ] > [ perf record: Captured and wrote 0.004 MB /tmp/__perf_test.perf.data.TCdYj\ > (20 samples) ] > Looking at perf.data file for vfs_getname records for the file we touched: > FAILED! > [root@m35lp76 perf]# The root cause was the print_fmt of the kprobe event that referenced the "ustring" > Setting up the kprobe event using perf command: > > # ./perf probe "vfs_getname=getname_flags:72 pathname=filename:ustring" > > generates this format file: > [root@m35lp76 perf]# cat /sys/kernel/debug/tracing/events/probe/\ > vfs_getname/format > name: vfs_getname > ID: 1172 > format: > field:unsigned short common_type; offset:0; size:2; signed:0; > field:unsigned char common_flags; offset:2; size:1; signed:0; > field:unsigned char common_preempt_count; offset:3; size:1; signed:0; > field:int common_pid; offset:4; size:4; signed:1; > > field:unsigned long __probe_ip; offset:8; size:8; signed:0; > field:__data_loc char[] pathname; offset:16; size:4; signed:1; > > print fmt: "(%lx) pathname=\"%s\"", REC->__probe_ip, REC->pathname Instead of using "__get_str(pathname)" it referenced it directly. Link: http://lkml.kernel.org/r/20200124100742.4050c15e@gandalf.local.home Cc: stable@vger.kernel.org Fixes: 88903c464321 ("tracing/probe: Add ustring type for user-space string") Acked-by: Masami Hiramatsu Reported-by: Thomas Richter Tested-by: Thomas Richter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9ae87be422f2..ab8b6436d53f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -876,7 +876,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, for (i = 0; i < tp->nr_args; i++) { parg = tp->args + i; if (parg->count) { - if (strcmp(parg->type->name, "string") == 0) + if ((strcmp(parg->type->name, "string") == 0) || + (strcmp(parg->type->name, "ustring") == 0)) fmt = ", __get_str(%s[%d])"; else fmt = ", REC->%s[%d]"; @@ -884,7 +885,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, pos += snprintf(buf + pos, LEN_OR_ZERO, fmt, parg->name, j); } else { - if (strcmp(parg->type->name, "string") == 0) + if ((strcmp(parg->type->name, "string") == 0) || + (strcmp(parg->type->name, "ustring") == 0)) fmt = ", __get_str(%s)"; else fmt = ", REC->%s"; -- cgit