From def98c84b6cdf2eeea19ec5736e90e316df5206b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 18 Sep 2019 18:43:40 -0700 Subject: workqueue: Fix spurious sanity check failures in destroy_workqueue() Before actually destrying a workqueue, destroy_workqueue() checks whether it's actually idle. If it isn't, it prints out a bunch of warning messages and leaves the workqueue dangling. It unfortunately has a couple issues. * Mayday list queueing increments pwq's refcnts which gets detected as busy and fails the sanity checks. However, because mayday list queueing is asynchronous, this condition can happen without any actual work items left in the workqueue. * Sanity check failure leaves the sysfs interface behind too which can lead to init failure of newer instances of the workqueue. This patch fixes the above two by * If a workqueue has a rescuer, disable and kill the rescuer before sanity checks. Disabling and killing is guaranteed to flush the existing mayday list. * Remove sysfs interface before sanity checks. Signed-off-by: Tejun Heo Reported-by: Marcin Pawlowski Reported-by: "Williams, Gerald S" Cc: stable@vger.kernel.org --- kernel/workqueue.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc2e09a8ea61..93e20f5330fc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4325,9 +4325,28 @@ void destroy_workqueue(struct workqueue_struct *wq) struct pool_workqueue *pwq; int node; + /* + * Remove it from sysfs first so that sanity check failure doesn't + * lead to sysfs name conflicts. + */ + workqueue_sysfs_unregister(wq); + /* drain it before proceeding with destruction */ drain_workqueue(wq); + /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ + if (wq->rescuer) { + struct worker *rescuer = wq->rescuer; + + /* this prevents new queueing */ + spin_lock_irq(&wq_mayday_lock); + wq->rescuer = NULL; + spin_unlock_irq(&wq_mayday_lock); + + /* rescuer will empty maydays list before exiting */ + kthread_stop(rescuer->task); + } + /* sanity checks */ mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { @@ -4359,11 +4378,6 @@ void destroy_workqueue(struct workqueue_struct *wq) list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); - workqueue_sysfs_unregister(wq); - - if (wq->rescuer) - kthread_stop(wq->rescuer->task); - if (!(wq->flags & WQ_UNBOUND)) { wq_unregister_lockdep(wq); /* -- cgit From 8efe1223d73c218ce7e8b2e0e9aadb974b582d7f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Sep 2019 13:39:57 -0700 Subject: workqueue: Fix missing kfree(rescuer) in destroy_workqueue() Signed-off-by: Tejun Heo Reported-by: Qian Cai Fixes: def98c84b6cd ("workqueue: Fix spurious sanity check failures in destroy_workqueue()") --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 93e20f5330fc..3f067f1d72e3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4345,6 +4345,7 @@ void destroy_workqueue(struct workqueue_struct *wq) /* rescuer will empty maydays list before exiting */ kthread_stop(rescuer->task); + kfree(rescuer); } /* sanity checks */ -- cgit From 30ae2fc0a75eb5f1a38bbee7355d8e3bc823a6e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Sep 2019 14:09:14 -0700 Subject: workqueue: Minor follow-ups to the rescuer destruction change * Now that wq->rescuer may be cleared while rescuer is still there, switch show_pwq() debug printout to test worker->rescue_wq to identify rescuers intead of testing wq->rescuer. * Update comment on ->rescuer locking. Signed-off-by: Tejun Heo Suggested-by: Lai Jiangshan --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3f067f1d72e3..7f7aa45dac37 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -248,7 +248,7 @@ struct workqueue_struct { struct list_head flusher_overflow; /* WQ: flush overflow list */ struct list_head maydays; /* MD: pwqs requesting rescue */ - struct worker *rescuer; /* I: rescue worker */ + struct worker *rescuer; /* MD: rescue worker */ int nr_drainers; /* WQ: drain in progress */ int saved_max_active; /* WQ: saved pwq max_active */ @@ -4672,7 +4672,7 @@ static void show_pwq(struct pool_workqueue *pwq) pr_cont("%s %d%s:%ps", comma ? "," : "", task_pid_nr(worker->task), - worker == pwq->wq->rescuer ? "(RESCUER)" : "", + worker->rescue_wq ? "(RESCUER)" : "", worker->current_func); list_for_each_entry(work, &worker->scheduled, entry) pr_cont_work(false, work); -- cgit From 2cb80dbbbaba4f2f86f686c34cb79ea5cbfb0edb Mon Sep 17 00:00:00 2001 From: Iurii Zaikin Date: Mon, 23 Sep 2019 02:02:47 -0700 Subject: kernel/sysctl-test: Add null pointer test for sysctl.c:proc_dointvec() KUnit tests for initialized data behavior of proc_dointvec that is explicitly checked in the code. Includes basic parsing tests including int min/max overflow. Signed-off-by: Iurii Zaikin Signed-off-by: Brendan Higgins Reviewed-by: Greg Kroah-Hartman Reviewed-by: Logan Gunthorpe Acked-by: Luis Chamberlain Reviewed-by: Stephen Boyd Signed-off-by: Shuah Khan --- kernel/Makefile | 2 + kernel/sysctl-test.c | 392 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 394 insertions(+) create mode 100644 kernel/sysctl-test.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index daad787fb795..f0902a7bd1b3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -115,6 +115,8 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_RSEQ) += rseq.o +obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o + obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c new file mode 100644 index 000000000000..2a63241a8453 --- /dev/null +++ b/kernel/sysctl-test.c @@ -0,0 +1,392 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test of proc sysctl. + */ + +#include +#include + +#define KUNIT_PROC_READ 0 +#define KUNIT_PROC_WRITE 1 + +static int i_zero; +static int i_one_hundred = 100; + +/* + * Test that proc_dointvec will not try to use a NULL .data field even when the + * length is non-zero. + */ +static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test) +{ + struct ctl_table null_data_table = { + .procname = "foo", + /* + * Here we are testing that proc_dointvec behaves correctly when + * we give it a NULL .data field. Normally this would point to a + * piece of memory where the value would be stored. + */ + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + /* + * proc_dointvec expects a buffer in user space, so we allocate one. We + * also need to cast it to __user so sparse doesn't get mad. + */ + void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), + GFP_USER); + size_t len; + loff_t pos; + + /* + * We don't care what the starting length is since proc_dointvec should + * not try to read because .data is NULL. + */ + len = 1234; + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&null_data_table, + KUNIT_PROC_READ, buffer, &len, + &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); + + /* + * See above. + */ + len = 1234; + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&null_data_table, + KUNIT_PROC_WRITE, buffer, &len, + &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); +} + +/* + * Similar to the previous test, we create a struct ctrl_table that has a .data + * field that proc_dointvec cannot do anything with; however, this time it is + * because we tell proc_dointvec that the size is 0. + */ +static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test) +{ + int data = 0; + struct ctl_table data_maxlen_unset_table = { + .procname = "foo", + .data = &data, + /* + * So .data is no longer NULL, but we tell proc_dointvec its + * length is 0, so it still shouldn't try to use it. + */ + .maxlen = 0, + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), + GFP_USER); + size_t len; + loff_t pos; + + /* + * As before, we don't care what buffer length is because proc_dointvec + * cannot do anything because its internal .data buffer has zero length. + */ + len = 1234; + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&data_maxlen_unset_table, + KUNIT_PROC_READ, buffer, &len, + &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); + + /* + * See previous comment. + */ + len = 1234; + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&data_maxlen_unset_table, + KUNIT_PROC_WRITE, buffer, &len, + &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); +} + +/* + * Here we provide a valid struct ctl_table, but we try to read and write from + * it using a buffer of zero length, so it should still fail in a similar way as + * before. + */ +static void sysctl_test_api_dointvec_table_len_is_zero(struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), + GFP_USER); + /* + * However, now our read/write buffer has zero length. + */ + size_t len = 0; + loff_t pos; + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, buffer, + &len, &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE, buffer, + &len, &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); +} + +/* + * Test that proc_dointvec refuses to read when the file position is non-zero. + */ +static void sysctl_test_api_dointvec_table_read_but_position_set( + struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), + GFP_USER); + /* + * We don't care about our buffer length because we start off with a + * non-zero file position. + */ + size_t len = 1234; + /* + * proc_dointvec should refuse to read into the buffer since the file + * pos is non-zero. + */ + loff_t pos = 1; + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, buffer, + &len, &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); +} + +/* + * Test that we can read a two digit number in a sufficiently size buffer. + * Nothing fancy. + */ +static void sysctl_test_dointvec_read_happy_single_positive(struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + size_t len = 4; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + /* Store 13 in the data field. */ + *((int *)table.data) = 13; + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, + user_buffer, &len, &pos)); + KUNIT_ASSERT_EQ(test, (size_t)3, len); + buffer[len] = '\0'; + /* And we read 13 back out. */ + KUNIT_EXPECT_STREQ(test, "13\n", buffer); +} + +/* + * Same as previous test, just now with negative numbers. + */ +static void sysctl_test_dointvec_read_happy_single_negative(struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + size_t len = 5; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + *((int *)table.data) = -16; + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, + user_buffer, &len, &pos)); + KUNIT_ASSERT_EQ(test, (size_t)4, len); + buffer[len] = '\0'; + KUNIT_EXPECT_STREQ(test, "-16\n", (char *)buffer); +} + +/* + * Test that a simple positive write works. + */ +static void sysctl_test_dointvec_write_happy_single_positive(struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + char input[] = "9"; + size_t len = sizeof(input) - 1; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + + memcpy(buffer, input, len); + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE, + user_buffer, &len, &pos)); + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len); + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, (size_t)pos); + KUNIT_EXPECT_EQ(test, 9, *((int *)table.data)); +} + +/* + * Same as previous test, but now with negative numbers. + */ +static void sysctl_test_dointvec_write_happy_single_negative(struct kunit *test) +{ + int data = 0; + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + char input[] = "-9"; + size_t len = sizeof(input) - 1; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + + memcpy(buffer, input, len); + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE, + user_buffer, &len, &pos)); + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len); + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, (size_t)pos); + KUNIT_EXPECT_EQ(test, -9, *((int *)table.data)); +} + +/* + * Test that writing a value smaller than the minimum possible value is not + * allowed. + */ +static void sysctl_test_api_dointvec_write_single_less_int_min( + struct kunit *test) +{ + int data = 0; + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + size_t max_len = 32, len = max_len; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, max_len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + unsigned long abs_of_less_than_min = (unsigned long)INT_MAX + - (INT_MAX + INT_MIN) + 1; + + /* + * We use this rigmarole to create a string that contains a value one + * less than the minimum accepted value. + */ + KUNIT_ASSERT_LT(test, + (size_t)snprintf(buffer, max_len, "-%lu", + abs_of_less_than_min), + max_len); + + KUNIT_EXPECT_EQ(test, -EINVAL, proc_dointvec(&table, KUNIT_PROC_WRITE, + user_buffer, &len, &pos)); + KUNIT_EXPECT_EQ(test, max_len, len); + KUNIT_EXPECT_EQ(test, 0, *((int *)table.data)); +} + +/* + * Test that writing the maximum possible value works. + */ +static void sysctl_test_api_dointvec_write_single_greater_int_max( + struct kunit *test) +{ + int data = 0; + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + size_t max_len = 32, len = max_len; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, max_len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + unsigned long greater_than_max = (unsigned long)INT_MAX + 1; + + KUNIT_ASSERT_GT(test, greater_than_max, (unsigned long)INT_MAX); + KUNIT_ASSERT_LT(test, (size_t)snprintf(buffer, max_len, "%lu", + greater_than_max), + max_len); + KUNIT_EXPECT_EQ(test, -EINVAL, proc_dointvec(&table, KUNIT_PROC_WRITE, + user_buffer, &len, &pos)); + KUNIT_ASSERT_EQ(test, max_len, len); + KUNIT_EXPECT_EQ(test, 0, *((int *)table.data)); +} + +static struct kunit_case sysctl_test_cases[] = { + KUNIT_CASE(sysctl_test_api_dointvec_null_tbl_data), + KUNIT_CASE(sysctl_test_api_dointvec_table_maxlen_unset), + KUNIT_CASE(sysctl_test_api_dointvec_table_len_is_zero), + KUNIT_CASE(sysctl_test_api_dointvec_table_read_but_position_set), + KUNIT_CASE(sysctl_test_dointvec_read_happy_single_positive), + KUNIT_CASE(sysctl_test_dointvec_read_happy_single_negative), + KUNIT_CASE(sysctl_test_dointvec_write_happy_single_positive), + KUNIT_CASE(sysctl_test_dointvec_write_happy_single_negative), + KUNIT_CASE(sysctl_test_api_dointvec_write_single_less_int_min), + KUNIT_CASE(sysctl_test_api_dointvec_write_single_greater_int_max), + {} +}; + +static struct kunit_suite sysctl_test_suite = { + .name = "sysctl_test", + .test_cases = sysctl_test_cases, +}; + +kunit_test_suite(sysctl_test_suite); -- cgit From 245d73698ed7abdc7e520dfa38048bb80ce89571 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 2 Oct 2019 16:41:58 -0700 Subject: audit: Report suspicious O_CREAT usage This renames the very specific audit_log_link_denied() to audit_log_path_denied() and adds the AUDIT_* type as an argument. This allows for the creation of the new AUDIT_ANOM_CREAT that can be used to report the fifo/regular file creation restrictions that were introduced in commit 30aba6656f61 ("namei: allow restricted O_CREAT of FIFOs and regular files"). Signed-off-by: Kees Cook Signed-off-by: Paul Moore --- kernel/audit.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index da8dc0db5bd3..d75485aa25ff 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2155,18 +2155,19 @@ void audit_log_task_info(struct audit_buffer *ab) EXPORT_SYMBOL(audit_log_task_info); /** - * audit_log_link_denied - report a link restriction denial - * @operation: specific link operation + * audit_log_path_denied - report a path restriction denial + * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc) + * @operation: specific operation name */ -void audit_log_link_denied(const char *operation) +void audit_log_path_denied(int type, const char *operation) { struct audit_buffer *ab; if (!audit_enabled || audit_dummy_context()) return; - /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */ - ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_LINK); + /* Generate log with subject, operation, outcome. */ + ab = audit_log_start(audit_context(), GFP_KERNEL, type); if (!ab) return; audit_log_format(ab, "op=%s", operation); -- cgit From c29eb85386880750130a01aabf288408a6614d65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 23 Sep 2019 11:08:58 -0700 Subject: workqueue: more destroy_workqueue() fixes destroy_workqueue() warnings still, at a lower frequency, trigger spuriously. The problem seems to be in-flight operations which haven't reached put_pwq() yet. * Make sanity check grab all the related locks so that it's synchronized against operations which puts pwq at the end. * Always print out the offending pwq. Signed-off-by: Tejun Heo Cc: "Williams, Gerald S" --- kernel/workqueue.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7f7aa45dac37..4a3c30177b94 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -355,6 +355,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); +static void show_pwq(struct pool_workqueue *pwq); #define CREATE_TRACE_POINTS #include @@ -4314,6 +4315,22 @@ err_destroy: } EXPORT_SYMBOL_GPL(alloc_workqueue); +static bool pwq_busy(struct pool_workqueue *pwq) +{ + int i; + + for (i = 0; i < WORK_NR_COLORS; i++) + if (pwq->nr_in_flight[i]) + return true; + + if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) + return true; + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + return true; + + return false; +} + /** * destroy_workqueue - safely terminate a workqueue * @wq: target workqueue @@ -4348,28 +4365,28 @@ void destroy_workqueue(struct workqueue_struct *wq) kfree(rescuer); } - /* sanity checks */ + /* + * Sanity checks - grab all the locks so that we wait for all + * in-flight operations which may do put_pwq(). + */ + mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { - int i; - - for (i = 0; i < WORK_NR_COLORS; i++) { - if (WARN_ON(pwq->nr_in_flight[i])) { - mutex_unlock(&wq->mutex); - show_workqueue_state(); - return; - } - } - - if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) || - WARN_ON(pwq->nr_active) || - WARN_ON(!list_empty(&pwq->delayed_works))) { + spin_lock_irq(&pwq->pool->lock); + if (WARN_ON(pwq_busy(pwq))) { + pr_warning("%s: %s has the following busy pwq (refcnt=%d)\n", + __func__, wq->name, pwq->refcnt); + show_pwq(pwq); + spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); + mutex_unlock(&wq_pool_mutex); show_workqueue_state(); return; } + spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); + mutex_unlock(&wq_pool_mutex); /* * wq list is used to freeze wq, remove from list after -- cgit From e66b39af00f426b3356b96433d620cb3367ba1ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Sep 2019 06:59:15 -0700 Subject: workqueue: Fix pwq ref leak in rescuer_thread() 008847f66c3 ("workqueue: allow rescuer thread to do more work.") made the rescuer worker requeue the pwq immediately if there may be more work items which need rescuing instead of waiting for the next mayday timer expiration. Unfortunately, it doesn't check whether the pwq is already on the mayday list and unconditionally gets the ref and moves it onto the list. This doesn't corrupt the list but creates an additional reference to the pwq. It got queued twice but will only be removed once. This leak later can trigger pwq refcnt warning on workqueue destruction and prevent freeing of the workqueue. Signed-off-by: Tejun Heo Cc: "Williams, Gerald S" Cc: NeilBrown Cc: stable@vger.kernel.org # v3.19+ --- kernel/workqueue.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4a3c30177b94..4dc8270326d7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2533,8 +2533,14 @@ repeat: */ if (need_to_create_worker(pool)) { spin_lock(&wq_mayday_lock); - get_pwq(pwq); - list_move_tail(&pwq->mayday_node, &wq->maydays); + /* + * Queue iff we aren't racing destruction + * and somebody else hasn't queued it already. + */ + if (wq->rescuer && list_empty(&pwq->mayday_node)) { + get_pwq(pwq); + list_add_tail(&pwq->mayday_node, &wq->maydays); + } spin_unlock(&wq_mayday_lock); } } @@ -4374,8 +4380,8 @@ void destroy_workqueue(struct workqueue_struct *wq) for_each_pwq(pwq, wq) { spin_lock_irq(&pwq->pool->lock); if (WARN_ON(pwq_busy(pwq))) { - pr_warning("%s: %s has the following busy pwq (refcnt=%d)\n", - __func__, wq->name, pwq->refcnt); + pr_warning("%s: %s has the following busy pwq\n", + __func__, wq->name); show_pwq(pwq); spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); @@ -4670,7 +4676,8 @@ static void show_pwq(struct pool_workqueue *pwq) pr_info(" pwq %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, + pr_cont(" active=%d/%d refcnt=%d%s\n", + pwq->nr_active, pwq->max_active, pwq->refcnt, !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); hash_for_each(pool->busy_hash, bkt, worker, hentry) { -- cgit From 01b4c39901e087ceebae2733857248de81476bd8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Jul 2019 15:22:59 +0200 Subject: nohz: Add TICK_DEP_BIT_RCU If a nohz_full CPU is looping in the kernel, the scheduling-clock tick might nevertheless remain disabled. In !PREEMPT kernels, this can prevent RCU's attempts to enlist the aid of that CPU's executions of cond_resched(), which can in turn result in an arbitrarily delayed grace period and thus an OOM. RCU therefore needs a way to enable a holdout nohz_full CPU's scheduler-clock interrupt. This commit therefore provides a new TICK_DEP_BIT_RCU value which RCU can pass to tick_dep_set_cpu() and friends to force on the scheduler-clock interrupt for a specified CPU or task. In some cases, rcutorture needs to turn on the scheduler-clock tick, so this commit also exports the relevant symbols to GPL-licensed modules. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/time/tick-sched.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 955851748dc3..d1b0a84b6112 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -198,6 +198,11 @@ static bool check_tick_dependency(atomic_t *dep) return true; } + if (val & TICK_DEP_MASK_RCU) { + trace_tick_stop(0, TICK_DEP_MASK_RCU); + return true; + } + return false; } @@ -324,6 +329,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) preempt_enable(); } } +EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { @@ -331,6 +337,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) atomic_andnot(BIT(bit), &ts->tick_dep_mask); } +EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* * Set a per-task tick dependency. Posix CPU timers need this in order to elapse -- cgit From ae9e557b5be2e285f48ee945d9c8faf75d4f6a66 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 9 Aug 2019 16:29:42 -0700 Subject: time: Export tick start/stop functions for rcutorture It turns out that rcutorture needs to ensure that the scheduling-clock interrupt is enabled in CONFIG_NO_HZ_FULL kernels before starting on CPU-bound in-kernel processing. This commit therefore exports tick_nohz_dep_set_task(), tick_nohz_dep_clear_task(), and tick_nohz_full_setup() to GPL kernel modules. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/time/tick-sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d1b0a84b6112..1ffdb4ba1ded 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -172,6 +172,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; bool tick_nohz_full_running; +EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) @@ -351,11 +352,13 @@ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) */ tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); } +EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } +EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse @@ -404,6 +407,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } +EXPORT_SYMBOL_GPL(tick_nohz_full_setup); static int tick_nohz_cpu_down(unsigned int cpu) { -- cgit From 6a949b7af82db7eb1e52caaed122eab1cf63acee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 28 Jul 2019 11:50:56 -0700 Subject: rcu: Force on tick when invoking lots of callbacks Callback invocation can run for a significant time period, and within CONFIG_NO_HZ_FULL=y kernels, this period will be devoid of scheduler-clock interrupts. In-kernel execution without such interrupts can cause all manner of malfunction, with RCU CPU stall warnings being but one result. This commit therefore forces scheduling-clock interrupts on whenever more than a few RCU callbacks are invoked. Because offloaded callback invocation can be preempted, this forcing is withdrawn on each context switch. This in turn requires that the loop invoking RCU callbacks reiterate the forcing periodically. [ paulmck: Apply Joel Fernandes TICK_DEP_MASK_RCU->TICK_DEP_BIT_RCU fix. ] [ paulmck: Remove NO_HZ_FULL check per Frederic Weisbecker feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 81105141b6a8..238f93b4b0a4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2151,6 +2151,7 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_unlock_irqrestore(rdp, flags); /* Invoke callbacks. */ + tick_dep_set_task(current, TICK_DEP_BIT_RCU); rhp = rcu_cblist_dequeue(&rcl); for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { debug_rcu_head_unqueue(rhp); @@ -2217,6 +2218,7 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Re-invoke RCU core processing if there are callbacks remaining. */ if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_core(); + tick_dep_clear_task(current, TICK_DEP_BIT_RCU); } /* -- cgit From d38e6dc6ed0dfef8d323354031a1ee1a7cfdedc1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 28 Jul 2019 12:00:48 -0700 Subject: rcutorture: Force on tick for readers and callback flooders Readers and callback flooders in the rcutorture stress-test suite run for extended time periods by design. They do take pains to relinquish the CPU from time to time, but in some cases this relies on the scheduler being active, which in turn relies on the scheduler-clock interrupt firing from time to time. This commit therefore forces scheduling-clock interrupts within these loops. While in the area, this commit also prevents rcu_torture_reader()'s occasional timed sleeps from delaying shutdown. [ paulmck: Apply Joel Fernandes TICK_DEP_MASK_RCU->TICK_DEP_BIT_RCU fix. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3c9feca1eab1..ab61f5c1353b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -44,6 +44,7 @@ #include #include #include +#include #include "rcu.h" @@ -1363,15 +1364,15 @@ rcu_torture_reader(void *arg) set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) timer_setup_on_stack(&t, rcu_torture_timer, 0); - + tick_dep_set_task(current, TICK_DEP_BIT_RCU); do { if (irqreader && cur_ops->irq_capable) { if (!timer_pending(&t)) mod_timer(&t, jiffies + 1); } - if (!rcu_torture_one_read(&rand)) + if (!rcu_torture_one_read(&rand) && !torture_must_stop()) schedule_timeout_interruptible(HZ); - if (time_after(jiffies, lastsleep)) { + if (time_after(jiffies, lastsleep) && !torture_must_stop()) { schedule_timeout_interruptible(1); lastsleep = jiffies + 10; } @@ -1383,6 +1384,7 @@ rcu_torture_reader(void *arg) del_timer_sync(&t); destroy_timer_on_stack(&t); } + tick_dep_clear_task(current, TICK_DEP_BIT_RCU); torture_kthread_stopping("rcu_torture_reader"); return 0; } @@ -1729,10 +1731,10 @@ static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) // Real call_rcu() floods hit userspace, so emulate that. if (need_resched() || (iter & 0xfff)) schedule(); - } else { - // No userspace emulation: CB invocation throttles call_rcu() - cond_resched(); + return; } + // No userspace emulation: CB invocation throttles call_rcu() + cond_resched(); } /* @@ -1865,6 +1867,7 @@ static void rcu_torture_fwd_prog_cr(void) cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); rcu_launder_gp_seq_start = gps; + tick_dep_set_task(current, TICK_DEP_BIT_RCU); while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { @@ -1911,6 +1914,7 @@ static void rcu_torture_fwd_prog_cr(void) rcu_torture_fwd_cb_hist(); } schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ + tick_dep_clear_task(current, TICK_DEP_BIT_RCU); WRITE_ONCE(rcu_fwd_cb_nodelay, false); } -- cgit From 366237e7b0833faa2d8da7a8d7d7da8c3ca802e5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jul 2019 08:01:01 -0700 Subject: stop_machine: Provide RCU quiescent state in multi_cpu_stop() When multi_cpu_stop() loops waiting for other tasks, it can trigger an RCU CPU stall warning. This can be misleading because what is instead needed is information on whatever task is blocking multi_cpu_stop(). This commit therefore inserts an RCU quiescent state into the multi_cpu_stop() function's waitloop. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/stop_machine.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 238f93b4b0a4..a5c296d202ae 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -364,7 +364,7 @@ bool rcu_eqs_special_set(int cpu) * * The caller must have disabled interrupts and must not be idle. */ -static void __maybe_unused rcu_momentary_dyntick_idle(void) +void rcu_momentary_dyntick_idle(void) { int special; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c7031a22aa7b..34c4f117d8c7 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -233,6 +233,7 @@ static int multi_cpu_stop(void *data) */ touch_nmi_watchdog(); } + rcu_momentary_dyntick_idle(); } while (curstate != MULTI_STOP_EXIT); local_irq_restore(flags); -- cgit From 96926686deab853bcacf887501f4ed958e38666b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Aug 2019 15:12:47 -0700 Subject: rcu: Make CPU-hotplug removal operations enable tick CPU-hotplug removal operations run the multi_cpu_stop() function, which relies on the scheduler to gain control from whatever is running on the various online CPUs, including any nohz_full CPUs running long loops in kernel-mode code. Lack of the scheduler-clock interrupt on such CPUs can delay multi_cpu_stop() for several minutes and can also result in RCU CPU stall warnings. This commit therefore causes CPU-hotplug removal operations to enable the scheduler-clock interrupt on all online CPUs. [ paulmck: Apply Joel Fernandes TICK_DEP_MASK_RCU->TICK_DEP_BIT_RCU fix. ] [ paulmck: Apply simplifications suggested by Frederic Weisbecker. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a5c296d202ae..7c67ea561b36 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2101,6 +2101,9 @@ int rcutree_dead_cpu(unsigned int cpu) rcu_boost_kthread_setaffinity(rnp, -1); /* Do any needed no-CB deferred wakeups from this CPU. */ do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu)); + + // Stop-machine done, so allow nohz_full to disable tick. + tick_dep_clear(TICK_DEP_BIT_RCU); return 0; } @@ -3085,6 +3088,9 @@ int rcutree_online_cpu(unsigned int cpu) return 0; /* Too early in boot for scheduler work. */ sync_sched_exp_online_cleanup(cpu); rcutree_affinity_setting(cpu, -1); + + // Stop-machine done, so allow nohz_full to disable tick. + tick_dep_clear(TICK_DEP_BIT_RCU); return 0; } @@ -3105,6 +3111,9 @@ int rcutree_offline_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcutree_affinity_setting(cpu, cpu); + + // nohz_full CPUs need the tick for stop-machine to work quickly + tick_dep_set(TICK_DEP_BIT_RCU); return 0; } -- cgit From 79ba7ff5a9925f5c170f51ed7a96d1475eb6c27f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Aug 2019 13:17:35 -0700 Subject: rcutorture: Emulate dyntick aspect of userspace nohz_full sojourn During an actual call_rcu() flood, there would be frequent trips to userspace (in-kernel call_rcu() floods must be otherwise housebroken). Userspace execution on nohz_full CPUs implies an RCU dyntick idle/not-idle transition pair, so this commit adds emulation of that pair. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 11 +++++++++++ kernel/rcu/tree.c | 1 + 2 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ab61f5c1353b..49ad88765ed2 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1761,6 +1761,11 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) kfree(rfcp); freed++; rcu_torture_fwd_prog_cond_resched(freed); + if (tick_nohz_full_enabled()) { + local_irq_save(flags); + rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } } return freed; } @@ -1835,6 +1840,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) static void rcu_torture_fwd_prog_cr(void) { unsigned long cver; + unsigned long flags; unsigned long gps; int i; long n_launders; @@ -1894,6 +1900,11 @@ static void rcu_torture_fwd_prog_cr(void) } cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); + if (tick_nohz_full_enabled()) { + local_irq_save(flags); + rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } } stoppedat = jiffies; n_launders_cb_snap = READ_ONCE(n_launders_cb); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7c67ea561b36..66354ef776aa 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -375,6 +375,7 @@ void rcu_momentary_dyntick_idle(void) WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); rcu_preempt_deferred_qs(current); } +EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); /** * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle -- cgit From ac5f636130c2014eb535f30951460b75db6cbe04 Mon Sep 17 00:00:00 2001 From: Ethan Hansen <1ethanhansen@gmail.com> Date: Thu, 1 Aug 2019 14:00:40 -0700 Subject: rcu: Remove unused function rcutorture_record_progress() The function rcutorture_record_progress() is declared in rcu.h, but is never used. This commit therefore removes rcutorture_record_progress() to clean code. Signed-off-by: Ethan Hansen <1ethanhansen@gmail.com> Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 8fd4f82c9b3d..aeec70fda82c 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -455,7 +455,6 @@ enum rcutorture_type { #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq); -void rcutorture_record_progress(unsigned long vernum); void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, @@ -468,7 +467,6 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, *flags = 0; *gp_seq = 0; } -static inline void rcutorture_record_progress(unsigned long vernum) { } #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, -- cgit From c5d3c8ca22d487ad9a18da9a7722a76e9bdbf4fd Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Fri, 2 Aug 2019 09:46:56 +0800 Subject: locktorture: Replace strncmp() with str_has_prefix() The strncmp() function is error-prone because it is easy to get the length wrong, especially if the string is subject to change, especially given the need to account for the terminating nul byte. This commit therefore substitutes the newly introduced str_has_prefix(), which does not require a separately specified length. Signed-off-by: Chuhong Yuan Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index c513031cd7e3..8dd900247205 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -889,16 +889,16 @@ static int __init lock_torture_init(void) cxt.nrealwriters_stress = 2 * num_online_cpus(); #ifdef CONFIG_DEBUG_MUTEXES - if (strncmp(torture_type, "mutex", 5) == 0) + if (str_has_prefix(torture_type, "mutex")) cxt.debug_lock = true; #endif #ifdef CONFIG_DEBUG_RT_MUTEXES - if (strncmp(torture_type, "rtmutex", 7) == 0) + if (str_has_prefix(torture_type, "rtmutex")) cxt.debug_lock = true; #endif #ifdef CONFIG_DEBUG_SPINLOCK - if ((strncmp(torture_type, "spin", 4) == 0) || - (strncmp(torture_type, "rw_lock", 7) == 0)) + if ((str_has_prefix(torture_type, "spin")) || + (str_has_prefix(torture_type, "rw_lock"))) cxt.debug_lock = true; #endif -- cgit From b3ffb206ddd7f07d83bafd10e1b403df57055af4 Mon Sep 17 00:00:00 2001 From: Ethan Hansen <1ethanhansen@gmail.com> Date: Wed, 7 Aug 2019 17:27:32 -0700 Subject: rcu: Remove unused variable rcu_perf_writer_state The variable rcu_perf_writer_state is declared and initialized, but is never actually referenced. Remove it to clean code. Signed-off-by: Ethan Hansen <1ethanhansen@gmail.com> [ paulmck: Also removed unused macros assigned to that variable. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 5a879d073c1c..5f884d560384 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -109,15 +109,6 @@ static unsigned long b_rcu_perf_writer_started; static unsigned long b_rcu_perf_writer_finished; static DEFINE_PER_CPU(atomic_t, n_async_inflight); -static int rcu_perf_writer_state; -#define RTWS_INIT 0 -#define RTWS_ASYNC 1 -#define RTWS_BARRIER 2 -#define RTWS_EXP_SYNC 3 -#define RTWS_SYNC 4 -#define RTWS_IDLE 5 -#define RTWS_STOPPING 6 - #define MAX_MEAS 10000 #define MIN_MEAS 100 @@ -404,25 +395,20 @@ retry: if (!rhp) rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { - rcu_perf_writer_state = RTWS_ASYNC; atomic_inc(this_cpu_ptr(&n_async_inflight)); cur_ops->async(rhp, rcu_perf_async_cb); rhp = NULL; } else if (!kthread_should_stop()) { - rcu_perf_writer_state = RTWS_BARRIER; cur_ops->gp_barrier(); goto retry; } else { kfree(rhp); /* Because we are stopping. */ } } else if (gp_exp) { - rcu_perf_writer_state = RTWS_EXP_SYNC; cur_ops->exp_sync(); } else { - rcu_perf_writer_state = RTWS_SYNC; cur_ops->sync(); } - rcu_perf_writer_state = RTWS_IDLE; t = ktime_get_mono_fast_ns(); *wdp = t - *wdp; i_max = i; @@ -463,10 +449,8 @@ retry: rcu_perf_wait_shutdown(); } while (!torture_must_stop()); if (gp_async) { - rcu_perf_writer_state = RTWS_BARRIER; cur_ops->gp_barrier(); } - rcu_perf_writer_state = RTWS_STOPPING; writer_n_durations[me] = i_max; torture_kthread_stopping("rcu_perf_writer"); return 0; -- cgit From 8b5ddf8b99dc42241d1d413c6685bce18275c40e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Aug 2019 12:02:40 -0700 Subject: rcutorture: Separate warnings for each failure type Currently, each of six different types of failure triggers a single WARN_ON_ONCE(), and it is then necessary to stare at the rcu_torture_stats(), Reader Pipe, and Reader Batch lines looking for inappropriately non-zero values. This can be annoying and error-prone, so this commit provides a separate WARN_ON_ONCE() for each of the six error conditions and adds short comments to each to ease error identification. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3c9feca1eab1..5ac467293803 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1442,15 +1442,18 @@ rcu_torture_stats_print(void) n_rcu_torture_barrier_error); pr_alert("%s%s ", torture_type, TORTURE_FLAG); - if (atomic_read(&n_rcu_torture_mberror) != 0 || - n_rcu_torture_barrier_error != 0 || - n_rcu_torture_boost_ktrerror != 0 || - n_rcu_torture_boost_rterror != 0 || - n_rcu_torture_boost_failure != 0 || + if (atomic_read(&n_rcu_torture_mberror) || + n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror || + n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure || i > 1) { pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); - WARN_ON_ONCE(1); + WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror)); + WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier() + WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread + WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio + WARN_ON_ONCE(n_rcu_torture_boost_failure); // RCU boost failed + WARN_ON_ONCE(i > 1); // Too-short grace period } pr_cont("Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) -- cgit From fbbd5e358cecb5fa490550ace66463517a7577e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Aug 2019 11:43:53 -0700 Subject: rcutorture: Make in-kernel-loop testing more brutal The rcu_torture_fwd_prog_nr() tests the ability of RCU to tolerate in-kernel busy loops. It invokes rcu_torture_fwd_prog_cond_resched() within its delay loop, which, in PREEMPT && NO_HZ_FULL kernels results in the occasional direct call to schedule(). Now, this direct call to schedule() is appropriate for call_rcu() flood testing, in which either the kernel should restrain itself or userspace transitions will supply the needed restraint. But in pure in-kernel loops, the occasional cond_resched() should do the job. This commit therefore makes rcu_torture_fwd_prog_nr() use cond_resched() instead of rcu_torture_fwd_prog_cond_resched() in order to increase the brutality of this aspect of rcutorture testing. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5ac467293803..df1caa93ee63 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1806,7 +1806,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) udelay(10); cur_ops->readunlock(idx); if (!fwd_progress_need_resched || need_resched()) - rcu_torture_fwd_prog_cond_resched(1); + cond_resched(); } (*tested_tries)++; if (!time_before(jiffies, stopat) && -- cgit From 67d64918a163fd62cf3b668d69133b723c48ed96 Mon Sep 17 00:00:00 2001 From: "Wolfgang M. Reimer" Date: Mon, 16 Sep 2019 16:54:04 +0200 Subject: locking: locktorture: Do not include rwlock.h directly Including rwlock.h directly will cause kernel builds to fail if CONFIG_PREEMPT_RT is defined. The correct header file (rwlock_rt.h OR rwlock.h) will be included by spinlock.h which is included by locktorture.c anyway. Remove the include of linux/rwlock.h. Signed-off-by: Wolfgang M. Reimer Signed-off-by: Sebastian Andrzej Siewior Acked-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8dd900247205..99475a66c94f 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include -- cgit From 61e867fde21ea94f2166899f24f16cad85cc7b24 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 29 Sep 2019 16:06:58 +0800 Subject: cgroup: short-circuit current_cgns_cgroup_from_root() on the default hierarchy Like commit 13d82fb77abb ("cgroup: short-circuit cset_cgroup_from_root() on the default hierarchy"), short-circuit current_cgns_cgroup_from_root() on the default hierarchy. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 080561bb8a4b..f6cba23290a1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1374,6 +1374,8 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) cset = current->nsproxy->cgroup_ns->root_cset; if (cset == &init_css_set) { res = &root->cgrp; + } else if (root == &cgrp_dfl_root) { + res = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; -- cgit From e7c7b1d85dc1646c874096dac3cf01537c1fd6f1 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Fri, 4 Oct 2019 12:57:39 +0200 Subject: cgroup: Update comments about task exit path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We no longer take cgroup_mutex in cgroup_exit and the exiting tasks are not moved to init_css_set, reflect that in several comments to prevent confusion. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f6cba23290a1..01fc24aeac71 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -899,8 +899,7 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_exit() changing the css_set to - * init_css_set and dropping the old one. + * against cgroup_exit()/cgroup_free() dropping the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -1432,9 +1431,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) { /* - * No need to lock the task - since we hold cgroup_mutex the - * task can't change groups, so the only thing that can happen - * is that it exits and its css is set back to init_css_set. + * No need to lock the task - since we hold css_set_lock the + * task can't change groups. */ return cset_cgroup_from_root(task_css_set(task), root); } @@ -6030,7 +6028,7 @@ void cgroup_post_fork(struct task_struct *child) struct css_set *cset; spin_lock_irq(&css_set_lock); - cset = task_css_set(current); + cset = task_css_set(current); /* current is @child's parent */ if (list_empty(&child->cg_list)) { get_css_set(cset); cset->nr_tasks++; @@ -6073,20 +6071,8 @@ void cgroup_post_fork(struct task_struct *child) * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * - * Description: Detach cgroup from @tsk and release it. - * - * Note that cgroups marked notify_on_release force every task in - * them to take the global cgroup_mutex mutex when exiting. - * This could impact scaling on very large systems. Be reluctant to - * use notify_on_release cgroups where very high task exit scaling - * is required on large systems. + * Description: Detach cgroup from @tsk. * - * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We - * call cgroup_exit() while the task is still competent to handle - * notify_on_release(), then leave the task attached to the root cgroup in - * each hierarchy for the remainder of its exit. No need to bother with - * init_css_set refcnting. init_css_set never goes away and we can't race - * with migration path - PF_EXITING is visible to migration path. */ void cgroup_exit(struct task_struct *tsk) { @@ -6096,7 +6082,8 @@ void cgroup_exit(struct task_struct *tsk) /* * Unlink from @tsk from its css_set. As migration path can't race - * with us, we can check css_set and cg_list without synchronization. + * with us (thanks to cgroup_threadgroup_rwsem), we can check css_set + * and cg_list without synchronization. */ cset = task_css_set(tsk); @@ -6112,6 +6099,8 @@ void cgroup_exit(struct task_struct *tsk) spin_unlock_irq(&css_set_lock); } else { + /* Take reference to avoid freeing init_css_set in cgroup_free, + * see cgroup_fork(). */ get_css_set(cset); } -- cgit From 9a3284fad42f66bb43629c6716709ff791aaa457 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Fri, 4 Oct 2019 12:57:40 +0200 Subject: cgroup: Optimize single thread migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are reports of users who use thread migrations between cgroups and they report performance drop after d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem"). The effect is pronounced on machines with more CPUs. The migration is affected by forking noise happening in the background, after the mentioned commit a migrating thread must wait for all (forking) processes on the system, not only of its threadgroup. There are several places that need to synchronize with migration: a) do_exit, b) de_thread, c) copy_process, d) cgroup_update_dfl_csses, e) parallel migration (cgroup_{proc,thread}s_write). In the case of self-migrating thread, we relax the synchronization on cgroup_threadgroup_rwsem to avoid the cost of waiting. d) and e) are excluded with cgroup_mutex, c) does not matter in case of single thread migration and the executing thread cannot exec(2) or exit(2) while it is writing into cgroup.threads. In case of do_exit because of signal delivery, we either exit before the migration or finish the migration (of not yet PF_EXITING thread) and die afterwards. This patch handles only the case of self-migration by writing "0" into cgroup.threads. For simplicity, we always take cgroup_threadgroup_rwsem with numeric PIDs. This change improves migration dependent workload performance similar to per-signal_struct state. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 5 +++-- kernel/cgroup/cgroup-v1.c | 5 +++-- kernel/cgroup/cgroup.c | 39 ++++++++++++++++++++++++++++++--------- 3 files changed, 36 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 809e34a3c017..90d1710fef6c 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -231,9 +231,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + bool *locked) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task) +void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 7f83f4121d8d..09f3a413f6f8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -495,12 +495,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; + bool locked; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup); + task = cgroup_procs_write_start(buf, threadgroup, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -522,7 +523,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 01fc24aeac71..8b1c4fd47a7a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2824,7 +2824,8 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, return ret; } -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + bool *locked) __acquires(&cgroup_threadgroup_rwsem) { struct task_struct *tsk; @@ -2833,7 +2834,21 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - percpu_down_write(&cgroup_threadgroup_rwsem); + /* + * If we migrate a single thread, we don't care about threadgroup + * stability. If the thread is `current`, it won't exit(2) under our + * hands or change PID through exec(2). We exclude + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write + * callers by cgroup_mutex. + * Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); + if (pid || threadgroup) { + percpu_down_write(&cgroup_threadgroup_rwsem); + *locked = true; + } else { + *locked = false; + } rcu_read_lock(); if (pid) { @@ -2864,13 +2879,16 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) goto out_unlock_rcu; out_unlock_threadgroup: - percpu_up_write(&cgroup_threadgroup_rwsem); + if (*locked) { + percpu_up_write(&cgroup_threadgroup_rwsem); + *locked = false; + } out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task) +void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem) { struct cgroup_subsys *ss; @@ -2879,7 +2897,8 @@ void cgroup_procs_write_finish(struct task_struct *task) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - percpu_up_write(&cgroup_threadgroup_rwsem); + if (locked) + percpu_up_write(&cgroup_threadgroup_rwsem); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -4754,12 +4773,13 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; ssize_t ret; + bool locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, true); + task = cgroup_procs_write_start(buf, true, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4777,7 +4797,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(dst_cgrp, task, true); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); @@ -4795,6 +4815,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; ssize_t ret; + bool locked; buf = strstrip(buf); @@ -4802,7 +4823,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, false); + task = cgroup_procs_write_start(buf, false, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4826,7 +4847,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, ret = cgroup_attach_task(dst_cgrp, task, false); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); -- cgit From f83eeb1a01689b2691f6f56629ac9f66de8d41c2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 3 Oct 2019 18:17:44 +0200 Subject: sched/cputime: Rename vtime_account_system() to vtime_account_kernel() vtime_account_system() decides if we need to account the time to the system (__vtime_account_system()) or to the guest (vtime_account_guest()). So this function is a misnomer as we are on a higher level than "system". All we know when we call that function is that we are accounting kernel cputime. Whether it belongs to guest or system time is a lower level detail. Rename this function to vtime_account_kernel(). This will clarify things and avoid too many underscored vtime_account_system() versions. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yauheni Kaliuta Link: https://lkml.kernel.org/r/20191003161745.28464-2-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 46ed4e1383e2..b45932e27857 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -412,7 +412,7 @@ void vtime_common_task_switch(struct task_struct *prev) if (is_idle_task(prev)) vtime_account_idle(prev); else - vtime_account_system(prev); + vtime_account_kernel(prev); vtime_flush(prev); arch_vtime_task_switch(prev); @@ -425,7 +425,7 @@ void vtime_common_task_switch(struct task_struct *prev) /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement - * vtime_account_system() and vtime_account_idle(). Archs that + * vtime_account_kernel() and vtime_account_idle(). Archs that * have other meaning of the idle time (s390 only includes the * time spent by the CPU when it's in low power mode) must override * vtime_account(). @@ -436,7 +436,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) if (!in_interrupt() && is_idle_task(tsk)) vtime_account_idle(tsk); else - vtime_account_system(tsk); + vtime_account_kernel(tsk); } EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ @@ -711,8 +711,8 @@ static u64 get_vtime_delta(struct vtime *vtime) return delta - other; } -static void __vtime_account_system(struct task_struct *tsk, - struct vtime *vtime) +static void vtime_account_system(struct task_struct *tsk, + struct vtime *vtime) { vtime->stime += get_vtime_delta(vtime); if (vtime->stime >= TICK_NSEC) { @@ -731,7 +731,7 @@ static void vtime_account_guest(struct task_struct *tsk, } } -void vtime_account_system(struct task_struct *tsk) +void vtime_account_kernel(struct task_struct *tsk) { struct vtime *vtime = &tsk->vtime; @@ -743,7 +743,7 @@ void vtime_account_system(struct task_struct *tsk) if (tsk->flags & PF_VCPU) vtime_account_guest(tsk, vtime); else - __vtime_account_system(tsk, vtime); + vtime_account_system(tsk, vtime); write_seqcount_end(&vtime->seqcount); } @@ -752,7 +752,7 @@ void vtime_user_enter(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk, vtime); + vtime_account_system(tsk, vtime); vtime->state = VTIME_USER; write_seqcount_end(&vtime->seqcount); } @@ -782,7 +782,7 @@ void vtime_guest_enter(struct task_struct *tsk) * that can thus safely catch up with a tickless delta. */ write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk, vtime); + vtime_account_system(tsk, vtime); tsk->flags |= PF_VCPU; write_seqcount_end(&vtime->seqcount); } -- cgit From 8d495477d62e4397207f22a432fcaa86d9f2bc2d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 3 Oct 2019 18:17:45 +0200 Subject: sched/cputime: Spare a seqcount lock/unlock cycle on context switch On context switch we are locking the vtime seqcount of the scheduling-out task twice: * On vtime_task_switch_common(), when we flush the pending vtime through vtime_account_system() * On arch_vtime_task_switch() to reset the vtime state. This is pointless as these actions can be performed without the need to unlock/lock in the middle. The reason these steps are separated is to consolidate a very small amount of common code between CONFIG_VIRT_CPU_ACCOUNTING_GEN and CONFIG_VIRT_CPU_ACCOUNTING_NATIVE. Performance in this fast path is definitely a priority over artificial code factorization so split the task switch code between GEN and NATIVE and mutualize the parts than can run under a single seqcount locked block. As a side effect, vtime_account_idle() becomes included in the seqcount protection. This happens to be a welcome preparation in order to properly support kcpustat under vtime in the future and fetch CPUTIME_IDLE without race. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yauheni Kaliuta Link: https://lkml.kernel.org/r/20191003161745.28464-3-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b45932e27857..cef23c211f41 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -405,9 +405,10 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ /* * Use precise platform statistics if available: */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + # ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_common_task_switch(struct task_struct *prev) +void vtime_task_switch(struct task_struct *prev) { if (is_idle_task(prev)) vtime_account_idle(prev); @@ -418,10 +419,7 @@ void vtime_common_task_switch(struct task_struct *prev) arch_vtime_task_switch(prev); } # endif -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement @@ -731,6 +729,16 @@ static void vtime_account_guest(struct task_struct *tsk, } } +static void __vtime_account_kernel(struct task_struct *tsk, + struct vtime *vtime) +{ + /* We might have scheduled out from guest path */ + if (tsk->flags & PF_VCPU) + vtime_account_guest(tsk, vtime); + else + vtime_account_system(tsk, vtime); +} + void vtime_account_kernel(struct task_struct *tsk) { struct vtime *vtime = &tsk->vtime; @@ -739,11 +747,7 @@ void vtime_account_kernel(struct task_struct *tsk) return; write_seqcount_begin(&vtime->seqcount); - /* We might have scheduled out from guest path */ - if (tsk->flags & PF_VCPU) - vtime_account_guest(tsk, vtime); - else - vtime_account_system(tsk, vtime); + __vtime_account_kernel(tsk, vtime); write_seqcount_end(&vtime->seqcount); } @@ -804,11 +808,15 @@ void vtime_account_idle(struct task_struct *tsk) account_idle_time(get_vtime_delta(&tsk->vtime)); } -void arch_vtime_task_switch(struct task_struct *prev) +void vtime_task_switch_generic(struct task_struct *prev) { struct vtime *vtime = &prev->vtime; write_seqcount_begin(&vtime->seqcount); + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + __vtime_account_kernel(prev, vtime); vtime->state = VTIME_INACTIVE; write_seqcount_end(&vtime->seqcount); -- cgit From 5facae4f3549b5cf7c0e10ec312a65ffd43b5726 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 19 Sep 2019 12:09:40 -0400 Subject: locking/lockdep: Remove unused @nested argument from lock_release() Since the following commit: b4adfe8e05f1 ("locking/lockdep: Remove unused argument in __lock_release") @nested is no longer used in lock_release(), so remove it from all lock_release() calls and friends. Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Acked-by: Daniel Vetter Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: alexander.levin@microsoft.com Cc: daniel@iogearbox.net Cc: davem@davemloft.net Cc: dri-devel@lists.freedesktop.org Cc: duyuyang@gmail.com Cc: gregkh@linuxfoundation.org Cc: hannes@cmpxchg.org Cc: intel-gfx@lists.freedesktop.org Cc: jack@suse.com Cc: jlbec@evilplan.or Cc: joonas.lahtinen@linux.intel.com Cc: joseph.qi@linux.alibaba.com Cc: jslaby@suse.com Cc: juri.lelli@redhat.com Cc: maarten.lankhorst@linux.intel.com Cc: mark@fasheh.com Cc: mhocko@kernel.org Cc: mripard@kernel.org Cc: ocfs2-devel@oss.oracle.com Cc: rodrigo.vivi@intel.com Cc: sean@poorly.run Cc: st@kernel.org Cc: tj@kernel.org Cc: tytso@mit.edu Cc: vdavydov.dev@gmail.com Cc: vincent.guittot@linaro.org Cc: viro@zeniv.linux.org.uk Link: https://lkml.kernel.org/r/1568909380-32199-1-git-send-email-cai@lca.pw Signed-off-by: Ingo Molnar --- kernel/bpf/stackmap.c | 2 +- kernel/cpu.c | 2 +- kernel/locking/lockdep.c | 3 +-- kernel/locking/mutex.c | 4 ++-- kernel/locking/rtmutex.c | 6 +++--- kernel/locking/rwsem.c | 10 +++++----- kernel/printk/printk.c | 10 +++++----- kernel/sched/core.c | 2 +- 8 files changed, 19 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 052580c33d26..dcfe2d37ad15 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -338,7 +338,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * up_read_non_owner(). The rwsem_release() is called * here to release the lock from lockdep's perspective. */ - rwsem_release(¤t->mm->mmap_sem.dep_map, 1, _RET_IP_); + rwsem_release(¤t->mm->mmap_sem.dep_map, _RET_IP_); } } diff --git a/kernel/cpu.c b/kernel/cpu.c index fc28e17940e0..68f85627d909 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -336,7 +336,7 @@ static void lockdep_acquire_cpus_lock(void) static void lockdep_release_cpus_lock(void) { - rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_); + rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_); } /* diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 233459c03b5a..8123518f9045 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4491,8 +4491,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, } EXPORT_SYMBOL_GPL(lock_acquire); -void lock_release(struct lockdep_map *lock, int nested, - unsigned long ip) +void lock_release(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 468a9b8422e3..5352ce50a97e 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1091,7 +1091,7 @@ err: err_early_kill: spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); - mutex_release(&lock->dep_map, 1, ip); + mutex_release(&lock->dep_map, ip); preempt_enable(); return ret; } @@ -1225,7 +1225,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne DEFINE_WAKE_Q(wake_q); unsigned long owner; - mutex_release(&lock->dep_map, 1, ip); + mutex_release(&lock->dep_map, ip); /* * Release the lock before (potentially) taking the spinlock such that diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2874bf556162..851bbb10819d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1517,7 +1517,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); if (ret) - mutex_release(&lock->dep_map, 1, _RET_IP_); + mutex_release(&lock->dep_map, _RET_IP_); return ret; } @@ -1561,7 +1561,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) RT_MUTEX_MIN_CHAINWALK, rt_mutex_slowlock); if (ret) - mutex_release(&lock->dep_map, 1, _RET_IP_); + mutex_release(&lock->dep_map, _RET_IP_); return ret; } @@ -1600,7 +1600,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); */ void __sched rt_mutex_unlock(struct rt_mutex *lock) { - mutex_release(&lock->dep_map, 1, _RET_IP_); + mutex_release(&lock->dep_map, _RET_IP_); rt_mutex_fastunlock(lock, rt_mutex_slowunlock); } EXPORT_SYMBOL_GPL(rt_mutex_unlock); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index eef04551eae7..44e68761f432 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1504,7 +1504,7 @@ int __sched down_read_killable(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); return -EINTR; } @@ -1546,7 +1546,7 @@ int __sched down_write_killable(struct rw_semaphore *sem) if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); return -EINTR; } @@ -1573,7 +1573,7 @@ EXPORT_SYMBOL(down_write_trylock); */ void up_read(struct rw_semaphore *sem) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); __up_read(sem); } EXPORT_SYMBOL(up_read); @@ -1583,7 +1583,7 @@ EXPORT_SYMBOL(up_read); */ void up_write(struct rw_semaphore *sem) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); __up_write(sem); } EXPORT_SYMBOL(up_write); @@ -1639,7 +1639,7 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); return -EINTR; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ca65327a6de8..c8be5a0f5259 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -248,7 +248,7 @@ static void __up_console_sem(unsigned long ip) { unsigned long flags; - mutex_release(&console_lock_dep_map, 1, ip); + mutex_release(&console_lock_dep_map, ip); printk_safe_enter_irqsave(flags); up(&console_sem); @@ -1679,20 +1679,20 @@ static int console_lock_spinning_disable_and_check(void) raw_spin_unlock(&console_owner_lock); if (!waiter) { - spin_release(&console_owner_dep_map, 1, _THIS_IP_); + spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } /* The waiter is now free to continue */ WRITE_ONCE(console_waiter, false); - spin_release(&console_owner_dep_map, 1, _THIS_IP_); + spin_release(&console_owner_dep_map, _THIS_IP_); /* * Hand off console_lock to waiter. The waiter will perform * the up(). After this, the waiter is the console_lock owner. */ - mutex_release(&console_lock_dep_map, 1, _THIS_IP_); + mutex_release(&console_lock_dep_map, _THIS_IP_); return 1; } @@ -1746,7 +1746,7 @@ static int console_trylock_spinning(void) /* Owner will clear console_waiter on hand off */ while (READ_ONCE(console_waiter)) cpu_relax(); - spin_release(&console_owner_dep_map, 1, _THIS_IP_); + spin_release(&console_owner_dep_map, _THIS_IP_); printk_safe_exit_irqrestore(flags); /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd05a378631a..eb42b71faab9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3105,7 +3105,7 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf * do an early lockdep release here: */ rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + spin_release(&rq->lock.dep_map, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = next; -- cgit From f49249d58abdc12067d69f15d509487171148b30 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 8 Oct 2019 11:46:46 +0100 Subject: PM: sleep: include for pm_wq Include the for the definition of pm_wq to avoid the following warning: kernel/power/main.c:890:25: warning: symbol 'pm_wq' was not declared. Should it be static? Signed-off-by: Ben Dooks Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index e8710d179b35..e26de7af520b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "power.h" -- cgit From 0f8b5b6d56b5fa4085c06945ea3e1ee5941ecfeb Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 13:02:17 -0700 Subject: kgdb: Remove unused DCPU_SSTEP definition From doing a 'git log --patch kernel/debug', it looks as if DCPU_SSTEP has never been used. Presumably it used to be used back when kgdb was out of tree and nobody thought to delete the definition when the usage went away. Delete. Signed-off-by: Douglas Anderson Acked-by: Jason Wessel Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index b4a7c326d546..804b0fe5a0ba 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -33,7 +33,6 @@ struct kgdb_state { #define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */ #define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */ #define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */ -#define DCPU_SSTEP 0x8 /* CPU is single stepping */ struct debuggerinfo_struct { void *debuggerinfo; -- cgit From 54af3e39eed7d77f0923511f3c7f446e7d477635 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 13:02:18 -0700 Subject: kdb: Remove unused "argcount" param from kdb_bt1(); make btaprompt bool The kdb_bt1() had a mysterious "argcount" parameter passed in (always the number 5, by the way) and never used. Presumably this is just old cruft. Remove it. While at it, upgrade the btaprompt parameter to a full fledged bool instead of an int. Signed-off-by: Douglas Anderson Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_bt.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 7e2379aa0a1e..120fc686c919 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -78,8 +78,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) */ static int -kdb_bt1(struct task_struct *p, unsigned long mask, - int argcount, int btaprompt) +kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt) { char buffer[2]; if (kdb_getarea(buffer[0], (unsigned long)p) || @@ -106,7 +105,6 @@ int kdb_bt(int argc, const char **argv) { int diag; - int argcount = 5; int btaprompt = 1; int nextarg; unsigned long addr; @@ -125,7 +123,7 @@ kdb_bt(int argc, const char **argv) /* Run the active tasks first */ for_each_online_cpu(cpu) { p = kdb_curr_task(cpu); - if (kdb_bt1(p, mask, argcount, btaprompt)) + if (kdb_bt1(p, mask, btaprompt)) return 0; } /* Now the inactive tasks */ @@ -134,7 +132,7 @@ kdb_bt(int argc, const char **argv) return 0; if (task_curr(p)) continue; - if (kdb_bt1(p, mask, argcount, btaprompt)) + if (kdb_bt1(p, mask, btaprompt)) return 0; } kdb_while_each_thread(g, p); } else if (strcmp(argv[0], "btp") == 0) { @@ -148,7 +146,7 @@ kdb_bt(int argc, const char **argv) p = find_task_by_pid_ns(pid, &init_pid_ns); if (p) { kdb_set_current_task(p); - return kdb_bt1(p, ~0UL, argcount, 0); + return kdb_bt1(p, ~0UL, false); } kdb_printf("No process with pid == %ld found\n", pid); return 0; @@ -159,7 +157,7 @@ kdb_bt(int argc, const char **argv) if (diag) return diag; kdb_set_current_task((struct task_struct *)addr); - return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0); + return kdb_bt1((struct task_struct *)addr, ~0UL, false); } else if (strcmp(argv[0], "btc") == 0) { unsigned long cpu = ~0; struct task_struct *save_current_task = kdb_current_task; @@ -211,7 +209,7 @@ kdb_bt(int argc, const char **argv) kdb_show_stack(kdb_current_task, (void *)addr); return 0; } else { - return kdb_bt1(kdb_current_task, ~0UL, argcount, 0); + return kdb_bt1(kdb_current_task, ~0UL, false); } } -- cgit From 55a7e23f461fc2c321d7efcdeca1750085e9323f Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 13:02:19 -0700 Subject: kdb: Fix "btc " crash if the CPU didn't round up I noticed that when I did "btc " and the CPU I passed in hadn't rounded up that I'd crash. I was going to copy the same fix from commit 162bc7f5afd7 ("kdb: Don't back trace on a cpu that didn't round up") into the "not all the CPUs" case, but decided it'd be better to clean things up a little bit. This consolidates the two code paths. It is _slightly_ wasteful in in that the checks for "cpu" being too small or being offline isn't really needed when we're iterating over all online CPUs, but that really shouldn't hurt. Better to have the same code path. While at it, eliminate at least one slightly ugly (and totally needless) recursive use of kdb_parse(). Signed-off-by: Douglas Anderson Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_bt.c | 61 ++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 120fc686c919..d9af139f9a31 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -101,6 +101,27 @@ kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt) return 0; } +static void +kdb_bt_cpu(unsigned long cpu) +{ + struct task_struct *kdb_tsk; + + if (cpu >= num_possible_cpus() || !cpu_online(cpu)) { + kdb_printf("WARNING: no process for cpu %ld\n", cpu); + return; + } + + /* If a CPU failed to round up we could be here */ + kdb_tsk = KDB_TSK(cpu); + if (!kdb_tsk) { + kdb_printf("WARNING: no task for cpu %ld\n", cpu); + return; + } + + kdb_set_current_task(kdb_tsk); + kdb_bt1(kdb_tsk, ~0UL, false); +} + int kdb_bt(int argc, const char **argv) { @@ -161,7 +182,6 @@ kdb_bt(int argc, const char **argv) } else if (strcmp(argv[0], "btc") == 0) { unsigned long cpu = ~0; struct task_struct *save_current_task = kdb_current_task; - char buf[80]; if (argc > 1) return KDB_ARGCOUNT; if (argc == 1) { @@ -169,35 +189,22 @@ kdb_bt(int argc, const char **argv) if (diag) return diag; } - /* Recursive use of kdb_parse, do not use argv after - * this point */ - argv = NULL; if (cpu != ~0) { - if (cpu >= num_possible_cpus() || !cpu_online(cpu)) { - kdb_printf("no process for cpu %ld\n", cpu); - return 0; - } - sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); - kdb_parse(buf); - return 0; - } - kdb_printf("btc: cpu status: "); - kdb_parse("cpu\n"); - for_each_online_cpu(cpu) { - void *kdb_tsk = KDB_TSK(cpu); - - /* If a CPU failed to round up we could be here */ - if (!kdb_tsk) { - kdb_printf("WARNING: no task for cpu %ld\n", - cpu); - continue; + kdb_bt_cpu(cpu); + } else { + /* + * Recursive use of kdb_parse, do not use argv after + * this point. + */ + argv = NULL; + kdb_printf("btc: cpu status: "); + kdb_parse("cpu\n"); + for_each_online_cpu(cpu) { + kdb_bt_cpu(cpu); + touch_nmi_watchdog(); } - - sprintf(buf, "btt 0x%px\n", kdb_tsk); - kdb_parse(buf); - touch_nmi_watchdog(); + kdb_set_current_task(save_current_task); } - kdb_set_current_task(save_current_task); return 0; } else { if (argc) { -- cgit From 2277b492582d5525244519f60da6f9daea5ef41a Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 13:02:20 -0700 Subject: kdb: Fix stack crawling on 'running' CPUs that aren't the master In kdb when you do 'btc' (back trace on CPU) it doesn't necessarily give you the right info. Specifically on many architectures (including arm64, where I tested) you can't dump the stack of a "running" process that isn't the process running on the current CPU. This can be seen by this: echo SOFTLOCKUP > /sys/kernel/debug/provoke-crash/DIRECT # wait 2 seconds g Here's what I see now on rk3399-gru-kevin. I see the stack crawl for the CPU that handled the sysrq but everything else just shows me stuck in __switch_to() which is bogus: ====== [0]kdb> btc btc: cpu status: Currently on cpu 0 Available cpus: 0, 1-3(I), 4, 5(I) Stack traceback for pid 0 0xffffff801101a9c0 0 0 1 0 R 0xffffff801101b3b0 *swapper/0 Call trace: dump_backtrace+0x0/0x138 ... kgdb_compiled_brk_fn+0x34/0x44 ... sysrq_handle_dbg+0x34/0x5c Stack traceback for pid 0 0xffffffc0f175a040 0 0 1 1 I 0xffffffc0f175aa30 swapper/1 Call trace: __switch_to+0x1e4/0x240 0xffffffc0f65616c0 Stack traceback for pid 0 0xffffffc0f175d040 0 0 1 2 I 0xffffffc0f175da30 swapper/2 Call trace: __switch_to+0x1e4/0x240 0xffffffc0f65806c0 Stack traceback for pid 0 0xffffffc0f175b040 0 0 1 3 I 0xffffffc0f175ba30 swapper/3 Call trace: __switch_to+0x1e4/0x240 0xffffffc0f659f6c0 Stack traceback for pid 1474 0xffffffc0dde8b040 1474 727 1 4 R 0xffffffc0dde8ba30 bash Call trace: __switch_to+0x1e4/0x240 __schedule+0x464/0x618 0xffffffc0dde8b040 Stack traceback for pid 0 0xffffffc0f17b0040 0 0 1 5 I 0xffffffc0f17b0a30 swapper/5 Call trace: __switch_to+0x1e4/0x240 0xffffffc0f65dd6c0 === The problem is that 'btc' eventually boils down to show_stack(task_struct, NULL); ...and show_stack() doesn't work for "running" CPUs because their registers haven't been stashed. On x86 things might work better (I haven't tested) because kdb has a special case for x86 in kdb_show_stack() where it passes the stack pointer to show_stack(). This wouldn't work on arm64 where the stack crawling function seems needs the "fp" and "pc", not the "sp" which is presumably why arm64's show_stack() function totally ignores the "sp" parameter. NOTE: we _can_ get a good stack dump for all the cpus if we manually switch each one to the kdb master and do a back trace. AKA: cpu 4 bt ...will give the expected trace. That's because now arm64's dump_backtrace will now see that "tsk == current" and go through a different path. In this patch I fix the problems by catching a request to stack crawl a task that's running on a CPU and then I ask that CPU to do the stack crawl. NOTE: this will (presumably) change what stack crawls are printed for x86 machines. Now kdb functions will show up in the stack crawl. Presumably this is OK but if it's not we can go back and add a special case for x86 again. Signed-off-by: Douglas Anderson Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 34 ++++++++++++++++++++++++++++++++++ kernel/debug/debug_core.h | 2 ++ kernel/debug/kdb/kdb_bt.c | 19 +++++++------------ 3 files changed, 43 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index f76d6f77dd5e..70e86b4b4932 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -441,6 +441,37 @@ setundefined: return 0; } +#ifdef CONFIG_KGDB_KDB +void kdb_dump_stack_on_cpu(int cpu) +{ + if (cpu == raw_smp_processor_id()) { + dump_stack(); + return; + } + + if (!(kgdb_info[cpu].exception_state & DCPU_IS_SLAVE)) { + kdb_printf("ERROR: Task on cpu %d didn't stop in the debugger\n", + cpu); + return; + } + + /* + * In general, architectures don't support dumping the stack of a + * "running" process that's not the current one. From the point of + * view of the Linux, kernel processes that are looping in the kgdb + * slave loop are still "running". There's also no API (that actually + * works across all architectures) that can do a stack crawl based + * on registers passed as a parameter. + * + * Solve this conundrum by asking slave CPUs to do the backtrace + * themselves. + */ + kgdb_info[cpu].exception_state |= DCPU_WANT_BT; + while (kgdb_info[cpu].exception_state & DCPU_WANT_BT) + cpu_relax(); +} +#endif + /* * Return true if there is a valid kgdb I/O module. Also if no * debugger is attached a message can be printed to the console about @@ -580,6 +611,9 @@ cpu_loop: atomic_xchg(&kgdb_active, cpu); break; } + } else if (kgdb_info[cpu].exception_state & DCPU_WANT_BT) { + dump_stack(); + kgdb_info[cpu].exception_state &= ~DCPU_WANT_BT; } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { if (!raw_spin_is_locked(&dbg_slave_lock)) goto return_normal; diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 804b0fe5a0ba..cd22b5f68831 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -33,6 +33,7 @@ struct kgdb_state { #define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */ #define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */ #define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */ +#define DCPU_WANT_BT 0x8 /* Slave cpu should backtrace then clear flag */ struct debuggerinfo_struct { void *debuggerinfo; @@ -75,6 +76,7 @@ extern int kdb_stub(struct kgdb_state *ks); extern int kdb_parse(const char *cmdstr); extern int kdb_common_init_state(struct kgdb_state *ks); extern int kdb_common_deinit_state(void); +extern void kdb_dump_stack_on_cpu(int cpu); #else /* ! CONFIG_KGDB_KDB */ static inline int kdb_stub(struct kgdb_state *ks) { diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index d9af139f9a31..0e94efe07b72 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -22,20 +22,15 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { int old_lvl = console_loglevel; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; - kdb_set_current_task(p); - if (addr) { - show_stack((struct task_struct *)p, addr); - } else if (kdb_current_regs) { -#ifdef CONFIG_X86 - show_stack(p, &kdb_current_regs->sp); -#else - show_stack(p, NULL); -#endif - } else { - show_stack(p, NULL); - } + + if (!addr && kdb_task_has_cpu(p)) + kdb_dump_stack_on_cpu(kdb_process_cpu(p)); + else + show_stack(p, addr); + console_loglevel = old_lvl; kdb_trap_printk--; } -- cgit From fb3c5386b382d4097476ce9647260fc89b34afdb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 20 Sep 2019 10:30:05 +0200 Subject: seccomp: add SECCOMP_USER_NOTIF_FLAG_CONTINUE This allows the seccomp notifier to continue a syscall. A positive discussion about this feature was triggered by a post to the ksummit-discuss mailing list (cf. [3]) and took place during KSummit (cf. [1]) and again at the containers/checkpoint-restore micro-conference at Linux Plumbers. Recently we landed seccomp support for SECCOMP_RET_USER_NOTIF (cf. [4]) which enables a process (watchee) to retrieve an fd for its seccomp filter. This fd can then be handed to another (usually more privileged) process (watcher). The watcher will then be able to receive seccomp messages about the syscalls having been performed by the watchee. This feature is heavily used in some userspace workloads. For example, it is currently used to intercept mknod() syscalls in user namespaces aka in containers. The mknod() syscall can be easily filtered based on dev_t. This allows us to only intercept a very specific subset of mknod() syscalls. Furthermore, mknod() is not possible in user namespaces toto coelo and so intercepting and denying syscalls that are not in the whitelist on accident is not a big deal. The watchee won't notice a difference. In contrast to mknod(), a lot of other syscall we intercept (e.g. setxattr()) cannot be easily filtered like mknod() because they have pointer arguments. Additionally, some of them might actually succeed in user namespaces (e.g. setxattr() for all "user.*" xattrs). Since we currently cannot tell seccomp to continue from a user notifier we are stuck with performing all of the syscalls in lieu of the container. This is a huge security liability since it is extremely difficult to correctly assume all of the necessary privileges of the calling task such that the syscall can be successfully emulated without escaping other additional security restrictions (think missing CAP_MKNOD for mknod(), or MS_NODEV on a filesystem etc.). This can be solved by telling seccomp to resume the syscall. One thing that came up in the discussion was the problem that another thread could change the memory after userspace has decided to let the syscall continue which is a well known TOCTOU with seccomp which is present in other ways already. The discussion showed that this feature is already very useful for any syscall without pointer arguments. For any accidentally intercepted non-pointer syscall it is safe to continue. For syscalls with pointer arguments there is a race but for any cautious userspace and the main usec cases the race doesn't matter. The notifier is intended to be used in a scenario where a more privileged watcher supervises the syscalls of lesser privileged watchee to allow it to get around kernel-enforced limitations by performing the syscall for it whenever deemed save by the watcher. Hence, if a user tricks the watcher into allowing a syscall they will either get a deny based on kernel-enforced restrictions later or they will have changed the arguments in such a way that they manage to perform a syscall with arguments that they would've been allowed to do anyway. In general, it is good to point out again, that the notifier fd was not intended to allow userspace to implement a security policy but rather to work around kernel security mechanisms in cases where the watcher knows that a given action is safe to perform. /* References */ [1]: https://linuxplumbersconf.org/event/4/contributions/560 [2]: https://linuxplumbersconf.org/event/4/contributions/477 [3]: https://lore.kernel.org/r/20190719093538.dhyopljyr5ns33qx@brauner.io [4]: commit 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Co-developed-by: Kees Cook Signed-off-by: Christian Brauner Reviewed-by: Tycho Andersen Cc: Andy Lutomirski Cc: Will Drewry CC: Tyler Hicks Link: https://lore.kernel.org/r/20190920083007.11475-2-christian.brauner@ubuntu.com Signed-off-by: Kees Cook --- kernel/seccomp.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index dba52a7db5e8..12d2227e5786 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -75,6 +75,7 @@ struct seccomp_knotif { /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */ int error; long val; + u32 flags; /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ struct completion ready; @@ -732,11 +733,12 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } -static void seccomp_do_user_notification(int this_syscall, - struct seccomp_filter *match, - const struct seccomp_data *sd) +static int seccomp_do_user_notification(int this_syscall, + struct seccomp_filter *match, + const struct seccomp_data *sd) { int err; + u32 flags = 0; long ret = 0; struct seccomp_knotif n = {}; @@ -764,6 +766,7 @@ static void seccomp_do_user_notification(int this_syscall, if (err == 0) { ret = n.val; err = n.error; + flags = n.flags; } /* @@ -780,8 +783,14 @@ static void seccomp_do_user_notification(int this_syscall, list_del(&n.list); out: mutex_unlock(&match->notify_lock); + + /* Userspace requests to continue the syscall. */ + if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) + return 0; + syscall_set_return_value(current, task_pt_regs(current), err, ret); + return -1; } static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, @@ -867,8 +876,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_USER_NOTIF: - seccomp_do_user_notification(this_syscall, match, sd); - goto skip; + if (seccomp_do_user_notification(this_syscall, match, sd)) + goto skip; + + return 0; case SECCOMP_RET_LOG: seccomp_log(this_syscall, 0, action, true); @@ -1087,7 +1098,11 @@ static long seccomp_notify_send(struct seccomp_filter *filter, if (copy_from_user(&resp, buf, sizeof(resp))) return -EFAULT; - if (resp.flags) + if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE) + return -EINVAL; + + if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) && + (resp.error || resp.val)) return -EINVAL; ret = mutex_lock_interruptible(&filter->notify_lock); @@ -1116,6 +1131,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, knotif->state = SECCOMP_NOTIFY_REPLIED; knotif->error = resp.error; knotif->val = resp.val; + knotif->flags = resp.flags; complete(&knotif->ready); out: mutex_unlock(&filter->notify_lock); -- cgit From a23740ec43ba022dbfd139d0fe3eff193216272b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 9 Oct 2019 13:14:57 -0700 Subject: bpf: Track contents of read-only maps as scalars Maps that are read-only both from BPF program side and user space side have their contents constant, so verifier can track referenced values precisely and use that knowledge for dead code elimination, branch pruning, etc. This patch teaches BPF verifier how to do this. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191009201458.2679171-2-andriin@fb.com --- kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ffc3e53f5300..b818fed3208d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2739,6 +2739,41 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) reg->smax_value = reg->umax_value; } +static bool bpf_map_is_rdonly(const struct bpf_map *map) +{ + return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen; +} + +static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) +{ + void *ptr; + u64 addr; + int err; + + err = map->ops->map_direct_value_addr(map, &addr, off); + if (err) + return err; + ptr = (void *)addr + off; + + switch (size) { + case sizeof(u8): + *val = (u64)*(u8 *)ptr; + break; + case sizeof(u16): + *val = (u64)*(u16 *)ptr; + break; + case sizeof(u32): + *val = (u64)*(u32 *)ptr; + break; + case sizeof(u64): + *val = *(u64 *)ptr; + break; + default: + return -EINVAL; + } + return 0; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -2776,9 +2811,27 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; err = check_map_access(env, regno, off, size, false); - if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); + if (!err && t == BPF_READ && value_regno >= 0) { + struct bpf_map *map = reg->map_ptr; + + /* if map is read-only, track its contents as scalars */ + if (tnum_is_const(reg->var_off) && + bpf_map_is_rdonly(map) && + map->ops->map_direct_value_addr) { + int map_off = off + reg->var_off.value; + u64 val = 0; + err = bpf_map_direct_read(map, map_off, size, + &val); + if (err) + return err; + + regs[value_regno].type = SCALAR_VALUE; + __mark_reg_known(®s[value_regno], val); + } else { + mark_reg_unknown(env, regs, value_regno); + } + } } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; -- cgit From 937c6b27c73e02cd4114f95f5c37ba2c29fadba1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 9 Oct 2019 17:02:30 +0200 Subject: cgroup: freezer: call cgroup_enter_frozen() with preemption disabled in ptrace_stop() ptrace_stop() does preempt_enable_no_resched() to avoid the preemption, but after that cgroup_enter_frozen() does spin_lock/unlock and this adds another preemption point. Reported-and-tested-by: Bruce Ashfield Fixes: 76f969e8948d ("cgroup: cgroup v2 freezer") Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Oleg Nesterov Acked-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c4da1ef56fdf..bcd46f547db3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2205,8 +2205,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t */ preempt_disable(); read_unlock(&tasklist_lock); - preempt_enable_no_resched(); cgroup_enter_frozen(); + preempt_enable_no_resched(); freezable_schedule(); cgroup_leave_frozen(true); } else { -- cgit From 2dedd7d2165565bafa89718eaadfc5d1a7865f66 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 11 Oct 2019 10:20:53 -0700 Subject: bpf: Fix cast to pointer from integer of different size warning Fix "warning: cast to pointer from integer of different size" when casting u64 addr to void *. Fixes: a23740ec43ba ("bpf: Track contents of read-only maps as scalars") Reported-by: kbuild test robot Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191011172053.2980619-1-andriin@fb.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b818fed3208d..d3446f018b9a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2753,7 +2753,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) err = map->ops->map_direct_value_addr(map, &addr, off); if (err) return err; - ptr = (void *)addr + off; + ptr = (void *)(long)addr + off; switch (size) { case sizeof(u8): -- cgit From ff229eee3d897f52bd001c841f2d3cce8853ecdc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Oct 2019 10:32:04 -0700 Subject: hrtimer: Annotate lockless access to timer->base Followup to commit dd2261ed45aa ("hrtimer: Protect lockless access to timer->base") lock_hrtimer_base() fetches timer->base without lock exclusion. Compiler is allowed to read timer->base twice (even if considered dumb) which could end up trying to lock migration_base and return &migration_base. base = timer->base; if (likely(base != &migration_base)) { /* compiler reads timer->base again, and now (base == &migration_base) raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* == &migration_base ! */ Similarly the write sides must use WRITE_ONCE() to avoid store tearing. Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20191008173204.180879-1-edumazet@google.com --- kernel/time/hrtimer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0d4dc241c0fb..65605530ee34 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -164,7 +164,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, struct hrtimer_clock_base *base; for (;;) { - base = timer->base; + base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) @@ -244,7 +244,7 @@ again: return base; /* See the comment in lock_hrtimer_base() */ - timer->base = &migration_base; + WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); @@ -253,10 +253,10 @@ again: raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; - timer->base = base; + WRITE_ONCE(timer->base, base); goto again; } - timer->base = new_base; + WRITE_ONCE(timer->base, new_base); } else { if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { -- cgit From b67114db64adcb0aaff0fe0bb2a84ede0a99aaf2 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 4 Oct 2019 13:10:09 +0200 Subject: parisc: sysctl.c: Use CONFIG_PARISC instead of __hppa_ define Signed-off-by: Helge Deller --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 00fcea236eba..b6f2f35d0bcf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -163,7 +163,7 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_SPARC #endif -#ifdef __hppa__ +#ifdef CONFIG_PARISC extern int pwrsw_enabled; #endif @@ -620,7 +620,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef __hppa__ +#ifdef CONFIG_PARISC { .procname = "soft-power", .data = &pwrsw_enabled, -- cgit From 15d42eb26bdee47c0278fbdab4198577bc6a97b5 Mon Sep 17 00:00:00 2001 From: Christian Kellner Date: Mon, 14 Oct 2019 18:20:32 +0200 Subject: pidfd: add NSpid entries to fdinfo Currently, the fdinfo file contains the Pid field which shows the pid a given pidfd refers to in the pid namespace of the procfs instance. If pid namespaces are configured, also show an NSpid field for easy retrieval of the pid in all descendant pid namespaces. If the pid namespace of the process is not a descendant of the pid namespace of the procfs instance 0 will be shown as its first NSpid entry and no other entries will be shown. Add a block comment to pidfd_show_fdinfo with a detailed explanation of Pid and NSpid fields. Co-developed-by: Christian Brauner Signed-off-by: Christian Brauner Signed-off-by: Christian Kellner Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20191014162034.2185-1-ckellner@redhat.com Signed-off-by: Christian Brauner --- kernel/fork.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..782986962d47 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1695,12 +1695,63 @@ static int pidfd_release(struct inode *inode, struct file *file) } #ifdef CONFIG_PROC_FS +/** + * pidfd_show_fdinfo - print information about a pidfd + * @m: proc fdinfo file + * @f: file referencing a pidfd + * + * Pid: + * This function will print the pid that a given pidfd refers to in the + * pid namespace of the procfs instance. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its pid. This is + * similar to calling getppid() on a process whose parent is outside of + * its pid namespace. + * + * NSpid: + * If pid namespaces are supported then this function will also print + * the pid of a given pidfd refers to for all descendant pid namespaces + * starting from the current pid namespace of the instance, i.e. the + * Pid field and the first entry in the NSpid field will be identical. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its first NSpid + * entry and no others will be shown. + * Note that this differs from the Pid and NSpid fields in + * /proc//status where Pid and NSpid are always shown relative to + * the pid namespace of the procfs instance. The difference becomes + * obvious when sending around a pidfd between pid namespaces from a + * different branch of the tree, i.e. where no ancestoral relation is + * present between the pid namespaces: + * - create two new pid namespaces ns1 and ns2 in the initial pid + * namespace (also take care to create new mount namespaces in the + * new pid namespace and mount procfs) + * - create a process with a pidfd in ns1 + * - send pidfd from ns1 to ns2 + * - read /proc/self/fdinfo/ and observe that both Pid and NSpid + * have exactly one entry, which is 0 + */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); struct pid *pid = f->private_data; + pid_t nr = pid_nr_ns(pid, ns); + + seq_put_decimal_ull(m, "Pid:\t", nr); - seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); +#ifdef CONFIG_PID_NS + seq_put_decimal_ull(m, "\nNSpid:\t", nr); + if (nr) { + int i; + + /* If nr is non-zero it means that 'pid' is valid and that + * ns, i.e. the pid namespace associated with the procfs + * instance, is in the pid namespace hierarchy of pid. + * Start at one below the already printed level. + */ + for (i = ns->level + 1; i <= pid->level; i++) + seq_put_decimal_ull(m, "\t", pid->numbers[i].nr); + } +#endif seq_putc(m, '\n'); } #endif -- cgit From bc88f85c6c09306bd21917e1ae28205e9cd775a7 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 16 Oct 2019 12:24:58 +0100 Subject: kthread: make __kthread_queue_delayed_work static The __kthread_queue_delayed_work is not exported so make it static, to avoid the following sparse warning: kernel/kthread.c:869:6: warning: symbol '__kthread_queue_delayed_work' was not declared. Should it be static? Signed-off-by: Ben Dooks Signed-off-by: Linus Torvalds --- kernel/kthread.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 621467c33fef..b262f47046ca 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -866,9 +866,9 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); -void __kthread_queue_delayed_work(struct kthread_worker *worker, - struct kthread_delayed_work *dwork, - unsigned long delay) +static void __kthread_queue_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; -- cgit From eac9153f2b584c702cea02c1f1a57d85aa9aea42 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 14 Oct 2019 10:12:23 -0700 Subject: bpf/stackmap: Fix deadlock with rq_lock in bpf_get_stack() bpf stackmap with build-id lookup (BPF_F_STACK_BUILD_ID) can trigger A-A deadlock on rq_lock(): rcu: INFO: rcu_sched detected stalls on CPUs/tasks: [...] Call Trace: try_to_wake_up+0x1ad/0x590 wake_up_q+0x54/0x80 rwsem_wake+0x8a/0xb0 bpf_get_stack+0x13c/0x150 bpf_prog_fbdaf42eded9fe46_on_event+0x5e3/0x1000 bpf_overflow_handler+0x60/0x100 __perf_event_overflow+0x4f/0xf0 perf_swevent_overflow+0x99/0xc0 ___perf_sw_event+0xe7/0x120 __schedule+0x47d/0x620 schedule+0x29/0x90 futex_wait_queue_me+0xb9/0x110 futex_wait+0x139/0x230 do_futex+0x2ac/0xa50 __x64_sys_futex+0x13c/0x180 do_syscall_64+0x42/0x100 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This can be reproduced by: 1. Start a multi-thread program that does parallel mmap() and malloc(); 2. taskset the program to 2 CPUs; 3. Attach bpf program to trace_sched_switch and gather stackmap with build-id, e.g. with trace.py from bcc tools: trace.py -U -p -s t:sched:sched_switch A sample reproducer is attached at the end. This could also trigger deadlock with other locks that are nested with rq_lock. Fix this by checking whether irqs are disabled. Since rq_lock and all other nested locks are irq safe, it is safe to do up_read() when irqs are not disable. If the irqs are disabled, postpone up_read() in irq_work. Fixes: 615755a77b24 ("bpf: extend stackmap to save binary_build_id+offset instead of address") Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191014171223.357174-1-songliubraving@fb.com Reproducer: ============================ 8< ============================ char *filename; void *worker(void *p) { void *ptr; int fd; char *pptr; fd = open(filename, O_RDONLY); if (fd < 0) return NULL; while (1) { struct timespec ts = {0, 1000 + rand() % 2000}; ptr = mmap(NULL, 4096 * 64, PROT_READ, MAP_PRIVATE, fd, 0); usleep(1); if (ptr == MAP_FAILED) { printf("failed to mmap\n"); break; } munmap(ptr, 4096 * 64); usleep(1); pptr = malloc(1); usleep(1); pptr[0] = 1; usleep(1); free(pptr); usleep(1); nanosleep(&ts, NULL); } close(fd); return NULL; } int main(int argc, char *argv[]) { void *ptr; int i; pthread_t threads[THREAD_COUNT]; if (argc < 2) return 0; filename = argv[1]; for (i = 0; i < THREAD_COUNT; i++) { if (pthread_create(threads + i, NULL, worker, NULL)) { fprintf(stderr, "Error creating thread\n"); return 0; } } for (i = 0; i < THREAD_COUNT; i++) pthread_join(threads[i], NULL); return 0; } ============================ 8< ============================ --- kernel/bpf/stackmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 052580c33d26..173e983619d7 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -287,7 +287,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, bool irq_work_busy = false; struct stack_map_irq_work *work = NULL; - if (in_nmi()) { + if (irqs_disabled()) { work = this_cpu_ptr(&up_read_work); if (work->irq_work.flags & IRQ_WORK_BUSY) /* cannot queue more up_read, fallback */ @@ -295,8 +295,9 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } /* - * We cannot do up_read() in nmi context. To do build_id lookup - * in nmi context, we need to run up_read() in irq_work. We use + * We cannot do up_read() when the irq is disabled, because of + * risk to deadlock with rq_lock. To do build_id lookup when the + * irqs are disabled, we need to run up_read() in irq_work. We use * a percpu variable to do the irq_work. If the irq_work is * already used by another lookup, we fall back to report ips. * -- cgit From 700dea5a0bea9f64eba89fae7cb2540326fdfdc1 Mon Sep 17 00:00:00 2001 From: Dmitry Goldin Date: Wed, 9 Oct 2019 13:42:14 +0000 Subject: kheaders: substituting --sort in archive creation The option --sort=ORDER was only introduced in tar 1.28 (2014), which is rather new and might not be available in some setups. This patch tries to replicate the previous behaviour as closely as possible to fix the kheaders build for older environments. It does not produce identical archives compared to the previous version due to minor sorting differences but produces reproducible results itself in my tests. Reported-by: Andreas Schwab Signed-off-by: Dmitry Goldin Tested-by: Andreas Schwab Tested-by: Quentin Perret Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index aff79e461fc9..5a0fc0b0403a 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -71,10 +71,13 @@ done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 find $cpio_dir -type f -print0 | xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' -# Create archive and try to normalize metadata for reproducibility -tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ - --owner=0 --group=0 --sort=name --numeric-owner \ - -Jcf $tarfile -C $cpio_dir/ . > /dev/null +# Create archive and try to normalize metadata for reproducibility. +# For compatibility with older versions of tar, files are fed to tar +# pre-sorted, as --sort=name might not be available. +find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \ + tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ + --owner=0 --group=0 --numeric-owner --no-recursion \ + -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null echo "$src_files_md5" > kernel/kheaders.md5 echo "$obj_files_md5" >> kernel/kheaders.md5 -- cgit From b1fc5833357524d5d342737913dbe32ff3557bc5 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2019 11:45:36 +0100 Subject: stop_machine: Avoid potential race behaviour Both multi_cpu_stop() and set_state() access multi_stop_data::state racily using plain accesses. These are subject to compiler transformations which could break the intended behaviour of the code, and this situation is detected by KCSAN on both arm64 and x86 (splats below). Improve matters by using READ_ONCE() and WRITE_ONCE() to ensure that the compiler cannot elide, replay, or tear loads and stores. In multi_cpu_stop() the two loads of multi_stop_data::state are expected to be a consistent value, so snapshot the value into a temporary variable to ensure this. The state transitions are serialized by atomic manipulation of multi_stop_data::num_threads, and other fields in multi_stop_data are not modified while subject to concurrent reads. KCSAN splat on arm64: | BUG: KCSAN: data-race in multi_cpu_stop+0xa8/0x198 and set_state+0x80/0xb0 | | write to 0xffff00001003bd00 of 4 bytes by task 24 on cpu 3: | set_state+0x80/0xb0 | multi_cpu_stop+0x16c/0x198 | cpu_stopper_thread+0x170/0x298 | smpboot_thread_fn+0x40c/0x560 | kthread+0x1a8/0x1b0 | ret_from_fork+0x10/0x18 | | read to 0xffff00001003bd00 of 4 bytes by task 14 on cpu 1: | multi_cpu_stop+0xa8/0x198 | cpu_stopper_thread+0x170/0x298 | smpboot_thread_fn+0x40c/0x560 | kthread+0x1a8/0x1b0 | ret_from_fork+0x10/0x18 | | Reported by Kernel Concurrency Sanitizer on: | CPU: 1 PID: 14 Comm: migration/1 Not tainted 5.3.0-00007-g67ab35a199f4-dirty #3 | Hardware name: linux,dummy-virt (DT) KCSAN splat on x86: | write to 0xffffb0bac0013e18 of 4 bytes by task 19 on cpu 2: | set_state kernel/stop_machine.c:170 [inline] | ack_state kernel/stop_machine.c:177 [inline] | multi_cpu_stop+0x1a4/0x220 kernel/stop_machine.c:227 | cpu_stopper_thread+0x19e/0x280 kernel/stop_machine.c:516 | smpboot_thread_fn+0x1a8/0x300 kernel/smpboot.c:165 | kthread+0x1b5/0x200 kernel/kthread.c:255 | ret_from_fork+0x35/0x40 arch/x86/entry/entry_64.S:352 | | read to 0xffffb0bac0013e18 of 4 bytes by task 44 on cpu 7: | multi_cpu_stop+0xb4/0x220 kernel/stop_machine.c:213 | cpu_stopper_thread+0x19e/0x280 kernel/stop_machine.c:516 | smpboot_thread_fn+0x1a8/0x300 kernel/smpboot.c:165 | kthread+0x1b5/0x200 kernel/kthread.c:255 | ret_from_fork+0x35/0x40 arch/x86/entry/entry_64.S:352 | | Reported by Kernel Concurrency Sanitizer on: | CPU: 7 PID: 44 Comm: migration/7 Not tainted 5.3.0+ #1 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Acked-by: Marco Elver Link: https://lkml.kernel.org/r/20191007104536.27276-1-mark.rutland@arm.com --- kernel/stop_machine.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c7031a22aa7b..998d50ee2d9b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -7,6 +7,7 @@ * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo */ +#include #include #include #include @@ -167,7 +168,7 @@ static void set_state(struct multi_stop_data *msdata, /* Reset ack counter. */ atomic_set(&msdata->thread_ack, msdata->num_threads); smp_wmb(); - msdata->state = newstate; + WRITE_ONCE(msdata->state, newstate); } /* Last one to ack a state moves to the next state. */ @@ -186,7 +187,7 @@ void __weak stop_machine_yield(const struct cpumask *cpumask) static int multi_cpu_stop(void *data) { struct multi_stop_data *msdata = data; - enum multi_stop_state curstate = MULTI_STOP_NONE; + enum multi_stop_state newstate, curstate = MULTI_STOP_NONE; int cpu = smp_processor_id(), err = 0; const struct cpumask *cpumask; unsigned long flags; @@ -210,8 +211,9 @@ static int multi_cpu_stop(void *data) do { /* Chill out and ensure we re-read multi_stop_state. */ stop_machine_yield(cpumask); - if (msdata->state != curstate) { - curstate = msdata->state; + newstate = READ_ONCE(msdata->state); + if (newstate != curstate) { + curstate = newstate; switch (curstate) { case MULTI_STOP_DISABLE_IRQ: local_irq_disable(); -- cgit From 3d6d8da48d0b214d65ea0227d47228abc75d7c88 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:28 +0200 Subject: pidfd: check pid has attached task in fdinfo Currently, when a task is dead we still print the pid it used to use in the fdinfo files of its pidfds. This doesn't make much sense since the pid may have already been reused. So verify that the task is still alive by introducing the pid_has_task() helper which will be used by other callers in follow-up patches. If the task is not alive anymore, we will print -1. This allows us to differentiate between a task not being present in a given pid namespace - in which case we already print 0 - and a task having been reaped. Note that this uses PIDTYPE_PID for the check. Technically, we could've checked PIDTYPE_TGID since pidfds currently only refer to thread-group leaders but if they won't anymore in the future then this check becomes problematic without it being immediately obvious to non-experts imho. If a thread is created via clone(CLONE_THREAD) than struct pid has a single non-empty list pid->tasks[PIDTYPE_PID] and this pid can't be used as a PIDTYPE_TGID meaning pid->tasks[PIDTYPE_TGID] will return NULL even though the thread-group leader might still be very much alive. So checking PIDTYPE_PID is fine and is easier to maintain should we ever allow pidfds to refer to threads. Cc: Jann Horn Cc: Christian Kellner Cc: linux-api@vger.kernel.org Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-1-christian.brauner@ubuntu.com --- kernel/fork.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 782986962d47..ffa314838b43 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1732,15 +1732,20 @@ static int pidfd_release(struct inode *inode, struct file *file) */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { - struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); struct pid *pid = f->private_data; - pid_t nr = pid_nr_ns(pid, ns); + struct pid_namespace *ns; + pid_t nr = -1; - seq_put_decimal_ull(m, "Pid:\t", nr); + if (likely(pid_has_task(pid, PIDTYPE_PID))) { + ns = proc_pid_ns(file_inode(m->file)); + nr = pid_nr_ns(pid, ns); + } + + seq_put_decimal_ll(m, "Pid:\t", nr); #ifdef CONFIG_PID_NS - seq_put_decimal_ull(m, "\nNSpid:\t", nr); - if (nr) { + seq_put_decimal_ll(m, "\nNSpid:\t", nr); + if (nr > 0) { int i; /* If nr is non-zero it means that 'pid' is valid and that @@ -1749,7 +1754,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) * Start at one below the already printed level. */ for (i = ns->level + 1; i <= pid->level; i++) - seq_put_decimal_ull(m, "\t", pid->numbers[i].nr); + seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); } #endif seq_putc(m, '\n'); -- cgit From 1d416a113f0c0da7a3608ffe4d44bd8e9c4859fa Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:30 +0200 Subject: pid: use pid_has_task() in __change_pid() Replace hlist_empty() with the new pid_has_task() helper which is more idiomatic, easier to grep for, and unifies how callers perform this check. Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-3-christian.brauner@ubuntu.com --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 0a9f2e437217..124d40b239b1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -299,7 +299,7 @@ static void __change_pid(struct task_struct *task, enum pid_type type, *pid_ptr = new; for (tmp = PIDTYPE_MAX; --tmp >= 0; ) - if (!hlist_empty(&pid->tasks[tmp])) + if (pid_has_task(pid, tmp)) return; free_pid(pid); -- cgit From 1722c14a2097634a7ba37000c0ec7d9409918b64 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:31 +0200 Subject: exit: use pid_has_task() in do_wait() Replace hlist_empty() with the new pid_has_task() helper which is more idiomatic, easier to grep for, and unifies how callers perform this check. Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-4-christian.brauner@ubuntu.com --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a46a50d67002..f2d20ab74422 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1457,7 +1457,7 @@ repeat: */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && - (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) + (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; set_current_state(TASK_INTERRUPTIBLE); -- cgit From 1e1d0f0b1a3e3533ea4cd4021eb251e53827c70b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:32 +0200 Subject: pid: use pid_has_task() in pidfd_open() Use the new pid_has_task() helper in pidfd_open(). This simplifies the code and avoids taking rcu_read_{lock,unlock}() and leads to overall nicer code. Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-5-christian.brauner@ubuntu.com --- kernel/pid.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 124d40b239b1..7b5f6c963d72 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -497,7 +497,7 @@ static int pidfd_create(struct pid *pid) */ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) { - int fd, ret; + int fd; struct pid *p; if (flags) @@ -510,13 +510,11 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) if (!p) return -ESRCH; - ret = 0; - rcu_read_lock(); - if (!pid_task(p, PIDTYPE_TGID)) - ret = -EINVAL; - rcu_read_unlock(); + if (pid_has_task(p, PIDTYPE_TGID)) + fd = pidfd_create(p); + else + fd = -EINVAL; - fd = ret ?: pidfd_create(p); put_pid(p); return fd; } -- cgit From 8580ac9404f6240668a026785d7d8856f0530409 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:57 -0700 Subject: bpf: Process in-kernel BTF If in-kernel BTF exists parse it and prepare 'struct btf *btf_vmlinux' for further use by the verifier. In-kernel BTF is trusted just like kallsyms and other build artifacts embedded into vmlinux. Yet run this BTF image through BTF verifier to make sure that it is valid and it wasn't mangled during the build. If in-kernel BTF is incorrect it means either gcc or pahole or kernel are buggy. In such case disallow loading BPF programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-4-ast@kernel.org --- kernel/bpf/btf.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 20 +++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 29c7c06c6bd6..ddeab1e8d21e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -698,6 +698,13 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; + /* btf verifier prints all types it is processing via + * btf_verifier_log_type(..., fmt = NULL). + * Skip those prints for in-kernel BTF verification. + */ + if (log->level == BPF_LOG_KERNEL && !fmt) + return; + __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, btf_kind_str[kind], @@ -735,6 +742,8 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; + if (log->level == BPF_LOG_KERNEL && !fmt) + return; /* The CHECK_META phase already did a btf dump. * * If member is logged again, it must hit an error in @@ -777,6 +786,8 @@ static void btf_verifier_log_vsi(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; + if (log->level == BPF_LOG_KERNEL && !fmt) + return; if (env->phase != CHECK_META) btf_verifier_log_type(env, datasec_type, NULL); @@ -802,6 +813,8 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; + if (log->level == BPF_LOG_KERNEL) + return; hdr = &btf->hdr; __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); __btf_verifier_log(log, "version: %u\n", hdr->version); @@ -2405,7 +2418,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } - + if (env->log.level == BPF_LOG_KERNEL) + continue; btf_verifier_log(env, "\t%s val=%d\n", __btf_name_by_offset(btf, enums[i].name_off), enums[i].val); @@ -3367,6 +3381,61 @@ errout: return ERR_PTR(err); } +extern char __weak _binary__btf_vmlinux_bin_start[]; +extern char __weak _binary__btf_vmlinux_bin_end[]; + +struct btf *btf_parse_vmlinux(void) +{ + struct btf_verifier_env *env = NULL; + struct bpf_verifier_log *log; + struct btf *btf = NULL; + int err; + + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); + if (!env) + return ERR_PTR(-ENOMEM); + + log = &env->log; + log->level = BPF_LOG_KERNEL; + + btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); + if (!btf) { + err = -ENOMEM; + goto errout; + } + env->btf = btf; + + btf->data = _binary__btf_vmlinux_bin_start; + btf->data_size = _binary__btf_vmlinux_bin_end - + _binary__btf_vmlinux_bin_start; + + err = btf_parse_hdr(env); + if (err) + goto errout; + + btf->nohdr_data = btf->data + btf->hdr.hdr_len; + + err = btf_parse_str_sec(env); + if (err) + goto errout; + + err = btf_check_all_metas(env); + if (err) + goto errout; + + btf_verifier_env_free(env); + refcount_set(&btf->refcnt, 1); + return btf; + +errout: + btf_verifier_env_free(env); + if (btf) { + kvfree(btf->types); + kfree(btf); + } + return ERR_PTR(err); +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d3446f018b9a..466b3b19de4d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -207,6 +207,8 @@ struct bpf_call_arg_meta { int func_id; }; +struct btf *btf_vmlinux; + static DEFINE_MUTEX(bpf_verifier_lock); static const struct bpf_line_info * @@ -243,6 +245,10 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, n = min(log->len_total - log->len_used - 1, n); log->kbuf[n] = '\0'; + if (log->level == BPF_LOG_KERNEL) { + pr_err("BPF:%s\n", log->kbuf); + return; + } if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) log->len_used += n; else @@ -9294,6 +9300,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->ops = bpf_verifier_ops[env->prog->type]; is_priv = capable(CAP_SYS_ADMIN); + if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + mutex_lock(&bpf_verifier_lock); + if (!btf_vmlinux) + btf_vmlinux = btf_parse_vmlinux(); + mutex_unlock(&bpf_verifier_lock); + } + /* grab the mutex to protect few globals used by verifier */ if (!is_priv) mutex_lock(&bpf_verifier_lock); @@ -9313,6 +9326,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto err_unlock; } + if (IS_ERR(btf_vmlinux)) { + /* Either gcc or pahole or kernel are broken. */ + verbose(env, "in-kernel BTF is malformed\n"); + ret = PTR_ERR(btf_vmlinux); + goto err_unlock; + } + env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; -- cgit From ccfe29eb29c2edcea6552072ef00ff4117f53e83 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:58 -0700 Subject: bpf: Add attach_btf_id attribute to program load Add attach_btf_id attribute to prog_load command. It's similar to existing expected_attach_type attribute which is used in several cgroup based program types. Unfortunately expected_attach_type is ignored for tracing programs and cannot be reused for new purpose. Hence introduce attach_btf_id to verify bpf programs against given in-kernel BTF type id at load time. It is strictly checked to be valid for raw_tp programs only. In a later patches it will become: btf_id == 0 semantics of existing raw_tp progs. btd_id > 0 raw_tp with BTF and additional type safety. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-5-ast@kernel.org --- kernel/bpf/syscall.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 82eabd4e38ad..b56c482c9760 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -1565,8 +1566,9 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) } static int -bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, - enum bpf_attach_type expected_attach_type) +bpf_prog_load_check_attach(enum bpf_prog_type prog_type, + enum bpf_attach_type expected_attach_type, + u32 btf_id) { switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK: @@ -1608,13 +1610,19 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_RAW_TRACEPOINT: + if (btf_id > BTF_MAX_TYPE) + return -EINVAL; + return 0; default: + if (btf_id) + return -EINVAL; return 0; } } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD line_info_cnt +#define BPF_PROG_LOAD_LAST_FIELD attach_btf_id static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -1656,7 +1664,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return -EPERM; bpf_prog_load_fixup_attach_type(attr); - if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) + if (bpf_prog_load_check_attach(type, attr->expected_attach_type, + attr->attach_btf_id)) return -EINVAL; /* plain bpf_prog allocation */ @@ -1665,6 +1674,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return -ENOMEM; prog->expected_attach_type = attr->expected_attach_type; + prog->aux->attach_btf_id = attr->attach_btf_id; prog->aux->offload_requested = !!attr->prog_ifindex; -- cgit From 9e15db66136a14cde3f35691f1d839d950118826 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:00 -0700 Subject: bpf: Implement accurate raw_tp context access via BTF libbpf analyzes bpf C program, searches in-kernel BTF for given type name and stores it into expected_attach_type. The kernel verifier expects this btf_id to point to something like: typedef void (*btf_trace_kfree_skb)(void *, struct sk_buff *skb, void *loc); which represents signature of raw_tracepoint "kfree_skb". Then btf_ctx_access() matches ctx+0 access in bpf program with 'skb' and 'ctx+8' access with 'loc' arguments of "kfree_skb" tracepoint. In first case it passes btf_id of 'struct sk_buff *' back to the verifier core and 'void *' in second case. Then the verifier tracks PTR_TO_BTF_ID as any other pointer type. Like PTR_TO_SOCKET points to 'struct bpf_sock', PTR_TO_TCP_SOCK points to 'struct bpf_tcp_sock', and so on. PTR_TO_BTF_ID points to in-kernel structs. If 1234 is btf_id of 'struct sk_buff' in vmlinux's BTF then PTR_TO_BTF_ID#1234 points to one of in kernel skbs. When PTR_TO_BTF_ID#1234 is dereferenced (like r2 = *(u64 *)r1 + 32) the btf_struct_access() checks which field of 'struct sk_buff' is at offset 32. Checks that size of access matches type definition of the field and continues to track the dereferenced type. If that field was a pointer to 'struct net_device' the r2's type will be PTR_TO_BTF_ID#456. Where 456 is btf_id of 'struct net_device' in vmlinux's BTF. Such verifier analysis prevents "cheating" in BPF C program. The program cannot cast arbitrary pointer to 'struct sk_buff *' and access it. C compiler would allow type cast, of course, but the verifier will notice type mismatch based on BPF assembly and in-kernel BTF. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-7-ast@kernel.org --- kernel/bpf/btf.c | 190 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 88 +++++++++++++++++++++- kernel/trace/bpf_trace.c | 2 +- 3 files changed, 276 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ddeab1e8d21e..271d27cd427f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3436,6 +3436,196 @@ errout: return ERR_PTR(err); } +extern struct btf *btf_vmlinux; + +bool btf_ctx_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + struct bpf_verifier_log *log = info->log; + u32 btf_id = prog->aux->attach_btf_id; + const struct btf_param *args; + const struct btf_type *t; + const char prefix[] = "btf_trace_"; + const char *tname; + u32 nr_args, arg; + + if (!btf_id) + return true; + + if (IS_ERR(btf_vmlinux)) { + bpf_log(log, "btf_vmlinux is malformed\n"); + return false; + } + + t = btf_type_by_id(btf_vmlinux, btf_id); + if (!t || BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) { + bpf_log(log, "btf_id is invalid\n"); + return false; + } + + tname = __btf_name_by_offset(btf_vmlinux, t->name_off); + if (strncmp(prefix, tname, sizeof(prefix) - 1)) { + bpf_log(log, "btf_id points to wrong type name %s\n", tname); + return false; + } + tname += sizeof(prefix) - 1; + + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_ptr(t)) + return false; + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_func_proto(t)) + return false; + + if (off % 8) { + bpf_log(log, "raw_tp '%s' offset %d is not multiple of 8\n", + tname, off); + return false; + } + arg = off / 8; + args = (const struct btf_param *)(t + 1); + /* skip first 'void *__data' argument in btf_trace_##name typedef */ + args++; + nr_args = btf_type_vlen(t) - 1; + if (arg >= nr_args) { + bpf_log(log, "raw_tp '%s' doesn't have %d-th argument\n", + tname, arg); + return false; + } + + t = btf_type_by_id(btf_vmlinux, args[arg].type); + /* skip modifiers */ + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf_vmlinux, t->type); + if (btf_type_is_int(t)) + /* accessing a scalar */ + return true; + if (!btf_type_is_ptr(t)) { + bpf_log(log, + "raw_tp '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", + tname, arg, + __btf_name_by_offset(btf_vmlinux, t->name_off), + btf_kind_str[BTF_INFO_KIND(t->info)]); + return false; + } + if (t->type == 0) + /* This is a pointer to void. + * It is the same as scalar from the verifier safety pov. + * No further pointer walking is allowed. + */ + return true; + + /* this is a pointer to another type */ + info->reg_type = PTR_TO_BTF_ID; + info->btf_id = t->type; + + t = btf_type_by_id(btf_vmlinux, t->type); + /* skip modifiers */ + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_struct(t)) { + bpf_log(log, + "raw_tp '%s' arg%d type %s is not a struct\n", + tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]); + return false; + } + bpf_log(log, "raw_tp '%s' arg%d has btf_id %d type %s '%s'\n", + tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)], + __btf_name_by_offset(btf_vmlinux, t->name_off)); + return true; +} + +int btf_struct_access(struct bpf_verifier_log *log, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, + u32 *next_btf_id) +{ + const struct btf_member *member; + const struct btf_type *mtype; + const char *tname, *mname; + int i, moff = 0, msize; + +again: + tname = __btf_name_by_offset(btf_vmlinux, t->name_off); + if (!btf_type_is_struct(t)) { + bpf_log(log, "Type '%s' is not a struct", tname); + return -EINVAL; + } + + for_each_member(i, t, member) { + /* offset of the field in bits */ + moff = btf_member_bit_offset(t, member); + + if (btf_member_bitfield_size(t, member)) + /* bitfields are not supported yet */ + continue; + + if (off + size <= moff / 8) + /* won't find anything, field is already too far */ + break; + + /* type of the field */ + mtype = btf_type_by_id(btf_vmlinux, member->type); + mname = __btf_name_by_offset(btf_vmlinux, member->name_off); + + /* skip modifiers */ + while (btf_type_is_modifier(mtype)) + mtype = btf_type_by_id(btf_vmlinux, mtype->type); + + if (btf_type_is_array(mtype)) + /* array deref is not supported yet */ + continue; + + if (!btf_type_has_size(mtype) && !btf_type_is_ptr(mtype)) { + bpf_log(log, "field %s doesn't have size\n", mname); + return -EFAULT; + } + if (btf_type_is_ptr(mtype)) + msize = 8; + else + msize = mtype->size; + if (off >= moff / 8 + msize) + /* no overlap with member, keep iterating */ + continue; + /* the 'off' we're looking for is either equal to start + * of this field or inside of this struct + */ + if (btf_type_is_struct(mtype)) { + /* our field must be inside that union or struct */ + t = mtype; + + /* adjust offset we're looking for */ + off -= moff / 8; + goto again; + } + if (msize != size) { + /* field access size doesn't match */ + bpf_log(log, + "cannot access %d bytes in struct %s field %s that has size %d\n", + size, tname, mname, msize); + return -EACCES; + } + + if (btf_type_is_ptr(mtype)) { + const struct btf_type *stype; + + stype = btf_type_by_id(btf_vmlinux, mtype->type); + /* skip modifiers */ + while (btf_type_is_modifier(stype)) + stype = btf_type_by_id(btf_vmlinux, stype->type); + if (btf_type_is_struct(stype)) { + *next_btf_id = mtype->type; + return PTR_TO_BTF_ID; + } + } + /* all other fields are treated as scalars */ + return SCALAR_VALUE; + } + bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off); + return -EINVAL; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 466b3b19de4d..42a463e09761 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -286,6 +286,19 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) va_end(args); } +__printf(2, 3) void bpf_log(struct bpf_verifier_log *log, + const char *fmt, ...) +{ + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); +} + static const char *ltrim(const char *s) { while (isspace(*s)) @@ -406,6 +419,7 @@ static const char * const reg_type_str[] = { [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", + [PTR_TO_BTF_ID] = "ptr_", }; static char slot_type_char[] = { @@ -436,6 +450,12 @@ static struct bpf_func_state *func(struct bpf_verifier_env *env, return cur->frame[reg->frameno]; } +const char *kernel_type_name(u32 id) +{ + return btf_name_by_offset(btf_vmlinux, + btf_type_by_id(btf_vmlinux, id)->name_off); +} + static void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state) { @@ -460,6 +480,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { + if (t == PTR_TO_BTF_ID) + verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); @@ -2337,10 +2359,12 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, - enum bpf_access_type t, enum bpf_reg_type *reg_type) + enum bpf_access_type t, enum bpf_reg_type *reg_type, + u32 *btf_id) { struct bpf_insn_access_aux info = { .reg_type = *reg_type, + .log = &env->log, }; if (env->ops->is_valid_access && @@ -2354,7 +2378,10 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + if (*reg_type == PTR_TO_BTF_ID) + *btf_id = info.btf_id; + else + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -2780,6 +2807,53 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) return 0; } +static int check_ptr_to_btf_access(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, + int regno, int off, int size, + enum bpf_access_type atype, + int value_regno) +{ + struct bpf_reg_state *reg = regs + regno; + const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id); + const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off); + u32 btf_id; + int ret; + + if (atype != BPF_READ) { + verbose(env, "only read is supported\n"); + return -EACCES; + } + + if (off < 0) { + verbose(env, + "R%d is ptr_%s invalid negative access: off=%d\n", + regno, tname, off); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, + "R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n", + regno, tname, off, tn_buf); + return -EACCES; + } + + ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); + if (ret < 0) + return ret; + + if (ret == SCALAR_VALUE) { + mark_reg_unknown(env, regs, value_regno); + return 0; + } + mark_reg_known_zero(env, regs, value_regno); + regs[value_regno].type = PTR_TO_BTF_ID; + regs[value_regno].btf_id = btf_id; + return 0; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -2840,6 +2914,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; + u32 btf_id = 0; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { @@ -2851,7 +2926,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf_id); + if (err) + verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a * PTR_TO_PACKET[_META,_END]. In the latter @@ -2870,6 +2947,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; + if (reg_type == PTR_TO_BTF_ID) + regs[value_regno].btf_id = btf_id; } regs[value_regno].type = reg_type; } @@ -2929,6 +3008,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_tp_buffer_access(env, reg, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_BTF_ID) { + err = check_ptr_to_btf_access(env, regs, regno, off, size, t, + value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 44bd08f2443b..6221e8c6ecc3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1074,7 +1074,7 @@ static bool raw_tp_prog_is_valid_access(int off, int size, return false; if (off % size != 0) return false; - return true; + return btf_ctx_access(off, size, type, prog, info); } const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { -- cgit From ac4414b5ca47d16c8de3134cc1b868056c4a68ea Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:01 -0700 Subject: bpf: Attach raw_tp program with BTF via type name BTF type id specified at program load time has all necessary information to attach that program to raw tracepoint. Use kernel type name to find raw tracepoint. Add missing CHECK_ATTR() condition. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-8-ast@kernel.org --- kernel/bpf/syscall.c | 70 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b56c482c9760..523e3ac15a08 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1816,17 +1816,52 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) struct bpf_raw_tracepoint *raw_tp; struct bpf_raw_event_map *btp; struct bpf_prog *prog; - char tp_name[128]; + const char *tp_name; + char buf[128]; int tp_fd, err; - if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name), - sizeof(tp_name) - 1) < 0) - return -EFAULT; - tp_name[sizeof(tp_name) - 1] = 0; + if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) + return -EINVAL; + + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { + err = -EINVAL; + goto out_put_prog; + } + + if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->aux->attach_btf_id) { + if (attr->raw_tracepoint.name) { + /* raw_tp name should not be specified in raw_tp + * programs that were verified via in-kernel BTF info + */ + err = -EINVAL; + goto out_put_prog; + } + /* raw_tp name is taken from type name instead */ + tp_name = kernel_type_name(prog->aux->attach_btf_id); + /* skip the prefix */ + tp_name += sizeof("btf_trace_") - 1; + } else { + if (strncpy_from_user(buf, + u64_to_user_ptr(attr->raw_tracepoint.name), + sizeof(buf) - 1) < 0) { + err = -EFAULT; + goto out_put_prog; + } + buf[sizeof(buf) - 1] = 0; + tp_name = buf; + } btp = bpf_get_raw_tracepoint(tp_name); - if (!btp) - return -ENOENT; + if (!btp) { + err = -ENOENT; + goto out_put_prog; + } raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); if (!raw_tp) { @@ -1834,38 +1869,27 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) goto out_put_btp; } raw_tp->btp = btp; - - prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto out_free_tp; - } - if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && - prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { - err = -EINVAL; - goto out_put_prog; - } + raw_tp->prog = prog; err = bpf_probe_register(raw_tp->btp, prog); if (err) - goto out_put_prog; + goto out_free_tp; - raw_tp->prog = prog; tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, O_CLOEXEC); if (tp_fd < 0) { bpf_probe_unregister(raw_tp->btp, prog); err = tp_fd; - goto out_put_prog; + goto out_free_tp; } return tp_fd; -out_put_prog: - bpf_prog_put(prog); out_free_tp: kfree(raw_tp); out_put_btp: bpf_put_raw_tracepoint(btp); +out_put_prog: + bpf_prog_put(prog); return err; } -- cgit From 2a02759ef5f8a34792df22b41d5e10658fd7bbd3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:02 -0700 Subject: bpf: Add support for BTF pointers to interpreter Pointer to BTF object is a pointer to kernel object or NULL. The memory access in the interpreter has to be done via probe_kernel_read to avoid page faults. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-9-ast@kernel.org --- kernel/bpf/core.c | 19 +++++++++++++++++++ kernel/bpf/verifier.c | 8 ++++++++ 2 files changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 66088a9e9b9e..8a765bbd33f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1291,6 +1291,11 @@ bool bpf_opcode_in_insntable(u8 code) } #ifndef CONFIG_BPF_JIT_ALWAYS_ON +u64 __weak bpf_probe_read(void * dst, u32 size, const void * unsafe_ptr) +{ + memset(dst, 0, size); + return -EFAULT; +} /** * __bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers @@ -1310,6 +1315,10 @@ static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u6 /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, + [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B, + [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, + [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, + [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW, }; #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL @@ -1542,6 +1551,16 @@ out: LDST(W, u32) LDST(DW, u64) #undef LDST +#define LDX_PROBE(SIZEOP, SIZE) \ + LDX_PROBE_MEM_##SIZEOP: \ + bpf_probe_read(&DST, SIZE, (const void *)(long) SRC); \ + CONT; + LDX_PROBE(B, 1) + LDX_PROBE(H, 2) + LDX_PROBE(W, 4) + LDX_PROBE(DW, 8) +#undef LDX_PROBE + STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ atomic_add((u32) SRC, (atomic_t *)(unsigned long) (DST + insn->off)); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 42a463e09761..c4b6a2cfcd47 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7581,6 +7581,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: + case PTR_TO_BTF_ID: return false; default: return true; @@ -8722,6 +8723,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_XDP_SOCK: convert_ctx_access = bpf_xdp_sock_convert_ctx_access; break; + case PTR_TO_BTF_ID: + if (type == BPF_WRITE) { + verbose(env, "Writes through BTF pointers are not allowed\n"); + return -EINVAL; + } + insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); + continue; default: continue; } -- cgit From 3dec541b2e632d630fe7142ed44f0b3702ef1f8c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:03 -0700 Subject: bpf: Add support for BTF pointers to x86 JIT Pointer to BTF object is a pointer to kernel object or NULL. Such pointers can only be used by BPF_LDX instructions. The verifier changed their opcode from LDX|MEM|size to LDX|PROBE_MEM|size to make JITing easier. The number of entries in extable is the number of BPF_LDX insns that access kernel memory via "pointer to BTF type". Only these load instructions can fault. Since x86 extable is relative it has to be allocated in the same memory region as JITed code. Allocate it prior to last pass of JITing and let the last pass populate it. Pointer to extable in bpf_prog_aux is necessary to make page fault handling fast. Page fault handling is done in two steps: 1. bpf_prog_kallsyms_find() finds BPF program that page faulted. It's done by walking rb tree. 2. then extable for given bpf program is binary searched. This process is similar to how page faulting is done for kernel modules. The exception handler skips over faulting x86 instruction and initializes destination register with zero. This mimics exact behavior of bpf_probe_read (when probe_kernel_read faults dest is zeroed). JITs for other architectures can add support in similar way. Until then they will reject unknown opcode and fallback to interpreter. Since extable should be aligned and placed near JITed code make bpf_jit_binary_alloc() return 4 byte aligned image offset, so that extable aligning formula in bpf_int_jit_compile() doesn't need to rely on internal implementation of bpf_jit_binary_alloc(). On x86 gcc defaults to 16-byte alignment for regular kernel functions due to better performance. JITed code may be aligned to 16 in the future, but it will use 4 in the meantime. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-10-ast@kernel.org --- kernel/bpf/core.c | 20 +++++++++++++++++++- kernel/bpf/verifier.c | 1 + kernel/extable.c | 2 ++ 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8a765bbd33f0..673f5d40a93e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -30,7 +30,7 @@ #include #include #include - +#include #include /* Registers */ @@ -712,6 +712,24 @@ bool is_bpf_text_address(unsigned long addr) return ret; } +const struct exception_table_entry *search_bpf_extables(unsigned long addr) +{ + const struct exception_table_entry *e = NULL; + struct bpf_prog *prog; + + rcu_read_lock(); + prog = bpf_prog_kallsyms_find(addr); + if (!prog) + goto out; + if (!prog->aux->num_exentries) + goto out; + + e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr); +out: + rcu_read_unlock(); + return e; +} + int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4b6a2cfcd47..fba9ef6a831b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8729,6 +8729,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return -EINVAL; } insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); + env->prog->aux->num_exentries++; continue; default: continue; diff --git a/kernel/extable.c b/kernel/extable.c index f6c9406eec7d..f6920a11e28a 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -56,6 +56,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) e = search_kernel_exception_table(addr); if (!e) e = search_module_extables(addr); + if (!e) + e = search_bpf_extables(addr); return e; } -- cgit From a7658e1a4164ce2b9eb4a11aadbba38586e93bd6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:04 -0700 Subject: bpf: Check types of arguments passed into helpers Introduce new helper that reuses existing skb perf_event output implementation, but can be called from raw_tracepoint programs that receive 'struct sk_buff *' as tracepoint argument or can walk other kernel data structures to skb pointer. In order to do that teach verifier to resolve true C types of bpf helpers into in-kernel BTF ids. The type of kernel pointer passed by raw tracepoint into bpf program will be tracked by the verifier all the way until it's passed into helper function. For example: kfree_skb() kernel function calls trace_kfree_skb(skb, loc); bpf programs receives that skb pointer and may eventually pass it into bpf_skb_output() bpf helper which in-kernel is implemented via bpf_skb_event_output() kernel function. Its first argument in the kernel is 'struct sk_buff *'. The verifier makes sure that types match all the way. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-11-ast@kernel.org --- kernel/bpf/btf.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 44 ++++++++++++++++++++----------- kernel/trace/bpf_trace.c | 4 +++ 3 files changed, 101 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 271d27cd427f..f7557af39756 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3626,6 +3626,74 @@ again: return -EINVAL; } +u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg) +{ + char fnname[KSYM_SYMBOL_LEN + 4] = "btf_"; + const struct btf_param *args; + const struct btf_type *t; + const char *tname, *sym; + u32 btf_id, i; + + if (IS_ERR(btf_vmlinux)) { + bpf_log(log, "btf_vmlinux is malformed\n"); + return -EINVAL; + } + + sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4); + if (!sym) { + bpf_log(log, "kernel doesn't have kallsyms\n"); + return -EFAULT; + } + + for (i = 1; i <= btf_vmlinux->nr_types; i++) { + t = btf_type_by_id(btf_vmlinux, i); + if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) + continue; + tname = __btf_name_by_offset(btf_vmlinux, t->name_off); + if (!strcmp(tname, fnname)) + break; + } + if (i > btf_vmlinux->nr_types) { + bpf_log(log, "helper %s type is not found\n", fnname); + return -ENOENT; + } + + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_ptr(t)) + return -EFAULT; + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_func_proto(t)) + return -EFAULT; + + args = (const struct btf_param *)(t + 1); + if (arg >= btf_type_vlen(t)) { + bpf_log(log, "bpf helper %s doesn't have %d-th argument\n", + fnname, arg); + return -EINVAL; + } + + t = btf_type_by_id(btf_vmlinux, args[arg].type); + if (!btf_type_is_ptr(t) || !t->type) { + /* anything but the pointer to struct is a helper config bug */ + bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n"); + return -EFAULT; + } + btf_id = t->type; + t = btf_type_by_id(btf_vmlinux, t->type); + /* skip modifiers */ + while (btf_type_is_modifier(t)) { + btf_id = t->type; + t = btf_type_by_id(btf_vmlinux, t->type); + } + if (!btf_type_is_struct(t)) { + bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n"); + return -EFAULT; + } + bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4, + arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off)); + return btf_id; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fba9ef6a831b..556e82f8869b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -205,6 +205,7 @@ struct bpf_call_arg_meta { u64 msize_umax_value; int ref_obj_id; int func_id; + u32 btf_id; }; struct btf *btf_vmlinux; @@ -3439,6 +3440,22 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_SOCKET; if (type != expected_type) goto err_type; + } else if (arg_type == ARG_PTR_TO_BTF_ID) { + expected_type = PTR_TO_BTF_ID; + if (type != expected_type) + goto err_type; + if (reg->btf_id != meta->btf_id) { + verbose(env, "Helper has type %s got %s in R%d\n", + kernel_type_name(meta->btf_id), + kernel_type_name(reg->btf_id), regno); + + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value || reg->off) { + verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", + regno); + return -EACCES; + } } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -3586,6 +3603,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_PERF_EVENT_ARRAY: if (func_id != BPF_FUNC_perf_event_read && func_id != BPF_FUNC_perf_event_output && + func_id != BPF_FUNC_skb_output && func_id != BPF_FUNC_perf_event_read_value) goto error; break; @@ -3673,6 +3691,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: case BPF_FUNC_perf_event_read_value: + case BPF_FUNC_skb_output: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; @@ -4127,21 +4146,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ - err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta); - if (err) - return err; + for (i = 0; i < 5; i++) { + if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID) { + if (!fn->btf_id[i]) + fn->btf_id[i] = btf_resolve_helper_id(&env->log, fn->func, i); + meta.btf_id = fn->btf_id[i]; + } + err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); + if (err) + return err; + } err = record_func_map(env, &meta, func_id, insn_idx); if (err) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6221e8c6ecc3..52f7e9d8c29b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -995,6 +995,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +extern const struct bpf_func_proto bpf_skb_output_proto; + BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) { @@ -1053,6 +1055,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_raw_tp; + case BPF_FUNC_skb_output: + return &bpf_skb_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: -- cgit From 9ae7ab20b4835dbea0e5fc6a5c70171dc354a72e Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 14 Oct 2019 17:44:08 +0100 Subject: sched/topology: Don't set SD_BALANCE_WAKE on cpuset domain relax As pointed out in commit 182a85f8a119 ("sched: Disable wakeup balancing") SD_BALANCE_WAKE is a tad too aggressive, and is usually left unset. However, it turns out cpuset domain relaxation will unconditionally set it on domains below the relaxation level. This made sense back when SD_BALANCE_WAKE was set unconditionally, but it no longer is the case. We can improve things slightly by noticing that set_domain_attribute() is always called after sd_init(), so rather than setting flags we can rely on whatever sd_init() is doing and only clear certain flags when above the relaxation level. While at it, slightly clean up the function and flip the relax level check to be more human readable. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: mingo@kernel.org Cc: vincent.guittot@linaro.org Cc: juri.lelli@redhat.com Cc: seto.hidetoshi@jp.fujitsu.com Cc: qperret@google.com Cc: Dietmar.Eggemann@arm.com Cc: morten.rasmussen@arm.com Link: https://lkml.kernel.org/r/20191014164408.32596-1-valentin.schneider@arm.com --- kernel/sched/topology.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b5667a273bf6..3623ffe85d18 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1201,16 +1201,13 @@ static void set_domain_attribute(struct sched_domain *sd, if (!attr || attr->relax_domain_level < 0) { if (default_relax_domain_level < 0) return; - else - request = default_relax_domain_level; + request = default_relax_domain_level; } else request = attr->relax_domain_level; - if (request < sd->level) { + + if (sd->level > request) { /* Turn off idle balance on this domain: */ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); - } else { - /* Turn on idle balance on this domain: */ - sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } } -- cgit From da97e18458fb42d7c00fac5fd1c56a3896ec666e Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 14 Oct 2019 13:03:08 -0400 Subject: perf_event: Add support for LSM and SELinux checks In current mainline, the degree of access to perf_event_open(2) system call depends on the perf_event_paranoid sysctl. This has a number of limitations: 1. The sysctl is only a single value. Many types of accesses are controlled based on the single value thus making the control very limited and coarse grained. 2. The sysctl is global, so if the sysctl is changed, then that means all processes get access to perf_event_open(2) opening the door to security issues. This patch adds LSM and SELinux access checking which will be used in Android to access perf_event_open(2) for the purposes of attaching BPF programs to tracepoints, perf profiling and other operations from userspace. These operations are intended for production systems. 5 new LSM hooks are added: 1. perf_event_open: This controls access during the perf_event_open(2) syscall itself. The hook is called from all the places that the perf_event_paranoid sysctl is checked to keep it consistent with the systctl. The hook gets passed a 'type' argument which controls CPU, kernel and tracepoint accesses (in this context, CPU, kernel and tracepoint have the same semantics as the perf_event_paranoid sysctl). Additionally, I added an 'open' type which is similar to perf_event_paranoid sysctl == 3 patch carried in Android and several other distros but was rejected in mainline [1] in 2016. 2. perf_event_alloc: This allocates a new security object for the event which stores the current SID within the event. It will be useful when the perf event's FD is passed through IPC to another process which may try to read the FD. Appropriate security checks will limit access. 3. perf_event_free: Called when the event is closed. 4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event. 5. perf_event_write: Called from the ioctl(2) syscalls for the event. [1] https://lwn.net/Articles/696240/ Since Peter had suggest LSM hooks in 2016 [1], I am adding his Suggested-by tag below. To use this patch, we set the perf_event_paranoid sysctl to -1 and then apply selinux checking as appropriate (default deny everything, and then add policy rules to give access to domains that need it). In the future we can remove the perf_event_paranoid sysctl altogether. Suggested-by: Peter Zijlstra Co-developed-by: Peter Zijlstra Signed-off-by: Joel Fernandes (Google) Signed-off-by: Peter Zijlstra (Intel) Acked-by: James Morris Cc: Arnaldo Carvalho de Melo Cc: rostedt@goodmis.org Cc: Yonghong Song Cc: Kees Cook Cc: Ingo Molnar Cc: Alexei Starovoitov Cc: jeffv@google.com Cc: Jiri Olsa Cc: Daniel Borkmann Cc: primiano@google.com Cc: Song Liu Cc: rsavitski@google.com Cc: Namhyung Kim Cc: Matthew Garrett Link: https://lkml.kernel.org/r/20191014170308.70668-1-joel@joelfernandes.org --- kernel/events/core.c | 57 +++++++++++++++++++++++++++++++++-------- kernel/trace/trace_event_perf.c | 15 +++++++---- 2 files changed, 56 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9ec0b0bfddbd..f9a5d4356562 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4229,8 +4229,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (!task) { /* Must be root to operate on a CPU event: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); + err = perf_allow_cpu(&event->attr); + if (err) + return ERR_PTR(err); cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; @@ -4539,6 +4540,8 @@ static void _free_event(struct perf_event *event) unaccount_event(event); + security_perf_event_free(event); + if (event->rb) { /* * Can happen when we close an event with re-directed output. @@ -4992,6 +4995,10 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) struct perf_event_context *ctx; int ret; + ret = security_perf_event_read(event); + if (ret) + return ret; + ctx = perf_event_ctx_lock(event); ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); @@ -5256,6 +5263,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct perf_event_context *ctx; long ret; + /* Treat ioctl like writes as it is likely a mutating operation. */ + ret = security_perf_event_write(event); + if (ret) + return ret; + ctx = perf_event_ctx_lock(event); ret = _perf_ioctl(event, cmd, arg); perf_event_ctx_unlock(event, ctx); @@ -5719,6 +5731,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; + ret = security_perf_event_read(event); + if (ret) + return ret; + vma_size = vma->vm_end - vma->vm_start; if (vma->vm_pgoff == 0) { @@ -5844,7 +5860,7 @@ accounting: lock_limit >>= PAGE_SHIFT; locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; - if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + if ((locked > lock_limit) && perf_is_paranoid() && !capable(CAP_IPC_LOCK)) { ret = -EPERM; goto unlock; @@ -10578,11 +10594,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } } + err = security_perf_event_alloc(event); + if (err) + goto err_callchain_buffer; + /* symmetric to unaccount_event() in _free_event() */ account_event(event); return event; +err_callchain_buffer: + if (!event->parent) { + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + } err_addr_filters: kfree(event->addr_filter_ranges); @@ -10671,9 +10696,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->branch_sample_type = mask; } /* privileged levels capture (kernel, hv): check permissions */ - if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) - && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { + ret = perf_allow_kernel(attr); + if (ret) + return ret; + } } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { @@ -10886,13 +10913,19 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & ~PERF_FLAG_ALL) return -EINVAL; + /* Do we allow access to perf_event_open(2) ? */ + err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + if (err) + return err; + err = perf_copy_attr(attr_uptr, &attr); if (err) return err; if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + err = perf_allow_kernel(&attr); + if (err) + return err; } if (attr.namespaces) { @@ -10909,9 +10942,11 @@ SYSCALL_DEFINE5(perf_event_open, } /* Only privileged users can get physical addresses */ - if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) && - perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { + err = perf_allow_kernel(&attr); + if (err) + return err; + } err = security_locked_down(LOCKDOWN_PERF); if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR)) diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0892e38ed6fb..0917fee6ee7c 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -8,6 +8,7 @@ #include #include +#include #include "trace.h" #include "trace_probe.h" @@ -26,8 +27,10 @@ static int total_ref_count; static int perf_trace_event_perm(struct trace_event_call *tp_event, struct perf_event *p_event) { + int ret; + if (tp_event->perf_perm) { - int ret = tp_event->perf_perm(tp_event, p_event); + ret = tp_event->perf_perm(tp_event, p_event); if (ret) return ret; } @@ -46,8 +49,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event)) { - if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return -EPERM; + ret = perf_allow_tracepoint(&p_event->attr); + if (ret) + return ret; if (!is_sampling_event(p_event)) return 0; @@ -82,8 +86,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, * ...otherwise raw tracepoint data can be a severe data leak, * only allow root to have these. */ - if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return -EPERM; + ret = perf_allow_tracepoint(&p_event->attr); + if (ret) + return ret; return 0; } -- cgit From 8a9f91c51ea72b126864e0db616b1bac12261200 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Mon, 14 Oct 2019 16:14:59 +0800 Subject: perf/ring_buffer: Modify the parameter type of perf_mmap_free_page() In perf_mmap_free_page(), the unsigned long type is converted to the pointer type, but where the call is made, the pointer type is converted to the unsigned long type. There is no need to do these operations. Modify the parameter type of perf_mmap_free_page() to pointer type. Signed-off-by: Yunfeng Ye Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Link: https://lkml.kernel.org/r/e6ae3f0c-d04c-50f9-544a-aee3b30330cd@huawei.com --- kernel/events/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ffb59a4ef4ff..abc145cbfedf 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -799,9 +799,9 @@ fail: return NULL; } -static void perf_mmap_free_page(unsigned long addr) +static void perf_mmap_free_page(void *addr) { - struct page *page = virt_to_page((void *)addr); + struct page *page = virt_to_page(addr); page->mapping = NULL; __free_page(page); @@ -811,9 +811,9 @@ void rb_free(struct ring_buffer *rb) { int i; - perf_mmap_free_page((unsigned long)rb->user_page); + perf_mmap_free_page(rb->user_page); for (i = 0; i < rb->nr_pages; i++) - perf_mmap_free_page((unsigned long)rb->data_pages[i]); + perf_mmap_free_page(rb->data_pages[i]); kfree(rb); } -- cgit From d7e78706e43107fa269fe34b1a69e653f5ec9f2c Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Mon, 14 Oct 2019 16:15:57 +0800 Subject: perf/ring_buffer: Matching the memory allocate and free, in rb_alloc() Currently perf_mmap_alloc_page() is used to allocate memory in rb_alloc(), but using free_page() to free memory in the failure path. It's better to use perf_mmap_free_page() instead. Signed-off-by: Yunfeng Ye Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Link: https://lkml.kernel.org/r/575c7e8c-90c7-4e3a-b41d-f894d8cdbd7f@huawei.com --- kernel/events/ring_buffer.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index abc145cbfedf..246c83ac5643 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -754,6 +754,14 @@ static void *perf_mmap_alloc_page(int cpu) return page_address(page); } +static void perf_mmap_free_page(void *addr) +{ + struct page *page = virt_to_page(addr); + + page->mapping = NULL; + __free_page(page); +} + struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) { struct ring_buffer *rb; @@ -788,9 +796,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) fail_data_pages: for (i--; i >= 0; i--) - free_page((unsigned long)rb->data_pages[i]); + perf_mmap_free_page(rb->data_pages[i]); - free_page((unsigned long)rb->user_page); + perf_mmap_free_page(rb->user_page); fail_user_page: kfree(rb); @@ -799,14 +807,6 @@ fail: return NULL; } -static void perf_mmap_free_page(void *addr) -{ - struct page *page = virt_to_page(addr); - - page->mapping = NULL; - __free_page(page); -} - void rb_free(struct ring_buffer *rb) { int i; -- cgit From fc65104c7c89635ecacbff6cf5724ac24232d381 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 18 Oct 2019 11:18:42 +0800 Subject: dma-debug: Use pr_warn instead of pr_warning As said in commit f2c2cbcc35d4 ("powerpc: Use pr_warn instead of pr_warning"), removing pr_warning so all logging messages use a consistent _warn style. Let's do it. Link: http://lkml.kernel.org/r/20191018031850.48498-25-wangkefeng.wang@huawei.com To: linux-kernel@vger.kernel.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Signed-off-by: Kefeng Wang Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/dma/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 099002d84f46..a26170469543 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -161,7 +161,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry) { #ifdef CONFIG_STACKTRACE if (entry) { - pr_warning("Mapped at:\n"); + pr_warn("Mapped at:\n"); stack_trace_print(entry->stack_entries, entry->stack_len, 0); } #endif -- cgit From 3da2e1fd46a726531a1e6d621bda62821a5e559a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 18 Oct 2019 11:18:43 +0800 Subject: trace: Use pr_warn instead of pr_warning As said in commit f2c2cbcc35d4 ("powerpc: Use pr_warn instead of pr_warning"), removing pr_warning so all logging messages use a consistent _warn style. Let's do it. Link: http://lkml.kernel.org/r/20191018031850.48498-26-wangkefeng.wang@huawei.com To: linux-kernel@vger.kernel.org Cc: Ingo Molnar Signed-off-by: Kefeng Wang Acked-by: Steven Rostedt (VMware) Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/trace/trace_benchmark.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 80e0b2aca703..2e9a4746ea85 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -178,14 +178,14 @@ static int benchmark_event_kthread(void *arg) int trace_benchmark_reg(void) { if (!ok_to_run) { - pr_warning("trace benchmark cannot be started via kernel command line\n"); + pr_warn("trace benchmark cannot be started via kernel command line\n"); return -EBUSY; } bm_event_thread = kthread_run(benchmark_event_kthread, NULL, "event_benchmark"); if (IS_ERR(bm_event_thread)) { - pr_warning("trace benchmark failed to create kernel thread\n"); + pr_warn("trace benchmark failed to create kernel thread\n"); return PTR_ERR(bm_event_thread); } -- cgit From 9fa8c9c647be624e91b09ecffa7cd97ee0600b40 Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Fri, 18 Oct 2019 09:20:34 +0800 Subject: tracing: Fix "gfp_t" format for synthetic events In the format of synthetic events, the "gfp_t" is shown as "signed:1", but in fact the "gfp_t" is "unsigned", should be shown as "signed:0". The issue can be reproduced by the following commands: echo 'memlatency u64 lat; unsigned int order; gfp_t gfp_flags; int migratetype' > /sys/kernel/debug/tracing/synthetic_events cat /sys/kernel/debug/tracing/events/synthetic/memlatency/format name: memlatency ID: 2233 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:u64 lat; offset:8; size:8; signed:0; field:unsigned int order; offset:16; size:4; signed:0; field:gfp_t gfp_flags; offset:24; size:4; signed:1; field:int migratetype; offset:32; size:4; signed:1; print fmt: "lat=%llu, order=%u, gfp_flags=%x, migratetype=%d", REC->lat, REC->order, REC->gfp_flags, REC->migratetype Link: http://lkml.kernel.org/r/20191018012034.6404-1-zhengjun.xing@linux.intel.com Reviewed-by: Tom Zanussi Signed-off-by: Zhengjun Xing Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 57648c5aa679..7482a1466ebf 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -679,6 +679,8 @@ static bool synth_field_signed(char *type) { if (str_has_prefix(type, "u")) return false; + if (strcmp(type, "gfp_t") == 0) + return false; return true; } -- cgit From c108e3c1bdbd0783d7c19ee80abb0591f79029e8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 17 Oct 2019 23:09:33 -0700 Subject: bpf: Fix bpf_attr.attach_btf_id check Only raw_tracepoint program type can have bpf_attr.attach_btf_id >= 0. Make sure to reject other program types that accidentally set it to non-zero. Fixes: ccfe29eb29c2 ("bpf: Add attach_btf_id attribute to program load") Reported-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191018060933.2950231-1-ast@kernel.org --- kernel/bpf/syscall.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 523e3ac15a08..16ea3c0db4f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1570,6 +1570,17 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, u32 btf_id) { + switch (prog_type) { + case BPF_PROG_TYPE_RAW_TRACEPOINT: + if (btf_id > BTF_MAX_TYPE) + return -EINVAL; + break; + default: + if (btf_id) + return -EINVAL; + break; + } + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { @@ -1610,13 +1621,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } - case BPF_PROG_TYPE_RAW_TRACEPOINT: - if (btf_id > BTF_MAX_TYPE) - return -EINVAL; - return 0; default: - if (btf_id) - return -EINVAL; return 0; } } -- cgit From 1f5343c0ae9673543055e9794362766e1f0ed163 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 18 Oct 2019 17:03:44 +0800 Subject: bpf: Fix build error without CONFIG_NET If CONFIG_NET is n, building fails: kernel/trace/bpf_trace.o: In function `raw_tp_prog_func_proto': bpf_trace.c:(.text+0x1a34): undefined reference to `bpf_skb_output_proto' Wrap it into a #ifdef to fix this. Fixes: a7658e1a4164 ("bpf: Check types of arguments passed into helpers") Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191018090344.26936-1-yuehaibing@huawei.com --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 52f7e9d8c29b..c3240898cc44 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1055,8 +1055,10 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_raw_tp; +#ifdef CONFIG_NET case BPF_FUNC_skb_output: return &bpf_skb_output_proto; +#endif case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: -- cgit From 05679ca6feebc1ef3bf743563315d9975adcf6fb Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 17 Oct 2019 12:57:02 +0200 Subject: xdp: Prevent overflow in devmap_hash cost calculation for 32-bit builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tetsuo pointed out that without an explicit cast, the cost calculation for devmap_hash type maps could overflow on 32-bit builds. This adds the missing cast. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Tetsuo Handa Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191017105702.2807093-1-toke@redhat.com --- kernel/bpf/devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d27f3b60ff6d..c0a48f336997 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -128,7 +128,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; - cost += sizeof(struct hlist_head) * dtab->n_buckets; + cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; } /* if map size is larger than memlock limit, reject it */ -- cgit From aa5de305c90c995321df452f274fe75b52bd8fcc Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 18 Oct 2019 20:20:40 -0700 Subject: kernel/events/uprobes.c: only do FOLL_SPLIT_PMD for uprobe register Attaching uprobe to text section in THP splits the PMD mapped page table into PTE mapped entries. On uprobe detach, we would like to regroup PMD mapped page table entry to regain performance benefit of THP. However, the regroup is broken For perf_event based trace_uprobe. This is because perf_event based trace_uprobe calls uprobe_unregister twice on close: first in TRACE_REG_PERF_CLOSE, then in TRACE_REG_PERF_UNREGISTER. The second call will split the PMD mapped page table entry, which is not the desired behavior. Fix this by only use FOLL_SPLIT_PMD for uprobe register case. Add a WARN() to confirm uprobe unregister never work on huge pages, and abort the operation when this WARN() triggers. Link: http://lkml.kernel.org/r/20191017164223.2762148-6-songliubraving@fb.com Fixes: 5a52c9df62b4 ("uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT") Signed-off-by: Song Liu Reviewed-by: Srikar Dronamraju Cc: Kirill A. Shutemov Cc: Oleg Nesterov Cc: Matthew Wilcox (Oracle) Cc: William Kucharski Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 94d38a39d72e..c74761004ee5 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -474,14 +474,17 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; bool orig_page_huge = false; + unsigned int gup_flags = FOLL_FORCE; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); retry: + if (is_register) + gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags, + &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -489,6 +492,12 @@ retry: if (ret <= 0) goto put_old; + if (WARN(!is_register && PageCompound(old_page), + "uprobe unregister should never work on compound page\n")) { + ret = -EINVAL; + goto put_old; + } + /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); -- cgit From da6043fe85eb5ec621e34a92540735dcebbea134 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 25 Sep 2019 15:39:12 +0100 Subject: PM / hibernate: memory_bm_find_bit(): Tighten node optimisation When looking for a bit by number we make use of the cached result from the preceding lookup to speed up operation. Firstly we check if the requested pfn is within the cached zone and if not lookup the new zone. We then check if the offset for that pfn falls within the existing cached node. This happens regardless of whether the node is within the zone we are now scanning. With certain memory layouts it is possible for this to false trigger creating a temporary alias for the pfn to a different bit. This leads the hibernation code to free memory which it was never allocated with the expected fallout. Ensure the zone we are scanning matches the cached zone before considering the cached node. Deep thanks go to Andrea for many, many, many hours of hacking and testing that went into cornering this bug. Reported-by: Andrea Righi Tested-by: Andrea Righi Signed-off-by: Andy Whitcroft Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 83105874f255..26b9168321e7 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -734,8 +734,15 @@ zone_found: * We have found the zone. Now walk the radix tree to find the leaf node * for our PFN. */ + + /* + * If the zone we wish to scan is the the current zone and the + * pfn falls into the current node then we do not need to walk + * the tree. + */ node = bm->cur.node; - if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) + if (zone == bm->cur.zone && + ((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) goto node_found; node = zone->rtree; -- cgit From 77751a466ebd1a785456556061a2db6d60ea3898 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 16 Oct 2019 12:41:24 +0200 Subject: PM: QoS: Introduce frequency QoS Introduce frequency QoS, based on the "raw" low-level PM QoS, to represent min and max frequency requests and aggregate constraints. The min and max frequency requests are to be represented by struct freq_qos_request objects and the aggregate constraints are to be represented by struct freq_constraints objects. The latter are expected to be initialized with the help of freq_constraints_init(). The freq_qos_read_value() helper is defined to retrieve the aggregate constraints values from a given struct freq_constraints object and there are the freq_qos_add_request(), freq_qos_update_request() and freq_qos_remove_request() helpers to manipulate the min and max frequency requests. It is assumed that the the helpers will not run concurrently with each other for the same struct freq_qos_request object, so if that may be the case, their uses must ensure proper synchronization between them (e.g. through locking). In addition, freq_qos_add_notifier() and freq_qos_remove_notifier() are provided to add and remove notifiers that will trigger on aggregate constraint changes to and from a given struct freq_constraints object, respectively. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- kernel/power/qos.c | 240 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 240 insertions(+) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9568a2fe7c11..04e83fdfbe80 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -650,3 +650,243 @@ static int __init pm_qos_power_init(void) } late_initcall(pm_qos_power_init); + +/* Definitions related to the frequency QoS below. */ + +/** + * freq_constraints_init - Initialize frequency QoS constraints. + * @qos: Frequency QoS constraints to initialize. + */ +void freq_constraints_init(struct freq_constraints *qos) +{ + struct pm_qos_constraints *c; + + c = &qos->min_freq; + plist_head_init(&c->list); + c->target_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->default_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->no_constraint_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->type = PM_QOS_MAX; + c->notifiers = &qos->min_freq_notifiers; + BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); + + c = &qos->max_freq; + plist_head_init(&c->list); + c->target_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->default_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->no_constraint_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->type = PM_QOS_MIN; + c->notifiers = &qos->max_freq_notifiers; + BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); +} + +/** + * freq_qos_read_value - Get frequency QoS constraint for a given list. + * @qos: Constraints to evaluate. + * @type: QoS request type. + */ +s32 freq_qos_read_value(struct freq_constraints *qos, + enum freq_qos_req_type type) +{ + s32 ret; + + switch (type) { + case FREQ_QOS_MIN: + ret = IS_ERR_OR_NULL(qos) ? + FREQ_QOS_MIN_DEFAULT_VALUE : + pm_qos_read_value(&qos->min_freq); + break; + case FREQ_QOS_MAX: + ret = IS_ERR_OR_NULL(qos) ? + FREQ_QOS_MAX_DEFAULT_VALUE : + pm_qos_read_value(&qos->max_freq); + break; + default: + WARN_ON(1); + ret = 0; + } + + return ret; +} + +/** + * freq_qos_apply - Add/modify/remove frequency QoS request. + * @req: Constraint request to apply. + * @action: Action to perform (add/update/remove). + * @value: Value to assign to the QoS request. + */ +static int freq_qos_apply(struct freq_qos_request *req, + enum pm_qos_req_action action, s32 value) +{ + int ret; + + switch(req->type) { + case FREQ_QOS_MIN: + ret = pm_qos_update_target(&req->qos->min_freq, &req->pnode, + action, value); + break; + case FREQ_QOS_MAX: + ret = pm_qos_update_target(&req->qos->max_freq, &req->pnode, + action, value); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/** + * freq_qos_add_request - Insert new frequency QoS request into a given list. + * @qos: Constraints to update. + * @req: Preallocated request object. + * @type: Request type. + * @value: Request value. + * + * Insert a new entry into the @qos list of requests, recompute the effective + * QoS constraint value for that list and initialize the @req object. The + * caller needs to save that object for later use in updates and removal. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_add_request(struct freq_constraints *qos, + struct freq_qos_request *req, + enum freq_qos_req_type type, s32 value) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !req) + return -EINVAL; + + if (WARN(freq_qos_request_active(req), + "%s() called for active request\n", __func__)) + return -EINVAL; + + req->qos = qos; + req->type = type; + ret = freq_qos_apply(req, PM_QOS_ADD_REQ, value); + if (ret < 0) { + req->qos = NULL; + req->type = 0; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_add_request); + +/** + * freq_qos_update_request - Modify existing frequency QoS request. + * @req: Request to modify. + * @new_value: New request value. + * + * Update an existing frequency QoS request along with the effective constraint + * value for the list of requests it belongs to. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) +{ + if (!req) + return -EINVAL; + + if (WARN(!freq_qos_request_active(req), + "%s() called for unknown object\n", __func__)) + return -EINVAL; + + if (req->pnode.prio == new_value) + return 0; + + return freq_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); +} +EXPORT_SYMBOL_GPL(freq_qos_update_request); + +/** + * freq_qos_remove_request - Remove frequency QoS request from its list. + * @req: Request to remove. + * + * Remove the given frequency QoS request from the list of constraints it + * belongs to and recompute the effective constraint value for that list. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_remove_request(struct freq_qos_request *req) +{ + if (!req) + return -EINVAL; + + if (WARN(!freq_qos_request_active(req), + "%s() called for unknown object\n", __func__)) + return -EINVAL; + + return freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); +} +EXPORT_SYMBOL_GPL(freq_qos_remove_request); + +/** + * freq_qos_add_notifier - Add frequency QoS change notifier. + * @qos: List of requests to add the notifier to. + * @type: Request type. + * @notifier: Notifier block to add. + */ +int freq_qos_add_notifier(struct freq_constraints *qos, + enum freq_qos_req_type type, + struct notifier_block *notifier) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !notifier) + return -EINVAL; + + switch (type) { + case FREQ_QOS_MIN: + ret = blocking_notifier_chain_register(qos->min_freq.notifiers, + notifier); + break; + case FREQ_QOS_MAX: + ret = blocking_notifier_chain_register(qos->max_freq.notifiers, + notifier); + break; + default: + WARN_ON(1); + ret = -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_add_notifier); + +/** + * freq_qos_remove_notifier - Remove frequency QoS change notifier. + * @qos: List of requests to remove the notifier from. + * @type: Request type. + * @notifier: Notifier block to remove. + */ +int freq_qos_remove_notifier(struct freq_constraints *qos, + enum freq_qos_req_type type, + struct notifier_block *notifier) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !notifier) + return -EINVAL; + + switch (type) { + case FREQ_QOS_MIN: + ret = blocking_notifier_chain_unregister(qos->min_freq.notifiers, + notifier); + break; + case FREQ_QOS_MAX: + ret = blocking_notifier_chain_unregister(qos->max_freq.notifiers, + notifier); + break; + default: + WARN_ON(1); + ret = -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_remove_notifier); -- cgit From 490ba971d8b498ba3a47999ab94c6a0d1830ad41 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:28 +0200 Subject: sched/fair: Clean up asym packing Clean up asym packing to follow the default load balance behavior: - classify the group by creating a group_asym_packing field. - calculate the imbalance in calculate_imbalance() instead of bypassing it. We don't need to test twice same conditions anymore to detect asym packing and we consolidate the calculation of imbalance in calculate_imbalance(). There is no functional changes. Signed-off-by: Vincent Guittot Acked-by: Rik van Riel Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 63 ++++++++++++++--------------------------------------- 1 file changed, 16 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 682a754ea3e1..5ce0f71042ad 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7665,6 +7665,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -8119,9 +8120,17 @@ asym_packing: * ASYM_PACKING needs to move all the work to the highest * prority CPUs in the group, therefore mark all groups * of lower priority than ourself as busy. + * + * This is primarily intended to used at the sibling level. Some + * cores like POWER7 prefer to use lower numbered SMT threads. In the + * case of POWER7, it can move to lower SMT modes only when higher + * threads are idle. When in lower SMT modes, the threads will + * perform better since they share less core resources. Hence when we + * have idle threads, we want them to be the higher ones. */ if (sgs->sum_nr_running && sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { + sgs->group_asym_packing = 1; if (!sds->busiest) return true; @@ -8262,51 +8271,6 @@ next_group: } } -/** - * check_asym_packing - Check to see if the group is packed into the - * sched domain. - * - * This is primarily intended to used at the sibling level. Some - * cores like POWER7 prefer to use lower numbered SMT threads. In the - * case of POWER7, it can move to lower SMT modes only when higher - * threads are idle. When in lower SMT modes, the threads will - * perform better since they share less core resources. Hence when we - * have idle threads, we want them to be the higher ones. - * - * This packing function is run on idle threads. It checks to see if - * the busiest CPU in this domain (core in the P7 case) has a higher - * CPU number than the packing function is being run on. Here we are - * assuming lower CPU number will be equivalent to lower a SMT thread - * number. - * - * Return: 1 when packing is required and a task should be moved to - * this CPU. The amount of the imbalance is returned in env->imbalance. - * - * @env: The load balancing environment. - * @sds: Statistics of the sched_domain which is to be packed - */ -static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) -{ - int busiest_cpu; - - if (!(env->sd->flags & SD_ASYM_PACKING)) - return 0; - - if (env->idle == CPU_NOT_IDLE) - return 0; - - if (!sds->busiest) - return 0; - - busiest_cpu = sds->busiest->asym_prefer_cpu; - if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) - return 0; - - env->imbalance = sds->busiest_stat.group_load; - - return 1; -} - /** * fix_small_imbalance - Calculate the minor imbalance that exists * amongst the groups of a sched_domain, during @@ -8391,6 +8355,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local = &sds->local_stat; busiest = &sds->busiest_stat; + if (busiest->group_asym_packing) { + env->imbalance = busiest->group_load; + return; + } + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages @@ -8495,8 +8464,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest = &sds.busiest_stat; /* ASYM feature bypasses nice load balance check */ - if (check_asym_packing(env, &sds)) - return sds.busiest; + if (busiest->group_asym_packing) + goto force_balance; /* There is no busy sibling group to pull tasks from */ if (!sds.busiest || busiest->sum_nr_running == 0) -- cgit From a34983470301018324f0110791da452fee1318c2 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:29 +0200 Subject: sched/fair: Rename sg_lb_stats::sum_nr_running to sum_h_nr_running Rename sum_nr_running to sum_h_nr_running because it effectively tracks cfs->h_nr_running so we can use sum_nr_running to track rq->nr_running when needed. There are no functional changes. Signed-off-by: Vincent Guittot Reviewed-by: Valentin Schneider Acked-by: Rik van Riel Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: srikar@linux.vnet.ibm.com Link: https://lkml.kernel.org/r/1571405198-27570-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5ce0f71042ad..ad8f16a411e1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7660,7 +7660,7 @@ struct sg_lb_stats { unsigned long load_per_task; unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ - unsigned int sum_nr_running; /* Nr tasks running in the group */ + unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; @@ -7705,7 +7705,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, - .sum_nr_running = 0, + .sum_h_nr_running = 0, .group_type = group_other, }, }; @@ -7896,7 +7896,7 @@ static inline int sg_imbalanced(struct sched_group *group) static inline bool group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running < sgs->group_weight) + if (sgs->sum_h_nr_running < sgs->group_weight) return true; if ((sgs->group_capacity * 100) > @@ -7917,7 +7917,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) static inline bool group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running <= sgs->group_weight) + if (sgs->sum_h_nr_running <= sgs->group_weight) return false; if ((sgs->group_capacity * 100) < @@ -8009,7 +8009,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += cpu_runnable_load(rq); sgs->group_util += cpu_util(i); - sgs->sum_nr_running += rq->cfs.h_nr_running; + sgs->sum_h_nr_running += rq->cfs.h_nr_running; nr_running = rq->nr_running; if (nr_running > 1) @@ -8039,8 +8039,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_capacity = group->sgc->capacity; sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; - if (sgs->sum_nr_running) - sgs->load_per_task = sgs->group_load / sgs->sum_nr_running; + if (sgs->sum_h_nr_running) + sgs->load_per_task = sgs->group_load / sgs->sum_h_nr_running; sgs->group_weight = group->group_weight; @@ -8097,7 +8097,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * capable CPUs may harm throughput. Maximize throughput, * power/energy consequences are not considered. */ - if (sgs->sum_nr_running <= sgs->group_weight && + if (sgs->sum_h_nr_running <= sgs->group_weight && group_smaller_min_cpu_capacity(sds->local, sg)) return false; @@ -8128,7 +8128,7 @@ asym_packing: * perform better since they share less core resources. Hence when we * have idle threads, we want them to be the higher ones. */ - if (sgs->sum_nr_running && + if (sgs->sum_h_nr_running && sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { sgs->group_asym_packing = 1; if (!sds->busiest) @@ -8146,9 +8146,9 @@ asym_packing: #ifdef CONFIG_NUMA_BALANCING static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running > sgs->nr_numa_running) + if (sgs->sum_h_nr_running > sgs->nr_numa_running) return regular; - if (sgs->sum_nr_running > sgs->nr_preferred_running) + if (sgs->sum_h_nr_running > sgs->nr_preferred_running) return remote; return all; } @@ -8223,7 +8223,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd */ if (prefer_sibling && sds->local && group_has_capacity(env, local) && - (sgs->sum_nr_running > local->sum_nr_running + 1)) { + (sgs->sum_h_nr_running > local->sum_h_nr_running + 1)) { sgs->group_no_capacity = 1; sgs->group_type = group_classify(sg, sgs); } @@ -8235,7 +8235,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ - sds->total_running += sgs->sum_nr_running; + sds->total_running += sgs->sum_h_nr_running; sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; @@ -8289,7 +8289,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) local = &sds->local_stat; busiest = &sds->busiest_stat; - if (!local->sum_nr_running) + if (!local->sum_h_nr_running) local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); else if (busiest->load_per_task > local->load_per_task) imbn = 1; @@ -8387,7 +8387,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { - load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; + load_above_capacity = busiest->sum_h_nr_running * SCHED_CAPACITY_SCALE; if (load_above_capacity > busiest->group_capacity) { load_above_capacity -= busiest->group_capacity; load_above_capacity *= scale_load_down(NICE_0_LOAD); @@ -8468,7 +8468,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest || busiest->sum_nr_running == 0) + if (!sds.busiest || busiest->sum_h_nr_running == 0) goto out_balanced; /* XXX broken for overlapping NUMA groups */ -- cgit From fcf0553db6f4c79387864f6e4ab4a891601f395e Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:30 +0200 Subject: sched/fair: Remove meaningless imbalance calculation Clean up load_balance() and remove meaningless calculation and fields before adding a new algorithm. Signed-off-by: Vincent Guittot Acked-by: Rik van Riel Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 105 +--------------------------------------------------- 1 file changed, 1 insertion(+), 104 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ad8f16a411e1..a1bc04fff23c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5380,18 +5380,6 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = cpu_runnable_load(rq); - - if (nr_running) - return load_avg / nr_running; - - return 0; -} - static void record_wakee(struct task_struct *p) { /* @@ -7657,7 +7645,6 @@ static unsigned long task_h_load(struct task_struct *p) struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long load_per_task; unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ @@ -8039,9 +8026,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_capacity = group->sgc->capacity; sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; - if (sgs->sum_h_nr_running) - sgs->load_per_task = sgs->group_load / sgs->sum_h_nr_running; - sgs->group_weight = group->group_weight; sgs->group_no_capacity = group_is_overloaded(env, sgs); @@ -8271,76 +8255,6 @@ next_group: } } -/** - * fix_small_imbalance - Calculate the minor imbalance that exists - * amongst the groups of a sched_domain, during - * load balancing. - * @env: The load balancing environment. - * @sds: Statistics of the sched_domain whose imbalance is to be calculated. - */ -static inline -void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) -{ - unsigned long tmp, capa_now = 0, capa_move = 0; - unsigned int imbn = 2; - unsigned long scaled_busy_load_per_task; - struct sg_lb_stats *local, *busiest; - - local = &sds->local_stat; - busiest = &sds->busiest_stat; - - if (!local->sum_h_nr_running) - local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); - else if (busiest->load_per_task > local->load_per_task) - imbn = 1; - - scaled_busy_load_per_task = - (busiest->load_per_task * SCHED_CAPACITY_SCALE) / - busiest->group_capacity; - - if (busiest->avg_load + scaled_busy_load_per_task >= - local->avg_load + (scaled_busy_load_per_task * imbn)) { - env->imbalance = busiest->load_per_task; - return; - } - - /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU capacity used by - * moving them. - */ - - capa_now += busiest->group_capacity * - min(busiest->load_per_task, busiest->avg_load); - capa_now += local->group_capacity * - min(local->load_per_task, local->avg_load); - capa_now /= SCHED_CAPACITY_SCALE; - - /* Amount of load we'd subtract */ - if (busiest->avg_load > scaled_busy_load_per_task) { - capa_move += busiest->group_capacity * - min(busiest->load_per_task, - busiest->avg_load - scaled_busy_load_per_task); - } - - /* Amount of load we'd add */ - if (busiest->avg_load * busiest->group_capacity < - busiest->load_per_task * SCHED_CAPACITY_SCALE) { - tmp = (busiest->avg_load * busiest->group_capacity) / - local->group_capacity; - } else { - tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / - local->group_capacity; - } - capa_move += local->group_capacity * - min(local->load_per_task, local->avg_load + tmp); - capa_move /= SCHED_CAPACITY_SCALE; - - /* Move if we gain throughput */ - if (capa_move > capa_now) - env->imbalance = busiest->load_per_task; -} - /** * calculate_imbalance - Calculate the amount of imbalance present within the * groups of a given sched_domain during load balance. @@ -8360,15 +8274,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return; } - if (busiest->group_type == group_imbalanced) { - /* - * In the group_imb case we cannot rely on group-wide averages - * to ensure CPU-load equilibrium, look at wider averages. XXX - */ - busiest->load_per_task = - min(busiest->load_per_task, sds->avg_load); - } - /* * Avg load of busiest sg can be less and avg load of local sg can * be greater than avg load across all sgs of sd because avg load @@ -8379,7 +8284,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load)) { env->imbalance = 0; - return fix_small_imbalance(env, sds); + return; } /* @@ -8417,14 +8322,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s busiest->group_misfit_task_load); } - /* - * if *imbalance is less than the average load per runnable task - * there is no guarantee that any tasks will be moved so we'll have - * a think about bumping its value to force at least one task to be - * moved - */ - if (env->imbalance < busiest->load_per_task) - return fix_small_imbalance(env, sds); } /******* find_busiest_group() helpers end here *********************/ -- cgit From 0b0695f2b34a4afa3f6e9aa1ff0e5336d8dad912 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:31 +0200 Subject: sched/fair: Rework load_balance() The load_balance() algorithm contains some heuristics which have become meaningless since the rework of the scheduler's metrics like the introduction of PELT. Furthermore, load is an ill-suited metric for solving certain task placement imbalance scenarios. For instance, in the presence of idle CPUs, we should simply try to get at least one task per CPU, whereas the current load-based algorithm can actually leave idle CPUs alone simply because the load is somewhat balanced. The current algorithm ends up creating virtual and meaningless values like the avg_load_per_task or tweaks the state of a group to make it overloaded whereas it's not, in order to try to migrate tasks. load_balance() should better qualify the imbalance of the group and clearly define what has to be moved to fix this imbalance. The type of sched_group has been extended to better reflect the type of imbalance. We now have: group_has_spare group_fully_busy group_misfit_task group_asym_packing group_imbalanced group_overloaded Based on the type of sched_group, load_balance now sets what it wants to move in order to fix the imbalance. It can be some load as before but also some utilization, a number of task or a type of task: migrate_task migrate_util migrate_load migrate_misfit This new load_balance() algorithm fixes several pending wrong tasks placement: - the 1 task per CPU case with asymmetric system - the case of cfs task preempted by other class - the case of tasks not evenly spread on groups with spare capacity Also the load balance decisions have been consolidated in the 3 functions below after removing the few bypasses and hacks of the current code: - update_sd_pick_busiest() select the busiest sched_group. - find_busiest_group() checks if there is an imbalance between local and busiest group. - calculate_imbalance() decides what have to be moved. Finally, the now unused field total_running of struct sd_lb_stats has been removed. Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-5-git-send-email-vincent.guittot@linaro.org [ Small readability and spelling updates. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 611 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 402 insertions(+), 209 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a1bc04fff23c..76a2aa8db471 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7058,11 +7058,26 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +/* + * group_type describes the group of CPUs at the moment of the load balance. + * The enum is ordered by pulling priority, with the group with lowest priority + * first so the groupe_type can be simply compared when selecting the busiest + * group. see update_sd_pick_busiest(). + */ enum group_type { - group_other = 0, + group_has_spare = 0, + group_fully_busy, group_misfit_task, + group_asym_packing, group_imbalanced, - group_overloaded, + group_overloaded +}; + +enum migration_type { + migrate_load = 0, + migrate_util, + migrate_task, + migrate_misfit }; #define LBF_ALL_PINNED 0x01 @@ -7095,7 +7110,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; - enum group_type src_grp_type; + enum migration_type migration_type; struct list_head tasks; }; @@ -7318,7 +7333,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) static const unsigned int sched_nr_migrate_break = 32; /* - * detach_tasks() -- tries to detach up to imbalance runnable load from + * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". * * Returns number of detached tasks if successful and 0 otherwise. @@ -7326,8 +7341,8 @@ static const unsigned int sched_nr_migrate_break = 32; static int detach_tasks(struct lb_env *env) { struct list_head *tasks = &env->src_rq->cfs_tasks; + unsigned long util, load; struct task_struct *p; - unsigned long load; int detached = 0; lockdep_assert_held(&env->src_rq->lock); @@ -7360,19 +7375,51 @@ static int detach_tasks(struct lb_env *env) if (!can_migrate_task(p, env)) goto next; - load = task_h_load(p); + switch (env->migration_type) { + case migrate_load: + load = task_h_load(p); - if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) - goto next; + if (sched_feat(LB_MIN) && + load < 16 && !env->sd->nr_balance_failed) + goto next; - if ((load / 2) > env->imbalance) - goto next; + if (load/2 > env->imbalance) + goto next; + + env->imbalance -= load; + break; + + case migrate_util: + util = task_util_est(p); + + if (util > env->imbalance) + goto next; + + env->imbalance -= util; + break; + + case migrate_task: + env->imbalance--; + break; + + case migrate_misfit: + load = task_h_load(p); + + /* + * Load of misfit task might decrease a bit since it has + * been recorded. Be conservative in the condition. + */ + if (load/2 < env->imbalance) + goto next; + + env->imbalance = 0; + break; + } detach_task(p, env); list_add(&p->se.group_node, &env->tasks); detached++; - env->imbalance -= load; #ifdef CONFIG_PREEMPTION /* @@ -7386,7 +7433,7 @@ static int detach_tasks(struct lb_env *env) /* * We only want to steal up to the prescribed amount of - * runnable load. + * load/util/tasks. */ if (env->imbalance <= 0) break; @@ -7651,7 +7698,6 @@ struct sg_lb_stats { unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - int group_no_capacity; unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING @@ -7667,10 +7713,10 @@ struct sg_lb_stats { struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ - unsigned long total_running; unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ + unsigned int prefer_sibling; /* tasks should go to sibling first */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ struct sg_lb_stats local_stat; /* Statistics of the local group */ @@ -7681,19 +7727,18 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) /* * Skimp on the clearing to avoid duplicate work. We can avoid clearing * local_stat because update_sg_lb_stats() does a full clear/assignment. - * We must however clear busiest_stat::avg_load because - * update_sd_pick_busiest() reads this before assignment. + * We must however set busiest_stat::group_type and + * busiest_stat::idle_cpus to the worst busiest group because + * update_sd_pick_busiest() reads these before assignment. */ *sds = (struct sd_lb_stats){ .busiest = NULL, .local = NULL, - .total_running = 0UL, .total_load = 0UL, .total_capacity = 0UL, .busiest_stat = { - .avg_load = 0UL, - .sum_h_nr_running = 0, - .group_type = group_other, + .idle_cpus = UINT_MAX, + .group_type = group_has_spare, }, }; } @@ -7935,19 +7980,26 @@ group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) } static inline enum -group_type group_classify(struct sched_group *group, +group_type group_classify(struct lb_env *env, + struct sched_group *group, struct sg_lb_stats *sgs) { - if (sgs->group_no_capacity) + if (group_is_overloaded(env, sgs)) return group_overloaded; if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_asym_packing) + return group_asym_packing; + if (sgs->group_misfit_task_load) return group_misfit_task; - return group_other; + if (!group_has_capacity(env, sgs)) + return group_fully_busy; + + return group_has_spare; } static bool update_nohz_stats(struct rq *rq, bool force) @@ -7984,10 +8036,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sg_lb_stats *sgs, int *sg_status) { - int i, nr_running; + int i, nr_running, local_group; memset(sgs, 0, sizeof(*sgs)); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); @@ -8012,9 +8066,16 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* * No need to call idle_cpu() if nr_running is not 0 */ - if (!nr_running && idle_cpu(i)) + if (!nr_running && idle_cpu(i)) { sgs->idle_cpus++; + /* Idle cpu can't have misfit task */ + continue; + } + + if (local_group) + continue; + /* Check for a misfit task on the cpu */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; @@ -8022,14 +8083,24 @@ static inline void update_sg_lb_stats(struct lb_env *env, } } - /* Adjust by relative CPU capacity of the group */ + /* Check if dst CPU is idle and preferred to this group */ + if (env->sd->flags & SD_ASYM_PACKING && + env->idle != CPU_NOT_IDLE && + sgs->sum_h_nr_running && + sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { + sgs->group_asym_packing = 1; + } + sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; sgs->group_weight = group->group_weight; - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs); + sgs->group_type = group_classify(env, group, sgs); + + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / + sgs->group_capacity; } /** @@ -8052,6 +8123,10 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + /* Make sure that there is at least one task to pull */ + if (!sgs->sum_h_nr_running) + return false; + /* * Don't try to pull misfit tasks we can't help. * We can use max_capacity here as reduction in capacity on some @@ -8060,7 +8135,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, */ if (sgs->group_type == group_misfit_task && (!group_smaller_max_cpu_capacity(sg, sds->local) || - !group_has_capacity(env, &sds->local_stat))) + sds->local_stat.group_type != group_has_spare)) return false; if (sgs->group_type > busiest->group_type) @@ -8069,62 +8144,80 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; - if (sgs->avg_load <= busiest->avg_load) - return false; - - if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) - goto asym_packing; - /* - * Candidate sg has no more than one task per CPU and - * has higher per-CPU capacity. Migrating tasks to less - * capable CPUs may harm throughput. Maximize throughput, - * power/energy consequences are not considered. + * The candidate and the current busiest group are the same type of + * group. Let check which one is the busiest according to the type. */ - if (sgs->sum_h_nr_running <= sgs->group_weight && - group_smaller_min_cpu_capacity(sds->local, sg)) - return false; - /* - * If we have more than one misfit sg go with the biggest misfit. - */ - if (sgs->group_type == group_misfit_task && - sgs->group_misfit_task_load < busiest->group_misfit_task_load) + switch (sgs->group_type) { + case group_overloaded: + /* Select the overloaded group with highest avg_load. */ + if (sgs->avg_load <= busiest->avg_load) + return false; + break; + + case group_imbalanced: + /* + * Select the 1st imbalanced group as we don't have any way to + * choose one more than another. + */ return false; -asym_packing: - /* This is the busiest node in its class. */ - if (!(env->sd->flags & SD_ASYM_PACKING)) - return true; + case group_asym_packing: + /* Prefer to move from lowest priority CPU's work */ + if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) + return false; + break; - /* No ASYM_PACKING if target CPU is already busy */ - if (env->idle == CPU_NOT_IDLE) - return true; - /* - * ASYM_PACKING needs to move all the work to the highest - * prority CPUs in the group, therefore mark all groups - * of lower priority than ourself as busy. - * - * This is primarily intended to used at the sibling level. Some - * cores like POWER7 prefer to use lower numbered SMT threads. In the - * case of POWER7, it can move to lower SMT modes only when higher - * threads are idle. When in lower SMT modes, the threads will - * perform better since they share less core resources. Hence when we - * have idle threads, we want them to be the higher ones. - */ - if (sgs->sum_h_nr_running && - sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { - sgs->group_asym_packing = 1; - if (!sds->busiest) - return true; + case group_misfit_task: + /* + * If we have more than one misfit sg go with the biggest + * misfit. + */ + if (sgs->group_misfit_task_load < busiest->group_misfit_task_load) + return false; + break; - /* Prefer to move from lowest priority CPU's work */ - if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, - sg->asym_prefer_cpu)) - return true; + case group_fully_busy: + /* + * Select the fully busy group with highest avg_load. In + * theory, there is no need to pull task from such kind of + * group because tasks have all compute capacity that they need + * but we can still improve the overall throughput by reducing + * contention when accessing shared HW resources. + * + * XXX for now avg_load is not computed and always 0 so we + * select the 1st one. + */ + if (sgs->avg_load <= busiest->avg_load) + return false; + break; + + case group_has_spare: + /* + * Select not overloaded group with lowest number of + * idle cpus. We could also compare the spare capacity + * which is more stable but it can end up that the + * group has less spare capacity but finally more idle + * CPUs which means less opportunity to pull tasks. + */ + if (sgs->idle_cpus >= busiest->idle_cpus) + return false; + break; } - return false; + /* + * Candidate sg has no more than one task per CPU and has higher + * per-CPU capacity. Migrating tasks to less capable CPUs may harm + * throughput. Maximize throughput, power/energy consequences are not + * considered. + */ + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && + (sgs->group_type <= group_fully_busy) && + (group_smaller_min_cpu_capacity(sds->local, sg))) + return false; + + return true; } #ifdef CONFIG_NUMA_BALANCING @@ -8162,13 +8255,13 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) * @env: The load balancing environment. * @sds: variable to hold the statistics for this sched_domain. */ + static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; int sg_status = 0; #ifdef CONFIG_NO_HZ_COMMON @@ -8195,22 +8288,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (local_group) goto next_group; - /* - * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity so that we'll try - * and move all the excess tasks away. We lower the capacity - * of a group only if the local group has the capacity to fit - * these excess tasks. The extra check prevents the case where - * you always pull from the heaviest group when it is already - * under-utilized (possible with a large weight task outweighs - * the tasks on the system). - */ - if (prefer_sibling && sds->local && - group_has_capacity(env, local) && - (sgs->sum_h_nr_running > local->sum_h_nr_running + 1)) { - sgs->group_no_capacity = 1; - sgs->group_type = group_classify(sg, sgs); - } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -8219,13 +8296,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ - sds->total_running += sgs->sum_h_nr_running; sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; sg = sg->next; } while (sg != env->sd->groups); + /* Tag domain that child domain prefers tasks go to siblings first */ + sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; + #ifdef CONFIG_NO_HZ_COMMON if ((env->flags & LBF_NOHZ_AGAIN) && cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { @@ -8263,69 +8342,149 @@ next_group: */ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long max_pull, load_above_capacity = ~0UL; struct sg_lb_stats *local, *busiest; local = &sds->local_stat; busiest = &sds->busiest_stat; - if (busiest->group_asym_packing) { - env->imbalance = busiest->group_load; + if (busiest->group_type == group_misfit_task) { + /* Set imbalance to allow misfit tasks to be balanced. */ + env->migration_type = migrate_misfit; + env->imbalance = busiest->group_misfit_task_load; + return; + } + + if (busiest->group_type == group_asym_packing) { + /* + * In case of asym capacity, we will try to migrate all load to + * the preferred CPU. + */ + env->migration_type = migrate_task; + env->imbalance = busiest->sum_h_nr_running; + return; + } + + if (busiest->group_type == group_imbalanced) { + /* + * In the group_imb case we cannot rely on group-wide averages + * to ensure CPU-load equilibrium, try to move any task to fix + * the imbalance. The next load balance will take care of + * balancing back the system. + */ + env->migration_type = migrate_task; + env->imbalance = 1; return; } /* - * Avg load of busiest sg can be less and avg load of local sg can - * be greater than avg load across all sgs of sd because avg load - * factors in sg capacity and sgs with smaller group_type are - * skipped when updating the busiest sg: + * Try to use spare capacity of local group without overloading it or + * emptying busiest */ - if (busiest->group_type != group_misfit_task && - (busiest->avg_load <= sds->avg_load || - local->avg_load >= sds->avg_load)) { - env->imbalance = 0; + if (local->group_type == group_has_spare) { + if (busiest->group_type > group_fully_busy) { + /* + * If busiest is overloaded, try to fill spare + * capacity. This might end up creating spare capacity + * in busiest or busiest still being overloaded but + * there is no simple way to directly compute the + * amount of load to migrate in order to balance the + * system. + */ + env->migration_type = migrate_util; + env->imbalance = max(local->group_capacity, local->group_util) - + local->group_util; + + /* + * In some cases, the group's utilization is max or even + * higher than capacity because of migrations but the + * local CPU is (newly) idle. There is at least one + * waiting task in this overloaded busiest group. Let's + * try to pull it. + */ + if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { + env->migration_type = migrate_task; + env->imbalance = 1; + } + + return; + } + + if (busiest->group_weight == 1 || sds->prefer_sibling) { + unsigned int nr_diff = busiest->sum_h_nr_running; + /* + * When prefer sibling, evenly spread running tasks on + * groups. + */ + env->migration_type = migrate_task; + lsub_positive(&nr_diff, local->sum_h_nr_running); + env->imbalance = nr_diff >> 1; + return; + } + + /* + * If there is no overload, we just want to even the number of + * idle cpus. + */ + env->migration_type = migrate_task; + env->imbalance = max_t(long, 0, (local->idle_cpus - + busiest->idle_cpus) >> 1); return; } /* - * If there aren't any idle CPUs, avoid creating some. + * Local is fully busy but has to take more load to relieve the + * busiest group */ - if (busiest->group_type == group_overloaded && - local->group_type == group_overloaded) { - load_above_capacity = busiest->sum_h_nr_running * SCHED_CAPACITY_SCALE; - if (load_above_capacity > busiest->group_capacity) { - load_above_capacity -= busiest->group_capacity; - load_above_capacity *= scale_load_down(NICE_0_LOAD); - load_above_capacity /= busiest->group_capacity; - } else - load_above_capacity = ~0UL; + if (local->group_type < group_overloaded) { + /* + * Local will become overloaded so the avg_load metrics are + * finally needed. + */ + + local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / + local->group_capacity; + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; } /* - * We're trying to get all the CPUs to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded CPU below the average load. At the same time, - * we also don't want to reduce the group load below the group - * capacity. Thus we look for the minimum possible imbalance. + * Both group are or will become overloaded and we're trying to get all + * the CPUs to the average_load, so we don't want to push ourselves + * above the average load, nor do we wish to reduce the max loaded CPU + * below the average load. At the same time, we also don't want to + * reduce the group load below the group capacity. Thus we look for + * the minimum possible imbalance. */ - max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); - - /* How much load to actually move to equalise the imbalance */ + env->migration_type = migrate_load; env->imbalance = min( - max_pull * busiest->group_capacity, + (busiest->avg_load - sds->avg_load) * busiest->group_capacity, (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; - - /* Boost imbalance to allow misfit task to be balanced. */ - if (busiest->group_type == group_misfit_task) { - env->imbalance = max_t(long, env->imbalance, - busiest->group_misfit_task_load); - } - } /******* find_busiest_group() helpers end here *********************/ +/* + * Decision matrix according to the local and busiest group type: + * + * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded + * has_spare nr_idle balanced N/A N/A balanced balanced + * fully_busy nr_idle nr_idle N/A N/A balanced balanced + * misfit_task force N/A N/A N/A force force + * asym_packing force force N/A N/A force force + * imbalanced force force N/A N/A force force + * overloaded force force N/A N/A force avg_load + * + * N/A : Not Applicable because already filtered while updating + * statistics. + * balanced : The system is balanced for these 2 groups. + * force : Calculate the imbalance as load migration is probably needed. + * avg_load : Only if imbalance is significant enough. + * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite + * different in groups. + */ + /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. @@ -8360,17 +8519,17 @@ static struct sched_group *find_busiest_group(struct lb_env *env) local = &sds.local_stat; busiest = &sds.busiest_stat; - /* ASYM feature bypasses nice load balance check */ - if (busiest->group_asym_packing) - goto force_balance; - /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest || busiest->sum_h_nr_running == 0) + if (!sds.busiest) goto out_balanced; - /* XXX broken for overlapping NUMA groups */ - sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) - / sds.total_capacity; + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + + /* ASYM feature bypasses nice load balance check */ + if (busiest->group_type == group_asym_packing) + goto force_balance; /* * If the busiest group is imbalanced the below checks don't @@ -8380,56 +8539,65 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; - /* - * When dst_cpu is idle, prevent SMP nice and/or asymmetric group - * capacities from resulting in underutilization due to avg_load. - */ - if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && - busiest->group_no_capacity) - goto force_balance; - - /* Misfit tasks should be dealt with regardless of the avg load */ - if (busiest->group_type == group_misfit_task) - goto force_balance; - /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. */ - if (local->avg_load >= busiest->avg_load) + if (local->group_type > busiest->group_type) goto out_balanced; /* - * Don't pull any tasks if this group is already above the domain - * average load. + * When groups are overloaded, use the avg_load to ensure fairness + * between tasks. */ - if (local->avg_load >= sds.avg_load) - goto out_balanced; + if (local->group_type == group_overloaded) { + /* + * If the local group is more loaded than the selected + * busiest group don't try to pull any tasks. + */ + if (local->avg_load >= busiest->avg_load) + goto out_balanced; + + /* XXX broken for overlapping NUMA groups */ + sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) / + sds.total_capacity; - if (env->idle == CPU_IDLE) { /* - * This CPU is idle. If the busiest group is not overloaded - * and there is no imbalance between this and busiest group - * wrt idle CPUs, it is balanced. The imbalance becomes - * significant if the diff is greater than 1 otherwise we - * might end up to just move the imbalance on another group + * Don't pull any tasks if this group is already above the + * domain average load. */ - if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + if (local->avg_load >= sds.avg_load) goto out_balanced; - } else { + /* - * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use - * imbalance_pct to be conservative. + * If the busiest group is more loaded, use imbalance_pct to be + * conservative. */ if (100 * busiest->avg_load <= env->sd->imbalance_pct * local->avg_load) goto out_balanced; } + /* Try to move all excess tasks to child's sibling domain */ + if (sds.prefer_sibling && local->group_type == group_has_spare && + busiest->sum_h_nr_running > local->sum_h_nr_running + 1) + goto force_balance; + + if (busiest->group_type != group_overloaded && + (env->idle == CPU_NOT_IDLE || + local->idle_cpus <= (busiest->idle_cpus + 1))) + /* + * If the busiest group is not overloaded + * and there is no imbalance between this and busiest group + * wrt. idle CPUs, it is balanced. The imbalance + * becomes significant if the diff is greater than 1 otherwise + * we might end up just moving the imbalance to another + * group. + */ + goto out_balanced; + force_balance: /* Looks like there is an imbalance. Compute it */ - env->src_grp_type = busiest->group_type; calculate_imbalance(env, &sds); return env->imbalance ? sds.busiest : NULL; @@ -8445,11 +8613,13 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long busiest_load = 0, busiest_capacity = 1; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int busiest_nr = 0; int i; for_each_cpu_and(i, sched_group_span(group), env->cpus) { - unsigned long capacity, load; + unsigned long capacity, load, util; + unsigned int nr_running; enum fbq_type rt; rq = cpu_rq(i); @@ -8477,20 +8647,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - /* - * For ASYM_CPUCAPACITY domains with misfit tasks we simply - * seek the "biggest" misfit task. - */ - if (env->src_grp_type == group_misfit_task) { - if (rq->misfit_task_load > busiest_load) { - busiest_load = rq->misfit_task_load; - busiest = rq; - } - - continue; - } - capacity = capacity_of(i); + nr_running = rq->cfs.h_nr_running; /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could @@ -8500,35 +8658,70 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && capacity_of(env->dst_cpu) < capacity && - rq->nr_running == 1) + nr_running == 1) continue; - load = cpu_runnable_load(rq); + switch (env->migration_type) { + case migrate_load: + /* + * When comparing with load imbalance, use + * cpu_runnable_load() which is not scaled with the CPU + * capacity. + */ + load = cpu_runnable_load(rq); - /* - * When comparing with imbalance, use cpu_runnable_load() - * which is not scaled with the CPU capacity. - */ + if (nr_running == 1 && load > env->imbalance && + !check_cpu_capacity(rq, env->sd)) + break; - if (rq->nr_running == 1 && load > env->imbalance && - !check_cpu_capacity(rq, env->sd)) - continue; + /* + * For the load comparisons with the other CPUs, + * consider the cpu_runnable_load() scaled with the CPU + * capacity, so that the load can be moved away from + * the CPU that is potentially running at a lower + * capacity. + * + * Thus we're looking for max(load_i / capacity_i), + * crosswise multiplication to rid ourselves of the + * division works out to: + * load_i * capacity_j > load_j * capacity_i; + * where j is our previous maximum. + */ + if (load * busiest_capacity > busiest_load * capacity) { + busiest_load = load; + busiest_capacity = capacity; + busiest = rq; + } + break; + + case migrate_util: + util = cpu_util(cpu_of(rq)); + + if (busiest_util < util) { + busiest_util = util; + busiest = rq; + } + break; + + case migrate_task: + if (busiest_nr < nr_running) { + busiest_nr = nr_running; + busiest = rq; + } + break; + + case migrate_misfit: + /* + * For ASYM_CPUCAPACITY domains with misfit tasks we + * simply seek the "biggest" misfit task. + */ + if (rq->misfit_task_load > busiest_load) { + busiest_load = rq->misfit_task_load; + busiest = rq; + } + + break; - /* - * For the load comparisons with the other CPU's, consider - * the cpu_runnable_load() scaled with the CPU capacity, so - * that the load can be moved away from the CPU that is - * potentially running at a lower capacity. - * - * Thus we're looking for max(load_i / capacity_i), crosswise - * multiplication to rid ourselves of the division works out - * to: load_i * capacity_j > load_j * capacity_i; where j is - * our previous maximum. - */ - if (load * busiest_capacity > busiest_load * capacity) { - busiest_load = load; - busiest_capacity = capacity; - busiest = rq; } } @@ -8574,7 +8767,7 @@ voluntary_active_balance(struct lb_env *env) return 1; } - if (env->src_grp_type == group_misfit_task) + if (env->migration_type == migrate_misfit) return 1; return 0; -- cgit From 5e23e474431529b7d1480f649ce33d0e9c1b2e48 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:32 +0200 Subject: sched/fair: Use rq->nr_running when balancing load CFS load_balance() only takes care of CFS tasks whereas CPUs can be used by other scheduling classes. Typically, a CFS task preempted by an RT or deadline task will not get a chance to be pulled by another CPU because load_balance() doesn't take into account tasks from other classes. Add sum of nr_running in the statistics and use it to detect such situations. Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-6-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 76a2aa8db471..4e7396c97239 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7694,6 +7694,7 @@ struct sg_lb_stats { unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ + unsigned int sum_nr_running; /* Nr of tasks running in the group */ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; @@ -7928,7 +7929,7 @@ static inline int sg_imbalanced(struct sched_group *group) static inline bool group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) { - if (sgs->sum_h_nr_running < sgs->group_weight) + if (sgs->sum_nr_running < sgs->group_weight) return true; if ((sgs->group_capacity * 100) > @@ -7949,7 +7950,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) static inline bool group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) { - if (sgs->sum_h_nr_running <= sgs->group_weight) + if (sgs->sum_nr_running <= sgs->group_weight) return false; if ((sgs->group_capacity * 100) < @@ -8053,6 +8054,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_h_nr_running += rq->cfs.h_nr_running; nr_running = rq->nr_running; + sgs->sum_nr_running += nr_running; + if (nr_running > 1) *sg_status |= SG_OVERLOAD; @@ -8410,13 +8413,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } if (busiest->group_weight == 1 || sds->prefer_sibling) { - unsigned int nr_diff = busiest->sum_h_nr_running; + unsigned int nr_diff = busiest->sum_nr_running; /* * When prefer sibling, evenly spread running tasks on * groups. */ env->migration_type = migrate_task; - lsub_positive(&nr_diff, local->sum_h_nr_running); + lsub_positive(&nr_diff, local->sum_nr_running); env->imbalance = nr_diff >> 1; return; } @@ -8580,7 +8583,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) /* Try to move all excess tasks to child's sibling domain */ if (sds.prefer_sibling && local->group_type == group_has_spare && - busiest->sum_h_nr_running > local->sum_h_nr_running + 1) + busiest->sum_nr_running > local->sum_nr_running + 1) goto force_balance; if (busiest->group_type != group_overloaded && -- cgit From b0fb1eb4f04ae4768231b9731efb1134e22053a4 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:33 +0200 Subject: sched/fair: Use load instead of runnable load in load_balance() 'runnable load' was originally introduced to take into account the case where blocked load biases the load balance decision which was selecting underutilized groups with huge blocked load whereas other groups were overloaded. The load is now only used when groups are overloaded. In this case, it's worth being conservative and taking into account the sleeping tasks that might wake up on the CPU. Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-7-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e7396c97239..e6a3db08481c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5375,6 +5375,11 @@ static unsigned long cpu_runnable_load(struct rq *rq) return cfs_rq_runnable_load_avg(&rq->cfs); } +static unsigned long cpu_load(struct rq *rq) +{ + return cfs_rq_load_avg(&rq->cfs); +} + static unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; @@ -8049,7 +8054,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; - sgs->group_load += cpu_runnable_load(rq); + sgs->group_load += cpu_load(rq); sgs->group_util += cpu_util(i); sgs->sum_h_nr_running += rq->cfs.h_nr_running; @@ -8507,7 +8512,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) init_sd_lb_stats(&sds); /* - * Compute the various statistics relavent for load balancing at + * Compute the various statistics relevant for load balancing at * this level. */ update_sd_lb_stats(env, &sds); @@ -8667,11 +8672,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, switch (env->migration_type) { case migrate_load: /* - * When comparing with load imbalance, use - * cpu_runnable_load() which is not scaled with the CPU - * capacity. + * When comparing with load imbalance, use cpu_load() + * which is not scaled with the CPU capacity. */ - load = cpu_runnable_load(rq); + load = cpu_load(rq); if (nr_running == 1 && load > env->imbalance && !check_cpu_capacity(rq, env->sd)) @@ -8679,10 +8683,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, /* * For the load comparisons with the other CPUs, - * consider the cpu_runnable_load() scaled with the CPU - * capacity, so that the load can be moved away from - * the CPU that is potentially running at a lower - * capacity. + * consider the cpu_load() scaled with the CPU + * capacity, so that the load can be moved away + * from the CPU that is potentially running at a + * lower capacity. * * Thus we're looking for max(load_i / capacity_i), * crosswise multiplication to rid ourselves of the -- cgit From 2ab4092fc82d6001fdd9d51dbba27d04dec967e0 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:34 +0200 Subject: sched/fair: Spread out tasks evenly when not overloaded When there is only one CPU per group, using the idle CPUs to evenly spread tasks doesn't make sense and nr_running is a better metrics. Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-8-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e6a3db08481c..f489f603f317 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8591,18 +8591,34 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->sum_nr_running > local->sum_nr_running + 1) goto force_balance; - if (busiest->group_type != group_overloaded && - (env->idle == CPU_NOT_IDLE || - local->idle_cpus <= (busiest->idle_cpus + 1))) - /* - * If the busiest group is not overloaded - * and there is no imbalance between this and busiest group - * wrt. idle CPUs, it is balanced. The imbalance - * becomes significant if the diff is greater than 1 otherwise - * we might end up just moving the imbalance to another - * group. - */ - goto out_balanced; + if (busiest->group_type != group_overloaded) { + if (env->idle == CPU_NOT_IDLE) + /* + * If the busiest group is not overloaded (and as a + * result the local one too) but this CPU is already + * busy, let another idle CPU try to pull task. + */ + goto out_balanced; + + if (busiest->group_weight > 1 && + local->idle_cpus <= (busiest->idle_cpus + 1)) + /* + * If the busiest group is not overloaded + * and there is no imbalance between this and busiest + * group wrt idle CPUs, it is balanced. The imbalance + * becomes significant if the diff is greater than 1 + * otherwise we might end up to just move the imbalance + * on another group. Of course this applies only if + * there is more than 1 CPU per group. + */ + goto out_balanced; + + if (busiest->sum_h_nr_running == 1) + /* + * busiest doesn't have any tasks waiting to run + */ + goto out_balanced; + } force_balance: /* Looks like there is an imbalance. Compute it */ -- cgit From c63be7be59de65d12ff7b4329acea99cf734d6de Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:35 +0200 Subject: sched/fair: Use utilization to select misfit task Utilization is used to detect a misfit task but the load is then used to select the task on the CPU which can lead to select a small task with high weight instead of the task that triggered the misfit migration. Check that task can't fit the CPU's capacity when selecting the misfit task instead of using the load. Signed-off-by: Vincent Guittot Acked-by: Valentin Schneider Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Link: https://lkml.kernel.org/r/1571405198-27570-9-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f489f603f317..1fd6f3917fe7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7408,13 +7408,8 @@ static int detach_tasks(struct lb_env *env) break; case migrate_misfit: - load = task_h_load(p); - - /* - * Load of misfit task might decrease a bit since it has - * been recorded. Be conservative in the condition. - */ - if (load/2 < env->imbalance) + /* This is not a misfit task */ + if (task_fits_capacity(p, capacity_of(env->src_cpu))) goto next; env->imbalance = 0; @@ -8358,7 +8353,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s if (busiest->group_type == group_misfit_task) { /* Set imbalance to allow misfit tasks to be balanced. */ env->migration_type = migrate_misfit; - env->imbalance = busiest->group_misfit_task_load; + env->imbalance = 1; return; } -- cgit From 11f10e5420f6cecac7d4823638bff040c257aba9 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:36 +0200 Subject: sched/fair: Use load instead of runnable load in wakeup path Runnable load was originally introduced to take into account the case where blocked load biases the wake up path which may end to select an overloaded CPU with a large number of runnable tasks instead of an underutilized CPU with a huge blocked load. Tha wake up path now starts looking for idle CPUs before comparing runnable load and it's worth aligning the wake up path with the load_balance() logic. Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-10-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1fd6f3917fe7..b0703b461460 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1474,7 +1474,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long cpu_runnable_load(struct rq *rq); +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); + +static unsigned long cpu_runnable_load(struct rq *rq) +{ + return cfs_rq_runnable_load_avg(&rq->cfs); +} /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -5370,11 +5375,6 @@ static int sched_idle_cpu(int cpu) rq->nr_running); } -static unsigned long cpu_runnable_load(struct rq *rq) -{ - return cfs_rq_runnable_load_avg(&rq->cfs); -} - static unsigned long cpu_load(struct rq *rq) { return cfs_rq_load_avg(&rq->cfs); @@ -5475,7 +5475,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, s64 this_eff_load, prev_eff_load; unsigned long task_load; - this_eff_load = cpu_runnable_load(cpu_rq(this_cpu)); + this_eff_load = cpu_load(cpu_rq(this_cpu)); if (sync) { unsigned long current_load = task_h_load(current); @@ -5493,7 +5493,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); - prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu)); + prev_eff_load = cpu_load(cpu_rq(prev_cpu)); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; @@ -5581,7 +5581,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, max_spare_cap = 0; for_each_cpu(i, sched_group_span(group)) { - load = cpu_runnable_load(cpu_rq(i)); + load = cpu_load(cpu_rq(i)); runnable_load += load; avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); @@ -5722,7 +5722,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this continue; } - load = cpu_runnable_load(cpu_rq(i)); + load = cpu_load(cpu_rq(i)); if (load < min_load) { min_load = load; least_loaded_cpu = i; -- cgit From fc1273f4cefe6670d528715581c848abf64f391c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:37 +0200 Subject: sched/fair: Optimize find_idlest_group() find_idlest_group() now reads CPU's load_avg in two different ways. Consolidate the function to read and use load_avg only once and simplify the algorithm to only look for the group with lowest load_avg. Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-11-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 50 ++++++++++++++------------------------------------ 1 file changed, 14 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b0703b461460..95a57c789885 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5550,16 +5550,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, { struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *most_spare_sg = NULL; - unsigned long min_runnable_load = ULONG_MAX; - unsigned long this_runnable_load = ULONG_MAX; - unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; + unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; unsigned long imbalance = scale_load_down(NICE_0_LOAD) * (sd->imbalance_pct-100) / 100; do { - unsigned long load, avg_load, runnable_load; + unsigned long load; unsigned long spare_cap, max_spare_cap; int local_group; int i; @@ -5576,15 +5574,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, * Tally up the load of all CPUs in the group and find * the group containing the CPU with most spare capacity. */ - avg_load = 0; - runnable_load = 0; + load = 0; max_spare_cap = 0; for_each_cpu(i, sched_group_span(group)) { - load = cpu_load(cpu_rq(i)); - runnable_load += load; - - avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); + load += cpu_load(cpu_rq(i)); spare_cap = capacity_spare_without(i, p); @@ -5593,31 +5587,15 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / + load = (load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; if (local_group) { - this_runnable_load = runnable_load; - this_avg_load = avg_load; + this_load = load; this_spare = max_spare_cap; } else { - if (min_runnable_load > (runnable_load + imbalance)) { - /* - * The runnable load is significantly smaller - * so we can pick this new CPU: - */ - min_runnable_load = runnable_load; - min_avg_load = avg_load; - idlest = group; - } else if ((runnable_load < (min_runnable_load + imbalance)) && - (100*min_avg_load > imbalance_scale*avg_load)) { - /* - * The runnable loads are close so take the - * blocked load into account through avg_load: - */ - min_avg_load = avg_load; + if (load < min_load) { + min_load = load; idlest = group; } @@ -5658,18 +5636,18 @@ skip_spare: * local domain to be very lightly loaded relative to the remote * domains but "imbalance" skews the comparison making remote CPUs * look much more favourable. When considering cross-domain, add - * imbalance to the runnable load on the remote node and consider - * staying local. + * imbalance to the load on the remote node and consider staying + * local. */ if ((sd->flags & SD_NUMA) && - min_runnable_load + imbalance >= this_runnable_load) + min_load + imbalance >= this_load) return NULL; - if (min_runnable_load > (this_runnable_load + imbalance)) + if (min_load >= this_load + imbalance) return NULL; - if ((this_runnable_load < (min_runnable_load + imbalance)) && - (100*this_avg_load < imbalance_scale*min_avg_load)) + if ((this_load < (min_load + imbalance)) && + (100*this_load < imbalance_scale*min_load)) return NULL; return idlest; -- cgit From 57abff067a084889b6e06137e61a3dc3458acd56 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 18 Oct 2019 15:26:38 +0200 Subject: sched/fair: Rework find_idlest_group() The slow wake up path computes per sched_group statisics to select the idlest group, which is quite similar to what load_balance() is doing for selecting busiest group. Rework find_idlest_group() to classify the sched_group and select the idlest one following the same steps as load_balance(). Signed-off-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/1571405198-27570-12-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 384 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 256 insertions(+), 128 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 95a57c789885..a81c36472822 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5531,127 +5531,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return target; } -static unsigned long cpu_util_without(int cpu, struct task_struct *p); - -static unsigned long capacity_spare_without(int cpu, struct task_struct *p) -{ - return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - * - * Assumes p is allowed on at least one CPU in sd. - */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) -{ - struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *most_spare_sg = NULL; - unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX; - unsigned long most_spare = 0, this_spare = 0; - int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; - unsigned long imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - - do { - unsigned long load; - unsigned long spare_cap, max_spare_cap; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_span(group), - p->cpus_ptr)) - continue; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_span(group)); - - /* - * Tally up the load of all CPUs in the group and find - * the group containing the CPU with most spare capacity. - */ - load = 0; - max_spare_cap = 0; - - for_each_cpu(i, sched_group_span(group)) { - load += cpu_load(cpu_rq(i)); - - spare_cap = capacity_spare_without(i, p); - - if (spare_cap > max_spare_cap) - max_spare_cap = spare_cap; - } - - /* Adjust by relative CPU capacity of the group */ - load = (load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - - if (local_group) { - this_load = load; - this_spare = max_spare_cap; - } else { - if (load < min_load) { - min_load = load; - idlest = group; - } - - if (most_spare < max_spare_cap) { - most_spare = max_spare_cap; - most_spare_sg = group; - } - } - } while (group = group->next, group != sd->groups); - - /* - * The cross-over point between using spare capacity or least load - * is too conservative for high utilization tasks on partially - * utilized systems if we require spare_capacity > task_util(p), - * so we allow for some task stuffing by using - * spare_capacity > task_util(p)/2. - * - * Spare capacity can't be used for fork because the utilization has - * not been set yet, we must first select a rq to compute the initial - * utilization. - */ - if (sd_flag & SD_BALANCE_FORK) - goto skip_spare; - - if (this_spare > task_util(p) / 2 && - imbalance_scale*this_spare > 100*most_spare) - return NULL; - - if (most_spare > task_util(p) / 2) - return most_spare_sg; - -skip_spare: - if (!idlest) - return NULL; - - /* - * When comparing groups across NUMA domains, it's possible for the - * local domain to be very lightly loaded relative to the remote - * domains but "imbalance" skews the comparison making remote CPUs - * look much more favourable. When considering cross-domain, add - * imbalance to the load on the remote node and consider staying - * local. - */ - if ((sd->flags & SD_NUMA) && - min_load + imbalance >= this_load) - return NULL; - - if (min_load >= this_load + imbalance) - return NULL; - - if ((this_load < (min_load + imbalance)) && - (100*this_load < imbalance_scale*min_load)) - return NULL; - - return idlest; -} + int this_cpu, int sd_flag); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -5724,7 +5606,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return prev_cpu; /* - * We need task's util for capacity_spare_without, sync it up to + * We need task's util for cpu_util_without, sync it up to * prev_cpu's last_update_time. */ if (!(sd_flag & SD_BALANCE_FORK)) @@ -7905,13 +7787,13 @@ static inline int sg_imbalanced(struct sched_group *group) * any benefit for the load balance. */ static inline bool -group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) +group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { if (sgs->sum_nr_running < sgs->group_weight) return true; if ((sgs->group_capacity * 100) > - (sgs->group_util * env->sd->imbalance_pct)) + (sgs->group_util * imbalance_pct)) return true; return false; @@ -7926,13 +7808,13 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) * false. */ static inline bool -group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) +group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { if (sgs->sum_nr_running <= sgs->group_weight) return false; if ((sgs->group_capacity * 100) < - (sgs->group_util * env->sd->imbalance_pct)) + (sgs->group_util * imbalance_pct)) return true; return false; @@ -7959,11 +7841,11 @@ group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) } static inline enum -group_type group_classify(struct lb_env *env, +group_type group_classify(unsigned int imbalance_pct, struct sched_group *group, struct sg_lb_stats *sgs) { - if (group_is_overloaded(env, sgs)) + if (group_is_overloaded(imbalance_pct, sgs)) return group_overloaded; if (sg_imbalanced(group)) @@ -7975,7 +7857,7 @@ group_type group_classify(struct lb_env *env, if (sgs->group_misfit_task_load) return group_misfit_task; - if (!group_has_capacity(env, sgs)) + if (!group_has_capacity(imbalance_pct, sgs)) return group_fully_busy; return group_has_spare; @@ -8076,7 +7958,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; - sgs->group_type = group_classify(env, group, sgs); + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); /* Computing avg_load makes sense only when group is overloaded */ if (sgs->group_type == group_overloaded) @@ -8231,6 +8113,252 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) } #endif /* CONFIG_NUMA_BALANCING */ + +struct sg_lb_stats; + +/* + * update_sg_wakeup_stats - Update sched_group's statistics for wakeup. + * @denv: The ched_domain level to look for idlest group. + * @group: sched_group whose statistics are to be updated. + * @sgs: variable to hold the statistics for this group. + */ +static inline void update_sg_wakeup_stats(struct sched_domain *sd, + struct sched_group *group, + struct sg_lb_stats *sgs, + struct task_struct *p) +{ + int i, nr_running; + + memset(sgs, 0, sizeof(*sgs)); + + for_each_cpu(i, sched_group_span(group)) { + struct rq *rq = cpu_rq(i); + + sgs->group_load += cpu_load(rq); + sgs->group_util += cpu_util_without(i, p); + sgs->sum_h_nr_running += rq->cfs.h_nr_running; + + nr_running = rq->nr_running; + sgs->sum_nr_running += nr_running; + + /* + * No need to call idle_cpu() if nr_running is not 0 + */ + if (!nr_running && idle_cpu(i)) + sgs->idle_cpus++; + + + } + + /* Check if task fits in the group */ + if (sd->flags & SD_ASYM_CPUCAPACITY && + !task_fits_capacity(p, group->sgc->max_capacity)) { + sgs->group_misfit_task_load = 1; + } + + sgs->group_capacity = group->sgc->capacity; + + sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); + + /* + * Computing avg_load makes sense only when group is fully busy or + * overloaded + */ + if (sgs->group_type < group_fully_busy) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / + sgs->group_capacity; +} + +static bool update_pick_idlest(struct sched_group *idlest, + struct sg_lb_stats *idlest_sgs, + struct sched_group *group, + struct sg_lb_stats *sgs) +{ + if (sgs->group_type < idlest_sgs->group_type) + return true; + + if (sgs->group_type > idlest_sgs->group_type) + return false; + + /* + * The candidate and the current idlest group are the same type of + * group. Let check which one is the idlest according to the type. + */ + + switch (sgs->group_type) { + case group_overloaded: + case group_fully_busy: + /* Select the group with lowest avg_load. */ + if (idlest_sgs->avg_load <= sgs->avg_load) + return false; + break; + + case group_imbalanced: + case group_asym_packing: + /* Those types are not used in the slow wakeup path */ + return false; + + case group_misfit_task: + /* Select group with the highest max capacity */ + if (idlest->sgc->max_capacity >= group->sgc->max_capacity) + return false; + break; + + case group_has_spare: + /* Select group with most idle CPUs */ + if (idlest_sgs->idle_cpus >= sgs->idle_cpus) + return false; + break; + } + + return true; +} + +/* + * find_idlest_group() finds and returns the least busy CPU group within the + * domain. + * + * Assumes p is allowed on at least one CPU in sd. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int sd_flag) +{ + struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; + struct sg_lb_stats local_sgs, tmp_sgs; + struct sg_lb_stats *sgs; + unsigned long imbalance; + struct sg_lb_stats idlest_sgs = { + .avg_load = UINT_MAX, + .group_type = group_overloaded, + }; + + imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + + do { + int local_group; + + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_span(group), + p->cpus_ptr)) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_span(group)); + + if (local_group) { + sgs = &local_sgs; + local = group; + } else { + sgs = &tmp_sgs; + } + + update_sg_wakeup_stats(sd, group, sgs, p); + + if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) { + idlest = group; + idlest_sgs = *sgs; + } + + } while (group = group->next, group != sd->groups); + + + /* There is no idlest group to push tasks to */ + if (!idlest) + return NULL; + + /* + * If the local group is idler than the selected idlest group + * don't try and push the task. + */ + if (local_sgs.group_type < idlest_sgs.group_type) + return NULL; + + /* + * If the local group is busier than the selected idlest group + * try and push the task. + */ + if (local_sgs.group_type > idlest_sgs.group_type) + return idlest; + + switch (local_sgs.group_type) { + case group_overloaded: + case group_fully_busy: + /* + * When comparing groups across NUMA domains, it's possible for + * the local domain to be very lightly loaded relative to the + * remote domains but "imbalance" skews the comparison making + * remote CPUs look much more favourable. When considering + * cross-domain, add imbalance to the load on the remote node + * and consider staying local. + */ + + if ((sd->flags & SD_NUMA) && + ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load)) + return NULL; + + /* + * If the local group is less loaded than the selected + * idlest group don't try and push any tasks. + */ + if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance)) + return NULL; + + if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load) + return NULL; + break; + + case group_imbalanced: + case group_asym_packing: + /* Those type are not used in the slow wakeup path */ + return NULL; + + case group_misfit_task: + /* Select group with the highest max capacity */ + if (local->sgc->max_capacity >= idlest->sgc->max_capacity) + return NULL; + break; + + case group_has_spare: + if (sd->flags & SD_NUMA) { +#ifdef CONFIG_NUMA_BALANCING + int idlest_cpu; + /* + * If there is spare capacity at NUMA, try to select + * the preferred node + */ + if (cpu_to_node(this_cpu) == p->numa_preferred_nid) + return NULL; + + idlest_cpu = cpumask_first(sched_group_span(idlest)); + if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) + return idlest; +#endif + /* + * Otherwise, keep the task on this node to stay close + * its wakeup source and improve locality. If there is + * a real need of migration, periodic load balance will + * take care of it. + */ + if (local_sgs.idle_cpus) + return NULL; + } + + /* + * Select group with highest number of idle CPUs. We could also + * compare the utilization which is more stable but it can end + * up that the group has less spare capacity but finally more + * idle CPUs which means more opportunity to run task. + */ + if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus) + return NULL; + break; + } + + return idlest; +} + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. -- cgit From 5e6c3c7b1ec217c1c4c95d9148182302b9969b97 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 21 Oct 2019 10:33:54 +0200 Subject: perf/aux: Fix tracking of auxiliary trace buffer allocation The following commit from the v5.4 merge window: d44248a41337 ("perf/core: Rework memory accounting in perf_mmap()") ... breaks auxiliary trace buffer tracking. If I run command 'perf record -e rbd000' to record samples and saving them in the **auxiliary** trace buffer then the value of 'locked_vm' becomes negative after all trace buffers have been allocated and released: During allocation the values increase: [52.250027] perf_mmap user->locked_vm:0x87 pinned_vm:0x0 ret:0 [52.250115] perf_mmap user->locked_vm:0x107 pinned_vm:0x0 ret:0 [52.250251] perf_mmap user->locked_vm:0x188 pinned_vm:0x0 ret:0 [52.250326] perf_mmap user->locked_vm:0x208 pinned_vm:0x0 ret:0 [52.250441] perf_mmap user->locked_vm:0x289 pinned_vm:0x0 ret:0 [52.250498] perf_mmap user->locked_vm:0x309 pinned_vm:0x0 ret:0 [52.250613] perf_mmap user->locked_vm:0x38a pinned_vm:0x0 ret:0 [52.250715] perf_mmap user->locked_vm:0x408 pinned_vm:0x2 ret:0 [52.250834] perf_mmap user->locked_vm:0x408 pinned_vm:0x83 ret:0 [52.250915] perf_mmap user->locked_vm:0x408 pinned_vm:0x103 ret:0 [52.251061] perf_mmap user->locked_vm:0x408 pinned_vm:0x184 ret:0 [52.251146] perf_mmap user->locked_vm:0x408 pinned_vm:0x204 ret:0 [52.251299] perf_mmap user->locked_vm:0x408 pinned_vm:0x285 ret:0 [52.251383] perf_mmap user->locked_vm:0x408 pinned_vm:0x305 ret:0 [52.251544] perf_mmap user->locked_vm:0x408 pinned_vm:0x386 ret:0 [52.251634] perf_mmap user->locked_vm:0x408 pinned_vm:0x406 ret:0 [52.253018] perf_mmap user->locked_vm:0x408 pinned_vm:0x487 ret:0 [52.253197] perf_mmap user->locked_vm:0x408 pinned_vm:0x508 ret:0 [52.253374] perf_mmap user->locked_vm:0x408 pinned_vm:0x589 ret:0 [52.253550] perf_mmap user->locked_vm:0x408 pinned_vm:0x60a ret:0 [52.253726] perf_mmap user->locked_vm:0x408 pinned_vm:0x68b ret:0 [52.253903] perf_mmap user->locked_vm:0x408 pinned_vm:0x70c ret:0 [52.254084] perf_mmap user->locked_vm:0x408 pinned_vm:0x78d ret:0 [52.254263] perf_mmap user->locked_vm:0x408 pinned_vm:0x80e ret:0 The value of user->locked_vm increases to a limit then the memory is tracked by pinned_vm. During deallocation the size is subtracted from pinned_vm until it hits a limit. Then a larger value is subtracted from locked_vm leading to a large number (because of type unsigned): [64.267797] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x78d [64.267826] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x70c [64.267848] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x68b [64.267869] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x60a [64.267891] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x589 [64.267911] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x508 [64.267933] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x487 [64.267952] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x406 [64.268883] perf_mmap_close mmap_user->locked_vm:0x307 pinned_vm:0x406 [64.269117] perf_mmap_close mmap_user->locked_vm:0x206 pinned_vm:0x406 [64.269433] perf_mmap_close mmap_user->locked_vm:0x105 pinned_vm:0x406 [64.269536] perf_mmap_close mmap_user->locked_vm:0x4 pinned_vm:0x404 [64.269797] perf_mmap_close mmap_user->locked_vm:0xffffffffffffff84 pinned_vm:0x303 [64.270105] perf_mmap_close mmap_user->locked_vm:0xffffffffffffff04 pinned_vm:0x202 [64.270374] perf_mmap_close mmap_user->locked_vm:0xfffffffffffffe84 pinned_vm:0x101 [64.270628] perf_mmap_close mmap_user->locked_vm:0xfffffffffffffe04 pinned_vm:0x0 This value sticks for the user until system is rebooted, causing follow-on system calls using locked_vm resource limit to fail. Note: There is no issue using the normal trace buffer. In fact the issue is in perf_mmap_close(). During allocation auxiliary trace buffer memory is either traced as 'extra' and added to 'pinned_vm' or trace as 'user_extra' and added to 'locked_vm'. This applies for normal trace buffers and auxiliary trace buffer. However in function perf_mmap_close() all auxiliary trace buffer is subtraced from 'locked_vm' and never from 'pinned_vm'. This breaks the ballance. Signed-off-by: Thomas Richter Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: gor@linux.ibm.com Cc: hechaol@fb.com Cc: heiko.carstens@de.ibm.com Cc: linux-perf-users@vger.kernel.org Cc: songliubraving@fb.com Fixes: d44248a41337 ("perf/core: Rework memory accounting in perf_mmap()") Link: https://lkml.kernel.org/r/20191021083354.67868-1-tmricht@linux.ibm.com [ Minor readability edits. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9ec0b0bfddbd..f5d7950d1931 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5607,8 +5607,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) perf_pmu_output_stop(event); /* now it's safe to free the pages */ - atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); + if (!rb->aux_mmap_locked) + atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); + else + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); -- cgit From b612e5df4587c934bd056bf05f4a1deca4de4f75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 14 Oct 2019 12:45:37 +0200 Subject: clone3: add CLONE_CLEAR_SIGHAND Reset all signal handlers of the child not set to SIG_IGN to SIG_DFL. Mutually exclusive with CLONE_SIGHAND to not disturb other thread's signal handler. In the spirit of closer cooperation between glibc developers and kernel developers (cf. [2]) this patchset came out of a discussion on the glibc mailing list for improving posix_spawn() (cf. [1], [3], [4]). Kernel support for this feature has been explicitly requested by glibc and I see no reason not to help them with this. The child helper process on Linux posix_spawn must ensure that no signal handlers are enabled, so the signal disposition must be either SIG_DFL or SIG_IGN. However, it requires a sigprocmask to obtain the current signal mask and at least _NSIG sigaction calls to reset the signal handlers for each posix_spawn call or complex state tracking that might lead to data corruption in glibc. Adding this flags lets glibc avoid these problems. [1]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00149.html [3]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00158.html [4]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00160.html [2]: https://lwn.net/Articles/799331/ '[...] by asking for better cooperation with the C-library projects in general. They should be copied on patches containing ABI changes, for example. I noted that there are often times where C-library developers wish the kernel community had done things differently; how could those be avoided in the future? Members of the audience suggested that more glibc developers should perhaps join the linux-api list. The other suggestion was to "copy Florian on everything".' Cc: Florian Weimer Cc: libc-alpha@sourceware.org Cc: linux-api@vger.kernel.org Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191014104538.3096-1-christian.brauner@ubuntu.com --- kernel/fork.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ffa314838b43..954e875e72b1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1517,6 +1517,11 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); spin_unlock_irq(¤t->sighand->siglock); + + /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */ + if (clone_flags & CLONE_CLEAR_SIGHAND) + flush_signal_handlers(tsk, 0); + return 0; } @@ -2619,11 +2624,8 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, static bool clone3_args_valid(const struct kernel_clone_args *kargs) { - /* - * All lower bits of the flag word are taken. - * Verify that no other unknown flags are passed along. - */ - if (kargs->flags & ~CLONE_LEGACY_FLAGS) + /* Verify that no unknown flags are passed along. */ + if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND)) return false; /* @@ -2633,6 +2635,10 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs) if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) return false; + if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == + (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) + return false; + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && kargs->exit_signal) return false; -- cgit From ce197d83a9fc42795c248c90983bf05faf0f013b Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Sat, 19 Oct 2019 13:19:31 +0200 Subject: xdp: Handle device unregister for devmap_hash map type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems I forgot to add handling of devmap_hash type maps to the device unregister hook for devmaps. This omission causes devices to not be properly released, which causes hangs. Fix this by adding the missing handler. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Tetsuo Handa Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191019111931.2981954-1-toke@redhat.com --- kernel/bpf/devmap.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index c0a48f336997..3867864cdc2f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -719,6 +719,32 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_check_btf = map_check_no_btf, }; +static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, + struct net_device *netdev) +{ + unsigned long flags; + u32 i; + + spin_lock_irqsave(&dtab->index_lock, flags); + for (i = 0; i < dtab->n_buckets; i++) { + struct bpf_dtab_netdev *dev; + struct hlist_head *head; + struct hlist_node *next; + + head = dev_map_index_hash(dtab, i); + + hlist_for_each_entry_safe(dev, next, head, index_hlist) { + if (netdev != dev->dev) + continue; + + dtab->items--; + hlist_del_rcu(&dev->index_hlist); + call_rcu(&dev->rcu, __dev_map_entry_free); + } + } + spin_unlock_irqrestore(&dtab->index_lock, flags); +} + static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { @@ -735,6 +761,11 @@ static int dev_map_notification(struct notifier_block *notifier, */ rcu_read_lock(); list_for_each_entry_rcu(dtab, &dev_map_list, list) { + if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dev_map_hash_remove_netdev(dtab, netdev); + continue; + } + for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev, *odev; -- cgit From 6b1340cc00edeadd52ebd8a45171f38c8de2a387 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Tue, 15 Oct 2019 11:47:25 +0530 Subject: tracing: Fix race in perf_trace_buf initialization A race condition exists while initialiazing perf_trace_buf from perf_trace_init() and perf_kprobe_init(). CPU0 CPU1 perf_trace_init() mutex_lock(&event_mutex) perf_trace_event_init() perf_trace_event_reg() total_ref_count == 0 buf = alloc_percpu() perf_trace_buf[i] = buf tp_event->class->reg() //fails perf_kprobe_init() goto fail perf_trace_event_init() perf_trace_event_reg() fail: total_ref_count == 0 total_ref_count == 0 buf = alloc_percpu() perf_trace_buf[i] = buf tp_event->class->reg() total_ref_count++ free_percpu(perf_trace_buf[i]) perf_trace_buf[i] = NULL Any subsequent call to perf_trace_event_reg() will observe total_ref_count > 0, causing the perf_trace_buf to be always NULL. This can result in perf_trace_buf getting accessed from perf_trace_buf_alloc() without being initialized. Acquiring event_mutex in perf_kprobe_init() before calling perf_trace_event_init() should fix this race. The race caused the following bug: Unable to handle kernel paging request at virtual address 0000003106f2003c Mem abort info: ESR = 0x96000045 Exception class = DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000045 CM = 0, WnR = 1 user pgtable: 4k pages, 39-bit VAs, pgdp = ffffffc034b9b000 [0000003106f2003c] pgd=0000000000000000, pud=0000000000000000 Internal error: Oops: 96000045 [#1] PREEMPT SMP Process syz-executor (pid: 18393, stack limit = 0xffffffc093190000) pstate: 80400005 (Nzcv daif +PAN -UAO) pc : __memset+0x20/0x1ac lr : memset+0x3c/0x50 sp : ffffffc09319fc50 __memset+0x20/0x1ac perf_trace_buf_alloc+0x140/0x1a0 perf_trace_sys_enter+0x158/0x310 syscall_trace_enter+0x348/0x7c0 el0_svc_common+0x11c/0x368 el0_svc_handler+0x12c/0x198 el0_svc+0x8/0xc Ramdumps showed the following: total_ref_count = 3 perf_trace_buf = ( 0x0 -> NULL, 0x0 -> NULL, 0x0 -> NULL, 0x0 -> NULL) Link: http://lkml.kernel.org/r/1571120245-4186-1-git-send-email-prsood@codeaurora.org Cc: stable@vger.kernel.org Fixes: e12f03d7031a9 ("perf/core: Implement the 'perf_kprobe' PMU") Acked-by: Song Liu Signed-off-by: Prateek Sood Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0892e38ed6fb..a9dfa04ffa44 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -272,9 +272,11 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) goto out; } + mutex_lock(&event_mutex); ret = perf_trace_event_init(tp_event, p_event); if (ret) destroy_local_trace_kprobe(tp_event); + mutex_unlock(&event_mutex); out: kfree(func); return ret; @@ -282,8 +284,10 @@ out: void perf_kprobe_destroy(struct perf_event *p_event) { + mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); destroy_local_trace_kprobe(p_event->tp_event); } -- cgit From f3a519e4add93b7b31a6616f0b09635ff2e6a159 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 22 Oct 2019 10:39:40 +0300 Subject: perf/aux: Fix AUX output stopping Commit: 8a58ddae2379 ("perf/core: Fix exclusive events' grouping") allows CAP_EXCLUSIVE events to be grouped with other events. Since all of those also happen to be AUX events (which is not the case the other way around, because arch/s390), this changes the rules for stopping the output: the AUX event may not be on its PMU's context any more, if it's grouped with a HW event, in which case it will be on that HW event's context instead. If that's the case, munmap() of the AUX buffer can't find and stop the AUX event, potentially leaving the last reference with the atomic context, which will then end up freeing the AUX buffer. This will then trip warnings: Fix this by using the context's PMU context when looking for events to stop, instead of the event's PMU context. Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20191022073940.61814-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f5d7950d1931..bb3748d29b04 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6949,7 +6949,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; - struct pmu *pmu = event->pmu; + struct pmu *pmu = event->ctx->pmu; struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); struct remote_output ro = { .rb = event->rb, -- cgit From cd7455f1013ef96d5cbf5c05d2b7c06f273810a6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 22 Oct 2019 15:57:23 +0200 Subject: bpf: Fix use after free in subprog's jited symbol removal syzkaller managed to trigger the following crash: [...] BUG: unable to handle page fault for address: ffffc90001923030 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD aa551067 P4D aa551067 PUD aa552067 PMD a572b067 PTE 80000000a1173163 Oops: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 7982 Comm: syz-executor912 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:bpf_jit_binary_hdr include/linux/filter.h:787 [inline] RIP: 0010:bpf_get_prog_addr_region kernel/bpf/core.c:531 [inline] RIP: 0010:bpf_tree_comp kernel/bpf/core.c:600 [inline] RIP: 0010:__lt_find include/linux/rbtree_latch.h:115 [inline] RIP: 0010:latch_tree_find include/linux/rbtree_latch.h:208 [inline] RIP: 0010:bpf_prog_kallsyms_find kernel/bpf/core.c:674 [inline] RIP: 0010:is_bpf_text_address+0x184/0x3b0 kernel/bpf/core.c:709 [...] Call Trace: kernel_text_address kernel/extable.c:147 [inline] __kernel_text_address+0x9a/0x110 kernel/extable.c:102 unwind_get_return_address+0x4c/0x90 arch/x86/kernel/unwind_frame.c:19 arch_stack_walk+0x98/0xe0 arch/x86/kernel/stacktrace.c:26 stack_trace_save+0xb6/0x150 kernel/stacktrace.c:123 save_stack mm/kasan/common.c:69 [inline] set_track mm/kasan/common.c:77 [inline] __kasan_kmalloc+0x11c/0x1b0 mm/kasan/common.c:510 kasan_slab_alloc+0xf/0x20 mm/kasan/common.c:518 slab_post_alloc_hook mm/slab.h:584 [inline] slab_alloc mm/slab.c:3319 [inline] kmem_cache_alloc+0x1f5/0x2e0 mm/slab.c:3483 getname_flags+0xba/0x640 fs/namei.c:138 getname+0x19/0x20 fs/namei.c:209 do_sys_open+0x261/0x560 fs/open.c:1091 __do_sys_open fs/open.c:1115 [inline] __se_sys_open fs/open.c:1110 [inline] __x64_sys_open+0x87/0x90 fs/open.c:1110 do_syscall_64+0xf7/0x1c0 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe [...] After further debugging it turns out that we walk kallsyms while in parallel we tear down a BPF program which contains subprograms that have been JITed though the program itself has not been fully exposed and is eventually bailing out with error. The bpf_prog_kallsyms_del_subprogs() in bpf_prog_load()'s error path removes the symbols, however, bpf_prog_free() tears down the JIT memory too early via scheduled work. Instead, it needs to properly respect RCU grace period as the kallsyms walk for BPF is under RCU. Fix it by refactoring __bpf_prog_put()'s tear down and reuse it in our error path where we defer final destruction when we have subprogs in the program. Fixes: 7d1982b4e335 ("bpf: fix panic in prog load calls cleanup") Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Reported-by: syzbot+710043c5d1d5b5013bc7@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Tested-by: syzbot+710043c5d1d5b5013bc7@syzkaller.appspotmail.com Link: https://lore.kernel.org/bpf/55f6367324c2d7e9583fa9ccf5385dcbba0d7a6e.1571752452.git.daniel@iogearbox.net --- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c | 31 ++++++++++++++++++++----------- 2 files changed, 21 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 66088a9e9b9e..ef0e1e3e66f4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -502,7 +502,7 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); } -void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) +static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 82eabd4e38ad..bcfc362de4f2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1332,18 +1332,26 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) bpf_prog_free(aux->prog); } +static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) +{ + bpf_prog_kallsyms_del_all(prog); + btf_put(prog->aux->btf); + kvfree(prog->aux->func_info); + bpf_prog_free_linfo(prog); + + if (deferred) + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + else + __bpf_prog_put_rcu(&prog->aux->rcu); +} + static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); - bpf_prog_kallsyms_del_all(prog); - btf_put(prog->aux->btf); - kvfree(prog->aux->func_info); - bpf_prog_free_linfo(prog); - - call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + __bpf_prog_put_noref(prog, true); } } @@ -1741,11 +1749,12 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return err; free_used_maps: - bpf_prog_free_linfo(prog); - kvfree(prog->aux->func_info); - btf_put(prog->aux->btf); - bpf_prog_kallsyms_del_subprogs(prog); - free_used_maps(prog->aux); + /* In case we have subprogs, we need to wait for a grace + * period before we can tear down JIT memory since symbols + * are already exposed under kallsyms. + */ + __bpf_prog_put_noref(prog, prog->aux->func_cnt); + return err; free_prog: bpf_prog_uncharge_memlock(prog); free_prog_sec: -- cgit From 3b4d9eb2ee74dd5ea7fa36cffb0ca7f5bc4924da Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 22 Oct 2019 23:30:38 +0200 Subject: bpf: Fix use after free in bpf_get_prog_name There is one more problematic case I noticed while recently fixing BPF kallsyms handling in cd7455f1013e ("bpf: Fix use after free in subprog's jited symbol removal") and that is bpf_get_prog_name(). If BTF has been attached to the prog, then we may be able to fetch the function signature type id in kallsyms through prog->aux->func_info[prog->aux->func_idx].type_id. However, while the BTF object itself is torn down via RCU callback, the prog's aux->func_info is immediately freed via kvfree(prog->aux->func_info) once the prog's refcount either hit zero or when subprograms were already exposed via kallsyms and we hit the error path added in 5482e9a93c83 ("bpf: Fix memleak in aux->func_info and aux->btf"). This violates RCU as well since kallsyms could be walked in parallel where we could access aux->func_info. Hence, defer kvfree() to after RCU grace period. Looking at ba64e7d85252 ("bpf: btf: support proper non-jit func info") there is no reason/dependency where we couldn't defer the kvfree(aux->func_info) into the RCU callback. Fixes: 5482e9a93c83 ("bpf: Fix memleak in aux->func_info and aux->btf") Fixes: ba64e7d85252 ("bpf: btf: support proper non-jit func info") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/875f2906a7c1a0691f2d567b4d8e4ea2739b1e88.1571779205.git.daniel@iogearbox.net --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bcfc362de4f2..0937719b87e2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1326,6 +1326,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); + kvfree(aux->func_info); free_used_maps(aux); bpf_prog_uncharge_memlock(aux->prog); security_bpf_prog_free(aux); @@ -1336,7 +1337,6 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) { bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); - kvfree(prog->aux->func_info); bpf_prog_free_linfo(prog); if (deferred) -- cgit From 086ee46b08634a999bcd1707eabe3b0dc1806674 Mon Sep 17 00:00:00 2001 From: "Ben Dooks (Codethink)" Date: Tue, 22 Oct 2019 14:12:26 +0100 Subject: timers/sched_clock: Include local timekeeping.h for missing declarations Include the timekeeping.h header to get the declaration of the sched_clock_{suspend,resume} functions. Fixes the following sparse warnings: kernel/time/sched_clock.c:275:5: warning: symbol 'sched_clock_suspend' was not declared. Should it be static? kernel/time/sched_clock.c:286:6: warning: symbol 'sched_clock_resume' was not declared. Should it be static? Signed-off-by: Ben Dooks (Codethink) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20191022131226.11465-1-ben.dooks@codethink.co.uk --- kernel/time/sched_clock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 142b07619918..dbd69052eaa6 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -17,6 +17,8 @@ #include #include +#include "timekeeping.h" + /** * struct clock_read_data - data required to read from sched_clock() * -- cgit From 7f2cbcbcafbca5360efbd175b3320852b8f7c637 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 21 Oct 2019 15:44:12 +0800 Subject: posix-cpu-timers: Fix two trivial comments Recent changes modified the function arguments of thread_group_sample_cputime() and task_cputimers_expired(), but forgot to update the comments. Fix it up. [ tglx: Changed the argument name of task_cputimers_expired() as the pointer points to an array of samples. ] Fixes: b7be4ef1365d ("posix-cpu-timers: Switch thread group sampling to array") Fixes: 001f7971433a ("posix-cpu-timers: Make expiry checks array based") Signed-off-by: Yi Wang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1571643852-21848-1-git-send-email-wang.yi59@zte.com.cn --- kernel/time/posix-cpu-timers.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 92a431981b1c..42d512fcfda2 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -266,7 +266,7 @@ static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, /** * thread_group_sample_cputime - Sample cputime for a given task * @tsk: Task for which cputime needs to be started - * @iimes: Storage for time samples + * @samples: Storage for time samples * * Called from sys_getitimer() to calculate the expiry time of an active * timer. That means group cputime accounting is already active. Called @@ -1038,12 +1038,12 @@ unlock: * member of @pct->bases[CLK].nextevt. False otherwise */ static inline bool -task_cputimers_expired(const u64 *sample, struct posix_cputimers *pct) +task_cputimers_expired(const u64 *samples, struct posix_cputimers *pct) { int i; for (i = 0; i < CPUCLOCK_MAX; i++) { - if (sample[i] >= pct->bases[i].nextevt) + if (samples[i] >= pct->bases[i].nextevt) return true; } return false; -- cgit From ce4dd4429b3c7e4506870796f3b8b06d707d2928 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 16 Oct 2019 15:13:41 +0100 Subject: Remove the nr_exclusive argument from __wake_up_sync_key() Remove the nr_exclusive argument from __wake_up_sync_key() and derived functions as everything seems to set it to 1. Note also that if it wasn't set to 1, it would clear WF_SYNC anyway. Signed-off-by: David Howells Acked-by: Peter Zijlstra (Intel) --- kernel/exit.c | 2 +- kernel/sched/wait.c | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a46a50d67002..a1ff25ef050e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1435,7 +1435,7 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, - TASK_INTERRUPTIBLE, 1, p); + TASK_INTERRUPTIBLE, p); } static long do_wait(struct wait_opts *wo) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index c1e566a114ca..b4b52361dab7 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -169,7 +169,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @wq_head: the waitqueue * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: opaque value to be passed to wakeup targets * * The sync wakeup differs that the waker knows that it will schedule @@ -183,26 +182,21 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); * accessing the task state. */ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, void *key) + void *key) { - int wake_flags = 1; /* XXX WF_SYNC */ - if (unlikely(!wq_head)) return; - if (unlikely(nr_exclusive != 1)) - wake_flags = 0; - - __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key); + __wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); /* * __wake_up_sync - see __wake_up_sync_key() */ -void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive) +void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode) { - __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL); + __wake_up_sync_key(wq_head, mode, NULL); } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ -- cgit From d07ce4e32a8d68062c58a3e635619313c52d0bf7 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Mon, 21 Oct 2019 11:10:56 +0100 Subject: kdb: Avoid array subscript warnings on non-SMP builds Recent versions of gcc (reported on gcc-7.4) issue array subscript warnings for builds where SMP is not enabled. kernel/debug/debug_core.c: In function 'kdb_dump_stack_on_cpu': kernel/debug/debug_core.c:452:17: warning: array subscript is outside array +bounds [-Warray-bounds] if (!(kgdb_info[cpu].exception_state & DCPU_IS_SLAVE)) { ~~~~~~~~~^~~~~ kernel/debug/debug_core.c:469:33: warning: array subscript is outside array +bounds [-Warray-bounds] kgdb_info[cpu].exception_state |= DCPU_WANT_BT; kernel/debug/debug_core.c:470:18: warning: array subscript is outside array +bounds [-Warray-bounds] while (kgdb_info[cpu].exception_state & DCPU_WANT_BT) There is no bug here but there is scope to improve the code generation for non-SMP systems (whilst also silencing the warning). Reported-by: kbuild test robot Fixes: 2277b492582d ("kdb: Fix stack crawling on 'running' CPUs that aren't the master") Signed-off-by: Daniel Thompson Link: https://lore.kernel.org/r/20191021101057.23861-1-daniel.thompson@linaro.org Reviewed-by: Douglas Anderson --- kernel/debug/debug_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 70e86b4b4932..2b7c9b67931d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -444,7 +444,7 @@ setundefined: #ifdef CONFIG_KGDB_KDB void kdb_dump_stack_on_cpu(int cpu) { - if (cpu == raw_smp_processor_id()) { + if (cpu == raw_smp_processor_id() || !IS_ENABLED(CONFIG_SMP)) { dump_stack(); return; } -- cgit From a713af394cf382a30dd28a1015cbe572f1b9ca75 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 17 Oct 2019 02:50:01 +1100 Subject: cgroup: pids: use atomic64_t for pids->limit Because pids->limit can be changed concurrently (but we don't want to take a lock because it would be needlessly expensive), use atomic64_ts instead. Fixes: commit 49b786ea146f ("cgroup: implement the PIDs subsystem") Cc: stable@vger.kernel.org # v4.3+ Signed-off-by: Aleksa Sarai Signed-off-by: Tejun Heo --- kernel/cgroup/pids.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 8e513a573fe9..138059eb730d 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -45,7 +45,7 @@ struct pids_cgroup { * %PIDS_MAX = (%PID_MAX_LIMIT + 1). */ atomic64_t counter; - int64_t limit; + atomic64_t limit; /* Handle for "pids.events" */ struct cgroup_file events_file; @@ -73,8 +73,8 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - pids->limit = PIDS_MAX; atomic64_set(&pids->counter, 0); + atomic64_set(&pids->limit, PIDS_MAX); atomic64_set(&pids->events_limit, 0); return &pids->css; } @@ -146,13 +146,14 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); + int64_t limit = atomic64_read(&p->limit); /* * Since new is capped to the maximum number of pid_t, if * p->limit is %PIDS_MAX then we know that this test will never * fail. */ - if (new > p->limit) + if (new > limit) goto revert; } @@ -277,7 +278,7 @@ set_limit: * Limit updates don't need to be mutex'd, since it isn't * critical that any racing fork()s follow the new limit. */ - pids->limit = limit; + atomic64_set(&pids->limit, limit); return nbytes; } @@ -285,7 +286,7 @@ static int pids_max_show(struct seq_file *sf, void *v) { struct cgroup_subsys_state *css = seq_css(sf); struct pids_cgroup *pids = css_pids(css); - int64_t limit = pids->limit; + int64_t limit = atomic64_read(&pids->limit); if (limit >= PIDS_MAX) seq_printf(sf, "%s\n", PIDS_MAX_STR); -- cgit From 3820729160440158a014add69cc0d371061a96b2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Oct 2019 17:18:11 -0700 Subject: bpf: Prepare btf_ctx_access for non raw_tp use case This patch makes a few changes to btf_ctx_access() to prepare it for non raw_tp use case where the attach_btf_id is not necessary a BTF_KIND_TYPEDEF. It moves the "btf_trace_" prefix check and typedef-follow logic to a new function "check_attach_btf_id()" which is called only once during bpf_check(). btf_ctx_access() only operates on a BTF_KIND_FUNC_PROTO type now. That should also be more efficient since it is done only one instead of every-time check_ctx_access() is called. "check_attach_btf_id()" needs to find the func_proto type from the attach_btf_id. It needs to store the result into the newly added prog->aux->attach_func_proto. func_proto btf type has no name, so a proper name should be stored into "attach_func_name" also. v2: - Move the "btf_trace_" check to an earlier verifier phase (Alexei) Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191025001811.1718491-1-kafai@fb.com --- kernel/bpf/btf.c | 73 +++++++++--------------------------------------- kernel/bpf/syscall.c | 4 +-- kernel/bpf/verifier.c | 52 +++++++++++++++++++++++++++++++++- kernel/trace/bpf_trace.c | 2 ++ 4 files changed, 67 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f7557af39756..128d89601d73 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -336,16 +336,6 @@ static bool btf_type_is_fwd(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; } -static bool btf_type_is_func(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; -} - -static bool btf_type_is_func_proto(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; -} - static bool btf_type_nosize(const struct btf_type *t) { return btf_type_is_void(t) || btf_type_is_fwd(t) || @@ -377,16 +367,6 @@ static bool btf_type_is_array(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; } -static bool btf_type_is_ptr(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; -} - -static bool btf_type_is_int(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_INT; -} - static bool btf_type_is_var(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; @@ -3442,54 +3422,27 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { + const struct btf_type *t = prog->aux->attach_func_proto; + const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; - u32 btf_id = prog->aux->attach_btf_id; const struct btf_param *args; - const struct btf_type *t; - const char prefix[] = "btf_trace_"; - const char *tname; u32 nr_args, arg; - if (!btf_id) - return true; - - if (IS_ERR(btf_vmlinux)) { - bpf_log(log, "btf_vmlinux is malformed\n"); - return false; - } - - t = btf_type_by_id(btf_vmlinux, btf_id); - if (!t || BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) { - bpf_log(log, "btf_id is invalid\n"); - return false; - } - - tname = __btf_name_by_offset(btf_vmlinux, t->name_off); - if (strncmp(prefix, tname, sizeof(prefix) - 1)) { - bpf_log(log, "btf_id points to wrong type name %s\n", tname); - return false; - } - tname += sizeof(prefix) - 1; - - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_ptr(t)) - return false; - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_func_proto(t)) - return false; - if (off % 8) { - bpf_log(log, "raw_tp '%s' offset %d is not multiple of 8\n", + bpf_log(log, "func '%s' offset %d is not multiple of 8\n", tname, off); return false; } arg = off / 8; args = (const struct btf_param *)(t + 1); - /* skip first 'void *__data' argument in btf_trace_##name typedef */ - args++; - nr_args = btf_type_vlen(t) - 1; + nr_args = btf_type_vlen(t); + if (prog->aux->attach_btf_trace) { + /* skip first 'void *__data' argument in btf_trace_##name typedef */ + args++; + nr_args--; + } if (arg >= nr_args) { - bpf_log(log, "raw_tp '%s' doesn't have %d-th argument\n", + bpf_log(log, "func '%s' doesn't have %d-th argument\n", tname, arg); return false; } @@ -3503,7 +3456,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; if (!btf_type_is_ptr(t)) { bpf_log(log, - "raw_tp '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", + "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", tname, arg, __btf_name_by_offset(btf_vmlinux, t->name_off), btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -3526,11 +3479,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, t = btf_type_by_id(btf_vmlinux, t->type); if (!btf_type_is_struct(t)) { bpf_log(log, - "raw_tp '%s' arg%d type %s is not a struct\n", + "func '%s' arg%d type %s is not a struct\n", tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]); return false; } - bpf_log(log, "raw_tp '%s' arg%d has btf_id %d type %s '%s'\n", + bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)], __btf_name_by_offset(btf_vmlinux, t->name_off)); return true; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 16ea3c0db4f6..ff5225759553 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1848,9 +1848,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) goto out_put_prog; } /* raw_tp name is taken from type name instead */ - tp_name = kernel_type_name(prog->aux->attach_btf_id); - /* skip the prefix */ - tp_name += sizeof("btf_trace_") - 1; + tp_name = prog->aux->attach_func_name; } else { if (strncpy_from_user(buf, u64_to_user_ptr(attr->raw_tracepoint.name), diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 556e82f8869b..c59778c0fc4d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9372,6 +9372,52 @@ static void print_verification_stats(struct bpf_verifier_env *env) env->peak_states, env->longest_mark_read_walk); } +static int check_attach_btf_id(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + u32 btf_id = prog->aux->attach_btf_id; + const struct btf_type *t; + const char *tname; + + if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT && btf_id) { + const char prefix[] = "btf_trace_"; + + t = btf_type_by_id(btf_vmlinux, btf_id); + if (!t) { + verbose(env, "attach_btf_id %u is invalid\n", btf_id); + return -EINVAL; + } + if (!btf_type_is_typedef(t)) { + verbose(env, "attach_btf_id %u is not a typedef\n", + btf_id); + return -EINVAL; + } + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + if (!tname || strncmp(prefix, tname, sizeof(prefix) - 1)) { + verbose(env, "attach_btf_id %u points to wrong type name %s\n", + btf_id, tname); + return -EINVAL; + } + tname += sizeof(prefix) - 1; + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_ptr(t)) + /* should never happen in valid vmlinux build */ + return -EINVAL; + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_func_proto(t)) + /* should never happen in valid vmlinux build */ + return -EINVAL; + + /* remember two read only pointers that are valid for + * the life time of the kernel + */ + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + prog->aux->attach_btf_trace = true; + } + return 0; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -9435,9 +9481,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, /* Either gcc or pahole or kernel are broken. */ verbose(env, "in-kernel BTF is malformed\n"); ret = PTR_ERR(btf_vmlinux); - goto err_unlock; + goto skip_full_check; } + ret = check_attach_btf_id(env); + if (ret) + goto skip_full_check; + env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c3240898cc44..571c25d60710 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1080,6 +1080,8 @@ static bool raw_tp_prog_is_valid_access(int off, int size, return false; if (off % size != 0) return false; + if (!prog->aux->attach_btf_id) + return true; return btf_ctx_access(off, size, type, prog, info); } -- cgit From 5153faac18d293fc7abb19ff7034683fbcd82dc7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Oct 2019 12:03:51 -0700 Subject: cgroup: remove cgroup_enable_task_cg_lists() optimization cgroup_enable_task_cg_lists() is used to lazyily initialize task cgroup associations on the first use to reduce fork / exit overheads on systems which don't use cgroup. Unfortunately, locking around it has never been actually correct and its value is dubious given how the vast majority of systems use cgroup right away from boot. This patch removes the optimization. For now, replace the cg_list based branches with WARN_ON_ONCE()'s to be on the safe side. We can simplify the logic further in the future. Signed-off-by: Tejun Heo Reported-by: Oleg Nesterov Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 184 +++++++++++-------------------------------------- kernel/cgroup/cpuset.c | 2 - 2 files changed, 39 insertions(+), 147 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8b1c4fd47a7a..cf32c0c7a45d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1883,65 +1883,6 @@ static int cgroup_reconfigure(struct fs_context *fc) return 0; } -/* - * To reduce the fork() overhead for systems that are not actually using - * their cgroups capability, we don't maintain the lists running through - * each css_set to its tasks until we see the list actually used - in other - * words after the first mount. - */ -static bool use_task_css_set_links __read_mostly; - -void cgroup_enable_task_cg_lists(void) -{ - struct task_struct *p, *g; - - /* - * We need tasklist_lock because RCU is not safe against - * while_each_thread(). Besides, a forking task that has passed - * cgroup_post_fork() without seeing use_task_css_set_links = 1 - * is not guaranteed to have its child immediately visible in the - * tasklist if we walk through it with RCU. - */ - read_lock(&tasklist_lock); - spin_lock_irq(&css_set_lock); - - if (use_task_css_set_links) - goto out_unlock; - - use_task_css_set_links = true; - - do_each_thread(g, p) { - WARN_ON_ONCE(!list_empty(&p->cg_list) || - task_css_set(p) != &init_css_set); - - /* - * We should check if the process is exiting, otherwise - * it will race with cgroup_exit() in that the list - * entry won't be deleted though the process has exited. - * Do it while holding siglock so that we don't end up - * racing against cgroup_exit(). - * - * Interrupts were already disabled while acquiring - * the css_set_lock, so we do not need to disable it - * again when acquiring the sighand->siglock here. - */ - spin_lock(&p->sighand->siglock); - if (!(p->flags & PF_EXITING)) { - struct css_set *cset = task_css_set(p); - - if (!css_set_populated(cset)) - css_set_update_populated(cset, true); - list_add_tail(&p->cg_list, &cset->tasks); - get_css_set(cset); - cset->nr_tasks++; - } - spin_unlock(&p->sighand->siglock); - } while_each_thread(g, p); -out_unlock: - spin_unlock_irq(&css_set_lock); - read_unlock(&tasklist_lock); -} - static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -2187,13 +2128,6 @@ static int cgroup_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; - /* - * The first time anyone tries to mount a cgroup, enable the list - * linking each css_set to its tasks and fix up all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); - ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); fc->fs_private = &ctx->kfc; @@ -2371,9 +2305,8 @@ static void cgroup_migrate_add_task(struct task_struct *task, if (task->flags & PF_EXITING) return; - /* leave @task alone if post_fork() hasn't linked it yet */ - if (list_empty(&task->cg_list)) - return; + /* cgroup_threadgroup_rwsem protects racing against forks */ + WARN_ON_ONCE(list_empty(&task->cg_list)); cset = task_css_set(task); if (!cset->mg_src_cgrp) @@ -4586,9 +4519,6 @@ repeat: void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { - /* no one should try to iterate before mounting cgroups */ - WARN_ON_ONCE(!use_task_css_set_links); - memset(it, 0, sizeof(*it)); spin_lock_irq(&css_set_lock); @@ -6022,62 +5952,38 @@ void cgroup_cancel_fork(struct task_struct *child) void cgroup_post_fork(struct task_struct *child) { struct cgroup_subsys *ss; + struct css_set *cset; int i; + spin_lock_irq(&css_set_lock); + + WARN_ON_ONCE(!list_empty(&child->cg_list)); + cset = task_css_set(current); /* current is @child's parent */ + get_css_set(cset); + cset->nr_tasks++; + css_set_move_task(child, NULL, cset, false); + /* - * This may race against cgroup_enable_task_cg_lists(). As that - * function sets use_task_css_set_links before grabbing - * tasklist_lock and we just went through tasklist_lock to add - * @child, it's guaranteed that either we see the set - * use_task_css_set_links or cgroup_enable_task_cg_lists() sees - * @child during its iteration. - * - * If we won the race, @child is associated with %current's - * css_set. Grabbing css_set_lock guarantees both that the - * association is stable, and, on completion of the parent's - * migration, @child is visible in the source of migration or - * already in the destination cgroup. This guarantee is necessary - * when implementing operations which need to migrate all tasks of - * a cgroup to another. - * - * Note that if we lose to cgroup_enable_task_cg_lists(), @child - * will remain in init_css_set. This is safe because all tasks are - * in the init_css_set before cg_links is enabled and there's no - * operation which transfers all tasks out of init_css_set. + * If the cgroup has to be frozen, the new task has too. Let's set + * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the + * frozen state. */ - if (use_task_css_set_links) { - struct css_set *cset; - - spin_lock_irq(&css_set_lock); - cset = task_css_set(current); /* current is @child's parent */ - if (list_empty(&child->cg_list)) { - get_css_set(cset); - cset->nr_tasks++; - css_set_move_task(child, NULL, cset, false); - } + if (unlikely(cgroup_task_freeze(child))) { + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); /* - * If the cgroup has to be frozen, the new task has too. - * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get - * the task into the frozen state. + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later from + * do_freezer_trap(). So we avoid cgroup's transient switch + * from the frozen state and back. */ - if (unlikely(cgroup_task_freeze(child))) { - spin_lock(&child->sighand->siglock); - WARN_ON_ONCE(child->frozen); - child->jobctl |= JOBCTL_TRAP_FREEZE; - spin_unlock(&child->sighand->siglock); - - /* - * Calling cgroup_update_frozen() isn't required here, - * because it will be called anyway a bit later - * from do_freezer_trap(). So we avoid cgroup's - * transient switch from the frozen state and back. - */ - } - - spin_unlock_irq(&css_set_lock); } + spin_unlock_irq(&css_set_lock); + /* * Call ss->fork(). This must happen after @child is linked on * css_set; otherwise, @child might change state between ->fork() @@ -6101,29 +6007,19 @@ void cgroup_exit(struct task_struct *tsk) struct css_set *cset; int i; - /* - * Unlink from @tsk from its css_set. As migration path can't race - * with us (thanks to cgroup_threadgroup_rwsem), we can check css_set - * and cg_list without synchronization. - */ - cset = task_css_set(tsk); + spin_lock_irq(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - spin_lock_irq(&css_set_lock); - css_set_move_task(tsk, cset, NULL, false); - list_add_tail(&tsk->cg_list, &cset->dying_tasks); - cset->nr_tasks--; + WARN_ON_ONCE(list_empty(&tsk->cg_list)); + cset = task_css_set(tsk); + css_set_move_task(tsk, cset, NULL, false); + list_add_tail(&tsk->cg_list, &cset->dying_tasks); + cset->nr_tasks--; - WARN_ON_ONCE(cgroup_task_frozen(tsk)); - if (unlikely(cgroup_task_freeze(tsk))) - cgroup_update_frozen(task_dfl_cgroup(tsk)); + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(cgroup_task_freeze(tsk))) + cgroup_update_frozen(task_dfl_cgroup(tsk)); - spin_unlock_irq(&css_set_lock); - } else { - /* Take reference to avoid freeing init_css_set in cgroup_free, - * see cgroup_fork(). */ - get_css_set(cset); - } + spin_unlock_irq(&css_set_lock); /* see cgroup_post_fork() for details */ do_each_subsys_mask(ss, i, have_exit_callback) { @@ -6140,12 +6036,10 @@ void cgroup_release(struct task_struct *task) ss->release(task); } while_each_subsys_mask(); - if (use_task_css_set_links) { - spin_lock_irq(&css_set_lock); - css_set_skip_task_iters(task_css_set(task), task); - list_del_init(&task->cg_list); - spin_unlock_irq(&css_set_lock); - } + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); } void cgroup_free(struct task_struct *task) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c52bc91f882b..faff8f99e8f2 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -928,8 +928,6 @@ static void rebuild_root_domains(void) lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); - cgroup_enable_task_cg_lists(); - rcu_read_lock(); /* -- cgit From c34c78dfc1fc68a1f5403f996de8ca62f298d7b2 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Wed, 23 Oct 2019 21:27:34 +0800 Subject: audit: remove redundant condition check in kauditd_thread() Warning is found by the code analysis tool: "the condition 'if(ac && rc < 0)' is redundant: ac" The @ac variable has been checked before. It can't be a null pointer here, so remove the redundant condition check. Signed-off-by: Yunfeng Ye Signed-off-by: Paul Moore --- kernel/audit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d75485aa25ff..8e09f0f55b4b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -830,7 +830,7 @@ static int kauditd_thread(void *dummy) rc = kauditd_send_queue(sk, portid, &audit_hold_queue, UNICAST_RETRIES, NULL, kauditd_rehold_skb); - if (ac && rc < 0) { + if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; @@ -840,7 +840,7 @@ static int kauditd_thread(void *dummy) rc = kauditd_send_queue(sk, portid, &audit_retry_queue, UNICAST_RETRIES, NULL, kauditd_hold_skb); - if (ac && rc < 0) { + if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; -- cgit From 8c7e975667fbc3b7c816119dd56104739899f125 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 25 Oct 2019 15:16:36 +0300 Subject: perf/core: Start rejecting the syscall with attr.__reserved_2 set Commit: 1a5941312414c ("perf: Add wakeup watermark control to the AUX area") added attr.__reserved_2 padding, but forgot to add an ABI check to reject attributes with this field set. Fix that. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: adrian.hunter@intel.com Cc: mathieu.poirier@linaro.org Link: https://lkml.kernel.org/r/20191025121636.75182-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index bb3748d29b04..aec8dba2bea4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10635,7 +10635,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->size = size; - if (attr->__reserved_1) + if (attr->__reserved_1 || attr->__reserved_2) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) -- cgit From c2b98a8661514f29a44ebd0925cf4b1503beb48c Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Wed, 23 Oct 2019 10:13:56 +0300 Subject: perf/x86: Synchronize PMU task contexts on optimized context switches Install Intel specific PMU task context synchronization adapter and extend optimized context switch path with PMU specific task context synchronization to fix LBR callstack virtualization on context switches. Signed-off-by: Alexey Budankov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/9c6445a9-bdba-ef03-3859-f1f91198f27a@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0940c8810be0..f48d38b55e7b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3204,10 +3204,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { + struct pmu *pmu = ctx->pmu; + WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + /* + * PMU specific parts of task perf context can require + * additional synchronization. As an example of such + * synchronization see implementation details of Intel + * LBR call stack data profiling; + */ + if (pmu->swap_task_ctx) + pmu->swap_task_ctx(ctx, next_ctx); + else + swap(ctx->task_ctx_data, next_ctx->task_ctx_data); /* * RCU_INIT_POINTER here is safe because we've not -- cgit From db0503e4f6751f2c719d002ba1becd1811633e6e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Oct 2019 16:02:39 +0200 Subject: perf/core: Optimize perf_install_in_event() Andi reported that when creating a lot of events, a lot of time is spent in IPIs and asked if it would be possible to elide some of that. Now when, as for example the perf-tool always does, events are created disabled, then these events will not need to be scheduled when added to the context (they're still disable) and therefore the IPI is not required -- except for the very first event, that will need to set ctx->is_active. ( It might be possible to set ctx->is_active remotely for cpu_ctx, but we really need the IPI for task_ctx, so lets not make that distinction. ) Also use __perf_effective_state() since group events depend on the state of the leader, if the leader is OFF, the whole group is OFF. So when sibling events are created enabled (XXX check tool) then we only need a single IPI to create and enable the whole group (+ that initial IPI to initialize the context). Suggested-by: Andi Kleen Reported-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f48d38b55e7b..ea70ca614987 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2666,6 +2666,25 @@ perf_install_in_context(struct perf_event_context *ctx, */ smp_store_release(&event->ctx, ctx); + /* + * perf_event_attr::disabled events will not run and can be initialized + * without IPI. Except when this is the first event for the context, in + * that case we need the magic of the IPI to set ctx->is_active. + * + * The IOC_ENABLE that is sure to follow the creation of a disabled + * event will issue the IPI and reprogram the hardware. + */ + if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) { + raw_spin_lock_irq(&ctx->lock); + if (ctx->task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; + } + add_event_to_ctx(event, ctx); + raw_spin_unlock_irq(&ctx->lock); + return; + } + if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; -- cgit From 66d258c5b048840991de49697264af75f5b09def Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Oct 2019 20:31:03 +0200 Subject: perf/core: Optimize perf_init_event() Andi reported that he was hitting the linear search in perf_init_event() a lot. Make more agressive use of the IDR lookup to avoid hitting the linear search. With exception of PERF_TYPE_SOFTWARE (which relies on a hideous hack), we can put everything in the IDR. On top of that, we can alias TYPE_HARDWARE and TYPE_HW_CACHE to TYPE_RAW on the lookup side. This greatly reduces the chances of hitting the linear search. Reported-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ea70ca614987..4d67c5d35c13 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10080,7 +10080,7 @@ static struct lock_class_key cpuctx_lock; int perf_pmu_register(struct pmu *pmu, const char *name, int type) { - int cpu, ret; + int cpu, ret, max = PERF_TYPE_MAX; mutex_lock(&pmus_lock); ret = -ENOMEM; @@ -10093,12 +10093,17 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) goto skip_type; pmu->name = name; - if (type < 0) { - type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); - if (type < 0) { - ret = type; + if (type != PERF_TYPE_SOFTWARE) { + if (type >= 0) + max = type; + + ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); + if (ret < 0) goto free_pdc; - } + + WARN_ON(type >= 0 && ret != type); + + type = ret; } pmu->type = type; @@ -10188,7 +10193,7 @@ free_dev: put_device(pmu->dev); free_idr: - if (pmu->type >= PERF_TYPE_MAX) + if (pmu->type != PERF_TYPE_SOFTWARE) idr_remove(&pmu_idr, pmu->type); free_pdc: @@ -10210,7 +10215,7 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_rcu(); free_percpu(pmu->pmu_disable_count); - if (pmu->type >= PERF_TYPE_MAX) + if (pmu->type != PERF_TYPE_SOFTWARE) idr_remove(&pmu_idr, pmu->type); if (pmu_bus_running) { if (pmu->nr_addr_filters) @@ -10280,9 +10285,8 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) static struct pmu *perf_init_event(struct perf_event *event) { + int idx, type, ret; struct pmu *pmu; - int idx; - int ret; idx = srcu_read_lock(&pmus_srcu); @@ -10295,12 +10299,27 @@ static struct pmu *perf_init_event(struct perf_event *event) } rcu_read_lock(); - pmu = idr_find(&pmu_idr, event->attr.type); + /* + * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * are often aliases for PERF_TYPE_RAW. + */ + type = event->attr.type; + if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) + type = PERF_TYPE_RAW; + +again: + pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { ret = perf_try_init_event(pmu, event); + if (ret == -ENOENT && event->attr.type != type) { + type = event->attr.type; + goto again; + } + if (ret) pmu = ERR_PTR(ret); + goto unlock; } -- cgit From d44f821b0e13275735e8f3fe4db8703b45f05d52 Mon Sep 17 00:00:00 2001 From: "Liang, Kan" Date: Tue, 22 Oct 2019 11:13:09 +0200 Subject: perf/core: Optimize perf_init_event() for TYPE_SOFTWARE Andi reported that he was hitting the linear search in perf_init_event() a lot. Now that all !TYPE_SOFTWARE events should hit the IDR, make sure the TYPE_SOFTWARE events are at the head of the list such that we'll quickly find the right PMU (provided a valid event was given). Signed-off-by: Liang, Kan Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4d67c5d35c13..cfd89b4a02d8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10180,7 +10180,16 @@ got_cpu_context: if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; - list_add_rcu(&pmu->entry, &pmus); + /* + * Ensure the TYPE_SOFTWARE PMUs are at the head of the list, + * since these cannot be in the IDR. This way the linear search + * is fast, provided a valid software event is provided. + */ + if (type == PERF_TYPE_SOFTWARE || !name) + list_add_rcu(&pmu->entry, &pmus); + else + list_add_tail_rcu(&pmu->entry, &pmus); + atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: -- cgit From 53b63136e81220cb2f8b541c03a1df9199896821 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:24 +0100 Subject: kdb: Tidy up code to handle escape sequences kdb_read_get_key() has extremely complex break/continue control flow managed by state variables and is very hard to review or modify. In particular the way the escape sequence handling interacts with the general control flow is hard to follow. Separate out the escape key handling, without changing the control flow. This makes the main body of the code easier to review. Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-2-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_io.c | 128 ++++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 3a5184eb6977..cfc054fd8097 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -49,6 +49,65 @@ static int kgdb_transition_check(char *buffer) return 0; } +/** + * kdb_handle_escape() - validity check on an accumulated escape sequence. + * @buf: Accumulated escape characters to be examined. Note that buf + * is not a string, it is an array of characters and need not be + * nil terminated. + * @sz: Number of accumulated escape characters. + * + * Return: -1 if the escape sequence is unwanted, 0 if it is incomplete, + * otherwise it returns a mapped key value to pass to the upper layers. + */ +static int kdb_handle_escape(char *buf, size_t sz) +{ + char *lastkey = buf + sz - 1; + + switch (sz) { + case 1: + if (*lastkey == '\e') + return 0; + break; + + case 2: /* \e */ + if (*lastkey == '[') + return 0; + break; + + case 3: + switch (*lastkey) { + case 'A': /* \e[A, up arrow */ + return 16; + case 'B': /* \e[B, down arrow */ + return 14; + case 'C': /* \e[C, right arrow */ + return 6; + case 'D': /* \e[D, left arrow */ + return 2; + case '1': /* \e[<1,3,4>], may be home, del, end */ + case '3': + case '4': + return 0; + } + break; + + case 4: + if (*lastkey == '~') { + switch (buf[2]) { + case '1': /* \e[1~, home */ + return 1; + case '3': /* \e[3~, del */ + return 4; + case '4': /* \e[4~, end */ + return 5; + } + } + break; + } + + return -1; +} + static int kdb_read_get_key(char *buffer, size_t bufsize) { #define ESCAPE_UDELAY 1000 @@ -102,68 +161,15 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) escape_delay = 2; continue; } - if (ped - escape_data == 1) { - /* \e */ - continue; - } else if (ped - escape_data == 2) { - /* \e */ - if (key != '[') - escape_delay = 2; - continue; - } else if (ped - escape_data == 3) { - /* \e[ */ - int mapkey = 0; - switch (key) { - case 'A': /* \e[A, up arrow */ - mapkey = 16; - break; - case 'B': /* \e[B, down arrow */ - mapkey = 14; - break; - case 'C': /* \e[C, right arrow */ - mapkey = 6; - break; - case 'D': /* \e[D, left arrow */ - mapkey = 2; - break; - case '1': /* dropthrough */ - case '3': /* dropthrough */ - /* \e[<1,3,4>], may be home, del, end */ - case '4': - mapkey = -1; - break; - } - if (mapkey != -1) { - if (mapkey > 0) { - escape_data[0] = mapkey; - escape_data[1] = '\0'; - } - escape_delay = 2; - } - continue; - } else if (ped - escape_data == 4) { - /* \e[<1,3,4> */ - int mapkey = 0; - if (key == '~') { - switch (escape_data[2]) { - case '1': /* \e[1~, home */ - mapkey = 1; - break; - case '3': /* \e[3~, del */ - mapkey = 4; - break; - case '4': /* \e[4~, end */ - mapkey = 5; - break; - } - } - if (mapkey > 0) { - escape_data[0] = mapkey; - escape_data[1] = '\0'; - } - escape_delay = 2; - continue; + + key = kdb_handle_escape(escape_data, ped - escape_data); + if (key > 0) { + escape_data[0] = key; + escape_data[1] = '\0'; } + if (key) + escape_delay = 2; + continue; } break; /* A key to process */ } -- cgit From d04213af90935d8b247c1327c9ea142fc037165f Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:25 +0100 Subject: kdb: Simplify code to fetch characters from console Currently kdb_read_get_key() contains complex control flow that, on close inspection, turns out to be unnecessary. In particular: 1. It is impossible to enter the branch conditioned on (escape_delay == 1) except when the loop enters with (escape_delay == 2) allowing us to combine the branches. 2. Most of the code conditioned on (escape_delay == 2) simply modifies local data and then breaks out of the loop causing the function to return escape_data[0]. 3. Based on #2 there is not actually any need to ever explicitly set escape_delay to 2 because we it is much simpler to directly return escape_data[0] instead. 4. escape_data[0] is, for all but one exit path, known to be '\e'. Simplify the code based on these observations. There is a subtle (and harmless) change of behaviour resulting from this simplification: instead of letting the escape timeout after ~1998 milliseconds we now timeout after ~2000 milliseconds Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-3-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_io.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index cfc054fd8097..a92ceca29637 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -124,25 +124,18 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) touch_nmi_watchdog(); f = &kdb_poll_funcs[0]; } - if (escape_delay == 2) { - *ped = '\0'; - ped = escape_data; - --escape_delay; - } - if (escape_delay == 1) { - key = *ped++; - if (!*ped) - --escape_delay; - break; - } + key = (*f)(); + if (key == -1) { if (escape_delay) { udelay(ESCAPE_UDELAY); - --escape_delay; + if (--escape_delay == 0) + return '\e'; } continue; } + if (bufsize <= 2) { if (key == '\r') key = '\n'; @@ -150,27 +143,24 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) *buffer = '\0'; return -1; } + if (escape_delay == 0 && key == '\e') { escape_delay = ESCAPE_DELAY; ped = escape_data; f_escape = f; } if (escape_delay) { - *ped++ = key; - if (f_escape != f) { - escape_delay = 2; - continue; - } + if (f_escape != f) + return '\e'; + *ped++ = key; key = kdb_handle_escape(escape_data, ped - escape_data); - if (key > 0) { - escape_data[0] = key; - escape_data[1] = '\0'; - } - if (key) - escape_delay = 2; - continue; + if (key < 0) + return '\e'; + if (key == 0) + continue; } + break; /* A key to process */ } return key; -- cgit From 4f27e824bf83dfc2f6dc1a54fae419be7cd335af Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:26 +0100 Subject: kdb: Remove special case logic from kdb_read() kdb_read() contains special case logic to force it exit after reading a single character. We can remove all the special case logic by directly calling the function to read a single character instead. This also allows us to tidy up the function prototype which, because it now matches getchar(), we can also rename in order to make its role clearer. This does involve some extra code to handle btaprompt properly but we don't mind the new lines of code here because the old code had some interesting problems (bad newline handling, treating unexpected characters like ). Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-4-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_bt.c | 22 +++++++++------ kernel/debug/kdb/kdb_io.c | 61 +++++++++++++++++++----------------------- kernel/debug/kdb/kdb_private.h | 1 + 3 files changed, 42 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 0e94efe07b72..4af48ac53625 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -75,9 +75,10 @@ static void kdb_show_stack(struct task_struct *p, void *addr) static int kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt) { - char buffer[2]; - if (kdb_getarea(buffer[0], (unsigned long)p) || - kdb_getarea(buffer[0], (unsigned long)(p+1)-1)) + char ch; + + if (kdb_getarea(ch, (unsigned long)p) || + kdb_getarea(ch, (unsigned long)(p+1)-1)) return KDB_BADADDR; if (!kdb_task_state(p, mask)) return 0; @@ -85,12 +86,17 @@ kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt) kdb_ps1(p); kdb_show_stack(p, NULL); if (btaprompt) { - kdb_getstr(buffer, sizeof(buffer), - "Enter to end, to continue:"); - if (buffer[0] == 'q') { - kdb_printf("\n"); + kdb_printf("Enter to end, or to continue:"); + do { + ch = kdb_getchar(); + } while (!strchr("\r\n q", ch)); + kdb_printf("\n"); + + /* reset the pager */ + kdb_nextline = 1; + + if (ch == 'q') return 1; - } } touch_nmi_watchdog(); return 0; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index a92ceca29637..9b6933d585b5 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -108,7 +108,22 @@ static int kdb_handle_escape(char *buf, size_t sz) return -1; } -static int kdb_read_get_key(char *buffer, size_t bufsize) +/** + * kdb_getchar() - Read a single character from a kdb console (or consoles). + * + * Other than polling the various consoles that are currently enabled, + * most of the work done in this function is dealing with escape sequences. + * + * An escape key could be the start of a vt100 control sequence such as \e[D + * (left arrow) or it could be a character in its own right. The standard + * method for detecting the difference is to wait for 2 seconds to see if there + * are any other characters. kdb is complicated by the lack of a timer service + * (interrupts are off), by multiple input sources. Escape sequence processing + * has to be done as states in the polling loop. + * + * Return: The key pressed or a control code derived from an escape sequence. + */ +char kdb_getchar(void) { #define ESCAPE_UDELAY 1000 #define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */ @@ -126,7 +141,6 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) } key = (*f)(); - if (key == -1) { if (escape_delay) { udelay(ESCAPE_UDELAY); @@ -136,14 +150,6 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) continue; } - if (bufsize <= 2) { - if (key == '\r') - key = '\n'; - *buffer++ = key; - *buffer = '\0'; - return -1; - } - if (escape_delay == 0 && key == '\e') { escape_delay = ESCAPE_DELAY; ped = escape_data; @@ -184,17 +190,7 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) * function. It is not reentrant - it relies on the fact * that while kdb is running on only one "master debug" cpu. * Remarks: - * - * The buffer size must be >= 2. A buffer size of 2 means that the caller only - * wants a single key. - * - * An escape key could be the start of a vt100 control sequence such as \e[D - * (left arrow) or it could be a character in its own right. The standard - * method for detecting the difference is to wait for 2 seconds to see if there - * are any other characters. kdb is complicated by the lack of a timer service - * (interrupts are off), by multiple input sources and by the need to sometimes - * return after just one key. Escape sequence processing has to be done as - * states in the polling loop. + * The buffer size must be >= 2. */ static char *kdb_read(char *buffer, size_t bufsize) @@ -229,9 +225,7 @@ static char *kdb_read(char *buffer, size_t bufsize) *cp = '\0'; kdb_printf("%s", buffer); poll_again: - key = kdb_read_get_key(buffer, bufsize); - if (key == -1) - return buffer; + key = kdb_getchar(); if (key != 9) tab = 0; switch (key) { @@ -742,7 +736,7 @@ kdb_printit: /* check for having reached the LINES number of printed lines */ if (kdb_nextline >= linecount) { - char buf1[16] = ""; + char ch; /* Watch out for recursion here. Any routine that calls * kdb_printf will come back through here. And kdb_read @@ -777,39 +771,38 @@ kdb_printit: if (logging) printk("%s", moreprompt); - kdb_read(buf1, 2); /* '2' indicates to return - * immediately after getting one key. */ + ch = kdb_getchar(); kdb_nextline = 1; /* Really set output line 1 */ /* empty and reset the buffer: */ kdb_buffer[0] = '\0'; next_avail = kdb_buffer; size_avail = sizeof(kdb_buffer); - if ((buf1[0] == 'q') || (buf1[0] == 'Q')) { + if ((ch == 'q') || (ch == 'Q')) { /* user hit q or Q */ KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */ KDB_STATE_CLEAR(PAGER); /* end of command output; back to normal mode */ kdb_grepping_flag = 0; kdb_printf("\n"); - } else if (buf1[0] == ' ') { + } else if (ch == ' ') { kdb_printf("\r"); suspend_grep = 1; /* for this recursion */ - } else if (buf1[0] == '\n') { + } else if (ch == '\n' || ch == '\r') { kdb_nextline = linecount - 1; kdb_printf("\r"); suspend_grep = 1; /* for this recursion */ - } else if (buf1[0] == '/' && !kdb_grepping_flag) { + } else if (ch == '/' && !kdb_grepping_flag) { kdb_printf("\r"); kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN, kdbgetenv("SEARCHPROMPT") ?: "search> "); *strchrnul(kdb_grep_string, '\n') = '\0'; kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH; suspend_grep = 1; /* for this recursion */ - } else if (buf1[0] && buf1[0] != '\n') { - /* user hit something other than enter */ + } else if (ch) { + /* user hit something unexpected */ suspend_grep = 1; /* for this recursion */ - if (buf1[0] != '/') + if (ch != '/') kdb_printf( "\nOnly 'q', 'Q' or '/' are processed at " "more prompt, input ignored\n"); diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 2118d8258b7c..55d052061ef9 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -210,6 +210,7 @@ extern void kdb_ps1(const struct task_struct *p); extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig(struct task_struct *p, int sig); extern void kdb_meminfo_proc_show(void); +extern char kdb_getchar(void); extern char *kdb_getstr(char *, size_t, const char *); extern void kdb_gdb_state_pass(char *buf); -- cgit From cdca8d8900dd33ce6b8b526e247d2a6009d05de0 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:27 +0100 Subject: kdb: Improve handling of characters from different input sources Currently if an escape timer is interrupted by a character from a different input source then the new character is discarded and the function returns '\e' (which will be discarded by the level above). It is hard to see why this would ever be the desired behaviour. Fix this to return the new character rather than the '\e'. This is a bigger refactor than might be expected because the new character needs to go through escape sequence detection. Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-5-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_io.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 9b6933d585b5..f794c0ca4557 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -127,10 +127,10 @@ char kdb_getchar(void) { #define ESCAPE_UDELAY 1000 #define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */ - char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */ - char *ped = escape_data; + char buf[4]; /* longest vt100 escape sequence is 4 bytes */ + char *pbuf = buf; int escape_delay = 0; - get_char_func *f, *f_escape = NULL; + get_char_func *f, *f_prev = NULL; int key; for (f = &kdb_poll_funcs[0]; ; ++f) { @@ -150,26 +150,26 @@ char kdb_getchar(void) continue; } - if (escape_delay == 0 && key == '\e') { + /* + * When the first character is received (or we get a change + * input source) we set ourselves up to handle an escape + * sequences (just in case). + */ + if (f_prev != f) { + f_prev = f; + pbuf = buf; escape_delay = ESCAPE_DELAY; - ped = escape_data; - f_escape = f; - } - if (escape_delay) { - if (f_escape != f) - return '\e'; - - *ped++ = key; - key = kdb_handle_escape(escape_data, ped - escape_data); - if (key < 0) - return '\e'; - if (key == 0) - continue; } - break; /* A key to process */ + *pbuf++ = key; + key = kdb_handle_escape(buf, pbuf - buf); + if (key < 0) /* no escape sequence; return first character */ + return buf[0]; + if (key > 0) + return key; } - return key; + + unreachable(); } /* -- cgit From c58ff643763c78bef12874ee39995c9f7f987bc2 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:28 +0100 Subject: kdb: Tweak escape handling for vi users Currently if sequences such as "\ehelp\r" are delivered to the console then the h gets eaten by the escape handling code. Since pressing escape becomes something of a nervous twitch for vi users (and that escape doesn't have much effect at a shell prompt) it is more helpful to emit the 'h' than the '\e'. We don't simply choose to emit the final character for all escape sequences since that will do odd things for unsupported escape sequences (in other words we retain the existing behaviour once we see '\e['). Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-6-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index f794c0ca4557..8bcdded5d61f 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -163,8 +163,8 @@ char kdb_getchar(void) *pbuf++ = key; key = kdb_handle_escape(buf, pbuf - buf); - if (key < 0) /* no escape sequence; return first character */ - return buf[0]; + if (key < 0) /* no escape sequence; return best character */ + return buf[pbuf - buf == 2 ? 1 : 0]; if (key > 0) return key; } -- cgit From 66e4c33b51bc515ca803c0948cf1525b53ffd631 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 Aug 2019 16:14:00 -0700 Subject: rcu: Force tick on for nohz_full CPUs not reaching quiescent states CPUs running for long time periods in the kernel in nohz_full mode might leave the scheduling-clock interrupt disabled for then full duration of their in-kernel execution. This can (among other things) delay grace periods. This commit therefore forces the tick back on for any nohz_full CPU that is failing to pass through a quiescent state upon return from interrupt, which the resched_cpu() will induce. Reported-by: Joel Fernandes [ paulmck: Clear ->rcu_forced_tick as reported by Joel Fernandes testing. ] [ paulmck: Apply Joel Fernandes TICK_DEP_MASK_RCU->TICK_DEP_BIT_RCU fix. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 38 +++++++++++++++++++++++++++++++------- kernel/rcu/tree.h | 1 + 2 files changed, 32 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 66354ef776aa..9fda33864ede 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -651,6 +651,12 @@ static __always_inline void rcu_nmi_exit_common(bool irq) */ if (rdp->dynticks_nmi_nesting != 1) { trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); + if (tick_nohz_full_cpu(rdp->cpu) && + rdp->dynticks_nmi_nesting == 2 && + rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + rdp->rcu_forced_tick = true; + tick_dep_set_cpu(rdp->cpu, TICK_DEP_MASK_RCU); + } WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); return; @@ -886,6 +892,18 @@ void rcu_irq_enter_irqson(void) local_irq_restore(flags); } +/* + * If the scheduler-clock interrupt was enabled on a nohz_full CPU + * in order to get to a quiescent state, disable it. + */ +void rcu_disable_tick_upon_qs(struct rcu_data *rdp) +{ + if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { + tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + rdp->rcu_forced_tick = false; + } +} + /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * @@ -1980,6 +1998,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) if (!offloaded) needwake = rcu_accelerate_cbs(rnp, rdp); + rcu_disable_tick_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ if (needwake) @@ -2265,6 +2284,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) int cpu; unsigned long flags; unsigned long mask; + struct rcu_data *rdp; struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) { @@ -2289,8 +2309,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { - if (f(per_cpu_ptr(&rcu_data, cpu))) + rdp = per_cpu_ptr(&rcu_data, cpu); + if (f(rdp)) { mask |= bit; + rcu_disable_tick_upon_qs(rdp); + } } } if (mask != 0) { @@ -2318,7 +2341,7 @@ void rcu_force_quiescent_state(void) rnp = __this_cpu_read(rcu_data.mynode); for (; rnp != NULL; rnp = rnp->parent) { ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) || - !raw_spin_trylock(&rnp->fqslock); + !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); if (ret) @@ -2851,7 +2874,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) { if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { rcu_barrier_trace(TPS("LastCB"), -1, - rcu_state.barrier_sequence); + rcu_state.barrier_sequence); complete(&rcu_state.barrier_completion); } else { rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence); @@ -2875,7 +2898,7 @@ static void rcu_barrier_func(void *unused) } else { debug_rcu_head_unqueue(&rdp->barrier_head); rcu_barrier_trace(TPS("IRQNQ"), -1, - rcu_state.barrier_sequence); + rcu_state.barrier_sequence); } rcu_nocb_unlock(rdp); } @@ -2902,7 +2925,7 @@ void rcu_barrier(void) /* Did someone else do our work for us? */ if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { rcu_barrier_trace(TPS("EarlyExit"), -1, - rcu_state.barrier_sequence); + rcu_state.barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rcu_state.barrier_mutex); return; @@ -2934,11 +2957,11 @@ void rcu_barrier(void) continue; if (rcu_segcblist_n_cbs(&rdp->cblist)) { rcu_barrier_trace(TPS("OnlineQ"), cpu, - rcu_state.barrier_sequence); + rcu_state.barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); } else { rcu_barrier_trace(TPS("OnlineNQ"), cpu, - rcu_state.barrier_sequence); + rcu_state.barrier_sequence); } } put_online_cpus(); @@ -3160,6 +3183,7 @@ void rcu_cpu_starting(unsigned int cpu) rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ + rcu_disable_tick_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index c612f306fe89..055c31781d3a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -181,6 +181,7 @@ struct rcu_data { atomic_t dynticks; /* Even value for idle, else odd. */ bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ + bool rcu_forced_tick; /* Forced tick to provide QS. */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ -- cgit From b200a0489517d9e5a52e983183e890f573454ebd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Aug 2019 13:24:49 -0700 Subject: rcu: Force nohz_full tick on upon irq enter instead of exit There is interrupt-exit code that forces on the tick for nohz_full CPUs failing to respond to the current grace period in a timely fashion. However, this code must compare ->dynticks_nmi_nesting to the value 2 in the interrupt-exit fastpath. This commit therefore moves this code to the interrupt-entry fastpath, where a lighter-weight comparison to zero may be used. Reported-by: Joel Fernandes [ paulmck: Apply Joel Fernandes TICK_DEP_MASK_RCU->TICK_DEP_BIT_RCU fix. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9fda33864ede..8dc878406c71 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -651,12 +651,6 @@ static __always_inline void rcu_nmi_exit_common(bool irq) */ if (rdp->dynticks_nmi_nesting != 1) { trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); - if (tick_nohz_full_cpu(rdp->cpu) && - rdp->dynticks_nmi_nesting == 2 && - rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - rdp->rcu_forced_tick = true; - tick_dep_set_cpu(rdp->cpu, TICK_DEP_MASK_RCU); - } WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); return; @@ -831,6 +825,11 @@ static __always_inline void rcu_nmi_enter_common(bool irq) rcu_cleanup_after_idle(); incby = 1; + } else if (tick_nohz_full_cpu(rdp->cpu) && + rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && + rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + rdp->rcu_forced_tick = true; + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, -- cgit From 516e5ae0c94016294d3ef175454215b235d03945 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 5 Sep 2019 10:26:41 -0700 Subject: rcu: Reset CPU hints when reporting a quiescent state In some cases, tracing shows that need_heavy_qs is still set even though urgent_qs was cleared upon reporting of a quiescent state. One such case is when the softirq reports that a CPU has passed quiescent state. Commit 671a63517cf9 ("rcu: Avoid unnecessary softirq when system is idle") fixed a bug where core_needs_qs was not being cleared. In order to avoid running into similar situations with the urgent-grace-period flags, this commit causes rcu_disable_urgency_upon_qs(), previously rcu_disable_tick_upon_qs(), to clear the urgency hints, ->rcu_urgent_qs and ->rcu_need_heavy_qs. Note that it is possible for CPUs to go offline with these urgency hints still set. This is handled because rcu_disable_urgency_upon_qs() is also invoked during the online process. Because these hints can be cleared both by the corresponding CPU and by the grace-period kthread, this commit also adds a number of READ_ONCE() and WRITE_ONCE() calls. Tested overnight with rcutorture running for 60 minutes on all configurations of RCU. Signed-off-by: "Joel Fernandes (Google)" [ paulmck: Clear urgency flags in rcu_disable_urgency_upon_qs(). ] [ paulmck: Remove ->core_needs_qs from the set cleared at quiescent state. ] [ paulmck: Make rcu_disable_urgency_upon_qs static per kbuild test robot. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8dc878406c71..82caca305cae 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -827,7 +827,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq) incby = 1; } else if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { rdp->rcu_forced_tick = true; tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); } @@ -892,11 +892,14 @@ void rcu_irq_enter_irqson(void) } /* - * If the scheduler-clock interrupt was enabled on a nohz_full CPU - * in order to get to a quiescent state, disable it. + * If any sort of urgency was applied to the current CPU (for example, + * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order + * to get to a quiescent state, disable it. */ -void rcu_disable_tick_upon_qs(struct rcu_data *rdp) +static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) { + WRITE_ONCE(rdp->rcu_urgent_qs, false); + WRITE_ONCE(rdp->rcu_need_heavy_qs, false); if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU); rdp->rcu_forced_tick = false; @@ -1997,7 +2000,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) if (!offloaded) needwake = rcu_accelerate_cbs(rnp, rdp); - rcu_disable_tick_upon_qs(rdp); + rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ if (needwake) @@ -2311,7 +2314,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) rdp = per_cpu_ptr(&rcu_data, cpu); if (f(rdp)) { mask |= bit; - rcu_disable_tick_upon_qs(rdp); + rcu_disable_urgency_upon_qs(rdp); } } } @@ -3182,7 +3185,7 @@ void rcu_cpu_starting(unsigned int cpu) rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ - rcu_disable_tick_upon_qs(rdp); + rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { -- cgit From ed93dfc6bc0084485ccad1ff6bd2ea81ab2c03cd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Sep 2019 14:09:56 -0700 Subject: rcu: Confine ->core_needs_qs accesses to the corresponding CPU Commit 671a63517cf9 ("rcu: Avoid unnecessary softirq when system is idle") fixed a bug that could result in an indefinite number of unnecessary invocations of the RCU_SOFTIRQ handler at the trailing edge of a scheduler-clock interrupt. However, the fix introduced off-CPU stores to ->core_needs_qs. These writes did not conflict with the on-CPU stores because the CPU's leaf rcu_node structure's ->lock was held across all such stores. However, the loads from ->core_needs_qs were not promoted to READ_ONCE() and, worse yet, the code loading from ->core_needs_qs was written assuming that it was only ever updated by the corresponding CPU. So operation has been robust, but only by luck. This situation is therefore an accident waiting to happen. This commit therefore takes a different approach. Instead of clearing ->core_needs_qs from the grace-period kthread's force-quiescent-state processing, it modifies the rcu_pending() function to suppress the rcu_sched_clock_irq() function's call to invoke_rcu_core() if there is no grace period in progress. This avoids the infinite needless RCU_SOFTIRQ handlers while still keeping all accesses to ->core_needs_qs local to the corresponding CPU. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 82caca305cae..0c8046bc5ec7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1989,7 +1989,6 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; - rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { @@ -2819,6 +2818,7 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu); */ static int rcu_pending(void) { + bool gp_in_progress; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -2834,7 +2834,8 @@ static int rcu_pending(void) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) + gp_in_progress = rcu_gp_in_progress(); + if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress) return 1; /* Does this CPU have callbacks ready to invoke? */ @@ -2842,8 +2843,7 @@ static int rcu_pending(void) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ - if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist) && + if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) && (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) || !rcu_segcblist_is_offloaded(&rdp->cblist)) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) -- cgit From dd7dafd1ad50aa9ed7958235431f243ea131ee7d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 14 Sep 2019 03:39:22 -0700 Subject: rcu: Make kernel-mode nohz_full CPUs invoke the RCU core processing If a nohz_full CPU is idle or executing in userspace, it makes good sense to keep it out of RCU core processing. After all, the RCU grace-period kthread can see its quiescent states and all of its callbacks are offloaded, so there is nothing for RCU core processing to do. However, if a nohz_full CPU is executing in kernel space, the RCU grace-period kthread cannot do anything for it, so such a CPU must report its own quiescent states. This commit therefore makes nohz_full CPUs skip RCU core processing only if the scheduler-clock interrupt caught them in idle or in userspace. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0c8046bc5ec7..4e6ae2699d2e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -497,7 +497,7 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next module_param(rcu_kick_kthreads, bool, 0644); static void force_qs_rnp(int (*f)(struct rcu_data *rdp)); -static int rcu_pending(void); +static int rcu_pending(int user); /* * Return the number of RCU GPs completed thus far for debug & stats. @@ -2267,7 +2267,7 @@ void rcu_sched_clock_irq(int user) __this_cpu_write(rcu_data.rcu_urgent_qs, false); } rcu_flavor_sched_clock_irq(user); - if (rcu_pending()) + if (rcu_pending(user)) invoke_rcu_core(); trace_rcu_utilization(TPS("End scheduler-tick")); @@ -2816,7 +2816,7 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu); * CPU-local state are performed first. However, we must check for CPU * stalls first, else we might not get a chance. */ -static int rcu_pending(void) +static int rcu_pending(int user) { bool gp_in_progress; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -2829,8 +2829,8 @@ static int rcu_pending(void) if (rcu_nocb_need_deferred_wakeup(rdp)) return 1; - /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ - if (rcu_nohz_full_cpu()) + /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */ + if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu()) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ -- cgit From cd1cb3350561d2bf544ddfef76fbf0b1c9c7178f Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 23 Oct 2019 16:37:44 +0100 Subject: sched/topology: Don't try to build empty sched domains Turns out hotplugging CPUs that are in exclusive cpusets can lead to the cpuset code feeding empty cpumasks to the sched domain rebuild machinery. This leads to the following splat: Internal error: Oops: 96000004 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 235 Comm: kworker/5:2 Not tainted 5.4.0-rc1-00005-g8d495477d62e #23 Hardware name: ARM Juno development board (r0) (DT) Workqueue: events cpuset_hotplug_workfn pstate: 60000005 (nZCv daif -PAN -UAO) pc : build_sched_domains (./include/linux/arch_topology.h:23 kernel/sched/topology.c:1898 kernel/sched/topology.c:1969) lr : build_sched_domains (kernel/sched/topology.c:1966) Call trace: build_sched_domains (./include/linux/arch_topology.h:23 kernel/sched/topology.c:1898 kernel/sched/topology.c:1969) partition_sched_domains_locked (kernel/sched/topology.c:2250) rebuild_sched_domains_locked (./include/linux/bitmap.h:370 ./include/linux/cpumask.h:538 kernel/cgroup/cpuset.c:955 kernel/cgroup/cpuset.c:978 kernel/cgroup/cpuset.c:1019) rebuild_sched_domains (kernel/cgroup/cpuset.c:1032) cpuset_hotplug_workfn (kernel/cgroup/cpuset.c:3205 (discriminator 2)) process_one_work (./arch/arm64/include/asm/jump_label.h:21 ./include/linux/jump_label.h:200 ./include/trace/events/workqueue.h:114 kernel/workqueue.c:2274) worker_thread (./include/linux/compiler.h:199 ./include/linux/list.h:268 kernel/workqueue.c:2416) kthread (kernel/kthread.c:255) ret_from_fork (arch/arm64/kernel/entry.S:1167) Code: f860dae2 912802d6 aa1603e1 12800000 (f8616853) The faulty line in question is: cap = arch_scale_cpu_capacity(cpumask_first(cpu_map)); and we're not checking the return value against nr_cpu_ids (we shouldn't have to!), which leads to the above. Prevent generate_sched_domains() from returning empty cpumasks, and add some assertion in build_sched_domains() to scream bloody murder if it happens again. The above splat was obtained on my Juno r0 with the following reproducer: $ cgcreate -g cpuset:asym $ cgset -r cpuset.cpus=0-3 asym $ cgset -r cpuset.mems=0 asym $ cgset -r cpuset.cpu_exclusive=1 asym $ cgcreate -g cpuset:smp $ cgset -r cpuset.cpus=4-5 smp $ cgset -r cpuset.mems=0 smp $ cgset -r cpuset.cpu_exclusive=1 smp $ cgset -r cpuset.sched_load_balance=0 . $ echo 0 > /sys/devices/system/cpu/cpu4/online $ echo 0 > /sys/devices/system/cpu/cpu5/online Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar.Eggemann@arm.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hannes@cmpxchg.org Cc: lizefan@huawei.com Cc: morten.rasmussen@arm.com Cc: qperret@google.com Cc: tj@kernel.org Cc: vincent.guittot@linaro.org Fixes: 05484e098448 ("sched/topology: Add SD_ASYM_CPUCAPACITY flag detection") Link: https://lkml.kernel.org/r/20191023153745.19515-2-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/cgroup/cpuset.c | 3 ++- kernel/sched/topology.c | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c52bc91f882b..c87ee6412b36 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -798,7 +798,8 @@ static int generate_sched_domains(cpumask_var_t **domains, cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) continue; - if (is_sched_load_balance(cp)) + if (is_sched_load_balance(cp) && + !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; /* skip @cp's subtree if not a partition root */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b5667a273bf6..9318acf1d1fe 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1948,7 +1948,7 @@ next_level: static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - enum s_alloc alloc_state; + enum s_alloc alloc_state = sa_none; struct sched_domain *sd; struct s_data d; struct rq *rq = NULL; @@ -1956,6 +1956,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct sched_domain_topology_level *tl_asym; bool has_asym = false; + if (WARN_ON(cpumask_empty(cpu_map))) + goto error; + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; -- cgit From e284df705cf1eeedb5ec3a66ed82d17a64659150 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 23 Oct 2019 16:37:45 +0100 Subject: sched/topology: Allow sched_asym_cpucapacity to be disabled While the static key is correctly initialized as being disabled, it will remain forever enabled once turned on. This means that if we start with an asymmetric system and hotplug out enough CPUs to end up with an SMP system, the static key will remain set - which is obviously wrong. We should detect this and turn off things like misfit migration and capacity aware wakeups. As Quentin pointed out, having separate root domains makes this slightly trickier. We could have exclusive cpusets that create an SMP island - IOW, the domains within this root domain will not see any asymmetry. This means we can't just disable the key on domain destruction, we need to count how many asymmetric root domains we have. Consider the following example using Juno r0 which is 2+4 big.LITTLE, where two identical cpusets are created: they both span both big and LITTLE CPUs: asym0 asym1 [ ][ ] L L B L L B $ cgcreate -g cpuset:asym0 $ cgset -r cpuset.cpus=0,1,3 asym0 $ cgset -r cpuset.mems=0 asym0 $ cgset -r cpuset.cpu_exclusive=1 asym0 $ cgcreate -g cpuset:asym1 $ cgset -r cpuset.cpus=2,4,5 asym1 $ cgset -r cpuset.mems=0 asym1 $ cgset -r cpuset.cpu_exclusive=1 asym1 $ cgset -r cpuset.sched_load_balance=0 . (the CPU numbering may look odd because on the Juno LITTLEs are CPUs 0,3-5 and bigs are CPUs 1-2) If we make one of those SMP (IOW remove asymmetry) by e.g. hotplugging its big core, we would end up with an SMP cpuset and an asymmetric cpuset - the static key must remain set, because we still have one asymmetric root domain. With the above example, this could be done with: $ echo 0 > /sys/devices/system/cpu/cpu2/online Which would result in: asym0 asym1 [ ][ ] L L B L L When both SMP and asymmetric cpusets are present, all CPUs will observe sched_asym_cpucapacity being set (it is system-wide), but not all CPUs observe asymmetry in their sched domain hierarchy: per_cpu(sd_asym_cpucapacity, ) == per_cpu(sd_asym_cpucapacity, ) == NULL Change the simple key enablement to an increment, and decrement the key counter when destroying domains that cover asymmetric CPUs. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Cc: Dietmar.Eggemann@arm.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hannes@cmpxchg.org Cc: lizefan@huawei.com Cc: morten.rasmussen@arm.com Cc: qperret@google.com Cc: tj@kernel.org Cc: vincent.guittot@linaro.org Fixes: df054e8445a4 ("sched/topology: Add static_key for asymmetric CPU capacity optimizations") Link: https://lkml.kernel.org/r/20191023153745.19515-3-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9318acf1d1fe..49b835f1305f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2029,7 +2029,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att rcu_read_unlock(); if (has_asym) - static_branch_enable_cpuslocked(&sched_asym_cpucapacity); + static_branch_inc_cpuslocked(&sched_asym_cpucapacity); if (rq && sched_debug_enabled) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", @@ -2124,8 +2124,12 @@ int sched_init_domains(const struct cpumask *cpu_map) */ static void detach_destroy_domains(const struct cpumask *cpu_map) { + unsigned int cpu = cpumask_any(cpu_map); int i; + if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) + static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); -- cgit From b8c96361402aa3e74ad48ceef18aed99153d8da8 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 23 Oct 2019 21:56:30 +0100 Subject: sched/fair/util_est: Implement faster ramp-up EWMA on utilization increases The estimated utilization for a task: util_est = max(util_avg, est.enqueue, est.ewma) is defined based on: - util_avg: the PELT defined utilization - est.enqueued: the util_avg at the end of the last activation - est.ewma: a exponential moving average on the est.enqueued samples According to this definition, when a task suddenly changes its bandwidth requirements from small to big, the EWMA will need to collect multiple samples before converging up to track the new big utilization. This slow convergence towards bigger utilization values is not aligned to the default scheduler behavior, which is to optimize for performance. Moreover, the est.ewma component fails to compensate for temporarely utilization drops which spans just few est.enqueued samples. To let util_est do a better job in the scenario depicted above, change its definition by making util_est directly follow upward motion and only decay the est.ewma on downward. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Douglas Raillard Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191023205630.14469-1-patrick.bellasi@matbug.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 +++++++++++++- kernel/sched/features.h | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a81c36472822..a14487462b6c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3768,11 +3768,22 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) if (ue.enqueued & UTIL_AVG_UNCHANGED) return; + /* + * Reset EWMA on utilization increases, the moving average is used only + * to smooth utilization decreases. + */ + ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); + if (sched_feat(UTIL_EST_FASTUP)) { + if (ue.ewma < ue.enqueued) { + ue.ewma = ue.enqueued; + goto done; + } + } + /* * Skip update of task's estimated utilization when its EWMA is * already ~1% close to its last activation value. */ - ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); last_ewma_diff = ue.enqueued - ue.ewma; if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) return; @@ -3805,6 +3816,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; ue.ewma += last_ewma_diff; ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; +done: WRITE_ONCE(p->se.avg.util_est, ue); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 2410db5e9a35..7481cd96f391 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -89,3 +89,4 @@ SCHED_FEAT(WA_BIAS, true) * UtilEstimation. Use estimated CPU utilization. */ SCHED_FEAT(UTIL_EST, true) +SCHED_FEAT(UTIL_EST_FASTUP, true) -- cgit From 802f4a827f139f2581b3c50c69d20f8bf4c24af1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Oct 2019 04:56:47 +0200 Subject: sched/vtime: Record CPU under seqcount for kcpustat needs In order to compute the kcpustat delta on a nohz CPU, we'll need to fetch the task running on that target. Checking that its vtime state snapshot actually refers to the relevant target involves recording that CPU under the seqcount locked on task switch. This is a step toward making kcpustat moving forward on full nohz CPUs. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Jacek Anaszewski Cc: Linus Torvalds Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Cc: Wanpeng Li Cc: Yauheni Kaliuta Link: https://lkml.kernel.org/r/20191016025700.31277-2-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cef23c211f41..40f581692254 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -818,6 +818,7 @@ void vtime_task_switch_generic(struct task_struct *prev) else __vtime_account_kernel(prev, vtime); vtime->state = VTIME_INACTIVE; + vtime->cpu = -1; write_seqcount_end(&vtime->seqcount); vtime = ¤t->vtime; @@ -825,6 +826,7 @@ void vtime_task_switch_generic(struct task_struct *prev) write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; vtime->starttime = sched_clock(); + vtime->cpu = smp_processor_id(); write_seqcount_end(&vtime->seqcount); } @@ -837,6 +839,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; vtime->starttime = sched_clock(); + vtime->cpu = cpu; write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } -- cgit From 14faf6fcac4ba33f8fd8d9b2d0278010a9eb1742 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Oct 2019 04:56:48 +0200 Subject: sched/cputime: Add vtime idle task state Record idle as a VTIME state instead of guessing it from VTIME_SYS and is_idle_task(). This is going to simplify the cputime read side especially as its state machine is going to further expand in order to fully support kcpustat on nohz_full. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Jacek Anaszewski Cc: Linus Torvalds Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Cc: Wanpeng Li Cc: Yauheni Kaliuta Link: https://lkml.kernel.org/r/20191016025700.31277-3-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 40f581692254..2e885e870aa1 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -813,7 +813,7 @@ void vtime_task_switch_generic(struct task_struct *prev) struct vtime *vtime = &prev->vtime; write_seqcount_begin(&vtime->seqcount); - if (is_idle_task(prev)) + if (vtime->state == VTIME_IDLE) vtime_account_idle(prev); else __vtime_account_kernel(prev, vtime); @@ -824,7 +824,10 @@ void vtime_task_switch_generic(struct task_struct *prev) vtime = ¤t->vtime; write_seqcount_begin(&vtime->seqcount); - vtime->state = VTIME_SYS; + if (is_idle_task(current)) + vtime->state = VTIME_IDLE; + else + vtime->state = VTIME_SYS; vtime->starttime = sched_clock(); vtime->cpu = smp_processor_id(); write_seqcount_end(&vtime->seqcount); @@ -837,7 +840,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&vtime->seqcount); - vtime->state = VTIME_SYS; + vtime->state = VTIME_IDLE; vtime->starttime = sched_clock(); vtime->cpu = cpu; write_seqcount_end(&vtime->seqcount); @@ -888,8 +891,8 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) *utime = t->utime; *stime = t->stime; - /* Task is sleeping, nothing to add */ - if (vtime->state == VTIME_INACTIVE || is_idle_task(t)) + /* Task is sleeping or idle, nothing to add */ + if (vtime->state < VTIME_SYS) continue; delta = vtime_delta(vtime); -- cgit From e6d5bf3e321ca664d12eb00ceb40bd58987ce8a1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Oct 2019 04:56:49 +0200 Subject: sched/cputime: Add vtime guest task state Record guest as a VTIME state instead of guessing it from VTIME_SYS and PF_VCPU. This is going to simplify the cputime read side especially as its state machine is going to further expand in order to fully support kcpustat on nohz_full. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Jacek Anaszewski Cc: Linus Torvalds Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Cc: Wanpeng Li Cc: Yauheni Kaliuta Link: https://lkml.kernel.org/r/20191016025700.31277-4-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2e885e870aa1..34086afc3518 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -733,7 +733,7 @@ static void __vtime_account_kernel(struct task_struct *tsk, struct vtime *vtime) { /* We might have scheduled out from guest path */ - if (tsk->flags & PF_VCPU) + if (vtime->state == VTIME_GUEST) vtime_account_guest(tsk, vtime); else vtime_account_system(tsk, vtime); @@ -788,6 +788,7 @@ void vtime_guest_enter(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); vtime_account_system(tsk, vtime); tsk->flags |= PF_VCPU; + vtime->state = VTIME_GUEST; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); @@ -799,6 +800,7 @@ void vtime_guest_exit(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); vtime_account_guest(tsk, vtime); tsk->flags &= ~PF_VCPU; + vtime->state = VTIME_SYS; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); @@ -826,6 +828,8 @@ void vtime_task_switch_generic(struct task_struct *prev) write_seqcount_begin(&vtime->seqcount); if (is_idle_task(current)) vtime->state = VTIME_IDLE; + else if (current->flags & PF_VCPU) + vtime->state = VTIME_GUEST; else vtime->state = VTIME_SYS; vtime->starttime = sched_clock(); @@ -860,7 +864,7 @@ u64 task_gtime(struct task_struct *t) seq = read_seqcount_begin(&vtime->seqcount); gtime = t->gtime; - if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) + if (vtime->state == VTIME_GUEST) gtime += vtime->gtime + vtime_delta(vtime); } while (read_seqcount_retry(&vtime->seqcount, seq)); @@ -898,13 +902,13 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) delta = vtime_delta(vtime); /* - * Task runs either in user or kernel space, add pending nohz time to - * the right place. + * Task runs either in user (including guest) or kernel space, + * add pending nohz time to the right place. */ - if (vtime->state == VTIME_USER || t->flags & PF_VCPU) - *utime += vtime->utime + delta; - else if (vtime->state == VTIME_SYS) + if (vtime->state == VTIME_SYS) *stime += vtime->stime + delta; + else + *utime += vtime->utime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit From 74c578759f15cb5a0d0107759bdad671d7b52ab9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Oct 2019 04:56:51 +0200 Subject: context_tracking: Rename context_tracking_is_enabled() => context_tracking_enabled() Remove the superfluous "is" in the middle of the name. We want to standardize the naming so that it can be expanded through suffixes: context_tracking_enabled() context_tracking_enabled_cpu() context_tracking_enabled_this_cpu() Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Jacek Anaszewski Cc: Linus Torvalds Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Cc: Wanpeng Li Cc: Yauheni Kaliuta Link: https://lkml.kernel.org/r/20191016025700.31277-6-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/context_tracking.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index be01a4d627c9..0296b4bda8f1 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -25,8 +25,8 @@ #define CREATE_TRACE_POINTS #include -DEFINE_STATIC_KEY_FALSE(context_tracking_enabled); -EXPORT_SYMBOL_GPL(context_tracking_enabled); +DEFINE_STATIC_KEY_FALSE(context_tracking_key); +EXPORT_SYMBOL_GPL(context_tracking_key); DEFINE_PER_CPU(struct context_tracking, context_tracking); EXPORT_SYMBOL_GPL(context_tracking); @@ -192,7 +192,7 @@ void __init context_tracking_cpu_set(int cpu) if (!per_cpu(context_tracking.active, cpu)) { per_cpu(context_tracking.active, cpu) = true; - static_branch_inc(&context_tracking_enabled); + static_branch_inc(&context_tracking_key); } if (initialized) -- cgit From e44fcb4b7a299602fb300b82a546c0b8a50d9d90 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Oct 2019 04:56:54 +0200 Subject: sched/vtime: Rename vtime_accounting_cpu_enabled() to vtime_accounting_enabled_this_cpu() Standardize the naming on top of the vtime_accounting_enabled_*() base. Also make it clear we are checking the vtime state of the *current* CPU with this function. We'll need to add an API to check that state on remote CPUs as well, so we must disambiguate the naming. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Jacek Anaszewski Cc: Linus Torvalds Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Cc: Wanpeng Li Cc: Yauheni Kaliuta Link: https://lkml.kernel.org/r/20191016025700.31277-9-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 2 +- kernel/time/tick-sched.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 34086afc3518..b931a19df093 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -475,7 +475,7 @@ void account_process_tick(struct task_struct *p, int user_tick) u64 cputime, steal; struct rq *rq = this_rq(); - if (vtime_accounting_cpu_enabled()) + if (vtime_accounting_enabled_this_cpu()) return; if (sched_clock_irqtime) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 955851748dc3..c2748232f607 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1119,7 +1119,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE unsigned long ticks; - if (vtime_accounting_cpu_enabled()) + if (vtime_accounting_enabled_this_cpu()) return; /* * We stopped the tick in idle. Update process times would miss the -- cgit From 64eea63c19a2c386a96638f4e54a1355510709e3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 25 Oct 2019 04:03:03 +0200 Subject: sched/kcpustat: Introduce vtime-aware kcpustat accessor for CPUTIME_SYSTEM Kcpustat is not correctly supported on nohz_full CPUs. The tick doesn't fire and the cputime therefore doesn't move forward. The issue has shown up after the vanishing of the remaining 1Hz which has made the stall visible. We are solving that with checking the task running on a CPU through RCU and reading its vtime delta that we add to the raw kcpustat values. We make sure that we fetch a coherent raw-kcpustat/vtime-delta couple sequence while checking that the CPU referred by the target vtime is the correct one, under the locked vtime seqcount. Only CPUTIME_SYSTEM is handled here as a start because it's the trivial case. User and guest time will require more preparation work to correctly handle niceness. Reported-by: Yauheni Kaliuta Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Link: https://lkml.kernel.org/r/20191025020303.19342-1-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b931a19df093..e0cd20693ef5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -911,4 +911,86 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) *utime += vtime->utime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); } + +static int kcpustat_field_vtime(u64 *cpustat, + struct vtime *vtime, + enum cpu_usage_stat usage, + int cpu, u64 *val) +{ + unsigned int seq; + int err; + + do { + seq = read_seqcount_begin(&vtime->seqcount); + + /* + * We raced against context switch, fetch the + * kcpustat task again. + */ + if (vtime->cpu != cpu && vtime->cpu != -1) + return -EAGAIN; + + /* + * Two possible things here: + * 1) We are seeing the scheduling out task (prev) or any past one. + * 2) We are seeing the scheduling in task (next) but it hasn't + * passed though vtime_task_switch() yet so the pending + * cputime of the prev task may not be flushed yet. + * + * Case 1) is ok but 2) is not. So wait for a safe VTIME state. + */ + if (vtime->state == VTIME_INACTIVE) + return -EAGAIN; + + err = 0; + + *val = cpustat[usage]; + + if (vtime->state == VTIME_SYS) + *val += vtime->stime + vtime_delta(vtime); + + } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return 0; +} + +u64 kcpustat_field(struct kernel_cpustat *kcpustat, + enum cpu_usage_stat usage, int cpu) +{ + u64 *cpustat = kcpustat->cpustat; + struct rq *rq; + u64 val; + int err; + + if (!vtime_accounting_enabled_cpu(cpu)) + return cpustat[usage]; + + /* Only support sys vtime for now */ + if (usage != CPUTIME_SYSTEM) + return cpustat[usage]; + + rq = cpu_rq(cpu); + + for (;;) { + struct task_struct *curr; + struct vtime *vtime; + + rcu_read_lock(); + curr = rcu_dereference(rq->curr); + if (WARN_ON_ONCE(!curr)) { + rcu_read_unlock(); + return cpustat[usage]; + } + + vtime = &curr->vtime; + err = kcpustat_field_vtime(cpustat, vtime, usage, cpu, &val); + rcu_read_unlock(); + + if (!err) + return val; + + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(kcpustat_field); #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit From 751459043cc87c3f0098034b15ca5252d12539ab Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 22 Oct 2019 20:34:50 -0700 Subject: futex: Drop leftover wake_q_add() comment Since the original comment, we have moved to do the task reference counting explicitly along with wake_q_add_safe(). Drop the now incorrect comment. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Link: https://lkml.kernel.org/r/20191023033450.6445-1-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index bd18f60e4c6c..43229f8999fc 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1480,7 +1480,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) /* * Queue the task for later wakeup for after we've released - * the hb->lock. wake_q_add() grabs reference to p. + * the hb->lock. */ wake_q_add_safe(wake_q, p); } -- cgit From a0855d24fc22d49cdc25664fb224caee16998683 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 24 Oct 2019 20:36:34 -0700 Subject: locking/mutex: Complain upon mutex API misuse in IRQ contexts Add warning checks if mutex_trylock() or mutex_unlock() are used in IRQ contexts, under CONFIG_DEBUG_MUTEXES=y. While the mutex rules and semantics are explicitly documented, this allows to expose any abusers and robustifies the whole thing. While trylock and unlock are non-blocking, calling from IRQ context is still forbidden (lock must be within the same context as unlock). Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Link: https://lkml.kernel.org/r/20191025033634.3330-1-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 5352ce50a97e..54cc5f9286e9 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -733,6 +733,9 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne */ void __sched mutex_unlock(struct mutex *lock) { +#ifdef CONFIG_DEBUG_MUTEXES + WARN_ON(in_interrupt()); +#endif #ifndef CONFIG_DEBUG_LOCK_ALLOC if (__mutex_unlock_fast(lock)) return; @@ -1413,6 +1416,7 @@ int __sched mutex_trylock(struct mutex *lock) #ifdef CONFIG_DEBUG_MUTEXES DEBUG_LOCKS_WARN_ON(lock->magic != lock); + WARN_ON(in_interrupt()); #endif locked = __mutex_trylock(lock); -- cgit From 771b53d033e8663abdf59704806aa856b236dcdb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 22 Oct 2019 10:25:58 -0600 Subject: io-wq: small threadpool implementation for io_uring This adds support for io-wq, a smaller and specialized thread pool implementation. This is meant to replace workqueues for io_uring. Among the reasons for this addition are: - We can assign memory context smarter and more persistently if we manage the life time of threads. - We can drop various work-arounds we have in io_uring, like the async_list. - We can implement hashed work insertion, to manage concurrency of buffered writes without needing a) an extra workqueue, or b) needlessly making the concurrency of said workqueue very low which hurts performance of multiple buffered file writers. - We can implement cancel through signals, for cancelling interruptible work like read/write (or send/recv) to/from sockets. - We need the above cancel for being able to assign and use file tables from a process. - We can implement a more thorough cancel operation in general. - We need it to move towards a syslet/threadlet model for even faster async execution. For that we need to take ownership of the used threads. This list is just off the top of my head. Performance should be the same, or better, at least that's what I've seen in my testing. io-wq supports basic NUMA functionality, setting up a pool per node. io-wq hooks up to the scheduler schedule in/out just like workqueue and uses that to drive the need for more/less workers. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Jens Axboe --- kernel/sched/core.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd05a378631a..a95a2f05f3b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,7 @@ #include #include "../workqueue_internal.h" +#include "../../fs/io-wq.h" #include "../smpboot.h" #include "pelt.h" @@ -4103,9 +4104,12 @@ static inline void sched_submit_work(struct task_struct *tsk) * we disable preemption to avoid it calling schedule() again * in the possible wakeup of a kworker. */ - if (tsk->flags & PF_WQ_WORKER) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { preempt_disable(); - wq_worker_sleeping(tsk); + if (tsk->flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + else + io_wq_worker_sleeping(tsk); preempt_enable_no_resched(); } @@ -4122,8 +4126,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & PF_WQ_WORKER) - wq_worker_running(tsk); + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); + else + io_wq_worker_running(tsk); + } } asmlinkage __visible void __sched schedule(void) -- cgit From 15ab09bdca616538597fe4d2eb4db3c2d28716ba Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 28 Oct 2019 20:24:26 -0700 Subject: bpf: Enforce 'return 0' in BTF-enabled raw_tp programs The return value of raw_tp programs is ignored by __bpf_trace_run() that calls them. The verifier also allows any value to be returned. For BTF-enabled raw_tp lets enforce 'return 0', so that return value can be used for something in the future. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191029032426.1206762-1-ast@kernel.org --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c59778c0fc4d..6b0de04f8b91 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6279,6 +6279,11 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_CGROUP_SOCKOPT: break; + case BPF_PROG_TYPE_RAW_TRACEPOINT: + if (!env->prog->aux->attach_btf_id) + return 0; + range = tnum_const(0); + break; default: return 0; } -- cgit From 1d24dd4e01fb6b928cf679e3e415ddff7016fa96 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Thu, 8 Aug 2019 10:32:58 +0800 Subject: rcu: Several rcu_segcblist functions can be static None of rcu_segcblist_set_len(), rcu_segcblist_add_len(), or rcu_segcblist_xchg_len() are used outside of kernel/rcu/rcu_segcblist.c. This commit therefore makes them static. Fixes: eda669a6a2c5 ("rcu/nocb: Atomic ->len field in rcu_segcblist structure") Signed-off-by: kbuild test robot [ paulmck: "Fixes:" updated per Stephen Rothwell feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 495c58ce1640..cbc87b804db9 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -88,7 +88,7 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) } /* Set the length of an rcu_segcblist structure. */ -void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) +static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) { #ifdef CONFIG_RCU_NOCB_CPU atomic_long_set(&rsclp->len, v); @@ -104,7 +104,7 @@ void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) * This increase is fully ordered with respect to the callers accesses * both before and after. */ -void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) +static void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) { #ifdef CONFIG_RCU_NOCB_CPU smp_mb__before_atomic(); /* Up to the caller! */ @@ -134,7 +134,7 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp) * with the actual number of callbacks on the structure. This exchange is * fully ordered with respect to the callers accesses both before and after. */ -long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v) +static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v) { #ifdef CONFIG_RCU_NOCB_CPU return atomic_long_xchg(&rsclp->len, v); -- cgit From 5a6446626d7e062b47a5cc1cf27d5060e60bb0d9 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 15 Aug 2019 10:18:42 -0400 Subject: workqueue: Convert for_each_wq to use built-in list check Because list_for_each_entry_rcu() can now check for holding a lock as well as for being in an RCU read-side critical section, this commit replaces the workqueue_sysfs_unregister() function's use of assert_rcu_or_wq_mutex() and list_for_each_entry_rcu() with list_for_each_entry_rcu() augmented with a lockdep_is_held() optional argument. Acked-by: Tejun Heo Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/workqueue.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc2e09a8ea61..e501c79e283a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -364,11 +364,6 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); !lockdep_is_held(&wq_pool_mutex), \ "RCU or wq_pool_mutex should be held") -#define assert_rcu_or_wq_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ - !lockdep_is_held(&wq->mutex), \ - "RCU or wq->mutex should be held") - #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq->mutex) && \ @@ -425,9 +420,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * ignored. */ #define for_each_pwq(pwq, wq) \ - list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ - if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ - else + list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \ + lockdep_is_held(&(wq->mutex))) #ifdef CONFIG_DEBUG_OBJECTS_WORK -- cgit From 05ef9e9eb3dade21413680f41eb0170778e8ae2b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 15 Aug 2019 22:59:14 -0400 Subject: rcu: Ensure that ->rcu_urgent_qs is set before resched IPI The RCU-specific resched_cpu() function sends a resched IPI to the specified CPU, which can be used to force the tick on for a given nohz_full CPU. This is needed when this nohz_full CPU is looping in the kernel while blocking the current grace period. However, for the tick to actually be forced on in all cases, that CPU's rcu_data structure's ->rcu_urgent_qs flag must be set beforehand. This commit therefore causes rcu_implicit_dynticks_qs() to set this flag prior to invoking resched_cpu() on a holdout nohz_full CPU. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 81105141b6a8..0d83b1944e19 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1073,6 +1073,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (tick_nohz_full_cpu(rdp->cpu) && time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) { + WRITE_ONCE(*ruqp, true); resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); } -- cgit From b8889c9c89a2655a231dfed93cc9bdca0930ea67 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 23 Sep 2019 17:26:34 +0300 Subject: rcu: Fix uninitialized variable in nocb_gp_wait() We never set this to false. This probably doesn't affect most people's runtime because GCC will automatically initialize it to false at certain common optimization levels. But that behavior is related to a bug in GCC and obviously should not be relied on. Fixes: 5d6742b37727 ("rcu/nocb: Use rcu_segcblist for no-CBs CPUs") Signed-off-by: Dan Carpenter Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2defc7fe74c3..fa08d55f7040 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1946,7 +1946,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) int __maybe_unused cpu = my_rdp->cpu; unsigned long cur_gp_seq; unsigned long flags; - bool gotcbs; + bool gotcbs = false; unsigned long j = jiffies; bool needwait_gp = false; // This prevents actual uninitialized use. bool needwake; -- cgit From 36b5dae64513b7ce3a0e0f6cb469e0f74bacad45 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Sep 2019 10:10:31 -0700 Subject: rcu: Suppress levelspread uninitialized messages New tools bring new warnings, and with v5.3 comes: kernel/rcu/srcutree.c: warning: 'levelspread[]' may be used uninitialized in this function [-Wuninitialized]: => 121:34 This commit suppresses this warning by initializing the full array to INT_MIN, which will result in failures should any out-of-bounds references appear. Reported-by: Michael Ellerman Reported-by: Geert Uytterhoeven Signed-off-by: Paul E. McKenney Reviewed-by: Geert Uytterhoeven --- kernel/rcu/rcu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 8fd4f82c9b3d..b64c707f6065 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -299,6 +299,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) { int i; + for (i = 0; i < RCU_NUM_LVLS; i++) + levelspread[i] = INT_MIN; if (rcu_fanout_exact) { levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; for (i = rcu_num_lvls - 2; i >= 0; i--) -- cgit From 6092f7263f7e56dacf72e383ed7ba9cbabec30e5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Sep 2019 15:37:04 -0700 Subject: bpf/cgroup: Replace rcu_swap_protected() with rcu_replace_pointer() This commit replaces the use of rcu_swap_protected() with the more intuitively appealing rcu_replace_pointer() as a step towards removing rcu_swap_protected(). Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/ Reported-by: Linus Torvalds [ paulmck: From rcu_replace() to rcu_replace_pointer() per Ingo Molnar. ] Signed-off-by: Paul E. McKenney Acked-by: Andrii Nakryiko Acked-by: Song Liu Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Martin KaFai Lau Cc: Yonghong Song Cc: Cc: --- kernel/bpf/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ddd8addcdb5c..c684cf424849 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -180,8 +180,8 @@ static void activate_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, struct bpf_prog_array *old_array) { - rcu_swap_protected(cgrp->bpf.effective[type], old_array, - lockdep_is_held(&cgroup_mutex)); + old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array, + lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array */ -- cgit From a445e940ea686fc60475564009821010eb213be3 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Wed, 30 Oct 2019 10:13:13 +0000 Subject: dma-mapping: fix handling of dma-ranges for reserved memory (again) Daniele reported that issue previously fixed in c41f9ea998f3 ("drivers: dma-coherent: Account dma_pfn_offset when used with device tree") reappear shortly after 43fc509c3efb ("dma-coherent: introduce interface for default DMA pool") where fix was accidentally dropped. Lets put fix back in place and respect dma-ranges for reserved memory. Fixes: 43fc509c3efb ("dma-coherent: introduce interface for default DMA pool") Reported-by: Daniele Alessandrelli Tested-by: Daniele Alessandrelli Tested-by: Alexandre Torgue Signed-off-by: Vladimir Murzin Signed-off-by: Christoph Hellwig --- kernel/dma/coherent.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 545e3869b0e3..551b0eb7028a 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -123,8 +123,9 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, return ret; } -static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, - ssize_t size, dma_addr_t *dma_handle) +static void *__dma_alloc_from_coherent(struct device *dev, + struct dma_coherent_mem *mem, + ssize_t size, dma_addr_t *dma_handle) { int order = get_order(size); unsigned long flags; @@ -143,7 +144,7 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, /* * Memory was found in the coherent area. */ - *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + *dma_handle = dma_get_device_base(dev, mem) + (pageno << PAGE_SHIFT); ret = mem->virt_base + (pageno << PAGE_SHIFT); spin_unlock_irqrestore(&mem->spinlock, flags); memset(ret, 0, size); @@ -175,17 +176,18 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, if (!mem) return 0; - *ret = __dma_alloc_from_coherent(mem, size, dma_handle); + *ret = __dma_alloc_from_coherent(dev, mem, size, dma_handle); return 1; } -void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) +void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle) { if (!dma_coherent_default_memory) return NULL; - return __dma_alloc_from_coherent(dma_coherent_default_memory, size, - dma_handle); + return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size, + dma_handle); } static int __dma_release_from_coherent(struct dma_coherent_mem *mem, -- cgit From 9ff6aa027dbb98755f0265695354f2dd07c0d1ce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 28 Oct 2019 14:56:46 -0700 Subject: dma-debug: add a schedule point in debug_dma_dump_mappings() debug_dma_dump_mappings() can take a lot of cpu cycles : lpk43:/# time wc -l /sys/kernel/debug/dma-api/dump 163435 /sys/kernel/debug/dma-api/dump real 0m0.463s user 0m0.003s sys 0m0.459s Let's add a cond_resched() to avoid holding cpu for too long. Signed-off-by: Eric Dumazet Cc: Corentin Labbe Cc: Christoph Hellwig Cc: Marek Szyprowski Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 099002d84f46..4ad74f5987ea 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -420,6 +420,7 @@ void debug_dma_dump_mappings(struct device *dev) } spin_unlock_irqrestore(&bucket->lock, flags); + cond_resched(); } } -- cgit From ca66536845cd55c6a5fccd82694dcc87ed970780 Mon Sep 17 00:00:00 2001 From: Shyam Saini Date: Sun, 20 Oct 2019 10:33:22 +0530 Subject: kernel: dma-contiguous: mark CMA parameters __initdata/__initconst These parameters are only referenced by __init routine calls during early boot so they should be marked as __initdata and __initconst accordingly. Signed-off-by: Shyam Saini Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 69cfb4345388..daa4e6eefdde 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -42,10 +42,11 @@ struct cma *dma_contiguous_default_area; * Users, who want to set the size of global CMA area for their system * should use cma= kernel parameter. */ -static const phys_addr_t size_bytes = (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; -static phys_addr_t size_cmdline = -1; -static phys_addr_t base_cmdline; -static phys_addr_t limit_cmdline; +static const phys_addr_t size_bytes __initconst = + (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; +static phys_addr_t size_cmdline __initdata = -1; +static phys_addr_t base_cmdline __initdata; +static phys_addr_t limit_cmdline __initdata; static int __init early_cma(char *p) { -- cgit From 7541c87c9b7a7e07c84481f37f2c19063b44469b Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 28 Oct 2019 13:29:02 +0100 Subject: bpf: Allow narrow loads of bpf_sysctl fields with offset > 0 "ctx:file_pos sysctl:read read ok narrow" works on s390 by accident: it reads the wrong byte, which happens to have the expected value of 0. Improve the test by seeking to the 4th byte and expecting 4 instead of 0. This makes the latent problem apparent: the test attempts to read the first byte of bpf_sysctl.file_pos, assuming this is the least-significant byte, which is not the case on big-endian machines: a non-zero offset is needed. The point of the test is to verify narrow loads, so we cannot cheat our way out by simply using BPF_W. The existence of the test means that such loads have to be supported, most likely because llvm can generate them. Fix the test by adding a big-endian variant, which uses an offset to access the least-significant byte of bpf_sysctl.file_pos. This reveals the final problem: verifier rejects accesses to bpf_sysctl fields with offset > 0. Such accesses are already allowed for a wide range of structs: __sk_buff, bpf_sock_addr and sk_msg_md to name a few. Extend this support to bpf_sysctl by using bpf_ctx_range instead of offsetof when matching field offsets. Fixes: 7b146cebe30c ("bpf: Sysctl hook") Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx") Fixes: 9a1027e52535 ("selftests/bpf: Test file_pos field in bpf_sysctl ctx") Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Andrey Ignatov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191028122902.9763-1-iii@linux.ibm.com --- kernel/bpf/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ddd8addcdb5c..a3eaf08e7dd3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1311,12 +1311,12 @@ static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, return false; switch (off) { - case offsetof(struct bpf_sysctl, write): + case bpf_ctx_range(struct bpf_sysctl, write): if (type != BPF_READ) return false; bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); - case offsetof(struct bpf_sysctl, file_pos): + case bpf_ctx_range(struct bpf_sysctl, file_pos): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); -- cgit From af91acbc62999d62e2ca1e80f660d20561ca55d3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Oct 2019 16:30:19 -0700 Subject: bpf: Fix bpf jit kallsym access Jiri reported crash when JIT is on, but net.core.bpf_jit_kallsyms is off. bpf_prog_kallsyms_find() was skipping addr->bpf_prog resolution logic in oops and stack traces. That's incorrect. It should only skip addr->name resolution for 'cat /proc/kallsyms'. That's what bpf_jit_kallsyms and bpf_jit_harden protect. Fixes: 3dec541b2e63 ("bpf: Add support for BTF pointers to x86 JIT") Reported-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191030233019.1187404-1-ast@kernel.org --- kernel/bpf/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 673f5d40a93e..8d3fbc86ca5e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -668,9 +668,6 @@ static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) { struct latch_tree_node *n; - if (!bpf_jit_kallsyms_enabled()) - return NULL; - n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); return n ? container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : -- cgit From f1b9509c2fb0ef4db8d22dac9aef8e856a5d81f6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Oct 2019 15:32:11 -0700 Subject: bpf: Replace prog_raw_tp+btf_id with prog_tracing The bpf program type raw_tp together with 'expected_attach_type' was the most appropriate api to indicate BTF-enabled raw_tp programs. But during development it became apparent that 'expected_attach_type' cannot be used and new 'attach_btf_id' field had to be introduced. Which means that the information is duplicated in two fields where one of them is ignored. Clean it up by introducing new program type where both 'expected_attach_type' and 'attach_btf_id' fields have specific meaning. In the future 'expected_attach_type' will be extended with other attach points that have similar semantics to raw_tp. This patch is replacing BTF-enabled BPF_PROG_TYPE_RAW_TRACEPOINT with prog_type = BPF_RPOG_TYPE_TRACING expected_attach_type = BPF_TRACE_RAW_TP attach_btf_id = btf_id of raw tracepoint inside the kernel Future patches will add expected_attach_type = BPF_TRACE_FENTRY or BPF_TRACE_FEXIT where programs have the same input context and the same helpers, but different attach points. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191030223212.953010-2-ast@kernel.org --- kernel/bpf/syscall.c | 6 +++--- kernel/bpf/verifier.c | 34 ++++++++++++++++++++++++---------- kernel/trace/bpf_trace.c | 44 ++++++++++++++++++++++++++++++++++++-------- 3 files changed, 63 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ff5225759553..985d01ced196 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1571,7 +1571,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, u32 btf_id) { switch (prog_type) { - case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_TRACING: if (btf_id > BTF_MAX_TYPE) return -EINVAL; break; @@ -1833,13 +1833,13 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) return PTR_ERR(prog); if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->type != BPF_PROG_TYPE_TRACING && prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { err = -EINVAL; goto out_put_prog; } - if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT && - prog->aux->attach_btf_id) { + if (prog->type == BPF_PROG_TYPE_TRACING) { if (attr->raw_tracepoint.name) { /* raw_tp name should not be specified in raw_tp * programs that were verified via in-kernel BTF info diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6b0de04f8b91..2f2374967b36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9381,24 +9381,36 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; u32 btf_id = prog->aux->attach_btf_id; + const char prefix[] = "btf_trace_"; const struct btf_type *t; const char *tname; - if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT && btf_id) { - const char prefix[] = "btf_trace_"; + if (prog->type != BPF_PROG_TYPE_TRACING) + return 0; - t = btf_type_by_id(btf_vmlinux, btf_id); - if (!t) { - verbose(env, "attach_btf_id %u is invalid\n", btf_id); - return -EINVAL; - } + if (!btf_id) { + verbose(env, "Tracing programs must provide btf_id\n"); + return -EINVAL; + } + t = btf_type_by_id(btf_vmlinux, btf_id); + if (!t) { + verbose(env, "attach_btf_id %u is invalid\n", btf_id); + return -EINVAL; + } + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + if (!tname) { + verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id); + return -EINVAL; + } + + switch (prog->expected_attach_type) { + case BPF_TRACE_RAW_TP: if (!btf_type_is_typedef(t)) { verbose(env, "attach_btf_id %u is not a typedef\n", btf_id); return -EINVAL; } - tname = btf_name_by_offset(btf_vmlinux, t->name_off); - if (!tname || strncmp(prefix, tname, sizeof(prefix) - 1)) { + if (strncmp(prefix, tname, sizeof(prefix) - 1)) { verbose(env, "attach_btf_id %u points to wrong type name %s\n", btf_id, tname); return -EINVAL; @@ -9419,8 +9431,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_name = tname; prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; + return 0; + default: + return -EINVAL; } - return 0; } int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 571c25d60710..f50bf19f7a05 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1055,10 +1055,6 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_raw_tp; -#ifdef CONFIG_NET - case BPF_FUNC_skb_output: - return &bpf_skb_output_proto; -#endif case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: @@ -1068,20 +1064,44 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { +#ifdef CONFIG_NET + case BPF_FUNC_skb_output: + return &bpf_skb_output_proto; +#endif + default: + return raw_tp_prog_func_proto(func_id, prog); + } +} + static bool raw_tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - /* largest tracepoint in the kernel has 12 args */ - if (off < 0 || off >= sizeof(__u64) * 12) + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + return true; +} + +static bool tracing_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) return false; if (type != BPF_READ) return false; if (off % size != 0) return false; - if (!prog->aux->attach_btf_id) - return true; return btf_ctx_access(off, size, type, prog, info); } @@ -1093,6 +1113,14 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; +const struct bpf_verifier_ops tracing_verifier_ops = { + .get_func_proto = tracing_prog_func_proto, + .is_valid_access = tracing_prog_is_valid_access, +}; + +const struct bpf_prog_ops tracing_prog_ops = { +}; + static bool raw_tp_writable_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, -- cgit From f94df9890e98f2090c6a8d70c795134863b70201 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Sep 2019 16:07:45 +0100 Subject: Add wake_up_interruptible_sync_poll_locked() Add a wakeup call for a case whereby the caller already has the waitqueue spinlock held. This can be used by pipes to alter the ring buffer indices and issue a wakeup under the same spinlock. Signed-off-by: David Howells Acked-by: Peter Zijlstra (Intel) --- kernel/sched/wait.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index b4b52361dab7..ba059fbfc53a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -191,6 +191,29 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, } EXPORT_SYMBOL_GPL(__wake_up_sync_key); +/** + * __wake_up_locked_sync_key - wake up a thread blocked on a locked waitqueue. + * @wq_head: the waitqueue + * @mode: which threads + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs in that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. + */ +void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, + unsigned int mode, void *key) +{ + __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key); + /* * __wake_up_sync - see __wake_up_sync_key() */ -- cgit From ff1c08e1f74b6864854c39be48aa799a6a2e4d2b Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Tue, 29 Oct 2019 16:43:07 +0100 Subject: bpf: Change size to u64 for bpf_map_{area_alloc, charge_init}() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions bpf_map_area_alloc() and bpf_map_charge_init() prior this commit passed the size parameter as size_t. In this commit this is changed to u64. All users of these functions avoid size_t overflows on 32-bit systems, by explicitly using u64 when calculating the allocation size and memory charge cost. However, since the result was narrowed by the size_t when passing size and cost to the functions, the overflow handling was in vain. Instead of changing all call sites to size_t and handle overflow at the call site, the parameter is changed to u64 and checked in the functions above. Fixes: d407bd25a204 ("bpf: don't trigger OOM killer under pressure with map alloc") Fixes: c85d69135a91 ("bpf: move memory size checks to bpf_map_charge_init()") Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20191029154307.23053-1-bjorn.topel@gmail.com --- kernel/bpf/syscall.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0937719b87e2..ace1cfaa24b6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -126,7 +126,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -void *bpf_map_area_alloc(size_t size, int numa_node) +void *bpf_map_area_alloc(u64 size, int numa_node) { /* We really just want to fail instead of triggering OOM killer * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, @@ -141,6 +141,9 @@ void *bpf_map_area_alloc(size_t size, int numa_node) const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; void *area; + if (size >= SIZE_MAX) + return NULL; + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, numa_node); @@ -197,7 +200,7 @@ static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) atomic_long_sub(pages, &user->locked_vm); } -int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size) +int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size) { u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; struct user_struct *user; -- cgit From 8b5369ea580964dbc982781bfb9fb93459fc5e8d Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Mon, 14 Oct 2019 20:31:03 +0200 Subject: dma/direct: turn ARCH_ZONE_DMA_BITS into a variable Some architectures, notably ARM, are interested in tweaking this depending on their runtime DMA addressing limitations. Acked-by: Christoph Hellwig Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Catalin Marinas --- kernel/dma/direct.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8402b29c280f..0b67c04e531b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -16,12 +16,11 @@ #include /* - * Most architectures use ZONE_DMA for the first 16 Megabytes, but - * some use it for entirely different regions: + * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it + * it for entirely different regions. In that case the arch code needs to + * override the variable below for dma-direct to work properly. */ -#ifndef ARCH_ZONE_DMA_BITS -#define ARCH_ZONE_DMA_BITS 24 -#endif +unsigned int zone_dma_bits __ro_after_init = 24; static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) { @@ -69,7 +68,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding * zones. */ - if (*phys_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + if (*phys_mask <= DMA_BIT_MASK(zone_dma_bits)) return GFP_DMA; if (*phys_mask <= DMA_BIT_MASK(32)) return GFP_DMA32; @@ -395,7 +394,7 @@ int dma_direct_supported(struct device *dev, u64 mask) u64 min_mask; if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS); + min_mask = DMA_BIT_MASK(zone_dma_bits); else min_mask = DMA_BIT_MASK(32); -- cgit From 7e35e4eb7e56233dcf445992d7b835a9ba93408e Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 30 Oct 2019 16:43:09 +0100 Subject: livepatch: Keep replaced patches until post_patch callback is called Pre/post (un)patch callbacks might manipulate the system state. Cumulative livepatches might need to take over the changes made by the replaced ones. For this they might need to access some data stored or referenced by the old livepatches. Therefore the replaced livepatches have to stay around until post_patch() callback is called. It is achieved by calling the free functions later. It is the same location where disabled livepatches have already been freed. Link: http://lkml.kernel.org/r/20191030154313.13263-2-pmladek@suse.com To: Jiri Kosina Cc: Kamalesh Babulal Cc: Nicolai Stange Cc: live-patching@vger.kernel.org Cc: linux-kernel@vger.kernel.org Acked-by: Miroslav Benes Acked-by: Joe Lawrence Acked-by: Josh Poimboeuf Signed-off-by: Petr Mladek --- kernel/livepatch/core.c | 36 ++++++++++++++++++++++++++---------- kernel/livepatch/core.h | 5 +++-- kernel/livepatch/transition.c | 12 ++++++------ 3 files changed, 35 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index ab4a4606d19b..1e1d87ead55c 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -632,7 +632,7 @@ static void klp_free_objects_dynamic(struct klp_patch *patch) * The operation must be completed by calling klp_free_patch_finish() * outside klp_mutex. */ -void klp_free_patch_start(struct klp_patch *patch) +static void klp_free_patch_start(struct klp_patch *patch) { if (!list_empty(&patch->list)) list_del(&patch->list); @@ -677,6 +677,23 @@ static void klp_free_patch_work_fn(struct work_struct *work) klp_free_patch_finish(patch); } +void klp_free_patch_async(struct klp_patch *patch) +{ + klp_free_patch_start(patch); + schedule_work(&patch->free_work); +} + +void klp_free_replaced_patches_async(struct klp_patch *new_patch) +{ + struct klp_patch *old_patch, *tmp_patch; + + klp_for_each_patch_safe(old_patch, tmp_patch) { + if (old_patch == new_patch) + return; + klp_free_patch_async(old_patch); + } +} + static int klp_init_func(struct klp_object *obj, struct klp_func *func) { if (!func->old_name) @@ -1022,12 +1039,13 @@ err: EXPORT_SYMBOL_GPL(klp_enable_patch); /* - * This function removes replaced patches. + * This function unpatches objects from the replaced livepatches. * * We could be pretty aggressive here. It is called in the situation where - * these structures are no longer accessible. All functions are redirected - * by the klp_transition_patch. They use either a new code or they are in - * the original code because of the special nop function patches. + * these structures are no longer accessed from the ftrace handler. + * All functions are redirected by the klp_transition_patch. They + * use either a new code or they are in the original code because + * of the special nop function patches. * * The only exception is when the transition was forced. In this case, * klp_ftrace_handler() might still see the replaced patch on the stack. @@ -1035,18 +1053,16 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * thanks to RCU. We only have to keep the patches on the system. Also * this is handled transparently by patch->module_put. */ -void klp_discard_replaced_patches(struct klp_patch *new_patch) +void klp_unpatch_replaced_patches(struct klp_patch *new_patch) { - struct klp_patch *old_patch, *tmp_patch; + struct klp_patch *old_patch; - klp_for_each_patch_safe(old_patch, tmp_patch) { + klp_for_each_patch(old_patch) { if (old_patch == new_patch) return; old_patch->enabled = false; klp_unpatch_objects(old_patch); - klp_free_patch_start(old_patch); - schedule_work(&old_patch->free_work); } } diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index ec43a40b853f..38209c7361b6 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -13,8 +13,9 @@ extern struct list_head klp_patches; #define klp_for_each_patch(patch) \ list_for_each_entry(patch, &klp_patches, list) -void klp_free_patch_start(struct klp_patch *patch); -void klp_discard_replaced_patches(struct klp_patch *new_patch); +void klp_free_patch_async(struct klp_patch *patch); +void klp_free_replaced_patches_async(struct klp_patch *new_patch); +void klp_unpatch_replaced_patches(struct klp_patch *new_patch); void klp_discard_nops(struct klp_patch *new_patch); static inline bool klp_is_object_loaded(struct klp_object *obj) diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index cdf318d86dd6..f6310f848f34 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -78,7 +78,7 @@ static void klp_complete_transition(void) klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) { - klp_discard_replaced_patches(klp_transition_patch); + klp_unpatch_replaced_patches(klp_transition_patch); klp_discard_nops(klp_transition_patch); } @@ -446,14 +446,14 @@ void klp_try_complete_transition(void) klp_complete_transition(); /* - * It would make more sense to free the patch in + * It would make more sense to free the unused patches in * klp_complete_transition() but it is called also * from klp_cancel_transition(). */ - if (!patch->enabled) { - klp_free_patch_start(patch); - schedule_work(&patch->free_work); - } + if (!patch->enabled) + klp_free_patch_async(patch); + else if (patch->replace) + klp_free_replaced_patches_async(patch); } /* -- cgit From 73727f4dafa2df107e85753c5ab703a1f344e1f1 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 30 Oct 2019 16:43:10 +0100 Subject: livepatch: Basic API to track system state changes This is another step how to help maintaining more livepatches. One big help was the atomic replace and cumulative livepatches. These livepatches replace the already installed ones. Therefore it should be enough when each cumulative livepatch is consistent. The problems might come with shadow variables and callbacks. They might change the system behavior or state so that it is no longer safe to go back and use an older livepatch or the original kernel code. Also, a new livepatch must be able to detect changes which were made by the already installed livepatches. This is where the livepatch system state tracking gets useful. It allows to: - find whether a system state has already been modified by previous livepatches - store data needed to manipulate and restore the system state The information about the manipulated system states is stored in an array of struct klp_state. It can be searched by two new functions klp_get_state() and klp_get_prev_state(). The dependencies are going to be solved by a version field added later. The only important information is that it will be allowed to modify the same state by more non-cumulative livepatches. It is similar to allowing to modify the same function several times. The livepatch author is responsible for preventing incompatible changes. Link: http://lkml.kernel.org/r/20191030154313.13263-3-pmladek@suse.com To: Jiri Kosina Cc: Kamalesh Babulal Cc: Nicolai Stange Cc: live-patching@vger.kernel.org Cc: linux-kernel@vger.kernel.org Acked-by: Miroslav Benes Acked-by: Joe Lawrence Acked-by: Josh Poimboeuf Signed-off-by: Petr Mladek --- kernel/livepatch/Makefile | 2 +- kernel/livepatch/state.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 kernel/livepatch/state.c (limited to 'kernel') diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index cf9b5bcdb952..cf03d4bdfc66 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_LIVEPATCH) += livepatch.o -livepatch-objs := core.o patch.o shadow.o transition.o +livepatch-objs := core.o patch.o shadow.o state.o transition.o diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c new file mode 100644 index 000000000000..6ab15b642c0a --- /dev/null +++ b/kernel/livepatch/state.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * system_state.c - State of the system modified by livepatches + * + * Copyright (C) 2019 SUSE + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include "core.h" +#include "transition.h" + +#define klp_for_each_state(patch, state) \ + for (state = patch->states; state && state->id; state++) + +/** + * klp_get_state() - get information about system state modified by + * the given patch + * @patch: livepatch that modifies the given system state + * @id: custom identifier of the modified system state + * + * Checks whether the given patch modifies the given system state. + * + * The function can be called either from pre/post (un)patch + * callbacks or from the kernel code added by the livepatch. + * + * Return: pointer to struct klp_state when found, otherwise NULL. + */ +struct klp_state *klp_get_state(struct klp_patch *patch, unsigned long id) +{ + struct klp_state *state; + + klp_for_each_state(patch, state) { + if (state->id == id) + return state; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(klp_get_state); + +/** + * klp_get_prev_state() - get information about system state modified by + * the already installed livepatches + * @id: custom identifier of the modified system state + * + * Checks whether already installed livepatches modify the given + * system state. + * + * The same system state can be modified by more non-cumulative + * livepatches. It is expected that the latest livepatch has + * the most up-to-date information. + * + * The function can be called only during transition when a new + * livepatch is being enabled or when such a transition is reverted. + * It is typically called only from from pre/post (un)patch + * callbacks. + * + * Return: pointer to the latest struct klp_state from already + * installed livepatches, NULL when not found. + */ +struct klp_state *klp_get_prev_state(unsigned long id) +{ + struct klp_patch *patch; + struct klp_state *state, *last_state = NULL; + + if (WARN_ON_ONCE(!klp_transition_patch)) + return NULL; + + klp_for_each_patch(patch) { + if (patch == klp_transition_patch) + goto out; + + state = klp_get_state(patch, id); + if (state) + last_state = state; + } + +out: + return last_state; +} +EXPORT_SYMBOL_GPL(klp_get_prev_state); -- cgit From 92c9abf5e57500ea7dc59a55273aa7850b631bda Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 30 Oct 2019 16:43:11 +0100 Subject: livepatch: Allow to distinguish different version of system state changes The atomic replace runs pre/post (un)install callbacks only from the new livepatch. There are several reasons for this: + Simplicity: clear ordering of operations, no interactions between old and new callbacks. + Reliability: only new livepatch knows what changes can already be made by older livepatches and how to take over the state. + Testing: the atomic replace can be properly tested only when a newer livepatch is available. It might be too late to fix unwanted effect of callbacks from older livepatches. It might happen that an older change is not enough and the same system state has to be modified another way. Different changes need to get distinguished by a version number added to struct klp_state. The version can also be used to prevent loading incompatible livepatches. The check is done when the livepatch is enabled. The rules are: + Any completely new system state modification is allowed. + System state modifications with the same or higher version are allowed for already modified system states. + Cumulative livepatches must handle all system state modifications from already installed livepatches. + Non-cumulative livepatches are allowed to touch already modified system states. Link: http://lkml.kernel.org/r/20191030154313.13263-4-pmladek@suse.com To: Jiri Kosina Cc: Kamalesh Babulal Cc: Nicolai Stange Cc: live-patching@vger.kernel.org Cc: linux-kernel@vger.kernel.org Acked-by: Miroslav Benes Acked-by: Joe Lawrence Acked-by: Josh Poimboeuf Signed-off-by: Petr Mladek --- kernel/livepatch/core.c | 8 ++++++++ kernel/livepatch/state.c | 36 ++++++++++++++++++++++++++++++++++++ kernel/livepatch/state.h | 9 +++++++++ 3 files changed, 53 insertions(+) create mode 100644 kernel/livepatch/state.h (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 1e1d87ead55c..c3512e7e0801 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -22,6 +22,7 @@ #include #include "core.h" #include "patch.h" +#include "state.h" #include "transition.h" /* @@ -1009,6 +1010,13 @@ int klp_enable_patch(struct klp_patch *patch) mutex_lock(&klp_mutex); + if (!klp_is_patch_compatible(patch)) { + pr_err("Livepatch patch (%s) is not compatible with the already installed livepatches.\n", + patch->mod->name); + mutex_unlock(&klp_mutex); + return -EINVAL; + } + ret = klp_init_patch_early(patch); if (ret) { mutex_unlock(&klp_mutex); diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c index 6ab15b642c0a..7ee19476de9d 100644 --- a/kernel/livepatch/state.c +++ b/kernel/livepatch/state.c @@ -9,6 +9,7 @@ #include #include "core.h" +#include "state.h" #include "transition.h" #define klp_for_each_state(patch, state) \ @@ -81,3 +82,38 @@ out: return last_state; } EXPORT_SYMBOL_GPL(klp_get_prev_state); + +/* Check if the patch is able to deal with the existing system state. */ +static bool klp_is_state_compatible(struct klp_patch *patch, + struct klp_state *old_state) +{ + struct klp_state *state; + + state = klp_get_state(patch, old_state->id); + + /* A cumulative livepatch must handle all already modified states. */ + if (!state) + return !patch->replace; + + return state->version >= old_state->version; +} + +/* + * Check that the new livepatch will not break the existing system states. + * Cumulative patches must handle all already modified states. + * Non-cumulative patches can touch already modified states. + */ +bool klp_is_patch_compatible(struct klp_patch *patch) +{ + struct klp_patch *old_patch; + struct klp_state *old_state; + + klp_for_each_patch(old_patch) { + klp_for_each_state(old_patch, old_state) { + if (!klp_is_state_compatible(patch, old_state)) + return false; + } + } + + return true; +} diff --git a/kernel/livepatch/state.h b/kernel/livepatch/state.h new file mode 100644 index 000000000000..49d9c16e8762 --- /dev/null +++ b/kernel/livepatch/state.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LIVEPATCH_STATE_H +#define _LIVEPATCH_STATE_H + +#include + +bool klp_is_patch_compatible(struct klp_patch *patch); + +#endif /* _LIVEPATCH_STATE_H */ -- cgit From f973cce0e4022fa8c969ca5b0c71559125382eb2 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 21 Oct 2019 22:38:29 +0200 Subject: kexec: Fix pointer-to-int-cast warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix two pointer-to-int-cast warnings when compiling for the 32-bit parisc platform: kernel/kexec_file.c: In function ‘crash_prepare_elf64_headers’: kernel/kexec_file.c:1307:19: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] phdr->p_vaddr = (Elf64_Addr)_text; ^ kernel/kexec_file.c:1324:19: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] phdr->p_vaddr = (unsigned long long) __va(mstart); ^ Signed-off-by: Helge Deller --- kernel/kexec_file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 79f252af7dee..a2df93948665 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -1304,7 +1304,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, if (kernel_map) { phdr->p_type = PT_LOAD; phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_vaddr = (Elf64_Addr)_text; + phdr->p_vaddr = (unsigned long) _text; phdr->p_filesz = phdr->p_memsz = _end - _text; phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); ehdr->e_phnum++; @@ -1321,7 +1321,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, phdr->p_offset = mstart; phdr->p_paddr = mstart; - phdr->p_vaddr = (unsigned long long) __va(mstart); + phdr->p_vaddr = (unsigned long) __va(mstart); phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; phdr->p_align = 0; ehdr->e_phnum++; -- cgit From 64fe8c061de7d7ffdfdc104a57d48d645ae0ac4f Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 1 Nov 2019 12:03:44 +0100 Subject: xsk: Store struct xdp_sock as a flexible array member of the XSKMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prior this commit, the array storing XDP socket instances were stored in a separate allocated array of the XSKMAP. Now, we store the sockets as a flexible array member in a similar fashion as the arraymap. Doing so, we do less pointer chasing in the lookup. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/20191101110346.15004-2-bjorn.topel@gmail.com --- kernel/bpf/xskmap.c | 55 ++++++++++++++++++++++------------------------------- 1 file changed, 23 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 82a1ffe15dfa..edcbd863650e 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -11,9 +11,9 @@ struct xsk_map { struct bpf_map map; - struct xdp_sock **xsk_map; struct list_head __percpu *flush_list; spinlock_t lock; /* Synchronize map updates */ + struct xdp_sock *xsk_map[]; }; int xsk_map_inc(struct xsk_map *map) @@ -80,9 +80,10 @@ static void xsk_map_sock_delete(struct xdp_sock *xs, static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { + struct bpf_map_memory mem; + int cpu, err, numa_node; struct xsk_map *m; - int cpu, err; - u64 cost; + u64 cost, size; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -92,44 +93,35 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) return ERR_PTR(-EINVAL); - m = kzalloc(sizeof(*m), GFP_USER); - if (!m) + numa_node = bpf_map_attr_numa_node(attr); + size = struct_size(m, xsk_map, attr->max_entries); + cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus()); + + err = bpf_map_charge_init(&mem, cost); + if (err < 0) + return ERR_PTR(err); + + m = bpf_map_area_alloc(size, numa_node); + if (!m) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } bpf_map_init_from_attr(&m->map, attr); + bpf_map_charge_move(&m->map.memory, &mem); spin_lock_init(&m->lock); - cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); - cost += sizeof(struct list_head) * num_possible_cpus(); - - /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_charge_init(&m->map.memory, cost); - if (err) - goto free_m; - - err = -ENOMEM; - m->flush_list = alloc_percpu(struct list_head); - if (!m->flush_list) - goto free_charge; + if (!m->flush_list) { + bpf_map_charge_finish(&m->map.memory); + bpf_map_area_free(m); + return ERR_PTR(-ENOMEM); + } for_each_possible_cpu(cpu) INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); - m->xsk_map = bpf_map_area_alloc(m->map.max_entries * - sizeof(struct xdp_sock *), - m->map.numa_node); - if (!m->xsk_map) - goto free_percpu; return &m->map; - -free_percpu: - free_percpu(m->flush_list); -free_charge: - bpf_map_charge_finish(&m->map.memory); -free_m: - kfree(m); - return ERR_PTR(err); } static void xsk_map_free(struct bpf_map *map) @@ -139,8 +131,7 @@ static void xsk_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_net(); free_percpu(m->flush_list); - bpf_map_area_free(m->xsk_map); - kfree(m); + bpf_map_area_free(m); } static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -- cgit From e65650f291ee72fc578d6346080fc4699204ef2c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 1 Nov 2019 12:03:45 +0100 Subject: bpf: Implement map_gen_lookup() callback for XSKMAP Inline the xsk_map_lookup_elem() via implementing the map_gen_lookup() callback. This results in emitting the bpf instructions in place of bpf_map_lookup_elem() helper call and better performance of bpf programs. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/20191101110346.15004-3-bjorn.topel@gmail.com --- kernel/bpf/xskmap.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index edcbd863650e..554939f78b83 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -163,6 +163,22 @@ struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) return xs; } +static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; + struct bpf_insn *insn = insn_buf; + + *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); + *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); + *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(ret, 0); + return insn - insn_buf; +} + int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, struct xdp_sock *xs) { @@ -303,6 +319,7 @@ const struct bpf_map_ops xsk_map_ops = { .map_free = xsk_map_free, .map_get_next_key = xsk_map_get_next_key, .map_lookup_elem = xsk_map_lookup_elem, + .map_gen_lookup = xsk_map_gen_lookup, .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, -- cgit From d817991cc7486ab83f6c7188b0bc80eebee872f6 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 1 Nov 2019 12:03:46 +0100 Subject: xsk: Restructure/inline XSKMAP lookup/redirect/flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this commit the XSKMAP entry lookup function used by the XDP redirect code is moved from the xskmap.c file to the xdp_sock.h header, so the lookup can be inlined from, e.g., the bpf_xdp_redirect_map() function. Further the __xsk_map_redirect() and __xsk_map_flush() is moved to the xsk.c, which lets the compiler inline the xsk_rcv() and xsk_flush() functions. Finally, all the XDP socket functions were moved from linux/bpf.h to net/xdp_sock.h, where most of the XDP sockets functions are anyway. This yields a ~2% performance boost for the xdpsock "rx_drop" scenario. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191101110346.15004-4-bjorn.topel@gmail.com --- kernel/bpf/xskmap.c | 48 ------------------------------------------------ 1 file changed, 48 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 554939f78b83..da16c30868f3 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -9,13 +9,6 @@ #include #include -struct xsk_map { - struct bpf_map map; - struct list_head __percpu *flush_list; - spinlock_t lock; /* Synchronize map updates */ - struct xdp_sock *xsk_map[]; -}; - int xsk_map_inc(struct xsk_map *map) { struct bpf_map *m = &map->map; @@ -151,18 +144,6 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs; - - if (key >= map->max_entries) - return NULL; - - xs = READ_ONCE(m->xsk_map[key]); - return xs; -} - static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; @@ -179,35 +160,6 @@ static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } -int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct list_head *flush_list = this_cpu_ptr(m->flush_list); - int err; - - err = xsk_rcv(xs, xdp); - if (err) - return err; - - if (!xs->flush_node.prev) - list_add(&xs->flush_node, flush_list); - - return 0; -} - -void __xsk_map_flush(struct bpf_map *map) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct list_head *flush_list = this_cpu_ptr(m->flush_list); - struct xdp_sock *xs, *tmp; - - list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { - xsk_flush(xs); - __list_del_clearprev(&xs->flush_node); - } -} - static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { WARN_ON_ONCE(!rcu_read_lock_held()); -- cgit From eb1b66887472eaa7342305b7890ae510dd9d1a79 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:58 +0100 Subject: bpf: Make use of probe_user_write in probe write helper Convert the bpf_probe_write_user() helper to probe_user_write() such that writes are not attempted under KERNEL_DS anymore which is buggy as kernel and user space pointers can have overlapping addresses. Also, given we have the access_ok() check inside probe_user_write(), the helper doesn't need to do it twice. Fixes: 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/841c461781874c07a0ee404a454c3bc0459eed30.1572649915.git.daniel@iogearbox.net --- kernel/trace/bpf_trace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f50bf19f7a05..2d87fcdcb19b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -163,7 +163,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, +BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, u32, size) { /* @@ -186,10 +186,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; - if (!access_ok(unsafe_ptr, size)) - return -EPERM; - return probe_kernel_write(unsafe_ptr, src, size); + return probe_user_write(unsafe_ptr, src, size); } static const struct bpf_func_proto bpf_probe_write_user_proto = { -- cgit From 6ae08ae3dea2cfa03dd3665a3c8475c2d429ef47 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:59 +0100 Subject: bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers The current bpf_probe_read() and bpf_probe_read_str() helpers are broken in that they assume they can be used for probing memory access for kernel space addresses /as well as/ user space addresses. However, plain use of probe_kernel_read() for both cases will attempt to always access kernel space address space given access is performed under KERNEL_DS and some archs in-fact have overlapping address spaces where a kernel pointer and user pointer would have the /same/ address value and therefore accessing application memory via bpf_probe_read{,_str}() would read garbage values. Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess: Add non-pagefault user-space read functions"). Unfortunately, the only way to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}() and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}() helpers are kept as-is to retain their current behavior. The two *_user() variants attempt the access always under USER_DS set, the two *_kernel() variants will -EFAULT when accessing user memory if the underlying architecture has non-overlapping address ranges, also avoiding throwing the kernel warning via 00c42373d397 ("x86-64: add warning for non-canonical user access address dereferences"). Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper") Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net --- kernel/trace/bpf_trace.c | 181 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 135 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2d87fcdcb19b..ffc91d4935ac 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -138,24 +138,140 @@ static const struct bpf_func_proto bpf_override_return_proto = { }; #endif -BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) +BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, + const void __user *, unsafe_ptr) { - int ret; + int ret = probe_user_read(dst, unsafe_ptr, size); - ret = security_locked_down(LOCKDOWN_BPF_READ); - if (ret < 0) - goto out; + if (unlikely(ret < 0)) + memset(dst, 0, size); + + return ret; +} + +static const struct bpf_func_proto bpf_probe_read_user_proto = { + .func = bpf_probe_read_user, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, + const void __user *, unsafe_ptr) +{ + int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size); + + if (unlikely(ret < 0)) + memset(dst, 0, size); + + return ret; +} + +static const struct bpf_func_proto bpf_probe_read_user_str_proto = { + .func = bpf_probe_read_user_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; - ret = probe_kernel_read(dst, unsafe_ptr, size); +static __always_inline int +bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr, + const bool compat) +{ + int ret = security_locked_down(LOCKDOWN_BPF_READ); + + if (unlikely(ret < 0)) + goto out; + ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) : + probe_kernel_read_strict(dst, unsafe_ptr, size); if (unlikely(ret < 0)) out: memset(dst, 0, size); + return ret; +} + +BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false); +} + +static const struct bpf_func_proto bpf_probe_read_kernel_proto = { + .func = bpf_probe_read_kernel, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true); +} +static const struct bpf_func_proto bpf_probe_read_compat_proto = { + .func = bpf_probe_read_compat, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static __always_inline int +bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr, + const bool compat) +{ + int ret = security_locked_down(LOCKDOWN_BPF_READ); + + if (unlikely(ret < 0)) + goto out; + /* + * The strncpy_from_unsafe_*() call will likely not fill the entire + * buffer, but that's okay in this circumstance as we're probing + * arbitrary memory anyway similar to bpf_probe_read_*() and might + * as well probe the stack. Thus, memory is explicitly cleared + * only in error case, so that improper users ignoring return + * code altogether don't copy garbage; otherwise length of string + * is returned that can be used for bpf_perf_event_output() et al. + */ + ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) : + strncpy_from_unsafe_strict(dst, unsafe_ptr, size); + if (unlikely(ret < 0)) +out: + memset(dst, 0, size); return ret; } -static const struct bpf_func_proto bpf_probe_read_proto = { - .func = bpf_probe_read, +BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false); +} + +static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { + .func = bpf_probe_read_kernel_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true); +} + +static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { + .func = bpf_probe_read_compat_str, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, @@ -583,41 +699,6 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, - const void *, unsafe_ptr) -{ - int ret; - - ret = security_locked_down(LOCKDOWN_BPF_READ); - if (ret < 0) - goto out; - - /* - * The strncpy_from_unsafe() call will likely not fill the entire - * buffer, but that's okay in this circumstance as we're probing - * arbitrary memory anyway similar to bpf_probe_read() and might - * as well probe the stack. Thus, memory is explicitly cleared - * only in error case, so that improper users ignoring return - * code altogether don't copy garbage; otherwise length of string - * is returned that can be used for bpf_perf_event_output() et al. - */ - ret = strncpy_from_unsafe(dst, unsafe_ptr, size); - if (unlikely(ret < 0)) -out: - memset(dst, 0, size); - - return ret; -} - -static const struct bpf_func_proto bpf_probe_read_str_proto = { - .func = bpf_probe_read_str, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_UNINIT_MEM, - .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_ANYTHING, -}; - struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; @@ -697,8 +778,6 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; - case BPF_FUNC_probe_read: - return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_tail_call: @@ -725,8 +804,18 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_probe_read_user: + return &bpf_probe_read_user_proto; + case BPF_FUNC_probe_read_kernel: + return &bpf_probe_read_kernel_proto; + case BPF_FUNC_probe_read: + return &bpf_probe_read_compat_proto; + case BPF_FUNC_probe_read_user_str: + return &bpf_probe_read_user_str_proto; + case BPF_FUNC_probe_read_kernel_str: + return &bpf_probe_read_kernel_str_proto; case BPF_FUNC_probe_read_str: - return &bpf_probe_read_str_proto; + return &bpf_probe_read_compat_str_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; -- cgit From 6e07a6341277a1dbdf5ed0c41921033c234c1635 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:18:00 +0100 Subject: bpf: Switch BPF probe insns to bpf_probe_read_kernel Commit 2a02759ef5f8 ("bpf: Add support for BTF pointers to interpreter") explicitly states that the pointer to BTF object is a pointer to a kernel object or NULL. Therefore we should also switch to using the strict kernel probe helper which is restricted to kernel addresses only when architectures have non-overlapping address spaces. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/d2b90827837685424a4b8008dfe0460558abfada.1572649915.git.daniel@iogearbox.net --- kernel/bpf/core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8d3fbc86ca5e..df82d5a42b23 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1306,11 +1306,12 @@ bool bpf_opcode_in_insntable(u8 code) } #ifndef CONFIG_BPF_JIT_ALWAYS_ON -u64 __weak bpf_probe_read(void * dst, u32 size, const void * unsafe_ptr) +u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) { memset(dst, 0, size); return -EFAULT; } + /** * __bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers @@ -1566,9 +1567,9 @@ out: LDST(W, u32) LDST(DW, u64) #undef LDST -#define LDX_PROBE(SIZEOP, SIZE) \ - LDX_PROBE_MEM_##SIZEOP: \ - bpf_probe_read(&DST, SIZE, (const void *)(long) SRC); \ +#define LDX_PROBE(SIZEOP, SIZE) \ + LDX_PROBE_MEM_##SIZEOP: \ + bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) SRC); \ CONT; LDX_PROBE(B, 1) LDX_PROBE(H, 2) -- cgit From 731dc9df975a5da21237a18c3384f811a7a41cc6 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Mon, 4 Nov 2019 12:22:02 +0100 Subject: cpu/speculation: Uninline and export CPU mitigations helpers A kernel module may need to check the value of the "mitigations=" kernel command line parameter as part of its setup when the module needs to perform software mitigations for a CPU flaw. Uninline and export the helper functions surrounding the cpu_mitigations enum to allow for their usage from a module. Lastly, privatize the enum and cpu_mitigations variable since the value of cpu_mitigations can be checked with the exported helper functions. Signed-off-by: Tyler Hicks Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index fc28e17940e0..e2cad3ee2ead 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2373,7 +2373,18 @@ void __init boot_cpu_hotplug_init(void) this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } -enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; +/* + * These are used for a global "mitigations=" cmdline option for toggling + * optional CPU mitigations. + */ +enum cpu_mitigations { + CPU_MITIGATIONS_OFF, + CPU_MITIGATIONS_AUTO, + CPU_MITIGATIONS_AUTO_NOSMT, +}; + +static enum cpu_mitigations cpu_mitigations __ro_after_init = + CPU_MITIGATIONS_AUTO; static int __init mitigations_parse_cmdline(char *arg) { @@ -2390,3 +2401,17 @@ static int __init mitigations_parse_cmdline(char *arg) return 0; } early_param("mitigations", mitigations_parse_cmdline); + +/* mitigations=off */ +bool cpu_mitigations_off(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_OFF; +} +EXPORT_SYMBOL_GPL(cpu_mitigations_off); + +/* mitigations=auto,nosmt */ +bool cpu_mitigations_auto_nosmt(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; +} +EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); -- cgit From 7162431dcf72032835d369c8d7b51311df407938 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Wed, 16 Oct 2019 13:33:13 +0200 Subject: ftrace: Introduce PERMANENT ftrace_ops flag Livepatch uses ftrace for redirection to new patched functions. It means that if ftrace is disabled, all live patched functions are disabled as well. Toggling global 'ftrace_enabled' sysctl thus affect it directly. It is not a problem per se, because only administrator can set sysctl values, but it still may be surprising. Introduce PERMANENT ftrace_ops flag to amend this. If the FTRACE_OPS_FL_PERMANENT is set on any ftrace ops, the tracing cannot be disabled by disabling ftrace_enabled. Equally, a callback with the flag set cannot be registered if ftrace_enabled is disabled. Link: http://lkml.kernel.org/r/20191016113316.13415-2-mbenes@suse.cz Reviewed-by: Petr Mladek Reviewed-by: Kamalesh Babulal Signed-off-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- kernel/livepatch/patch.c | 3 ++- kernel/trace/ftrace.c | 23 +++++++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index bd43537702bd..b552cf2d85f8 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -196,7 +196,8 @@ static int klp_patch_func(struct klp_func *func) ops->fops.func = klp_ftrace_handler; ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC | - FTRACE_OPS_FL_IPMODIFY; + FTRACE_OPS_FL_IPMODIFY | + FTRACE_OPS_FL_PERMANENT; list_add(&ops->node, &klp_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f296d89be757..89e9128652ef 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -326,6 +326,8 @@ int __register_ftrace_function(struct ftrace_ops *ops) if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) ops->flags |= FTRACE_OPS_FL_SAVE_REGS; #endif + if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT)) + return -EBUSY; if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; @@ -6754,6 +6756,18 @@ int unregister_ftrace_function(struct ftrace_ops *ops) } EXPORT_SYMBOL_GPL(unregister_ftrace_function); +static bool is_permanent_ops_registered(void) +{ + struct ftrace_ops *op; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->flags & FTRACE_OPS_FL_PERMANENT) + return true; + } while_for_each_ftrace_op(op); + + return false; +} + int ftrace_enable_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -6771,8 +6785,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; - last_ftrace_enabled = !!ftrace_enabled; - if (ftrace_enabled) { /* we are starting ftrace again */ @@ -6783,12 +6795,19 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ftrace_startup_sysctl(); } else { + if (is_permanent_ops_registered()) { + ftrace_enabled = true; + ret = -EBUSY; + goto out; + } + /* stopping ftrace calls (just send to ftrace_stub) */ ftrace_trace_function = ftrace_stub; ftrace_shutdown_sysctl(); } + last_ftrace_enabled = !!ftrace_enabled; out: mutex_unlock(&ftrace_lock); return ret; -- cgit From b0c51f158455e31d5024100cf3580fcd88214b0e Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 30 Oct 2019 08:25:45 +0100 Subject: stacktrace: Don't skip first entry on noncurrent tasks When doing cat /proc//stack, the output is missing the first entry. When the current code walks the stack starting in stack_trace_save_tsk, it skips all scheduler functions (that's OK) plus one more function. But this one function should be skipped only for the 'current' task as it is stack_trace_save_tsk proper. The original code (before the common infrastructure) skipped one function only for the 'current' task -- see save_stack_trace_tsk before 3599fe12a125. So do so also in the new infrastructure now. Fixes: 214d8ca6ee85 ("stacktrace: Provide common infrastructure") Signed-off-by: Jiri Slaby Signed-off-by: Thomas Gleixner Tested-by: Michal Suchanek Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20191030072545.19462-1-jslaby@suse.cz --- kernel/stacktrace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 6d1f68b7e528..c9ea7eb2cb1a 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -141,7 +141,8 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, struct stacktrace_cookie c = { .store = store, .size = size, - .skip = skipnr + 1, + /* skip this function if they are tracing us */ + .skip = skipnr + !!(current == tsk), }; if (!try_get_task_stack(tsk)) @@ -298,7 +299,8 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, struct stack_trace trace = { .entries = store, .max_entries = size, - .skip = skipnr + 1, + /* skip this function if they are tracing us */ + .skip = skipnr + !!(current == task), }; save_stack_trace_tsk(task, &trace); -- cgit From 52338415cf4d4064ae6b8dd972dadbda841da4fa Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 24 Oct 2019 11:28:29 +0800 Subject: timekeeping/vsyscall: Update VDSO data unconditionally The update of the VDSO data is depending on __arch_use_vsyscall() returning True. This is a leftover from the attempt to map the features of various architectures 1:1 into generic code. The usage of __arch_use_vsyscall() in the actual vsyscall implementations got dropped and replaced by the requirement for the architecture code to return U64_MAX if the global clocksource is not usable in the VDSO. But the __arch_use_vsyscall() check in the update code stayed which causes the VDSO data to be stale or invalid when an architecture actually implements that function and returns False when the current clocksource is not usable in the VDSO. As a consequence the VDSO implementations of clock_getres(), time(), clock_gettime(CLOCK_.*_COARSE) operate on invalid data and return bogus information. Remove the __arch_use_vsyscall() check from the VDSO update function and update the VDSO data unconditionally. [ tglx: Massaged changelog and removed the now useless implementations in asm-generic/ARM64/MIPS ] Fixes: 44f57d788e7deecb50 ("timekeeping: Provide a generic update_vsyscall() implementation") Signed-off-by: Huacai Chen Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Vincenzo Frascino Cc: Arnd Bergmann Cc: Paul Burton Cc: linux-mips@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1571887709-11447-1-git-send-email-chenhc@lemote.com --- kernel/time/vsyscall.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 4bc37ac3bb05..5ee0f7709410 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -110,8 +110,7 @@ void update_vsyscall(struct timekeeper *tk) nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); - if (__arch_use_vsyscall(vdata)) - update_vdso_data(vdata, tk); + update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); @@ -124,10 +123,8 @@ void update_vsyscall_tz(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); - if (__arch_use_vsyscall(vdata)) { - vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; - vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; - } + vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; + vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; __arch_sync_vdso_data(vdata); } -- cgit From 0ed9ca25894ef673d0259e4bd312d5fa1b9a6591 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Sat, 19 Oct 2019 17:07:27 +0800 Subject: irq/irqdomain: Update __irq_domain_alloc_fwnode() function documentation A recent commit changed a parameter of __irq_domain_alloc_fwnode(), but did not update the documentation comment. Fix it up. Fixes: b977fcf477c1 ("irqdomain/debugfs: Use PAs to generate fwnode names") Signed-off-by: Yi Wang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1571476047-29463-1-git-send-email-wang.yi59@zte.com.cn --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 132672b74e4b..dd822fd8a7d5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); * @type: Type of irqchip_fwnode. See linux/irqdomain.h * @name: Optional user provided domain name * @id: Optional user provided id if name != NULL - * @data: Optional user-provided data + * @pa: Optional user-provided physical address * * Allocate a struct irqchip_fwid, and return a poiner to the embedded * fwnode_handle (or NULL on failure). -- cgit From fa729c4df558936b4a1a7b3e2234011f44ede28b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 31 Oct 2019 12:36:08 +0100 Subject: clone3: validate stack arguments Validate the stack arguments and setup the stack depening on whether or not it is growing down or up. Legacy clone() required userspace to know in which direction the stack is growing and pass down the stack pointer appropriately. To make things more confusing microblaze uses a variant of the clone() syscall selected by CONFIG_CLONE_BACKWARDS3 that takes an additional stack_size argument. IA64 has a separate clone2() syscall which also takes an additional stack_size argument. Finally, parisc has a stack that is growing upwards. Userspace therefore has a lot nasty code like the following: #define __STACK_SIZE (8 * 1024 * 1024) pid_t sys_clone(int (*fn)(void *), void *arg, int flags, int *pidfd) { pid_t ret; void *stack; stack = malloc(__STACK_SIZE); if (!stack) return -ENOMEM; #ifdef __ia64__ ret = __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg, pidfd); #elif defined(__parisc__) /* stack grows up */ ret = clone(fn, stack, flags | SIGCHLD, arg, pidfd); #else ret = clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg, pidfd); #endif return ret; } or even crazier variants such as [3]. With clone3() we have the ability to validate the stack. We can check that when stack_size is passed, the stack pointer is valid and the other way around. We can also check that the memory area userspace gave us is fine to use via access_ok(). Furthermore, we probably should not require userspace to know in which direction the stack is growing. It is easy for us to do this in the kernel and I couldn't find the original reasoning behind exposing this detail to userspace. /* Intentional user visible API change */ clone3() was released with 5.3. Currently, it is not documented and very unclear to userspace how the stack and stack_size argument have to be passed. After talking to glibc folks we concluded that trying to change clone3() to setup the stack instead of requiring userspace to do this is the right course of action. Note, that this is an explicit change in user visible behavior we introduce with this patch. If it breaks someone's use-case we will revert! (And then e.g. place the new behavior under an appropriate flag.) Breaking someone's use-case is very unlikely though. First, neither glibc nor musl currently expose a wrapper for clone3(). Second, there is no real motivation for anyone to use clone3() directly since it does not provide features that legacy clone doesn't. New features for clone3() will first happen in v5.5 which is why v5.4 is still a good time to try and make that change now and backport it to v5.3. Searches on [4] did not reveal any packages calling clone3(). [1]: https://lore.kernel.org/r/CAG48ez3q=BeNcuVTKBN79kJui4vC6nw0Bfq6xc-i0neheT17TA@mail.gmail.com [2]: https://lore.kernel.org/r/20191028172143.4vnnjpdljfnexaq5@wittgenstein [3]: https://github.com/systemd/systemd/blob/5238e9575906297608ff802a27e2ff9effa3b338/src/basic/raw-clone.h#L31 [4]: https://codesearch.debian.net Fixes: 7f192e3cd316 ("fork: add clone3") Cc: Kees Cook Cc: Jann Horn Cc: David Howells Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Florian Weimer Cc: Peter Zijlstra Cc: linux-api@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: # 5.3 Cc: GNU C Library Signed-off-by: Christian Brauner Acked-by: Arnd Bergmann Acked-by: Aleksa Sarai Link: https://lore.kernel.org/r/20191031113608.20713-1-christian.brauner@ubuntu.com --- kernel/fork.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..55af6931c6ec 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2561,7 +2561,35 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, return 0; } -static bool clone3_args_valid(const struct kernel_clone_args *kargs) +/** + * clone3_stack_valid - check and prepare stack + * @kargs: kernel clone args + * + * Verify that the stack arguments userspace gave us are sane. + * In addition, set the stack direction for userspace since it's easy for us to + * determine. + */ +static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) +{ + if (kargs->stack == 0) { + if (kargs->stack_size > 0) + return false; + } else { + if (kargs->stack_size == 0) + return false; + + if (!access_ok((void __user *)kargs->stack, kargs->stack_size)) + return false; + +#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64) + kargs->stack += kargs->stack_size; +#endif + } + + return true; +} + +static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* * All lower bits of the flag word are taken. @@ -2581,6 +2609,9 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs) kargs->exit_signal) return false; + if (!clone3_stack_valid(kargs)) + return false; + return true; } -- cgit From fbf6c73c5b264c25484fa9f449b5546569fe11f0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 16 Oct 2019 17:51:10 +0100 Subject: ftrace: add ftrace_init_nop() Architectures may need to perform special initialization of ftrace callsites, and today they do so by special-casing ftrace_make_nop() when the expected branch address is MCOUNT_ADDR. In some cases (e.g. for patchable-function-entry), we don't have an mcount-like symbol and don't want a synthetic MCOUNT_ADDR, but we may need to perform some initialization of callsites. To make it possible to separate initialization from runtime modification, and to handle cases without an mcount-like symbol, this patch adds an optional ftrace_init_nop() function that architectures can implement, which does not pass a branch address. Where an architecture does not provide ftrace_init_nop(), we will fall back to the existing behaviour of calling ftrace_make_nop() with MCOUNT_ADDR. At the same time, ftrace_code_disable() is renamed to ftrace_nop_initialize() to make it clearer that it is intended to intialize a callsite into a disabled state, and is not for disabling a callsite that has been runtime enabled. The kerneldoc description of rec arguments is updated to cover non-mcount callsites. Signed-off-by: Mark Rutland Reviewed-by: Amit Daniel Kachhap Reviewed-by: Ard Biesheuvel Reviewed-by: Miroslav Benes Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Torsten Duwe Tested-by: Amit Daniel Kachhap Tested-by: Sven Schnelle Tested-by: Torsten Duwe Cc: Ingo Molnar --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f296d89be757..5259d4dea675 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2494,14 +2494,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) } static int -ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) +ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec) { int ret; if (unlikely(ftrace_disabled)) return 0; - ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); + ret = ftrace_init_nop(mod, rec); if (ret) { ftrace_bug_type = FTRACE_BUG_INIT; ftrace_bug(ret, rec); @@ -2943,7 +2943,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) * to the NOP instructions. */ if (!__is_defined(CC_USING_NOP_MCOUNT) && - !ftrace_code_disable(mod, p)) + !ftrace_nop_initialize(mod, p)) break; update_cnt++; -- cgit From a1326b17ac03a9012cb3d01e434aacb4d67a416c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 16 Oct 2019 18:17:11 +0100 Subject: module/ftrace: handle patchable-function-entry When using patchable-function-entry, the compiler will record the callsites into a section named "__patchable_function_entries" rather than "__mcount_loc". Let's abstract this difference behind a new FTRACE_CALLSITE_SECTION, so that architectures don't have to handle this explicitly (e.g. with custom module linker scripts). As parisc currently handles this explicitly, it is fixed up accordingly, with its custom linker script removed. Since FTRACE_CALLSITE_SECTION is only defined when DYNAMIC_FTRACE is selected, the parisc module loading code is updated to only use the definition in that case. When DYNAMIC_FTRACE is not selected, modules shouldn't have this section, so this removes some redundant work in that case. To make sure that this is keep up-to-date for modules and the main kernel, a comment is added to vmlinux.lds.h, with the existing ifdeffery simplified for legibility. I built parisc generic-{32,64}bit_defconfig with DYNAMIC_FTRACE enabled, and verified that the section made it into the .ko files for modules. Signed-off-by: Mark Rutland Acked-by: Helge Deller Acked-by: Steven Rostedt (VMware) Reviewed-by: Ard Biesheuvel Reviewed-by: Torsten Duwe Tested-by: Amit Daniel Kachhap Tested-by: Sven Schnelle Tested-by: Torsten Duwe Cc: Ingo Molnar Cc: James E.J. Bottomley Cc: Jessica Yu Cc: linux-parisc@vger.kernel.org --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ff2d7359a418..acf7962936c4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3222,7 +3222,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ - mod->ftrace_callsites = section_objs(info, "__mcount_loc", + mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION, sizeof(*mod->ftrace_callsites), &mod->num_ftrace_callsites); #endif -- cgit From 1bb5ec2eec48dcab1d8ae3707e4a388da6a9c9dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 6 Nov 2019 12:49:57 -0800 Subject: cgroup: use cgroup->last_bstat instead of cgroup->bstat_pending for consistency cgroup->bstat_pending is used to determine the base stat delta to propagate to the parent. While correct, this is different from how percpu delta is determined for no good reason and the inconsistency makes the code more difficult to understand. This patch makes parent propagation delta calculation use the same method as percpu to global propagation. * cgroup_base_stat_accumulate() is renamed to cgroup_base_stat_add() and cgroup_base_stat_sub() is added. * percpu propagation calculation is updated to use the above helpers. * cgroup->bstat_pending is replaced with cgroup->last_bstat and updated to use the same calculation as percpu propagation. Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index ca19b4c8acf5..b48b22d4deb6 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -304,44 +304,48 @@ void __init cgroup_rstat_boot(void) * Functions for cgroup basic resource statistics implemented on top of * rstat. */ -static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, - struct cgroup_base_stat *src_bstat) +static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; } +static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) +{ + dst_bstat->cputime.utime -= src_bstat->cputime.utime; + dst_bstat->cputime.stime -= src_bstat->cputime.stime; + dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; +} + static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; - struct task_cputime cputime; - struct cgroup_base_stat delta; + struct cgroup_base_stat cur, delta; unsigned seq; /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); - cputime = rstatc->bstat.cputime; + cur.cputime = rstatc->bstat.cputime; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); - /* calculate the delta to propgate */ - delta.cputime.utime = cputime.utime - last_cputime->utime; - delta.cputime.stime = cputime.stime - last_cputime->stime; - delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - - last_cputime->sum_exec_runtime; - *last_cputime = cputime; - - /* transfer the pending stat into delta */ - cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); - memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); - - /* propagate delta into the global stat and the parent's pending */ - cgroup_base_stat_accumulate(&cgrp->bstat, &delta); - if (parent) - cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); + /* propagate percpu delta to global */ + delta = cur; + cgroup_base_stat_sub(&delta, &rstatc->last_bstat); + cgroup_base_stat_add(&cgrp->bstat, &delta); + cgroup_base_stat_add(&rstatc->last_bstat, &delta); + + /* propagate global delta to parent */ + if (parent) { + delta = cgrp->bstat; + cgroup_base_stat_sub(&delta, &cgrp->last_bstat); + cgroup_base_stat_add(&parent->bstat, &delta); + cgroup_base_stat_add(&cgrp->last_bstat, &delta); + } } static struct cgroup_rstat_cpu * -- cgit From 56144737e67329c9aaed15f942d46a6302e2e3d8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Nov 2019 09:48:04 -0800 Subject: hrtimer: Annotate lockless access to timer->state syzbot reported various data-race caused by hrtimer_is_queued() reading timer->state. A READ_ONCE() is required there to silence the warning. Also add the corresponding WRITE_ONCE() when timer->state is set. In remove_hrtimer() the hrtimer_is_queued() helper is open coded to avoid loading timer->state twice. KCSAN reported these cases: BUG: KCSAN: data-race in __remove_hrtimer / tcp_pacing_check write to 0xffff8880b2a7d388 of 1 bytes by interrupt on cpu 0: __remove_hrtimer+0x52/0x130 kernel/time/hrtimer.c:991 __run_hrtimer kernel/time/hrtimer.c:1496 [inline] __hrtimer_run_queues+0x250/0x600 kernel/time/hrtimer.c:1576 hrtimer_run_softirq+0x10e/0x150 kernel/time/hrtimer.c:1593 __do_softirq+0x115/0x33f kernel/softirq.c:292 run_ksoftirqd+0x46/0x60 kernel/softirq.c:603 smpboot_thread_fn+0x37d/0x4a0 kernel/smpboot.c:165 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 read to 0xffff8880b2a7d388 of 1 bytes by task 24652 on cpu 1: tcp_pacing_check net/ipv4/tcp_output.c:2235 [inline] tcp_pacing_check+0xba/0x130 net/ipv4/tcp_output.c:2225 tcp_xmit_retransmit_queue+0x32c/0x5a0 net/ipv4/tcp_output.c:3044 tcp_xmit_recovery+0x7c/0x120 net/ipv4/tcp_input.c:3558 tcp_ack+0x17b6/0x3170 net/ipv4/tcp_input.c:3717 tcp_rcv_established+0x37e/0xf50 net/ipv4/tcp_input.c:5696 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1561 sk_backlog_rcv include/net/sock.h:945 [inline] __release_sock+0x135/0x1e0 net/core/sock.c:2435 release_sock+0x61/0x160 net/core/sock.c:2951 sk_stream_wait_memory+0x3d7/0x7c0 net/core/stream.c:145 tcp_sendmsg_locked+0xb47/0x1f30 net/ipv4/tcp.c:1393 tcp_sendmsg+0x39/0x60 net/ipv4/tcp.c:1434 inet_sendmsg+0x6d/0x90 net/ipv4/af_inet.c:807 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 BUG: KCSAN: data-race in __remove_hrtimer / __tcp_ack_snd_check write to 0xffff8880a3a65588 of 1 bytes by interrupt on cpu 0: __remove_hrtimer+0x52/0x130 kernel/time/hrtimer.c:991 __run_hrtimer kernel/time/hrtimer.c:1496 [inline] __hrtimer_run_queues+0x250/0x600 kernel/time/hrtimer.c:1576 hrtimer_run_softirq+0x10e/0x150 kernel/time/hrtimer.c:1593 __do_softirq+0x115/0x33f kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0xbb/0xe0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:536 [inline] smp_apic_timer_interrupt+0xe6/0x280 arch/x86/kernel/apic/apic.c:1137 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 read to 0xffff8880a3a65588 of 1 bytes by task 22891 on cpu 1: __tcp_ack_snd_check+0x415/0x4f0 net/ipv4/tcp_input.c:5265 tcp_ack_snd_check net/ipv4/tcp_input.c:5287 [inline] tcp_rcv_established+0x750/0xf50 net/ipv4/tcp_input.c:5708 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1561 sk_backlog_rcv include/net/sock.h:945 [inline] __release_sock+0x135/0x1e0 net/core/sock.c:2435 release_sock+0x61/0x160 net/core/sock.c:2951 sk_stream_wait_memory+0x3d7/0x7c0 net/core/stream.c:145 tcp_sendmsg_locked+0xb47/0x1f30 net/ipv4/tcp.c:1393 tcp_sendmsg+0x39/0x60 net/ipv4/tcp.c:1434 inet_sendmsg+0x6d/0x90 net/ipv4/af_inet.c:807 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 __sys_sendto+0x21f/0x320 net/socket.c:1952 __do_sys_sendto net/socket.c:1964 [inline] __se_sys_sendto net/socket.c:1960 [inline] __x64_sys_sendto+0x89/0xb0 net/socket.c:1960 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 24652 Comm: syz-executor.3 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 [ tglx: Added comments ] Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20191106174804.74723-1-edumazet@google.com --- kernel/time/hrtimer.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 65605530ee34..7f31932216a1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -966,7 +966,8 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - timer->state = HRTIMER_STATE_ENQUEUED; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } @@ -988,7 +989,8 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; - timer->state = newstate; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; @@ -1013,8 +1015,9 @@ static void __remove_hrtimer(struct hrtimer *timer, static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { - if (hrtimer_is_queued(timer)) { - u8 state = timer->state; + u8 state = timer->state; + + if (state & HRTIMER_STATE_ENQUEUED) { int reprogram; /* -- cgit From d0fbb51dfaa612f960519b798387be436e8f83c5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 4 Nov 2019 12:15:36 +0300 Subject: bpf, offload: Unlock on error in bpf_offload_dev_create() We need to drop the bpf_devs_lock on error before returning. Fixes: 9fd7c5559165 ("bpf: offload: aggregate offloads per-device") Signed-off-by: Dan Carpenter Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20191104091536.GB31509@mwanda --- kernel/bpf/offload.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index ba635209ae9a..5b9da0954a27 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -678,8 +678,10 @@ bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) down_write(&bpf_devs_lock); if (!offdevs_inited) { err = rhashtable_init(&offdevs, &offdevs_params); - if (err) + if (err) { + up_write(&bpf_devs_lock); return ERR_PTR(err); + } offdevs_inited = true; } up_write(&bpf_devs_lock); -- cgit From 85d31dd07002315c0d1816543fdc392c8cc6b73a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 6 Nov 2019 17:46:40 -0800 Subject: bpf: Account for insn->off when doing bpf_probe_read_kernel In the bpf interpreter mode, bpf_probe_read_kernel is used to read from PTR_TO_BTF_ID's kernel object. It currently missed considering the insn->off. This patch fixes it. Fixes: 2a02759ef5f8 ("bpf: Add support for BTF pointers to interpreter") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191107014640.384083-1-kafai@fb.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 97e37d82a1cc..c1fde0303280 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1569,7 +1569,7 @@ out: #undef LDST #define LDX_PROBE(SIZEOP, SIZE) \ LDX_PROBE_MEM_##SIZEOP: \ - bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) SRC); \ + bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \ CONT; LDX_PROBE(B, 1) LDX_PROBE(H, 2) -- cgit From 3f6ec871e1c2b360aaf022e90bb99dcc016b3874 Mon Sep 17 00:00:00 2001 From: Amit Kucheria Date: Mon, 21 Oct 2019 17:45:12 +0530 Subject: cpufreq: Initialize the governors in core_initcall Initialize the cpufreq governors earlier to allow for earlier performance control during the boot process. Signed-off-by: Amit Kucheria Acked-by: Viresh Kumar Reviewed-by: Rafael J. Wysocki Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/b98eae9b44eb2f034d7f5d12a161f5f831be1eb7.1571656015.git.amit.kucheria@linaro.org --- kernel/sched/cpufreq_schedutil.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 86800b4d5453..322ca8860f54 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -915,7 +915,7 @@ static int __init sugov_register(void) { return cpufreq_register_governor(&schedutil_gov); } -fs_initcall(sugov_register); +core_initcall(sugov_register); #ifdef CONFIG_ENERGY_MODEL extern bool sched_energy_update; -- cgit From 742e8cd3e1ba6f19cad6d912f8d469df5557d0fd Mon Sep 17 00:00:00 2001 From: Honglei Wang Date: Wed, 30 Oct 2019 16:18:10 +0800 Subject: cgroup: freezer: don't change task and cgroups status unnecessarily It's not necessary to adjust the task state and revisit the state of source and destination cgroups if the cgroups are not in freeze state and the task itself is not frozen. And in this scenario, it wakes up the task who's not supposed to be ready to run. Don't do the unnecessary task state adjustment can help stop waking up the task without a reason. Signed-off-by: Honglei Wang Acked-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/cgroup/freezer.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 8cf010680678..3984dd6b8ddb 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -230,6 +230,15 @@ void cgroup_freezer_migrate_task(struct task_struct *task, if (task->flags & PF_KTHREAD) return; + /* + * It's not necessary to do changes if both of the src and dst cgroups + * are not freezing and task is not frozen. + */ + if (!test_bit(CGRP_FREEZE, &src->flags) && + !test_bit(CGRP_FREEZE, &dst->flags) && + !task->frozen) + return; + /* * Adjust counters of freezing and frozen tasks. * Note, that if the task is frozen, but the destination cgroup is not -- cgit From acaade1af3587132e7ea585f470a95261e14f60c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2019 09:57:09 +0100 Subject: dma-direct: remove __dma_direct_free_pages We can just call dma_free_contiguous directly instead of wrapping it. Signed-off-by: Christoph Hellwig Reviewed-by: Max Filippov --- kernel/dma/direct.c | 11 +++-------- kernel/dma/remap.c | 4 ++-- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8402b29c280f..a7a2739fb747 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -153,7 +153,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, * so log an error and fail. */ dev_info(dev, "Rejecting highmem page from CMA.\n"); - __dma_direct_free_pages(dev, size, page); + dma_free_contiguous(dev, page, size); return NULL; } @@ -175,11 +175,6 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, return ret; } -void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page) -{ - dma_free_contiguous(dev, page, size); -} - void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { @@ -188,7 +183,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ - __dma_direct_free_pages(dev, size, cpu_addr); + dma_free_contiguous(dev, cpu_addr, size); return; } @@ -198,7 +193,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && dma_alloc_need_uncached(dev, attrs)) cpu_addr = cached_kernel_address(cpu_addr); - __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr)); + dma_free_contiguous(dev, virt_to_page(cpu_addr), size); } void *dma_direct_alloc(struct device *dev, size_t size, diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index c00b9258fa6a..fb1e50c2d48a 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -238,7 +238,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) { - __dma_direct_free_pages(dev, size, page); + dma_free_contiguous(dev, page, size); return ret; } @@ -256,7 +256,7 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, struct page *page = pfn_to_page(__phys_to_pfn(phys)); vunmap(vaddr); - __dma_direct_free_pages(dev, size, page); + dma_free_contiguous(dev, page, size); } } -- cgit From 4e1003aa56a7d60ddb048e43a7a51368fcfe36af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2019 09:57:32 +0100 Subject: dma-direct: remove the dma_handle argument to __dma_direct_alloc_pages The argument isn't used anywhere, so stop passing it. Signed-off-by: Christoph Hellwig Reviewed-by: Max Filippov --- kernel/dma/direct.c | 4 ++-- kernel/dma/remap.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index a7a2739fb747..724c282dd943 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -83,7 +83,7 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) } struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) + gfp_t gfp, unsigned long attrs) { size_t alloc_size = PAGE_ALIGN(size); int node = dev_to_node(dev); @@ -131,7 +131,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; - page = __dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); + page = __dma_direct_alloc_pages(dev, size, gfp, attrs); if (!page) return NULL; diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index fb1e50c2d48a..90d5ce77c189 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -226,7 +226,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, goto done; } - page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs); + page = __dma_direct_alloc_pages(dev, size, flags, attrs); if (!page) return NULL; -- cgit From 7e3617a72df32341fea6d226cd6bb21de40c558d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 7 Nov 2019 10:09:03 -0800 Subject: bpf: Add array support to btf_struct_access This patch adds array support to btf_struct_access(). It supports array of int, array of struct and multidimensional array. It also allows using u8[] as a scratch space. For example, it allows access the "char cb[48]" with size larger than the array's element "char". Another potential use case is "u64 icsk_ca_priv[]" in the tcp congestion control. btf_resolve_size() is added to resolve the size of any type. It will follow the modifier if there is any. Please see the function comment for details. This patch also adds the "off < moff" check at the beginning of the for loop. It is to reject cases when "off" is pointing to a "hole" in a struct. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191107180903.4097702-1-kafai@fb.com --- kernel/bpf/btf.c | 195 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 166 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 128d89601d73..4639c4ba9a9b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1036,6 +1036,82 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) return env->top_stack ? &env->stack[env->top_stack - 1] : NULL; } +/* Resolve the size of a passed-in "type" + * + * type: is an array (e.g. u32 array[x][y]) + * return type: type "u32[x][y]", i.e. BTF_KIND_ARRAY, + * *type_size: (x * y * sizeof(u32)). Hence, *type_size always + * corresponds to the return type. + * *elem_type: u32 + * *total_nelems: (x * y). Hence, individual elem size is + * (*type_size / *total_nelems) + * + * type: is not an array (e.g. const struct X) + * return type: type "struct X" + * *type_size: sizeof(struct X) + * *elem_type: same as return type ("struct X") + * *total_nelems: 1 + */ +static const struct btf_type * +btf_resolve_size(const struct btf *btf, const struct btf_type *type, + u32 *type_size, const struct btf_type **elem_type, + u32 *total_nelems) +{ + const struct btf_type *array_type = NULL; + const struct btf_array *array; + u32 i, size, nelems = 1; + + for (i = 0; i < MAX_RESOLVE_DEPTH; i++) { + switch (BTF_INFO_KIND(type->info)) { + /* type->size can be used */ + case BTF_KIND_INT: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + size = type->size; + goto resolved; + + case BTF_KIND_PTR: + size = sizeof(void *); + goto resolved; + + /* Modifiers */ + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + type = btf_type_by_id(btf, type->type); + break; + + case BTF_KIND_ARRAY: + if (!array_type) + array_type = type; + array = btf_type_array(type); + if (nelems && array->nelems > U32_MAX / nelems) + return ERR_PTR(-EINVAL); + nelems *= array->nelems; + type = btf_type_by_id(btf, array->type); + break; + + /* type without size */ + default: + return ERR_PTR(-EINVAL); + } + } + + return ERR_PTR(-EINVAL); + +resolved: + if (nelems && size > U32_MAX / nelems) + return ERR_PTR(-EINVAL); + + *type_size = nelems * size; + *total_nelems = nelems; + *elem_type = type; + + return array_type ? : type; +} + /* The input param "type_id" must point to a needs_resolve type */ static const struct btf_type *btf_type_id_resolve(const struct btf *btf, u32 *type_id) @@ -3494,10 +3570,10 @@ int btf_struct_access(struct bpf_verifier_log *log, enum bpf_access_type atype, u32 *next_btf_id) { + u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; + const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; - const struct btf_type *mtype; const char *tname, *mname; - int i, moff = 0, msize; again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); @@ -3507,40 +3583,88 @@ again: } for_each_member(i, t, member) { - /* offset of the field in bits */ - moff = btf_member_bit_offset(t, member); - if (btf_member_bitfield_size(t, member)) /* bitfields are not supported yet */ continue; - if (off + size <= moff / 8) + /* offset of the field in bytes */ + moff = btf_member_bit_offset(t, member) / 8; + if (off + size <= moff) /* won't find anything, field is already too far */ break; + /* In case of "off" is pointing to holes of a struct */ + if (off < moff) + continue; /* type of the field */ mtype = btf_type_by_id(btf_vmlinux, member->type); mname = __btf_name_by_offset(btf_vmlinux, member->name_off); - /* skip modifiers */ - while (btf_type_is_modifier(mtype)) - mtype = btf_type_by_id(btf_vmlinux, mtype->type); - - if (btf_type_is_array(mtype)) - /* array deref is not supported yet */ - continue; - - if (!btf_type_has_size(mtype) && !btf_type_is_ptr(mtype)) { + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, + &elem_type, &total_nelems); + if (IS_ERR(mtype)) { bpf_log(log, "field %s doesn't have size\n", mname); return -EFAULT; } - if (btf_type_is_ptr(mtype)) - msize = 8; - else - msize = mtype->size; - if (off >= moff / 8 + msize) + + mtrue_end = moff + msize; + if (off >= mtrue_end) /* no overlap with member, keep iterating */ continue; + + if (btf_type_is_array(mtype)) { + u32 elem_idx; + + /* btf_resolve_size() above helps to + * linearize a multi-dimensional array. + * + * The logic here is treating an array + * in a struct as the following way: + * + * struct outer { + * struct inner array[2][2]; + * }; + * + * looks like: + * + * struct outer { + * struct inner array_elem0; + * struct inner array_elem1; + * struct inner array_elem2; + * struct inner array_elem3; + * }; + * + * When accessing outer->array[1][0], it moves + * moff to "array_elem2", set mtype to + * "struct inner", and msize also becomes + * sizeof(struct inner). Then most of the + * remaining logic will fall through without + * caring the current member is an array or + * not. + * + * Unlike mtype/msize/moff, mtrue_end does not + * change. The naming difference ("_true") tells + * that it is not always corresponding to + * the current mtype/msize/moff. + * It is the true end of the current + * member (i.e. array in this case). That + * will allow an int array to be accessed like + * a scratch space, + * i.e. allow access beyond the size of + * the array's element as long as it is + * within the mtrue_end boundary. + */ + + /* skip empty array */ + if (moff == mtrue_end) + continue; + + msize /= total_nelems; + elem_idx = (off - moff) / msize; + moff += elem_idx * msize; + mtype = elem_type; + } + /* the 'off' we're looking for is either equal to start * of this field or inside of this struct */ @@ -3549,20 +3673,20 @@ again: t = mtype; /* adjust offset we're looking for */ - off -= moff / 8; + off -= moff; goto again; } - if (msize != size) { - /* field access size doesn't match */ - bpf_log(log, - "cannot access %d bytes in struct %s field %s that has size %d\n", - size, tname, mname, msize); - return -EACCES; - } if (btf_type_is_ptr(mtype)) { const struct btf_type *stype; + if (msize != size || off != moff) { + bpf_log(log, + "cannot access ptr member %s with moff %u in struct %s with off %u size %u\n", + mname, moff, tname, off, size); + return -EACCES; + } + stype = btf_type_by_id(btf_vmlinux, mtype->type); /* skip modifiers */ while (btf_type_is_modifier(stype)) @@ -3572,7 +3696,20 @@ again: return PTR_TO_BTF_ID; } } - /* all other fields are treated as scalars */ + + /* Allow more flexible access within an int as long as + * it is within mtrue_end. + * Since mtrue_end could be the end of an array, + * that also allows using an array of int as a scratch + * space. e.g. skb->cb[]. + */ + if (off + size > mtrue_end) { + bpf_log(log, + "access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n", + mname, mtrue_end, tname, off, size); + return -EACCES; + } + return SCALAR_VALUE; } bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off); -- cgit From 714641c3670cdc75371a7ff5bdfd5e9a170c7ffd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 12:25:46 -0500 Subject: ftrace: Separate out the copying of a ftrace_hash from __ftrace_hash_move() Most of the functionality of __ftrace_hash_move() can be reused, but not all of it. That is, __ftrace_hash_move() is used to simply make a new hash from an existing one, using the same size as the original. Creating a dup_hash(), where we can specify a new size will be useful when we want to create a hash with a default size, or simply copy the old one. Signed-off-by: Steven Rostedt (VMWare) --- kernel/trace/ftrace.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 89e9128652ef..76e5de8c7822 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1372,23 +1372,15 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, struct ftrace_hash *new_hash); -static struct ftrace_hash * -__ftrace_hash_move(struct ftrace_hash *src) +static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size) { struct ftrace_func_entry *entry; - struct hlist_node *tn; - struct hlist_head *hhd; struct ftrace_hash *new_hash; - int size = src->count; + struct hlist_head *hhd; + struct hlist_node *tn; int bits = 0; int i; - /* - * If the new source is empty, just return the empty_hash. - */ - if (ftrace_hash_empty(src)) - return EMPTY_HASH; - /* * Make the hash size about 1/2 the # found */ @@ -1413,10 +1405,23 @@ __ftrace_hash_move(struct ftrace_hash *src) __add_hash_entry(new_hash, entry); } } - return new_hash; } +static struct ftrace_hash * +__ftrace_hash_move(struct ftrace_hash *src) +{ + int size = src->count; + + /* + * If the new source is empty, just return the empty_hash. + */ + if (ftrace_hash_empty(src)) + return EMPTY_HASH; + + return dup_hash(src, size); +} + static int ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash **dst, struct ftrace_hash *src) -- cgit From 7e16f581a81759bafea04d049134b32d1a881226 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 12:26:46 -0500 Subject: ftrace: Separate out functionality from ftrace_location_range() Create a new function called lookup_rec() from the functionality of ftrace_location_range(). The difference between lookup_rec() is that it returns the record that it finds, where as ftrace_location_range() returns only if it found a match or not. The lookup_rec() is static, and can be used for new functionality where ftrace needs to find a record of a specific address. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 76e5de8c7822..b0e7f03919de 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1541,6 +1541,26 @@ static int ftrace_cmp_recs(const void *a, const void *b) return 0; } +static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec = NULL; + struct dyn_ftrace key; + + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + } + return rec; +} + /** * ftrace_location_range - return the first address of a traced location * if it touches the given ip range @@ -1555,23 +1575,11 @@ static int ftrace_cmp_recs(const void *a, const void *b) */ unsigned long ftrace_location_range(unsigned long start, unsigned long end) { - struct ftrace_page *pg; struct dyn_ftrace *rec; - struct dyn_ftrace key; - key.ip = start; - key.flags = end; /* overload flags, as it is unsigned long */ - - for (pg = ftrace_pages_start; pg; pg = pg->next) { - if (end < pg->records[0].ip || - start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) - continue; - rec = bsearch(&key, pg->records, pg->index, - sizeof(struct dyn_ftrace), - ftrace_cmp_recs); - if (rec) - return rec->ip; - } + rec = lookup_rec(start, end); + if (rec) + return rec->ip; return 0; } -- cgit From e3b8b6a0d12cccf772113d6b5c1875192186fbd4 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 5 Nov 2019 11:22:12 +0000 Subject: sched/core: Fix compilation error when cgroup not selected MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When cgroup is disabled the following compilation error was hit kernel/sched/core.c: In function ‘uclamp_update_active_tasks’: kernel/sched/core.c:1081:23: error: storage size of ‘it’ isn’t known struct css_task_iter it; ^~ kernel/sched/core.c:1084:2: error: implicit declaration of function ‘css_task_iter_start’; did you mean ‘__sg_page_iter_start’? [-Werror=implicit-function-declaration] css_task_iter_start(css, 0, &it); ^~~~~~~~~~~~~~~~~~~ __sg_page_iter_start kernel/sched/core.c:1085:14: error: implicit declaration of function ‘css_task_iter_next’; did you mean ‘__sg_page_iter_next’? [-Werror=implicit-function-declaration] while ((p = css_task_iter_next(&it))) { ^~~~~~~~~~~~~~~~~~ __sg_page_iter_next kernel/sched/core.c:1091:2: error: implicit declaration of function ‘css_task_iter_end’; did you mean ‘get_task_cred’? [-Werror=implicit-function-declaration] css_task_iter_end(&it); ^~~~~~~~~~~~~~~~~ get_task_cred kernel/sched/core.c:1081:23: warning: unused variable ‘it’ [-Wunused-variable] struct css_task_iter it; ^~ cc1: some warnings being treated as errors make[2]: *** [kernel/sched/core.o] Error 1 Fix by protetion uclamp_update_active_tasks() with CONFIG_UCLAMP_TASK_GROUP Fixes: babbe170e053 ("sched/uclamp: Update CPU's refcount on TG's clamp changes") Reported-by: Randy Dunlap Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Tested-by: Randy Dunlap Cc: Steven Rostedt Cc: Ingo Molnar Cc: Vincent Guittot Cc: Patrick Bellasi Cc: Mel Gorman Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Ben Segall Link: https://lkml.kernel.org/r/20191105112212.596-1-qais.yousef@arm.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd05a378631a..afd4d8028771 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1073,6 +1073,7 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) task_rq_unlock(rq, p, &rf); } +#ifdef CONFIG_UCLAMP_TASK_GROUP static inline void uclamp_update_active_tasks(struct cgroup_subsys_state *css, unsigned int clamps) @@ -1091,7 +1092,6 @@ uclamp_update_active_tasks(struct cgroup_subsys_state *css, css_task_iter_end(&it); } -#ifdef CONFIG_UCLAMP_TASK_GROUP static void cpu_util_update_eff(struct cgroup_subsys_state *css); static void uclamp_update_root_tg(void) { -- cgit From 6e2df0581f569038719cf2bc2b3baa3fcc83cab4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Nov 2019 11:11:52 +0100 Subject: sched: Fix pick_next_task() vs 'change' pattern race Commit 67692435c411 ("sched: Rework pick_next_task() slow-path") inadvertly introduced a race because it changed a previously unexplored dependency between dropping the rq->lock and sched_class::put_prev_task(). The comments about dropping rq->lock, in for example newidle_balance(), only mentions the task being current and ->on_cpu being set. But when we look at the 'change' pattern (in for example sched_setnuma()): queued = task_on_rq_queued(p); /* p->on_rq == TASK_ON_RQ_QUEUED */ running = task_current(rq, p); /* rq->curr == p */ if (queued) dequeue_task(...); if (running) put_prev_task(...); /* change task properties */ if (queued) enqueue_task(...); if (running) set_next_task(...); It becomes obvious that if we do this after put_prev_task() has already been called on @p, things go sideways. This is exactly what the commit in question allows to happen when it does: prev->sched_class->put_prev_task(rq, prev, rf); if (!rq->nr_running) newidle_balance(rq, rf); The newidle_balance() call will drop rq->lock after we've called put_prev_task() and that allows the above 'change' pattern to interleave and mess up the state. Furthermore, it turns out we lost the RT-pull when we put the last DL task. Fix both problems by extracting the balancing from put_prev_task() and doing a multi-class balance() pass before put_prev_task(). Fixes: 67692435c411 ("sched: Rework pick_next_task() slow-path") Reported-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Tested-by: Quentin Perret Tested-by: Valentin Schneider --- kernel/sched/core.c | 21 +++++++++++++++------ kernel/sched/deadline.c | 40 ++++++++++++++++++++-------------------- kernel/sched/fair.c | 15 ++++++++++++--- kernel/sched/idle.c | 9 ++++++++- kernel/sched/rt.c | 37 +++++++++++++++++++------------------ kernel/sched/sched.h | 30 +++++++++++++++++++++++++++--- kernel/sched/stop_task.c | 18 +++++++++++------- 7 files changed, 112 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index afd4d8028771..0f2eb3629070 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3929,13 +3929,22 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } restart: +#ifdef CONFIG_SMP /* - * Ensure that we put DL/RT tasks before the pick loop, such that they - * can PULL higher prio tasks when we lower the RQ 'priority'. + * We must do the balancing pass before put_next_task(), such + * that when we release the rq->lock the task is in the same + * state as before we took rq->lock. + * + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. */ - prev->sched_class->put_prev_task(rq, prev, rf); - if (!rq->nr_running) - newidle_balance(rq, rf); + for_class_range(class, prev->sched_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +#endif + + put_prev_task(rq, prev); for_each_class(class) { p = class->pick_next_task(rq, NULL, NULL); @@ -6201,7 +6210,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) for_each_class(class) { next = class->pick_next_task(rq, NULL, NULL); if (next) { - next->sched_class->put_prev_task(rq, next, NULL); + next->sched_class->put_prev_task(rq, next); return next; } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2dc48720f189..a8a08030a8f7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1691,6 +1691,22 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) resched_curr(rq); } +static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +{ + if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_dl_task(rq); + rq_repin_lock(rq, rf); + } + + return sched_stop_runnable(rq) || sched_dl_runnable(rq); +} #endif /* CONFIG_SMP */ /* @@ -1758,45 +1774,28 @@ static struct task_struct * pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_dl_entity *dl_se; + struct dl_rq *dl_rq = &rq->dl; struct task_struct *p; - struct dl_rq *dl_rq; WARN_ON_ONCE(prev || rf); - dl_rq = &rq->dl; - - if (unlikely(!dl_rq->dl_nr_running)) + if (!sched_dl_runnable(rq)) return NULL; dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); - p = dl_task_of(dl_se); - set_next_task_dl(rq, p); - return p; } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); - - if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we've - * not yet started the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_dl_task(rq); - rq_repin_lock(rq, rf); - } } /* @@ -2442,6 +2441,7 @@ const struct sched_class dl_sched_class = { .set_next_task = set_next_task_dl, #ifdef CONFIG_SMP + .balance = balance_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 682a754ea3e1..22a2fed29054 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6570,6 +6570,15 @@ static void task_dead_fair(struct task_struct *p) { remove_entity_load_avg(&p->se); } + +static int +balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + if (rq->nr_running) + return 1; + + return newidle_balance(rq, rf) != 0; +} #endif /* CONFIG_SMP */ static unsigned long wakeup_gran(struct sched_entity *se) @@ -6746,7 +6755,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: - if (!cfs_rq->nr_running) + if (!sched_fair_runnable(rq)) goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6884,7 +6893,7 @@ idle: /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; @@ -10414,11 +10423,11 @@ const struct sched_class fair_sched_class = { .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, - .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, #ifdef CONFIG_SMP + .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8dad5aa600ea..f65ef1e2f204 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -365,6 +365,12 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static int +balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return WARN_ON_ONCE(1); +} #endif /* @@ -375,7 +381,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { } @@ -460,6 +466,7 @@ const struct sched_class idle_sched_class = { .set_next_task = set_next_task_idle, #ifdef CONFIG_SMP + .balance = balance_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ebaa4e619684..9b8adc01be3d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1469,6 +1469,22 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) resched_curr(rq); } +static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +{ + if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_rt_task(rq); + rq_repin_lock(rq, rf); + } + + return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); +} #endif /* CONFIG_SMP */ /* @@ -1552,21 +1568,18 @@ static struct task_struct * pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *p; - struct rt_rq *rt_rq = &rq->rt; WARN_ON_ONCE(prev || rf); - if (!rt_rq->rt_queued) + if (!sched_rt_runnable(rq)) return NULL; p = _pick_next_task_rt(rq); - set_next_task_rt(rq, p); - return p; } -static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); @@ -1578,18 +1591,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_fla */ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); - - if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we've - * not yet started the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_rt_task(rq); - rq_repin_lock(rq, rf); - } } #ifdef CONFIG_SMP @@ -2366,8 +2367,8 @@ const struct sched_class rt_sched_class = { .set_next_task = set_next_task_rt, #ifdef CONFIG_SMP + .balance = balance_rt, .select_task_rq = select_task_rq_rt, - .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0db2c1b3361e..c8870c5bd7df 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1727,10 +1727,11 @@ struct sched_class { struct task_struct * (*pick_next_task)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); void (*set_next_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int new_cpu); @@ -1773,7 +1774,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->curr != prev); - prev->sched_class->put_prev_task(rq, prev, NULL); + prev->sched_class->put_prev_task(rq, prev); } static inline void set_next_task(struct rq *rq, struct task_struct *next) @@ -1787,8 +1788,12 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) #else #define sched_class_highest (&dl_sched_class) #endif + +#define for_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = class->next) + #define for_each_class(class) \ - for (class = sched_class_highest; class; class = class->next) + for_class_range(class, sched_class_highest, NULL) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; @@ -1796,6 +1801,25 @@ extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; +static inline bool sched_stop_runnable(struct rq *rq) +{ + return rq->stop && task_on_rq_queued(rq->stop); +} + +static inline bool sched_dl_runnable(struct rq *rq) +{ + return rq->dl.dl_nr_running > 0; +} + +static inline bool sched_rt_runnable(struct rq *rq) +{ + return rq->rt.rt_queued > 0; +} + +static inline bool sched_fair_runnable(struct rq *rq) +{ + return rq->cfs.nr_running > 0; +} #ifdef CONFIG_SMP diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 7e1cee4e65b2..c0640739e05e 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -15,6 +15,12 @@ select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } + +static int +balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return sched_stop_runnable(rq); +} #endif /* CONFIG_SMP */ static void @@ -31,16 +37,13 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop) static struct task_struct * pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct task_struct *stop = rq->stop; - WARN_ON_ONCE(prev || rf); - if (!stop || !task_on_rq_queued(stop)) + if (!sched_stop_runnable(rq)) return NULL; - set_next_task_stop(rq, stop); - - return stop; + set_next_task_stop(rq, rq->stop); + return rq->stop; } static void @@ -60,7 +63,7 @@ static void yield_task_stop(struct rq *rq) BUG(); /* the stop task should never yield, its pointless. */ } -static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *curr = rq->curr; u64 delta_exec; @@ -129,6 +132,7 @@ const struct sched_class stop_sched_class = { .set_next_task = set_next_task_stop, #ifdef CONFIG_SMP + .balance = balance_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif -- cgit From 630faf81b3e61bcc90dc6d8b497800657d2752a5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 Nov 2019 11:53:27 -0500 Subject: cgroup: don't put ERR_PTR() into fc->root the caller of ->get_tree() expects NULL left there on error... Reported-by: Thibaut Sautereau Signed-off-by: Al Viro --- kernel/cgroup/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 080561bb8a4b..ef4242e5d4bc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2119,11 +2119,12 @@ int cgroup_do_get_tree(struct fs_context *fc) nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(fc->root); - fc->root = nsdentry; if (IS_ERR(nsdentry)) { - ret = PTR_ERR(nsdentry); deactivate_locked_super(sb); + ret = PTR_ERR(nsdentry); + nsdentry = NULL; } + fc->root = nsdentry; } if (!ctx->kfc.new_sb_created) -- cgit From 69924b89687a2923e88cc42144aea27868913d0e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 Nov 2019 13:11:41 -0400 Subject: audit_get_nd(): don't unlock parent too early if the child has been negative and just went positive under us, we want coherent d_is_positive() and ->d_inode. Don't unlock the parent until we'd done that work... Signed-off-by: Al Viro --- kernel/audit_watch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 1f31c2f1e6fc..4508d5e0cf69 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -351,12 +351,12 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - inode_unlock(d_backing_inode(parent->dentry)); if (d_is_positive(d)) { /* update watch filter fields */ watch->dev = d->d_sb->s_dev; watch->ino = d_backing_inode(d)->i_ino; } + inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; } -- cgit From 7277a34c6be0b2972bdd1fea88c7cef409bed5b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Nov 2019 14:15:55 +0100 Subject: sched/fair: Better document newidle_balance() Whilst chasing the pick_next_task() race, there was some confusion about the newidle_balance() return values. Document them. [ mingo: Minor edits. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: ktkhai@virtuozzo.com Cc: mgorman@suse.de Cc: qais.yousef@arm.com Cc: qperret@google.com Cc: rostedt@goodmis.org Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20191108131909.488364308@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e622f48c8de..c48a695a140c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9941,6 +9941,11 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. + * + * Returns: + * < 0 - we released the lock and there are !fair tasks present + * 0 - failed, no new tasks + * > 0 - success, new (fair) tasks present */ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { -- cgit From f488e1057bb97b881ed317557dc5e62ff8747393 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Nov 2019 14:15:56 +0100 Subject: sched/core: Make pick_next_task_idle() more consistent Only pick_next_task_fair() needs the @prev and @rf argument; these are required to implement the cpu-cgroup optimization. None of the other pick_next_task() methods need this. Make pick_next_task_idle() more consistent. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: ktkhai@virtuozzo.com Cc: mgorman@suse.de Cc: qais.yousef@arm.com Cc: qperret@google.com Cc: rostedt@goodmis.org Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20191108131909.545730862@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++-- kernel/sched/idle.c | 3 +-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0f2eb3629070..59c4f2963417 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3922,8 +3922,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) goto restart; /* Assumes fair_sched_class->next == idle_sched_class */ - if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev, rf); + if (unlikely(!p)) { + put_prev_task(rq, prev); + p = idle_sched_class.pick_next_task(rq, NULL, NULL); + } return p; } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f65ef1e2f204..179d1d4ac5a6 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -396,8 +396,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf { struct task_struct *next = rq->idle; - if (prev) - put_prev_task(rq, prev); + WARN_ON_ONCE(prev || rf); set_next_task_idle(rq, next); -- cgit From 5d7d605642b28a5911198a405a6072f091bfbee6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Nov 2019 14:15:57 +0100 Subject: sched/core: Optimize pick_next_task() Ever since we moved the sched_class definitions into their own files, the constant expression {fair,idle}_sched_class.pick_next_task() is not in fact a compile time constant anymore and results in an indirect call (barring LTO). Fix that by exposing pick_next_task_{fair,idle}() directly, this gets rid of the indirect call (and RETPOLINE) on the fast path. Also remove the unlikely() from the idle case, it is in fact /the/ way we select idle -- and that is a very common thing to do. Performance for will-it-scale/sched_yield improves by 2% (as reported by 0-day). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: ktkhai@virtuozzo.com Cc: mgorman@suse.de Cc: qais.yousef@arm.com Cc: qperret@google.com Cc: rostedt@goodmis.org Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20191108131909.603037345@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/fair.c | 2 +- kernel/sched/idle.c | 2 +- kernel/sched/sched.h | 3 +++ 4 files changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 59c4f2963417..7cf65474fa26 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3917,14 +3917,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) prev->sched_class == &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev, rf); + p = pick_next_task_fair(rq, prev, rf); if (unlikely(p == RETRY_TASK)) goto restart; /* Assumes fair_sched_class->next == idle_sched_class */ - if (unlikely(!p)) { + if (!p) { put_prev_task(rq, prev); - p = idle_sched_class.pick_next_task(rq, NULL, NULL); + p = pick_next_task_idle(rq, NULL, NULL); } return p; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c48a695a140c..da81451d5575 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6611,7 +6611,7 @@ preempt: set_last_buddy(se); } -static struct task_struct * +struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 179d1d4ac5a6..0fdceac00b70 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -391,7 +391,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next) schedstat_inc(rq->sched_goidle); } -static struct task_struct * +struct task_struct * pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *next = rq->idle; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c8870c5bd7df..66172a337675 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1821,6 +1821,9 @@ static inline bool sched_fair_runnable(struct rq *rq) return rq->cfs.nr_running > 0; } +extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); +extern struct task_struct *pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + #ifdef CONFIG_SMP extern void update_group_capacity(struct sched_domain *sd, int cpu); -- cgit From 98c2f700edb413e4baa4a0368c5861d96211a775 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Nov 2019 14:15:58 +0100 Subject: sched/core: Simplify sched_class::pick_next_task() Now that the indirect class call never uses the last two arguments of pick_next_task(), remove them. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: ktkhai@virtuozzo.com Cc: mgorman@suse.de Cc: qais.yousef@arm.com Cc: qperret@google.com Cc: rostedt@goodmis.org Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20191108131909.660595546@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/deadline.c | 5 +---- kernel/sched/fair.c | 7 ++++++- kernel/sched/idle.c | 5 +---- kernel/sched/rt.c | 5 +---- kernel/sched/sched.h | 18 +++--------------- kernel/sched/stop_task.c | 5 +---- 7 files changed, 16 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7cf65474fa26..513a4794ff36 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3924,7 +3924,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* Assumes fair_sched_class->next == idle_sched_class */ if (!p) { put_prev_task(rq, prev); - p = pick_next_task_idle(rq, NULL, NULL); + p = pick_next_task_idle(rq); } return p; @@ -3949,7 +3949,7 @@ restart: put_prev_task(rq, prev); for_each_class(class) { - p = class->pick_next_task(rq, NULL, NULL); + p = class->pick_next_task(rq); if (p) return p; } @@ -6210,7 +6210,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) struct task_struct *next; for_each_class(class) { - next = class->pick_next_task(rq, NULL, NULL); + next = class->pick_next_task(rq); if (next) { next->sched_class->put_prev_task(rq, next); return next; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a8a08030a8f7..f7fbb4427959 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1770,15 +1770,12 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -static struct task_struct * -pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static struct task_struct *pick_next_task_dl(struct rq *rq) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; struct task_struct *p; - WARN_ON_ONCE(prev || rf); - if (!sched_dl_runnable(rq)) return NULL; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da81451d5575..1789193d9917 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6755,6 +6755,11 @@ idle: return NULL; } +static struct task_struct *__pick_next_task_fair(struct rq *rq) +{ + return pick_next_task_fair(rq, NULL, NULL); +} + /* * Account for a descheduled task: */ @@ -10622,7 +10627,7 @@ const struct sched_class fair_sched_class = { .check_preempt_curr = check_preempt_wakeup, - .pick_next_task = pick_next_task_fair, + .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 0fdceac00b70..f88b79eeee1e 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -391,13 +391,10 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next) schedstat_inc(rq->sched_goidle); } -struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +struct task_struct *pick_next_task_idle(struct rq *rq) { struct task_struct *next = rq->idle; - WARN_ON_ONCE(prev || rf); - set_next_task_idle(rq, next); return next; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 9b8adc01be3d..38027c04b04c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1564,13 +1564,10 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return rt_task_of(rt_se); } -static struct task_struct * -pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static struct task_struct *pick_next_task_rt(struct rq *rq) { struct task_struct *p; - WARN_ON_ONCE(prev || rf); - if (!sched_rt_runnable(rq)) return NULL; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 66172a337675..75d96cce1492 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1713,20 +1713,8 @@ struct sched_class { void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); - /* - * Both @prev and @rf are optional and may be NULL, in which case the - * caller must already have invoked put_prev_task(rq, prev, rf). - * - * Otherwise it is the responsibility of the pick_next_task() to call - * put_prev_task() on the @prev task or something equivalent, IFF it - * returns a next task. - * - * In that case (@rf != NULL) it may return RETRY_TASK when it finds a - * higher prio class has runnable tasks. - */ - struct task_struct * (*pick_next_task)(struct rq *rq, - struct task_struct *prev, - struct rq_flags *rf); + struct task_struct *(*pick_next_task)(struct rq *rq); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); void (*set_next_task)(struct rq *rq, struct task_struct *p); @@ -1822,7 +1810,7 @@ static inline bool sched_fair_runnable(struct rq *rq) } extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); -extern struct task_struct *pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); +extern struct task_struct *pick_next_task_idle(struct rq *rq); #ifdef CONFIG_SMP diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index c0640739e05e..0aefdfb79b36 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -34,11 +34,8 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop) stop->se.exec_start = rq_clock_task(rq); } -static struct task_struct * -pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static struct task_struct *pick_next_task_stop(struct rq *rq) { - WARN_ON_ONCE(prev || rf); - if (!sched_stop_runnable(rq)) return NULL; -- cgit From 2eeb01a28c9233333bf229a5b4b0559f4bd22b52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Nov 2019 14:15:59 +0100 Subject: sched/fair: Use mul_u32_u32() While reading the code I encountered another site where we should be using mul_u32_u32() because GCC just won't take a hint. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: ktkhai@virtuozzo.com Cc: mgorman@suse.de Cc: qais.yousef@arm.com Cc: qperret@google.com Cc: rostedt@goodmis.org Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20191108131909.717931380@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1789193d9917..ba97d1a1b976 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -229,8 +229,7 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight } } - /* hint to use a 32x32->64 mul */ - fact = (u64)(u32)fact * lw->inv_weight; + fact = mul_u32_u32(fact, lw->inv_weight); while (fact >> 32) { fact >>= 1; -- cgit From a0e813f26ebcb25c0b5e504498fbd796cca1a4ba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Nov 2019 14:16:00 +0100 Subject: sched/core: Further clarify sched_class::set_next_task() It turns out there really is something special to the first set_next_task() invocation. In specific the 'change' pattern really should not cause balance callbacks. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: ktkhai@virtuozzo.com Cc: mgorman@suse.de Cc: qais.yousef@arm.com Cc: qperret@google.com Cc: rostedt@goodmis.org Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Fixes: f95d4eaee6d0 ("sched/{rt,deadline}: Fix set_next_task vs pick_next_task") Link: https://lkml.kernel.org/r/20191108131909.775434698@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 7 +++++-- kernel/sched/fair.c | 2 +- kernel/sched/idle.c | 4 ++-- kernel/sched/rt.c | 7 +++++-- kernel/sched/sched.h | 4 ++-- kernel/sched/stop_task.c | 4 ++-- 6 files changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f7fbb4427959..43323f875cb9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1743,13 +1743,16 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) } #endif -static void set_next_task_dl(struct rq *rq, struct task_struct *p) +static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { p->se.exec_start = rq_clock_task(rq); /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); + if (!first) + return; + if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); @@ -1782,7 +1785,7 @@ static struct task_struct *pick_next_task_dl(struct rq *rq) dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); p = dl_task_of(dl_se); - set_next_task_dl(rq, p); + set_next_task_dl(rq, p, true); return p; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ba97d1a1b976..81eba554db8d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10344,7 +10344,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * This routine is mostly called to set cfs_rq->curr field when a task * migrates between groups/classes. */ -static void set_next_task_fair(struct rq *rq, struct task_struct *p) +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { struct sched_entity *se = &p->se; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f88b79eeee1e..428cd05c0b5d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -385,7 +385,7 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { } -static void set_next_task_idle(struct rq *rq, struct task_struct *next) +static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) { update_idle_core(rq); schedstat_inc(rq->sched_goidle); @@ -395,7 +395,7 @@ struct task_struct *pick_next_task_idle(struct rq *rq) { struct task_struct *next = rq->idle; - set_next_task_idle(rq, next); + set_next_task_idle(rq, next, true); return next; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 38027c04b04c..e591d40fd645 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1515,13 +1515,16 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } -static inline void set_next_task_rt(struct rq *rq, struct task_struct *p) +static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) { p->se.exec_start = rq_clock_task(rq); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); + if (!first) + return; + /* * If prev task was rt, put_prev_task() has already updated the * utilization. We only care of the case where we start to schedule a @@ -1572,7 +1575,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) return NULL; p = _pick_next_task_rt(rq); - set_next_task_rt(rq, p); + set_next_task_rt(rq, p, true); return p; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 75d96cce1492..05c282775f21 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1716,7 +1716,7 @@ struct sched_class { struct task_struct *(*pick_next_task)(struct rq *rq); void (*put_prev_task)(struct rq *rq, struct task_struct *p); - void (*set_next_task)(struct rq *rq, struct task_struct *p); + void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); #ifdef CONFIG_SMP int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); @@ -1768,7 +1768,7 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) static inline void set_next_task(struct rq *rq, struct task_struct *next) { WARN_ON_ONCE(rq->curr != next); - next->sched_class->set_next_task(rq, next); + next->sched_class->set_next_task(rq, next, false); } #ifdef CONFIG_SMP diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 0aefdfb79b36..4c9e9975684f 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -29,7 +29,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) /* we're never preempted */ } -static void set_next_task_stop(struct rq *rq, struct task_struct *stop) +static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool first) { stop->se.exec_start = rq_clock_task(rq); } @@ -39,7 +39,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) if (!sched_stop_runnable(rq)) return NULL; - set_next_task_stop(rq, rq->stop); + set_next_task_stop(rq, rq->stop, true); return rq->stop; } -- cgit From 153bedbac2ebd475e1c7c2d2fa0c042f5525927d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Nov 2019 17:08:55 +0100 Subject: irq_work: Convert flags to atomic_t We need to convert flags to atomic_t in order to later fix an ordering issue on atomic_cmpxchg() failure. This will allow us to use atomic_fetch_or(). Also clarify the nature of those flags. [ mingo: Converted two more usage site the original patch missed. ] Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191108160858.31665-2-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/bpf/stackmap.c | 2 +- kernel/irq_work.c | 18 +++++++++--------- kernel/printk/printk.c | 2 +- kernel/trace/bpf_trace.c | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 052580c33d26..4d31284095e2 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -289,7 +289,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, if (in_nmi()) { work = this_cpu_ptr(&up_read_work); - if (work->irq_work.flags & IRQ_WORK_BUSY) + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) /* cannot queue more up_read, fallback */ irq_work_busy = true; } diff --git a/kernel/irq_work.c b/kernel/irq_work.c index d42acaf81886..df0dbf4d859b 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -29,16 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list); */ static bool irq_work_claim(struct irq_work *work) { - unsigned long flags, oflags, nflags; + int flags, oflags, nflags; /* * Start with our best wish as a premise but only trust any * flag value after cmpxchg() result. */ - flags = work->flags & ~IRQ_WORK_PENDING; + flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; for (;;) { nflags = flags | IRQ_WORK_CLAIMED; - oflags = cmpxchg(&work->flags, flags, nflags); + oflags = atomic_cmpxchg(&work->flags, flags, nflags); if (oflags == flags) break; if (oflags & IRQ_WORK_PENDING) @@ -61,7 +61,7 @@ void __weak arch_irq_work_raise(void) static void __irq_work_queue_local(struct irq_work *work) { /* If the work is "lazy", handle it from next tick if any */ - if (work->flags & IRQ_WORK_LAZY) { + if (atomic_read(&work->flags) & IRQ_WORK_LAZY) { if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && tick_nohz_tick_stopped()) arch_irq_work_raise(); @@ -143,7 +143,7 @@ static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; struct llist_node *llnode; - unsigned long flags; + int flags; BUG_ON(!irqs_disabled()); @@ -159,15 +159,15 @@ static void irq_work_run_list(struct llist_head *list) * to claim that work don't rely on us to handle their data * while we are in the middle of the func. */ - flags = work->flags & ~IRQ_WORK_PENDING; - xchg(&work->flags, flags); + flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; + atomic_xchg(&work->flags, flags); work->func(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ - (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); + (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); } } @@ -199,7 +199,7 @@ void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); - while (work->flags & IRQ_WORK_BUSY) + while (atomic_read(&work->flags) & IRQ_WORK_BUSY) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ca65327a6de8..865727373a3b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2961,7 +2961,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { .func = wake_up_klogd_work_func, - .flags = IRQ_WORK_LAZY, + .flags = ATOMIC_INIT(IRQ_WORK_LAZY), }; void wake_up_klogd(void) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 44bd08f2443b..ff467a4e2639 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -660,7 +660,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig) return -EINVAL; work = this_cpu_ptr(&send_signal_work); - if (work->irq_work.flags & IRQ_WORK_BUSY) + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) return -EBUSY; /* Add the current task, which is the target of sending signal, -- cgit From 25269871db1ad0cbbaafd5098cbdb40c8db4ccb9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Nov 2019 17:08:56 +0100 Subject: irq_work: Fix irq_work_claim() memory ordering When irq_work_claim() finds IRQ_WORK_PENDING flag already set, we just return and don't raise a new IPI. We expect the destination to see and handle our latest updades thanks to the pairing atomic_xchg() in irq_work_run_list(). But cmpxchg() doesn't guarantee a full memory barrier upon failure. So it's possible that the destination misses our latest updates. So use atomic_fetch_or() instead that is unconditionally fully ordered and also performs exactly what we want here and simplify the code. Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191108160858.31665-3-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index df0dbf4d859b..255454a48346 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -29,24 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list); */ static bool irq_work_claim(struct irq_work *work) { - int flags, oflags, nflags; + int oflags; + oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags); /* - * Start with our best wish as a premise but only trust any - * flag value after cmpxchg() result. + * If the work is already pending, no need to raise the IPI. + * The pairing atomic_xchg() in irq_work_run() makes sure + * everything we did before is visible. */ - flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; - for (;;) { - nflags = flags | IRQ_WORK_CLAIMED; - oflags = atomic_cmpxchg(&work->flags, flags, nflags); - if (oflags == flags) - break; - if (oflags & IRQ_WORK_PENDING) - return false; - flags = oflags; - cpu_relax(); - } - + if (oflags & IRQ_WORK_PENDING) + return false; return true; } -- cgit From feb4a51323babe13315c3b783ea7f1cf25368918 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Nov 2019 17:08:57 +0100 Subject: irq_work: Slightly simplify IRQ_WORK_PENDING clearing Instead of fetching the value of flags and perform an xchg() to clear a bit, just use atomic_fetch_andnot() that is more suitable to do that job in one operation while keeping the full ordering. Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191108160858.31665-4-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 255454a48346..49c53f80a13a 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -34,7 +34,7 @@ static bool irq_work_claim(struct irq_work *work) oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags); /* * If the work is already pending, no need to raise the IPI. - * The pairing atomic_xchg() in irq_work_run() makes sure + * The pairing atomic_fetch_andnot() in irq_work_run() makes sure * everything we did before is visible. */ if (oflags & IRQ_WORK_PENDING) @@ -135,7 +135,6 @@ static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; struct llist_node *llnode; - int flags; BUG_ON(!irqs_disabled()); @@ -144,6 +143,7 @@ static void irq_work_run_list(struct llist_head *list) llnode = llist_del_all(list); llist_for_each_entry_safe(work, tmp, llnode, llnode) { + int flags; /* * Clear the PENDING bit, after this point the @work * can be re-used. @@ -151,8 +151,7 @@ static void irq_work_run_list(struct llist_head *list) * to claim that work don't rely on us to handle their data * while we are in the middle of the func. */ - flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; - atomic_xchg(&work->flags, flags); + flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); work->func(work); /* -- cgit From 4b48512c2e9c63b62d7da23563cdb224b4d61d72 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 11 Nov 2019 10:26:47 +0100 Subject: stacktrace: Get rid of unneeded '!!' pattern My commit b0c51f158455 ("stacktrace: Don't skip first entry on noncurrent tasks") adds one or zero to skipnr by "!!(current == tsk)". But the C99 standard says: The == (equal to) and != (not equal to) operators are ... Each of the operators yields 1 if the specified relation is true and 0 if it is false. So there is no need to prepend the above expression by "!!" -- remove it. Reported-by: Joe Perches Signed-off-by: Jiri Slaby Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191111092647.27419-1-jslaby@suse.cz Signed-off-by: Ingo Molnar --- kernel/stacktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index c9ea7eb2cb1a..2af66e449aa6 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -142,7 +142,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, .store = store, .size = size, /* skip this function if they are tracing us */ - .skip = skipnr + !!(current == tsk), + .skip = skipnr + (current == tsk), }; if (!try_get_task_stack(tsk)) @@ -300,7 +300,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, .entries = store, .max_entries = size, /* skip this function if they are tracing us */ - .skip = skipnr + !!(current == task), + .skip = skipnr + (current == task), }; save_stack_trace_tsk(task, &trace); -- cgit From 34dc0ea6bc960f1f57b2148f01a3f4da23f87013 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2019 11:01:37 +0100 Subject: dma-direct: provide mmap and get_sgtable method overrides For dma-direct we know that the DMA address is an encoding of the physical address that we can trivially decode. Use that fact to provide implementations that do not need the arch_dma_coherent_to_pfn architecture hook. Note that we still can only support mmap of non-coherent memory only if the architecture provides a way to set an uncached bit in the page tables. This must be true for architectures that use the generic remap helpers, but other architectures can also manually select it. Signed-off-by: Christoph Hellwig Reviewed-by: Max Filippov --- kernel/dma/Kconfig | 12 ++++++++--- kernel/dma/direct.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/dma/mapping.c | 45 +++++++-------------------------------- kernel/dma/remap.c | 6 ------ 4 files changed, 75 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 73c5c2b8e824..4c103a24e380 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -51,9 +51,6 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL config ARCH_HAS_DMA_PREP_COHERENT bool -config ARCH_HAS_DMA_COHERENT_TO_PFN - bool - config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool @@ -68,9 +65,18 @@ config SWIOTLB bool select NEED_DMA_MAP_STATE +# +# Should be selected if we can mmap non-coherent mappings to userspace. +# The only thing that is really required is a way to set an uncached bit +# in the pagetables +# +config DMA_NONCOHERENT_MMAP + bool + config DMA_REMAP depends on MMU select GENERIC_ALLOCATOR + select DMA_NONCOHERENT_MMAP bool config DMA_DIRECT_REMAP diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 724c282dd943..58beaa9ddd27 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -43,6 +43,12 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev, return phys_to_dma(dev, phys); } +static inline struct page *dma_direct_to_page(struct device *dev, + dma_addr_t dma_addr) +{ + return pfn_to_page(PHYS_PFN(dma_to_phys(dev, dma_addr))); +} + u64 dma_direct_get_required_mask(struct device *dev) { u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); @@ -379,6 +385,59 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, } EXPORT_SYMBOL(dma_direct_map_resource); +int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + struct page *page = dma_direct_to_page(dev, dma_addr); + int ret; + + ret = sg_alloc_table(sgt, 1, GFP_KERNEL); + if (!ret) + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + return ret; +} + +#ifdef CONFIG_MMU +bool dma_direct_can_mmap(struct device *dev) +{ + return dev_is_dma_coherent(dev) || + IS_ENABLED(CONFIG_DMA_NONCOHERENT_MMAP); +} + +int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + unsigned long user_count = vma_pages(vma); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long pfn = PHYS_PFN(dma_to_phys(dev, dma_addr)); + int ret = -ENXIO; + + vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); + + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) + return ret; + + if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff) + return -ENXIO; + return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, + user_count << PAGE_SHIFT, vma->vm_page_prot); +} +#else /* CONFIG_MMU */ +bool dma_direct_can_mmap(struct device *dev) +{ + return false; +} + +int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + return -ENXIO; +} +#endif /* CONFIG_MMU */ + /* * Because 32-bit DMA masks are so common we expect every architecture to be * able to satisfy them - either by not supporting more physical memory, or by diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index d9334f31a5af..12ff766ec1fa 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -112,24 +112,9 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { - struct page *page; + struct page *page = virt_to_page(cpu_addr); int ret; - if (!dev_is_dma_coherent(dev)) { - unsigned long pfn; - - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) - return -ENXIO; - - /* If the PFN is not valid, we do not have a struct page */ - pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); - if (!pfn_valid(pfn)) - return -ENXIO; - page = pfn_to_page(pfn); - } else { - page = virt_to_page(cpu_addr); - } - ret = sg_alloc_table(sgt, 1, GFP_KERNEL); if (!ret) sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); @@ -154,7 +139,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_is_direct(ops)) - return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, + return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); if (!ops->get_sgtable) return -ENXIO; @@ -192,7 +177,6 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long user_count = vma_pages(vma); unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long off = vma->vm_pgoff; - unsigned long pfn; int ret = -ENXIO; vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); @@ -203,19 +187,8 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, if (off >= count || user_count > count - off) return -ENXIO; - if (!dev_is_dma_coherent(dev)) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) - return -ENXIO; - - /* If the PFN is not valid, we do not have a struct page */ - pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); - if (!pfn_valid(pfn)) - return -ENXIO; - } else { - pfn = page_to_pfn(virt_to_page(cpu_addr)); - } - - return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, + return remap_pfn_range(vma, vma->vm_start, + page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff, user_count << PAGE_SHIFT, vma->vm_page_prot); #else return -ENXIO; @@ -233,12 +206,8 @@ bool dma_can_mmap(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) { - return IS_ENABLED(CONFIG_MMU) && - (dev_is_dma_coherent(dev) || - IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)); - } - + if (dma_is_direct(ops)) + return dma_direct_can_mmap(dev); return ops->mmap != NULL; } EXPORT_SYMBOL_GPL(dma_can_mmap); @@ -263,7 +232,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_is_direct(ops)) - return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, + return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); if (!ops->mmap) return -ENXIO; diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 90d5ce77c189..3c49499ee6b0 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -259,10 +259,4 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_free_contiguous(dev, page, size); } } - -long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, - dma_addr_t dma_addr) -{ - return __phys_to_pfn(dma_to_phys(dev, dma_addr)); -} #endif /* CONFIG_DMA_DIRECT_REMAP */ -- cgit From 3acac065508f6cc60ac9d3e4b7c6cc37fd91d531 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2019 11:06:32 +0100 Subject: dma-mapping: merge the generic remapping helpers into dma-direct Integrate the generic dma remapping implementation into the main flow. This prepares for architectures like xtensa that use an uncached segment for pages in the kernel mapping, but can also remap highmem from CMA. To simplify that implementation we now always deduct the page from the physical address via the DMA address instead of the virtual address. Signed-off-by: Christoph Hellwig Reviewed-by: Max Filippov --- kernel/dma/direct.c | 60 ++++++++++++++++++++++++++++++++++++++++++----------- kernel/dma/remap.c | 49 ------------------------------------------- 2 files changed, 48 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 58beaa9ddd27..22a2e0833862 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -137,6 +138,15 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; + if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + dma_alloc_need_uncached(dev, attrs) && + !gfpflags_allow_blocking(gfp)) { + ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp); + if (!ret) + return NULL; + goto done; + } + page = __dma_direct_alloc_pages(dev, size, gfp, attrs); if (!page) return NULL; @@ -146,9 +156,28 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, /* remove any dirty cache lines on the kernel alias */ if (!PageHighMem(page)) arch_dma_prep_coherent(page, size); - *dma_handle = phys_to_dma(dev, page_to_phys(page)); /* return the page pointer as the opaque cookie */ - return page; + ret = page; + goto done; + } + + if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + dma_alloc_need_uncached(dev, attrs)) || + (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) { + /* remove any dirty cache lines on the kernel alias */ + arch_dma_prep_coherent(page, PAGE_ALIGN(size)); + + /* create a coherent mapping */ + ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size), + dma_pgprot(dev, PAGE_KERNEL, attrs), + __builtin_return_address(0)); + if (!ret) { + dma_free_contiguous(dev, page, size); + return ret; + } + + memset(ret, 0, size); + goto done; } if (PageHighMem(page)) { @@ -164,12 +193,9 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, } ret = page_address(page); - if (force_dma_unencrypted(dev)) { + if (force_dma_unencrypted(dev)) set_memory_decrypted((unsigned long)ret, 1 << get_order(size)); - *dma_handle = __phys_to_dma(dev, page_to_phys(page)); - } else { - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - } + memset(ret, 0, size); if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && @@ -177,7 +203,11 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, arch_dma_prep_coherent(page, size); ret = uncached_kernel_address(ret); } - +done: + if (force_dma_unencrypted(dev)) + *dma_handle = __phys_to_dma(dev, page_to_phys(page)); + else + *dma_handle = phys_to_dma(dev, page_to_phys(page)); return ret; } @@ -193,19 +223,24 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, return; } + if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + dma_free_from_pool(cpu_addr, PAGE_ALIGN(size))) + return; + if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); - if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && - dma_alloc_need_uncached(dev, attrs)) - cpu_addr = cached_kernel_address(cpu_addr); - dma_free_contiguous(dev, virt_to_page(cpu_addr), size); + if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) + vunmap(cpu_addr); + + dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size); } void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); @@ -215,6 +250,7 @@ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs)) arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); else diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 3c49499ee6b0..d47bd40fc0f5 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -210,53 +210,4 @@ bool dma_free_from_pool(void *start, size_t size) gen_pool_free(atomic_pool, (unsigned long)start, size); return true; } - -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t flags, unsigned long attrs) -{ - struct page *page = NULL; - void *ret; - - size = PAGE_ALIGN(size); - - if (!gfpflags_allow_blocking(flags)) { - ret = dma_alloc_from_pool(size, &page, flags); - if (!ret) - return NULL; - goto done; - } - - page = __dma_direct_alloc_pages(dev, size, flags, attrs); - if (!page) - return NULL; - - /* remove any dirty cache lines on the kernel alias */ - arch_dma_prep_coherent(page, size); - - /* create a coherent mapping */ - ret = dma_common_contiguous_remap(page, size, - dma_pgprot(dev, PAGE_KERNEL, attrs), - __builtin_return_address(0)); - if (!ret) { - dma_free_contiguous(dev, page, size); - return ret; - } - - memset(ret, 0, size); -done: - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - return ret; -} - -void arch_dma_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, unsigned long attrs) -{ - if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { - phys_addr_t phys = dma_to_phys(dev, dma_handle); - struct page *page = pfn_to_page(__phys_to_pfn(phys)); - - vunmap(vaddr); - dma_free_contiguous(dev, page, size); - } -} #endif /* CONFIG_DMA_DIRECT_REMAP */ -- cgit From d3694f30732fd2a334b93f087033c5a5836f7aba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Oct 2019 12:32:04 -0700 Subject: dma-debug: reorder struct dma_debug_entry fields Move all fields used during exact match lookups to the first cache line. This makes debug_dma_mapping_error() and friends about 50% faster. Since it removes two 32bit holes, force a cacheline alignment on struct dma_debug_entry. Signed-off-by: Eric Dumazet Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 4ad74f5987ea..a5b85dabfb8c 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -54,33 +54,33 @@ enum map_err_types { * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping * @list: node on pre-allocated free_entries list * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent - * @type: single, page, sg, coherent - * @pfn: page frame of the start address - * @offset: offset of mapping relative to pfn * @size: length of the mapping + * @type: single, page, sg, coherent * @direction: enum dma_data_direction * @sg_call_ents: 'nents' from dma_map_sg * @sg_mapped_ents: 'mapped_ents' from dma_map_sg + * @pfn: page frame of the start address + * @offset: offset of mapping relative to pfn * @map_err_type: track whether dma_mapping_error() was checked * @stacktrace: support backtraces when a violation is detected */ struct dma_debug_entry { struct list_head list; struct device *dev; - int type; - unsigned long pfn; - size_t offset; u64 dev_addr; u64 size; + int type; int direction; int sg_call_ents; int sg_mapped_ents; + unsigned long pfn; + size_t offset; enum map_err_types map_err_type; #ifdef CONFIG_STACKTRACE unsigned int stack_len; unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; #endif -}; +} ____cacheline_aligned_in_smp; typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *); -- cgit From 5e76f564572b85735de4b75a5e73b514be2562be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Oct 2019 11:48:44 -0700 Subject: dma-debug: increase HASH_SIZE With modern NIC, it is not unusual having about ~256,000 active dma mappings and a hash size of 1024 buckets is too small. Forcing full cache line per bucket does not seem useful, especially now that we have contention on free_entries_lock for allocations and freeing of entries. Better use the space to fit more buckets. Signed-off-by: Eric Dumazet Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index a5b85dabfb8c..004496654aaa 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -27,7 +27,7 @@ #include -#define HASH_SIZE 1024ULL +#define HASH_SIZE 16384ULL #define HASH_FN_SHIFT 13 #define HASH_FN_MASK (HASH_SIZE - 1) @@ -87,7 +87,7 @@ typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *); struct hash_bucket { struct list_head list; spinlock_t lock; -} ____cacheline_aligned_in_smp; +}; /* Hash list to save the allocated dma addresses */ static struct hash_bucket dma_entry_hash[HASH_SIZE]; -- cgit From 9a066357184485784f782719093ff804d05b85db Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:52 +0900 Subject: kheaders: remove unneeded 'cat' command piped to 'head' / 'tail' The 'head' and 'tail' commands can take a file path directly. So, you do not need to run 'cat'. cat kernel/kheaders.md5 | head -1 ... is equivalent to: head -1 kernel/kheaders.md5 and the latter saves forking one process. While I was here, I replaced 'head -1' with 'head -n 1'. I also replaced '==' with '=' since we do not have a good reason to use the bashism. Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 5a0fc0b0403a..b8054b0d5010 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -41,10 +41,10 @@ obj_files_md5="$(find $dir_list -name "*.h" | this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)" if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi if [ -f kernel/kheaders.md5 ] && - [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && - [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && - [ "$(cat kernel/kheaders.md5|head -3|tail -1)" == "$this_file_md5" ] && - [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then + [ "$(head -n 1 kernel/kheaders.md5)" = "$src_files_md5" ] && + [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$obj_files_md5" ] && + [ "$(head -n 3 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] && + [ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then exit fi -- cgit From 0e11773e76098729552b750ccff79374d1e62002 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:53 +0900 Subject: kheaders: optimize md5sum calculation for in-tree builds This script computes md5sum of headers in srctree and in objtree. However, when we are building in-tree, we know the srctree and the objtree are the same. That is, we end up with the same computation twice. In fact, the first two lines of kernel/kheaders.md5 are always the same for in-tree builds. Unify the two md5sum calculations. For in-tree builds ($building_out_of_srctree is empty), we check only two directories, "include", and "arch/$SRCARCH/include". For out-of-tree builds ($building_out_of_srctree is 1), we check 4 directories, "$srctree/include", "$srctree/arch/$SRCARCH/include", "include", and "arch/$SRCARCH/include" since we know they are all different. Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index b8054b0d5010..6ff86e62787f 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -21,29 +21,30 @@ arch/$SRCARCH/include/ # Uncomment it for debugging. # if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter; # else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi -# find $src_file_list -name "*.h" | xargs ls -l > /tmp/src-ls-$iter -# find $obj_file_list -name "*.h" | xargs ls -l > /tmp/obj-ls-$iter +# find $all_dirs -name "*.h" | xargs ls -l > /tmp/ls-$iter + +all_dirs= +if [ "$building_out_of_srctree" ]; then + for d in $dir_list; do + all_dirs="$all_dirs $srctree/$d" + done +fi +all_dirs="$all_dirs $dir_list" # include/generated/compile.h is ignored because it is touched even when none # of the source files changed. This causes pointless regeneration, so let us # ignore them for md5 calculation. -pushd $srctree > /dev/null -src_files_md5="$(find $dir_list -name "*.h" | - grep -v "include/generated/compile.h" | - grep -v "include/generated/autoconf.h" | - xargs ls -l | md5sum | cut -d ' ' -f1)" -popd > /dev/null -obj_files_md5="$(find $dir_list -name "*.h" | - grep -v "include/generated/compile.h" | - grep -v "include/generated/autoconf.h" | +headers_md5="$(find $all_dirs -name "*.h" | + grep -v "include/generated/compile.h" | + grep -v "include/generated/autoconf.h" | xargs ls -l | md5sum | cut -d ' ' -f1)" + # Any changes to this script will also cause a rebuild of the archive. this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)" if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi if [ -f kernel/kheaders.md5 ] && - [ "$(head -n 1 kernel/kheaders.md5)" = "$src_files_md5" ] && - [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$obj_files_md5" ] && - [ "$(head -n 3 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] && + [ "$(head -n 1 kernel/kheaders.md5)" = "$headers_md5" ] && + [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] && [ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then exit fi @@ -79,8 +80,7 @@ find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \ --owner=0 --group=0 --numeric-owner --no-recursion \ -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null -echo "$src_files_md5" > kernel/kheaders.md5 -echo "$obj_files_md5" >> kernel/kheaders.md5 +echo $headers_md5 > kernel/kheaders.md5 echo "$this_file_md5" >> kernel/kheaders.md5 echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 -- cgit From ea79e5168be644fdaf7d4e6a73eceaf07b3da76a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:54 +0900 Subject: kheaders: optimize header copy for in-tree builds This script copies headers by the cpio command twice; first from srctree, and then from objtree. However, when we building in-tree, we know the srctree and the objtree are the same. That is, all the headers copied by the first cpio are overwritten by the second one. Skip the first cpio when we are building in-tree. Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 6ff86e62787f..0f7752dd93a6 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -56,14 +56,16 @@ fi rm -rf $cpio_dir mkdir $cpio_dir -pushd $srctree > /dev/null -for f in $dir_list; - do find "$f" -name "*.h"; -done | cpio --quiet -pd $cpio_dir -popd > /dev/null +if [ "$building_out_of_srctree" ]; then + pushd $srctree > /dev/null + for f in $dir_list + do find "$f" -name "*.h"; + done | cpio --quiet -pd $cpio_dir + popd > /dev/null +fi -# The second CPIO can complain if files already exist which can -# happen with out of tree builds. Just silence CPIO for now. +# The second CPIO can complain if files already exist which can happen with out +# of tree builds having stale headers in srctree. Just silence CPIO for now. for f in $dir_list; do find "$f" -name "*.h"; done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 -- cgit From 1463f74f492eea7191f0178e01f3d38371a48210 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:55 +0900 Subject: kheaders: remove the last bashism to allow sh to run it 'pushd' ... 'popd' is the last bash-specific code in this script. One way to avoid it is to run the code in a sub-shell. With that addressed, you can run this script with sh. I replaced $(BASH) with $(CONFIG_SHELL), and I changed the hashbang to #!/bin/sh. Signed-off-by: Masahiro Yamada --- kernel/Makefile | 2 +- kernel/gen_kheaders.sh | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index daad787fb795..42557f251fea 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -128,7 +128,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz - cmd_genikh = $(BASH) $(srctree)/kernel/gen_kheaders.sh $@ + cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 0f7752dd93a6..dc5744b93f8c 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # SPDX-License-Identifier: GPL-2.0 # This script generates an archive consisting of kernel headers @@ -57,11 +57,12 @@ rm -rf $cpio_dir mkdir $cpio_dir if [ "$building_out_of_srctree" ]; then - pushd $srctree > /dev/null - for f in $dir_list - do find "$f" -name "*.h"; - done | cpio --quiet -pd $cpio_dir - popd > /dev/null + ( + cd $srctree + for f in $dir_list + do find "$f" -name "*.h"; + done | cpio --quiet -pd $cpio_dir + ) fi # The second CPIO can complain if files already exist which can happen with out -- cgit From f276031b4e2f4c961ed6d8a42f0f0124ccac2e09 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:56 +0900 Subject: kheaders: explain why include/config/autoconf.h is excluded from md5sum This comment block explains why include/generated/compile.h is omitted, but nothing about include/generated/autoconf.h, which might be more difficult to understand. Add more comments. Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index dc5744b93f8c..e13ca842eb7e 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -32,8 +32,15 @@ fi all_dirs="$all_dirs $dir_list" # include/generated/compile.h is ignored because it is touched even when none -# of the source files changed. This causes pointless regeneration, so let us -# ignore them for md5 calculation. +# of the source files changed. +# +# When Kconfig regenerates include/generated/autoconf.h, its timestamp is +# updated, but the contents might be still the same. When any CONFIG option is +# changed, Kconfig touches the corresponding timestamp file include/config/*.h. +# Hence, the md5sum detects the configuration change anyway. We do not need to +# check include/generated/autoconf.h explicitly. +# +# Ignore them for md5 calculation to avoid pointless regeneration. headers_md5="$(find $all_dirs -name "*.h" | grep -v "include/generated/compile.h" | grep -v "include/generated/autoconf.h" | -- cgit From c1d51f684c72b5eb2aecbbd47be3a2977a2dc903 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 7 Nov 2019 15:25:12 +0100 Subject: cpuidle: Use nanoseconds as the unit of time Currently, the cpuidle subsystem uses microseconds as the unit of time which (among other things) causes the idle loop to incur some integer division overhead for no clear benefit. In order to allow cpuidle to measure time in nanoseconds, add two new fields, exit_latency_ns and target_residency_ns, to represent the exit latency and target residency of an idle state in nanoseconds, respectively, to struct cpuidle_state and initialize them with the help of the corresponding values in microseconds provided by drivers. Additionally, change cpuidle_governor_latency_req() to return the idle state exit latency constraint in nanoseconds. Also meeasure idle state residency (last_residency_ns in struct cpuidle_device and time_ns in struct cpuidle_driver) in nanoseconds and update the cpuidle core and governors accordingly. However, the menu governor still computes typical intervals in microseconds to avoid integer overflows. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Doug Smythies Tested-by: Doug Smythies --- kernel/sched/idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8dad5aa600ea..1aa260702b38 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -104,7 +104,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, * update no idle residency and return. */ if (current_clr_polling_and_test()) { - dev->last_residency = 0; + dev->last_residency_ns = 0; local_irq_enable(); return -EBUSY; } -- cgit From 2f5841349df281ecf8f81cc82d869b8476f0db0b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 8 Nov 2019 21:34:24 +0100 Subject: ntp/y2038: Remove incorrect time_t truncation A cast to 'time_t' was accidentally left in place during the conversion of __do_adjtimex() to 64-bit timestamps, so the resulting value is incorrectly truncated. Remove the cast so the 64-bit time gets propagated correctly. Fixes: ead25417f82e ("timex: use __kernel_timex internally") Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20191108203435.112759-2-arnd@arndb.de --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 65eb796610dc..069ca78fb0bf 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -771,7 +771,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, /* fill PPS status fields */ pps_fill_timex(txc); - txc->time.tv_sec = (time_t)ts->tv_sec; + txc->time.tv_sec = ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC; -- cgit From 20d087368d38c7350a4519a3b316ef7eb2504692 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 8 Nov 2019 21:34:25 +0100 Subject: time: Optimize ns_to_timespec64() ns_to_timespec64() calls div_s64_rem(), which is a rather slow function on 32-bit architectures, as it cannot take advantage of the do_div() optimizations for constant arguments. Open-code the div_s64_rem() function in ns_to_timespec64(), so a constant divider can be passed into the optimized div_u64_rem() function. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20191108203435.112759-3-arnd@arndb.de --- kernel/time/time.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 5c54ca632d08..45a358953f09 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -550,18 +550,21 @@ EXPORT_SYMBOL(set_normalized_timespec64); */ struct timespec64 ns_to_timespec64(const s64 nsec) { - struct timespec64 ts; + struct timespec64 ts = { 0, 0 }; s32 rem; - if (!nsec) - return (struct timespec64) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; + if (likely(nsec > 0)) { + ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + ts.tv_nsec = rem; + } else if (nsec < 0) { + /* + * With negative times, tv_sec points to the earlier + * second, and tv_nsec counts the nanoseconds since + * then, so tv_nsec is always a positive number. + */ + ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1; + ts.tv_nsec = NSEC_PER_SEC - rem - 1; } - ts.tv_nsec = rem; return ts; } -- cgit From 1d6acc18fee71a0db6e4fbbfbdb247e0bd5b0655 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Tue, 15 Oct 2019 13:03:39 +0530 Subject: time: Fix spelling mistake in comment witin => within Signed-off-by: Mukesh Ojha Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1571124819-9639-1-git-send-email-mojha@codeaurora.org --- kernel/time/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 45a358953f09..ea6e7e47cc37 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -179,7 +179,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz return error; if (tz) { - /* Verify we're witin the +-15 hrs range */ + /* Verify we're within the +-15 hrs range */ if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) return -EINVAL; -- cgit From d61ca3c25e0330c44d9a18b2a767197f10c0cd16 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat (VMware)" Date: Fri, 25 Oct 2019 16:02:07 -0700 Subject: sched/Kconfig: Fix spelling mistake in user-visible help text Fix a spelling mistake in the help text for PREEMPT_RT. Signed-off-by: Srivatsa S. Bhat (VMware) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/157204450499.10518.4542293884417101528.stgit@srivatsa-ubuntu --- kernel/Kconfig.preempt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index deff97217496..bf82259cff96 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -65,7 +65,7 @@ config PREEMPT_RT preemptible priority-inheritance aware variants, enforcing interrupt threading and introducing mechanisms to break up long non-preemptible sections. This makes the kernel, except for very - low level and critical code pathes (entry code, scheduler, low + low level and critical code paths (entry code, scheduler, low level interrupt handling) fully preemptible and brings most execution contexts under scheduler control. -- cgit From 67c0496e87d193b8356d2af49ab95e8a1b954b3c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: kernfs: convert kernfs_node->id from union kernfs_node_id to u64 kernfs_node->id is currently a union kernfs_node_id which represents either a 32bit (ino, gen) pair or u64 value. I can't see much value in the usage of the union - all that's needed is a 64bit ID which the current code is already limited to. Using a union makes the code unnecessarily complicated and prevents using 64bit ino without adding practical benefits. This patch drops union kernfs_node_id and makes kernfs_node->id a u64. ino is stored in the lower 32bits and gen upper. Accessors - kernfs[_id]_ino() and kernfs[_id]_gen() - are added to retrieve the ino and gen. This simplifies ID handling less cumbersome and will allow using 64bit inos on supported archs. This patch doesn't make any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim Cc: Jens Axboe Cc: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- kernel/bpf/local_storage.c | 2 +- kernel/cgroup/cgroup.c | 3 +-- kernel/trace/blktrace.c | 67 +++++++++++++++++++++------------------------- 4 files changed, 34 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5e28718928ca..912e761cd17a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp = task_dfl_cgroup(current); - return cgrp->kn->id.id; + return cgrp->kn->id; } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index addd6fdceec8..5d867f6d7204 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, return; storage->key.attach_type = type; - storage->key.cgroup_inode_id = cgroup->kn->id.id; + storage->key.cgroup_inode_id = cgroup->kn->id; map = storage->map; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cf32c0c7a45d..c6bd1a5a1977 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5786,8 +5786,7 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen) +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2d6e93ab0478..a986d2e74ca2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -64,8 +64,7 @@ static void blk_unregister_tracepoints(void); * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len, - union kernfs_node_id *cgid) + const void *data, size_t len, u64 cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; @@ -73,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (blk_tracer) { buffer = blk_tr->trace_buffer.buffer; @@ -100,8 +99,8 @@ record_it: t->pid = pid; t->cpu = cpu; t->pdu_len = len + cgid_len; - if (cgid) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) @@ -122,7 +121,7 @@ static void trace_note_tsk(struct task_struct *tsk) spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm), NULL); + sizeof(tsk->comm), 0); } spin_unlock_irqrestore(&running_trace_lock, flags); } @@ -139,7 +138,7 @@ static void trace_note_time(struct blk_trace *bt) words[1] = now.tv_nsec; local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0); local_irq_restore(flags); } @@ -172,9 +171,9 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, blkcg = NULL; #ifdef CONFIG_BLK_CGROUP trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); + blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : 0); #else - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); #endif local_irq_restore(flags); } @@ -212,7 +211,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data, union kernfs_node_id *cgid) + void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -223,7 +222,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -294,7 +293,7 @@ record_it: t->pdu_len = pdu_len + cgid_len; if (cgid_len) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); if (pdu_len) memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); @@ -751,31 +750,29 @@ void blk_trace_shutdown(struct request_queue *q) } #ifdef CONFIG_BLK_CGROUP -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { struct blk_trace *bt = q->blk_trace; if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) - return NULL; + return 0; if (!bio->bi_blkg) - return NULL; + return 0; return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); } #else -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - return NULL; + return 0; } #endif -static union kernfs_node_id * +static u64 blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) { if (!rq->bio) - return NULL; + return 0; /* Use the first bio */ return blk_trace_bio_get_cgid(q, rq->bio); } @@ -797,8 +794,7 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) * **/ static void blk_add_trace_rq(struct request *rq, int error, - unsigned int nr_bytes, u32 what, - union kernfs_node_id *cgid) + unsigned int nr_bytes, u32 what, u64 cgid) { struct blk_trace *bt = rq->q->blk_trace; @@ -913,7 +909,7 @@ static void blk_add_trace_getrq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL, NULL); + NULL, 0); } } @@ -929,7 +925,7 @@ static void blk_add_trace_sleeprq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL, NULL); + 0, 0, NULL, 0); } } @@ -938,7 +934,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -955,7 +951,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } } @@ -1172,19 +1168,17 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) { - return (void *)(te_blk_io_trace(ent) + 1) + - (has_cg ? sizeof(union kernfs_node_id) : 0); + return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0); } -static inline const void *cgid_start(const struct trace_entry *ent) +static inline u64 t_cgid(const struct trace_entry *ent) { - return (void *)(te_blk_io_trace(ent) + 1); + return *(u64 *)(te_blk_io_trace(ent) + 1); } static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) { - return te_blk_io_trace(ent)->pdu_len - - (has_cg ? sizeof(union kernfs_node_id) : 0); + return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0); } static inline u32 t_action(const struct trace_entry *ent) @@ -1257,7 +1251,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, fill_rwbs(rwbs, t); if (has_cg) { - const union kernfs_node_id *id = cgid_start(iter->ent); + u64 id = t_cgid(iter->ent); if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { char blkcg_name_buf[NAME_MAX + 1] = "<...>"; @@ -1269,9 +1263,10 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, blkcg_name_buf, act, rwbs); } else trace_seq_printf(&iter->seq, - "%3d,%-3d %x,%-x %2s %3s ", + "%3d,%-3d %lx,%-x %2s %3s ", MAJOR(t->device), MINOR(t->device), - id->ino, id->generation, act, rwbs); + kernfs_id_ino(id), kernfs_id_gen(id), + act, rwbs); } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); -- cgit From fe0f726c9fb626b1092a9ea3bf75f57f2eed676e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: kernfs: combine ino/id lookup functions into kernfs_find_and_get_node_by_id() kernfs_find_and_get_node_by_ino() looks the kernfs_node matching the specified ino. On top of that, kernfs_get_node_by_id() and kernfs_fh_get_inode() implement full ID matching by testing the rest of ID. On surface, confusingly, the two are slightly different in that the latter uses 0 gen as wildcard while the former doesn't - does it mean that the latter can't uniquely identify inodes w/ 0 gen? In practice, this is a distinction without a difference because generation number starts at 1. There are no actual IDs with 0 gen, so it can always safely used as wildcard. Let's simplify the code by renaming kernfs_find_and_get_node_by_ino() to kernfs_find_and_get_node_by_id(), moving all lookup logics into it, and removing now unnecessary kernfs_get_node_by_id(). Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c6bd1a5a1977..b5dcbee5aa6c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5790,7 +5790,7 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; - kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); + kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return; kernfs_path(kn, buf, buflen); -- cgit From 40430452fd5da1509177ac597b394614cd3a121f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: kernfs: use 64bit inos if ino_t is 64bit Each kernfs_node is identified with a 64bit ID. The low 32bit is exposed as ino and the high gen. While this already allows using inos as keys by looking up with wildcard generation number of 0, it's adding unnecessary complications for 64bit ino archs which can directly use kernfs_node IDs as inos to uniquely identify each cgroup instance. This patch exposes IDs directly as inos on 64bit ino archs. The conversion is mostly straight-forward. * 32bit ino archs behave the same as before. 64bit ino archs now use the whole 64bit ID as ino and the generation number is fixed at 1. * 64bit inos still use the same idr allocator which gurantees that the lower 32bits identify the current live instance uniquely and the high 32bits are incremented whenever the low bits wrap. As the upper 32bits are no longer used as gen and we don't wanna start ino allocation with 33rd bit set, the initial value for highbits allocation is changed to 0 on 64bit ino archs. * blktrace exposes two 32bit numbers - (INO,GEN) pair - to identify the issuing cgroup. Userland builds FILEID_INO32_GEN fids from these numbers to look up the cgroups. To remain compatible with the behavior, always output (LOW32,HIGH32) which will be constructed back to the original 64bit ID by __kernfs_fh_to_dentry(). Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- kernel/trace/blktrace.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a986d2e74ca2..a7dac5b63f3f 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1261,12 +1261,25 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", MAJOR(t->device), MINOR(t->device), blkcg_name_buf, act, rwbs); - } else + } else { + /* + * The cgid portion used to be "INO,GEN". Userland + * builds a FILEID_INO32_GEN fid out of them and + * opens the cgroup using open_by_handle_at(2). + * While 32bit ino setups are still the same, 64bit + * ones now use the 64bit ino as the whole ID and + * no longer use generation. + * + * Regarldess of the content, always output + * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can + * be mapped back to @id on both 64 and 32bit ino + * setups. See __kernfs_fh_to_dentry(). + */ trace_seq_printf(&iter->seq, - "%3d,%-3d %lx,%-x %2s %3s ", + "%3d,%-3d %llx,%-llx %2s %3s ", MAJOR(t->device), MINOR(t->device), - kernfs_id_ino(id), kernfs_id_gen(id), - act, rwbs); + id & U32_MAX, id >> 32, act, rwbs); + } } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); -- cgit From 743210386c0354a2f8ef3d697353c7d8477fa81d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: cgroup: use cgrp->kn->id as the cgroup ID cgroup ID is currently allocated using a dedicated per-hierarchy idr and used internally and exposed through tracepoints and bpf. This is confusing because there are tracepoints and other interfaces which use the cgroupfs ino as IDs. The preceding changes made kn->id exposed as ino as 64bit ino on supported archs or ino+gen (low 32bits as ino, high gen). There's no reason for cgroup to use different IDs. The kernfs IDs are unique and userland can easily discover them and map them back to paths using standard file operations. This patch replaces cgroup IDs with kernfs IDs. * cgroup_id() is added and all cgroup ID users are converted to use it. * kernfs_node creation is moved to earlier during cgroup init so that cgroup_id() is available during init. * While at it, s/cgroup/cgrp/ in psi helpers for consistency. * Fallback ID value is changed to 1 to be consistent with root cgroup ID. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- kernel/bpf/helpers.c | 2 +- kernel/bpf/local_storage.c | 2 +- kernel/cgroup/cgroup.c | 76 +++++++++++++++------------------------------- kernel/trace/blktrace.c | 4 +-- 4 files changed, 29 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 912e761cd17a..cada974c9f4e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp = task_dfl_cgroup(current); - return cgrp->kn->id; + return cgroup_id(cgrp); } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 5d867f6d7204..2ba750725cb2 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, return; storage->key.attach_type = type; - storage->key.cgroup_inode_id = cgroup->kn->id; + storage->key.cgroup_inode_id = cgroup_id(cgroup); map = storage->map; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b5dcbee5aa6c..c12dcf7dc432 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1308,10 +1308,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - if (root) { - idr_destroy(&root->cgroup_idr); - kfree(root); - } + kfree(root); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1917,7 +1914,6 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); - idr_init(&root->cgroup_idr); root->flags = ctx->flags; if (ctx->release_agent) @@ -1938,12 +1934,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) lockdep_assert_held(&cgroup_mutex); - ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); - if (ret < 0) - goto out; - root_cgrp->id = ret; - root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -1976,6 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto exit_root_id; } root_cgrp->kn = root->kf_root->kn; + WARN_ON_ONCE(cgroup_id(root_cgrp) != 1); + root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); ret = css_populate_dir(&root_cgrp->self); if (ret) @@ -3552,22 +3544,22 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -4987,9 +4979,6 @@ static void css_release_work_fn(struct work_struct *work) tcgrp->nr_dying_descendants--; spin_unlock_irq(&css_set_lock); - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - /* * There are two control paths which try to determine * cgroup from dentry without going through kernfs - @@ -5154,10 +5143,12 @@ err_free_css: * it isn't associated with its kernfs_node and doesn't have the control * mask applied. */ -static struct cgroup *cgroup_create(struct cgroup *parent) +static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, + umode_t mode) { struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; + struct kernfs_node *kn; int level = parent->level + 1; int ret; @@ -5177,15 +5168,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent) goto out_cancel_ref; } - /* - * Temporarily set the pointer to NULL, so idr_find() won't return - * a half-baked cgroup. - */ - cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); - if (cgrp->id < 0) { - ret = -ENOMEM; + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); goto out_stat_exit; } + cgrp->kn = kn; init_cgroup_housekeeping(cgrp); @@ -5195,7 +5184,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ret = psi_cgroup_alloc(cgrp); if (ret) - goto out_idr_free; + goto out_kernfs_remove; ret = cgroup_bpf_inherit(cgrp); if (ret) @@ -5219,7 +5208,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; + cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); if (tcgrp != cgrp) { tcgrp->nr_descendants++; @@ -5248,12 +5237,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent) atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); - /* - * @cgrp is now fully operational. If something fails after this - * point, it'll be released via the normal destruction path. - */ - cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. @@ -5267,8 +5250,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) out_psi_free: psi_cgroup_free(cgrp); -out_idr_free: - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_kernfs_remove: + kernfs_remove(cgrp->kn); out_stat_exit: if (cgroup_on_dfl(parent)) cgroup_rstat_exit(cgrp); @@ -5305,7 +5288,6 @@ fail: int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; - struct kernfs_node *kn; int ret; /* do not accept '\n' to prevent making /proc//cgroup unparsable */ @@ -5321,27 +5303,19 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) goto out_unlock; } - cgrp = cgroup_create(parent); + cgrp = cgroup_create(parent, name, mode); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); goto out_unlock; } - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - goto out_destroy; - } - cgrp->kn = kn; - /* * This extra ref will be put in cgroup_free_fn() and guarantees * that @cgrp->kn is always accessible. */ - kernfs_get(kn); + kernfs_get(cgrp->kn); - ret = cgroup_kn_set_ugid(kn); + ret = cgroup_kn_set_ugid(cgrp->kn); if (ret) goto out_destroy; @@ -5356,7 +5330,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ - kernfs_activate(kn); + kernfs_activate(cgrp->kn); ret = 0; goto out_unlock; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a7dac5b63f3f..475e29498bca 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -171,7 +171,7 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, blkcg = NULL; #ifdef CONFIG_BLK_CGROUP trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : 0); + blkcg ? cgroup_id(blkcg->css.cgroup) : 1); #else trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); #endif @@ -759,7 +759,7 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bio->bi_blkg) return 0; - return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); + return cgroup_id(bio_blkcg(bio)->css.cgroup); } #else u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) -- cgit From ff51ff84d82aea5a889b85f2b9fb3aa2b8691668 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Oct 2019 11:18:37 +0200 Subject: sched/core: Avoid spurious lock dependencies While seemingly harmless, __sched_fork() does hrtimer_init(), which, when DEBUG_OBJETS, can end up doing allocations. This then results in the following lock order: rq->lock zone->lock.rlock batched_entropy_u64.lock Which in turn causes deadlocks when we do wakeups while holding that batched_entropy lock -- as the random code does. Solve this by moving __sched_fork() out from under rq->lock. This is safe because nothing there relies on rq->lock, as also evident from the other __sched_fork() callsite. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Qian Cai Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: bigeasy@linutronix.de Cc: cl@linux.com Cc: keescook@chromium.org Cc: penberg@kernel.org Cc: rientjes@google.com Cc: thgarnie@google.com Cc: tytso@mit.edu Cc: will@kernel.org Fixes: b7d5dc21072c ("random: add a spinlock_t to struct batched_entropy") Link: https://lkml.kernel.org/r/20191001091837.GK4536@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0f2eb3629070..33cd25051f3a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6019,10 +6019,11 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; + __sched_fork(0, idle); + raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_lock(&rq->lock); - __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; -- cgit From b90f7c9d2198d789709390280a43e0a46345682b Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 30 Oct 2019 12:18:29 +0100 Subject: sched/pelt: Fix update of blocked PELT ordering update_cfs_rq_load_avg() can call cpufreq_update_util() to trigger an update of the frequency. Make sure that RT, DL and IRQ PELT signals have been updated before calling cpufreq. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: dsmythies@telus.net Cc: juri.lelli@redhat.com Cc: mgorman@suse.de Cc: rostedt@goodmis.org Fixes: 371bf4273269 ("sched/rt: Add rt_rq utilization tracking") Fixes: 3727e0e16340 ("sched/dl: Add dl_rq utilization tracking") Fixes: 91c27493e78d ("sched/irq: Add IRQ utilization tracking") Link: https://lkml.kernel.org/r/1572434309-32512-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 22a2fed29054..69a81a5709ff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7547,6 +7547,19 @@ static void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); + /* + * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure + * that RT, DL and IRQ signals have been updated before updating CFS. + */ + curr_class = rq->curr->sched_class; + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); + update_irq_load_avg(rq, 0); + + /* Don't need periodic decay once load/util_avg are null */ + if (others_have_blocked(rq)) + done = false; + /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. @@ -7574,14 +7587,6 @@ static void update_blocked_averages(int cpu) done = false; } - curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); - update_irq_load_avg(rq, 0); - /* Don't need periodic decay once load/util_avg are null */ - if (others_have_blocked(rq)) - done = false; - update_blocked_load_status(rq, !done); rq_unlock_irqrestore(rq, &rf); } @@ -7642,12 +7647,18 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + /* + * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure + * that RT, DL and IRQ signals have been updated before updating CFS. + */ curr_class = rq->curr->sched_class; update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); + + update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq)); rq_unlock_irqrestore(rq, &rf); } -- cgit From 09f4e8f05d85bfc98fe9227e988a7c1b3ec416ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Nov 2019 12:51:04 +0100 Subject: perf/core: Disallow uncore-cgroup events While discussing uncore event scheduling, I noticed we do not in fact seem to dis-allow making uncore-cgroup events. Such events make no sense what so ever because the cgroup is a CPU local state where uncore counts across a number of CPUs. Disallow them. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index aec8dba2bea4..022a34b66e60 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10535,6 +10535,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_ns; } + /* + * Disallow uncore-cgroup events, they don't make sense as the cgroup will + * be different on other CPUs in the uncore mask. + */ + if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) { + err = -EINVAL; + goto err_pmu; + } + if (event->attr.aux_output && !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { err = -EOPNOTSUPP; -- cgit From 00496fe5e09e8c8bb115540e7e3470553cd07a5c Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 1 Nov 2019 17:12:48 +0200 Subject: perf/aux: Fix the aux_output group inheritance fix Commit f733c6b508bc ("perf/core: Fix inheritance of aux_output groups") adds a NULL pointer dereference in case inherit_group() races with perf_release(), which causes the below crash: > BUG: kernel NULL pointer dereference, address: 000000000000010b > #PF: supervisor read access in kernel mode > #PF: error_code(0x0000) - not-present page > PGD 3b203b067 P4D 3b203b067 PUD 3b2040067 PMD 0 > Oops: 0000 [#1] SMP KASAN > CPU: 0 PID: 315 Comm: exclusive-group Tainted: G B 5.4.0-rc3-00181-g72e1839403cb-dirty #878 > RIP: 0010:perf_get_aux_event+0x86/0x270 > Call Trace: > ? __perf_read_group_add+0x3b0/0x3b0 > ? __kasan_check_write+0x14/0x20 > ? __perf_event_init_context+0x154/0x170 > inherit_task_group.isra.0.part.0+0x14b/0x170 > perf_event_init_task+0x296/0x4b0 Fix this by skipping over events that are getting closed, in the inheritance path. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: f733c6b508bc ("perf/core: Fix inheritance of aux_output groups") Link: https://lkml.kernel.org/r/20191101151248.47327-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 022a34b66e60..b752bd3aa03b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11899,7 +11899,7 @@ static int inherit_group(struct perf_event *parent_event, if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); - if (sub->aux_event == parent_event && + if (sub->aux_event == parent_event && child_ctr && !perf_get_aux_event(child_ctr, leader)) return -EINVAL; } -- cgit From f25d8ba9e1b204b90fbf55970ea6e68955006068 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 30 Oct 2019 15:47:30 +0200 Subject: perf/core: Reattach a misplaced comment A comment is in a wrong place in perf_event_create_kernel_counter(). Fix that. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/20191030134731.5437-2-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b752bd3aa03b..e18d8d34ca77 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11331,10 +11331,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct perf_event *event; int err; - /* - * Get the target context (task or percpu): - */ - event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { @@ -11345,6 +11341,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; + /* + * Get the target context (task or percpu): + */ ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); -- cgit From dce5affb94eb54edfff17727a6240a6a5d998666 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 30 Oct 2019 15:47:31 +0200 Subject: perf/aux: Disallow aux_output for kernel events Commit ab43762ef0109 ("perf: Allow normal events to output AUX data") added 'aux_output' bit to the attribute structure, which relies on AUX events and grouping, neither of which is supported for the kernel events. This notwithstanding, attempts have been made to use it in the kernel code, suggesting the necessity of an explicit hard -EINVAL. Fix this by rejecting attributes with aux_output set for kernel events. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/20191030134731.5437-3-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index e18d8d34ca77..7655441065a9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11331,6 +11331,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct perf_event *event; int err; + /* + * Grouping is not supported for kernel events, neither is 'AUX', + * make sure the caller's intentions are adjusted. + */ + if (attr->aux_output) + return ERR_PTR(-EINVAL); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { -- cgit From 697d877849d4b34ab58d7078d6930bad0ef6fc66 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 5 Nov 2019 09:57:02 +0200 Subject: perf/core: Consistently fail fork on allocation failures Commit: 313ccb9615948 ("perf: Allocate context task_ctx_data for child event") makes the inherit path skip over the current event in case of task_ctx_data allocation failure. This, however, is inconsistent with allocation failures in perf_event_alloc(), which would abort the fork. Correct this by returning an error code on task_ctx_data allocation failure and failing the fork in that case. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/20191105075702.60319-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7655441065a9..466e333db1f3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11802,7 +11802,7 @@ inherit_event(struct perf_event *parent_event, GFP_KERNEL); if (!child_ctx->task_ctx_data) { free_event(child_event); - return NULL; + return ERR_PTR(-ENOMEM); } } -- cgit From d00dbd29814236ad128ff9517e8f7af6b6ef4ba0 Mon Sep 17 00:00:00 2001 From: "Ben Dooks (Codethink)" Date: Wed, 6 Nov 2019 13:25:27 +0000 Subject: perf/core: Fix missing static inline on perf_cgroup_switch() It looks like a "static inline" has been missed in front of the empty definition of perf_cgroup_switch() under certain configurations. Fixes the following sparse warning: kernel/events/core.c:1035:1: warning: symbol 'perf_cgroup_switch' was not declared. Should it be static? Signed-off-by: Ben Dooks (Codethink) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/20191106132527.19977-1-ben.dooks@codethink.co.uk Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 466e333db1f3..00a014670ed0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1031,7 +1031,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, { } -void +static inline void perf_cgroup_switch(struct task_struct *task, struct task_struct *next) { } -- cgit From deb0c3c29d552ab81ecd5481bb83bf2f4e41927d Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 6 Nov 2019 00:29:35 -0500 Subject: perf/core: Fix unlock balance in perf_init_event() Commit: 66d258c5b048 ("perf/core: Optimize perf_init_event()") introduced an unlock imbalance in perf_init_event() where it calls "goto again" and then only repeat rcu_read_unlock(). Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 66d258c5b048 ("perf/core: Optimize perf_init_event()") Link: https://lkml.kernel.org/r/20191106052935.8352-1-cai@lca.pw Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6cb6d685191d..8d65e03b98f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10307,7 +10307,6 @@ static struct pmu *perf_init_event(struct perf_event *event) goto unlock; } - rcu_read_lock(); /* * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE * are often aliases for PERF_TYPE_RAW. @@ -10317,6 +10316,7 @@ static struct pmu *perf_init_event(struct perf_event *event) type = PERF_TYPE_RAW; again: + rcu_read_lock(); pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { -- cgit From a4faf00d994c40e64f656805ac375c65e324eefb Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 25 Oct 2019 17:08:33 +0300 Subject: perf/aux: Allow using AUX data in perf samples AUX data can be used to annotate perf events such as performance counters or tracepoints/breakpoints by including it in sample records when PERF_SAMPLE_AUX flag is set. Such samples would be instrumental in debugging and profiling by providing, for example, a history of instruction flow leading up to the event's overflow. The implementation makes use of grouping an AUX event with all the events that wish to take samples of the AUX data, such that the former is the group leader. The samplees should also specify the desired size of the AUX sample via attr.aux_sample_size. AUX capable PMUs need to explicitly add support for sampling, because it relies on a new callback to take a snapshot of the buffer without touching the event states. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: adrian.hunter@intel.com Cc: mathieu.poirier@linaro.org Link: https://lkml.kernel.org/r/20191025140835.53665-2-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 173 +++++++++++++++++++++++++++++++++++++++++++- kernel/events/internal.h | 1 + kernel/events/ring_buffer.c | 36 +++++++++ 3 files changed, 207 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 8d65e03b98f2..16d80ad8d6d7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1941,6 +1941,11 @@ static void perf_put_aux_event(struct perf_event *event) } } +static bool perf_need_aux_event(struct perf_event *event) +{ + return !!event->attr.aux_output || !!event->attr.aux_sample_size; +} + static int perf_get_aux_event(struct perf_event *event, struct perf_event *group_leader) { @@ -1953,7 +1958,17 @@ static int perf_get_aux_event(struct perf_event *event, if (!group_leader) return 0; - if (!perf_aux_output_match(event, group_leader)) + /* + * aux_output and aux_sample_size are mutually exclusive. + */ + if (event->attr.aux_output && event->attr.aux_sample_size) + return 0; + + if (event->attr.aux_output && + !perf_aux_output_match(event, group_leader)) + return 0; + + if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; if (!atomic_long_inc_not_zero(&group_leader->refcount)) @@ -6222,6 +6237,122 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, } } +static unsigned long perf_prepare_sample_aux(struct perf_event *event, + struct perf_sample_data *data, + size_t size) +{ + struct perf_event *sampler = event->aux_event; + struct ring_buffer *rb; + + data->aux_size = 0; + + if (!sampler) + goto out; + + if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) + goto out; + + if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) + goto out; + + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + if (!rb) + goto out; + + /* + * If this is an NMI hit inside sampling code, don't take + * the sample. See also perf_aux_sample_output(). + */ + if (READ_ONCE(rb->aux_in_sampling)) { + data->aux_size = 0; + } else { + size = min_t(size_t, size, perf_aux_size(rb)); + data->aux_size = ALIGN(size, sizeof(u64)); + } + ring_buffer_put(rb); + +out: + return data->aux_size; +} + +long perf_pmu_snapshot_aux(struct ring_buffer *rb, + struct perf_event *event, + struct perf_output_handle *handle, + unsigned long size) +{ + unsigned long flags; + long ret; + + /* + * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler + * paths. If we start calling them in NMI context, they may race with + * the IRQ ones, that is, for example, re-starting an event that's just + * been stopped, which is why we're using a separate callback that + * doesn't change the event state. + * + * IRQs need to be disabled to prevent IPIs from racing with us. + */ + local_irq_save(flags); + /* + * Guard against NMI hits inside the critical section; + * see also perf_prepare_sample_aux(). + */ + WRITE_ONCE(rb->aux_in_sampling, 1); + barrier(); + + ret = event->pmu->snapshot_aux(event, handle, size); + + barrier(); + WRITE_ONCE(rb->aux_in_sampling, 0); + local_irq_restore(flags); + + return ret; +} + +static void perf_aux_sample_output(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *data) +{ + struct perf_event *sampler = event->aux_event; + unsigned long pad; + struct ring_buffer *rb; + long size; + + if (WARN_ON_ONCE(!sampler || !data->aux_size)) + return; + + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + if (!rb) + return; + + size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size); + + /* + * An error here means that perf_output_copy() failed (returned a + * non-zero surplus that it didn't copy), which in its current + * enlightened implementation is not possible. If that changes, we'd + * like to know. + */ + if (WARN_ON_ONCE(size < 0)) + goto out_put; + + /* + * The pad comes from ALIGN()ing data->aux_size up to u64 in + * perf_prepare_sample_aux(), so should not be more than that. + */ + pad = data->aux_size - size; + if (WARN_ON_ONCE(pad >= sizeof(u64))) + pad = 8; + + if (pad) { + u64 zero = 0; + perf_output_copy(handle, &zero, pad); + } + +out_put: + ring_buffer_put(rb); +} + static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -6541,6 +6672,13 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); + if (sample_type & PERF_SAMPLE_AUX) { + perf_output_put(handle, data->aux_size); + + if (data->aux_size) + perf_aux_sample_output(event, handle, data); + } + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -6729,6 +6867,35 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); + + if (sample_type & PERF_SAMPLE_AUX) { + u64 size; + + header->size += sizeof(u64); /* size */ + + /* + * Given the 16bit nature of header::size, an AUX sample can + * easily overflow it, what with all the preceding sample bits. + * Make sure this doesn't happen by using up to U16_MAX bytes + * per sample in total (rounded down to 8 byte boundary). + */ + size = min_t(size_t, U16_MAX - header->size, + event->attr.aux_sample_size); + size = rounddown(size, 8); + size = perf_prepare_sample_aux(event, data, size); + + WARN_ON_ONCE(size + header->size > U16_MAX); + header->size += size; + } + /* + * If you're adding more sample types here, you likely need to do + * something about the overflowing header::size, like repurpose the + * lowest 3 bits of size, which should be always zero at the moment. + * This raises a more important question, do we really need 512k sized + * samples and why, so good argumentation is in order for whatever you + * do here next. + */ + WARN_ON_ONCE(header->size & 7); } static __always_inline int @@ -10727,7 +10894,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->size = size; - if (attr->__reserved_1 || attr->__reserved_2) + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) @@ -11277,7 +11444,7 @@ SYSCALL_DEFINE5(perf_event_open, } } - if (event->attr.aux_output && !perf_get_aux_event(event, group_leader)) + if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) goto err_locked; /* diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 3aef4191798c..747d67f130cb 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -50,6 +50,7 @@ struct ring_buffer { unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; + int aux_in_sampling; void **aux_pages; void *aux_priv; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 246c83ac5643..7ffd5c763f93 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -562,6 +562,42 @@ void *perf_get_aux(struct perf_output_handle *handle) } EXPORT_SYMBOL_GPL(perf_get_aux); +/* + * Copy out AUX data from an AUX handle. + */ +long perf_output_copy_aux(struct perf_output_handle *aux_handle, + struct perf_output_handle *handle, + unsigned long from, unsigned long to) +{ + unsigned long tocopy, remainder, len = 0; + struct ring_buffer *rb = aux_handle->rb; + void *addr; + + from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + + do { + tocopy = PAGE_SIZE - offset_in_page(from); + if (to > from) + tocopy = min(tocopy, to - from); + if (!tocopy) + break; + + addr = rb->aux_pages[from >> PAGE_SHIFT]; + addr += offset_in_page(from); + + remainder = perf_output_copy(handle, addr, tocopy); + if (remainder) + return -EFAULT; + + len += tocopy; + from += tocopy; + from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + } while (to != from); + + return len; +} + #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) static struct page *rb_alloc_aux_page(int node, int order) -- cgit From c759bc47db0fb8c02ecf2b2acc4b7fc6e4099039 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 4 Nov 2019 12:12:52 +0300 Subject: locking/lockdep: Update the comment for __lock_release() This changes "to the list" to "from the list" and also deletes the obsolete comment about the "@nested" argument. The "nested" argument was removed in this commit, earlier this year: 5facae4f3549 ("locking/lockdep: Remove unused @nested argument from lock_release()"). Signed-off-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: Will Deacon Link: https://lkml.kernel.org/r/20191104091252.GA31509@mwanda Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8123518f9045..32282e7112d3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4208,11 +4208,9 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) } /* - * Remove the lock to the list of currently held locks - this gets + * Remove the lock from the list of currently held locks - this gets * called on mutex_unlock()/spin_unlock*() (or on a failed * mutex_lock_interruptible()). - * - * @nested is an hysterical artifact, needs a tree wide cleanup. */ static int __lock_release(struct lockdep_map *lock, unsigned long ip) -- cgit From cf25e24db61cc9df42c47485a2ec2bff4e9a3692 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Nov 2019 11:07:58 +0100 Subject: time: Rename tsk->real_start_time to ->start_boottime Since it stores CLOCK_BOOTTIME, not, as the name suggests, CLOCK_REALTIME, let's rename ->real_start_time to ->start_bootime. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..1392ee8f4848 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2130,7 +2130,7 @@ static __latent_entropy struct task_struct *copy_process( */ p->start_time = ktime_get_ns(); - p->real_start_time = ktime_get_boottime_ns(); + p->start_boottime = ktime_get_boottime_ns(); /* * Make it visible to the rest of the system, but dont wake it up yet. -- cgit From 763e34e74bb7d5c316015e2e39fcc8520bfd071c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:07:06 -0500 Subject: ftrace: Add register_ftrace_direct() Add the start of the functionality to allow other trampolines to use the ftrace mcount/fentry/nop location. This adds two new functions: register_ftrace_direct() and unregister_ftrace_direct() Both take two parameters: the first is the instruction address of where the mcount/fentry/nop exists, and the second is the trampoline to have that location called. This will handle cases where ftrace is already used on that same location, and will make it still work, where the registered direct called trampoline will get called after all the registered ftrace callers are handled. Currently, it will not allow for IP_MODIFY functions to be called at the same locations, which include some kprobes and live kernel patching. At this point, no architecture supports this. This is only the start of implementing the framework. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 8 ++ kernel/trace/ftrace.c | 269 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 272 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e08527f50d2a..624a05e99b0b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -33,6 +33,9 @@ config HAVE_DYNAMIC_FTRACE config HAVE_DYNAMIC_FTRACE_WITH_REGS bool +config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + bool + config HAVE_FTRACE_MCOUNT_RECORD bool help @@ -557,6 +560,11 @@ config DYNAMIC_FTRACE_WITH_REGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_REGS +config DYNAMIC_FTRACE_WITH_DIRECT_CALLS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b0e7f03919de..329a3f3789a1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1023,6 +1023,7 @@ static bool update_all_ops; struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; + unsigned long direct; /* for direct lookup only */ }; struct ftrace_func_probe { @@ -1730,6 +1731,9 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) return false; + if (ops->flags & FTRACE_OPS_FL_DIRECT) + rec->flags |= FTRACE_FL_DIRECT; + /* * If there's only a single callback registered to a * function, and the ops has a trampoline registered @@ -1757,6 +1761,15 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, return false; rec->flags--; + /* + * Only the internal direct_ops should have the + * DIRECT flag set. Thus, if it is removing a + * function, then that function should no longer + * be direct. + */ + if (ops->flags & FTRACE_OPS_FL_DIRECT) + rec->flags &= ~FTRACE_FL_DIRECT; + /* * If the rec had REGS enabled and the ops that is * being removed had REGS set, then see if there is @@ -2092,15 +2105,34 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) * If enabling and the REGS flag does not match the REGS_EN, or * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore * this record. Set flags to fail the compare against ENABLED. + * Same for direct calls. */ if (flag) { - if (!(rec->flags & FTRACE_FL_REGS) != + if (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)) flag |= FTRACE_FL_REGS; - if (!(rec->flags & FTRACE_FL_TRAMP) != + if (!(rec->flags & FTRACE_FL_TRAMP) != !(rec->flags & FTRACE_FL_TRAMP_EN)) flag |= FTRACE_FL_TRAMP; + + /* + * Direct calls are special, as count matters. + * We must test the record for direct, if the + * DIRECT and DIRECT_EN do not match, but only + * if the count is 1. That's because, if the + * count is something other than one, we do not + * want the direct enabled (it will be done via the + * direct helper). But if DIRECT_EN is set, and + * the count is not one, we need to clear it. + */ + if (ftrace_rec_count(rec) == 1) { + if (!(rec->flags & FTRACE_FL_DIRECT) != + !(rec->flags & FTRACE_FL_DIRECT_EN)) + flag |= FTRACE_FL_DIRECT; + } else if (rec->flags & FTRACE_FL_DIRECT_EN) { + flag |= FTRACE_FL_DIRECT; + } } /* If the state of this record hasn't changed, then do nothing */ @@ -2125,6 +2157,25 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) else rec->flags &= ~FTRACE_FL_TRAMP_EN; } + if (flag & FTRACE_FL_DIRECT) { + /* + * If there's only one user (direct_ops helper) + * then we can call the direct function + * directly (no ftrace trampoline). + */ + if (ftrace_rec_count(rec) == 1) { + if (rec->flags & FTRACE_FL_DIRECT) + rec->flags |= FTRACE_FL_DIRECT_EN; + else + rec->flags &= ~FTRACE_FL_DIRECT_EN; + } else { + /* + * Can only call directly if there's + * only one callback to the function. + */ + rec->flags &= ~FTRACE_FL_DIRECT_EN; + } + } } /* @@ -2154,7 +2205,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) * and REGS states. The _EN flags must be disabled though. */ rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN | - FTRACE_FL_REGS_EN); + FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN); } ftrace_bug_type = FTRACE_BUG_NOP; @@ -2309,6 +2360,51 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) return NULL; } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* Protected by rcu_tasks for reading, and direct_mutex for writing */ +static struct ftrace_hash *direct_functions = EMPTY_HASH; +static DEFINE_MUTEX(direct_mutex); + +/* + * Search the direct_functions hash to see if the given instruction pointer + * has a direct caller attached to it. + */ +static unsigned long find_rec_direct(unsigned long ip) +{ + struct ftrace_func_entry *entry; + + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) + return 0; + + return entry->direct; +} + +static void call_direct_funcs(unsigned long ip, unsigned long pip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + unsigned long addr; + + addr = find_rec_direct(ip); + if (!addr) + return; + + arch_ftrace_set_direct_caller(regs, addr); +} + +struct ftrace_ops direct_ops = { + .func = call_direct_funcs, + .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE + | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS + | FTRACE_OPS_FL_PERMANENT, +}; +#else +static inline unsigned long find_rec_direct(unsigned long ip) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + /** * ftrace_get_addr_new - Get the call address to set to * @rec: The ftrace record descriptor @@ -2322,6 +2418,15 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) { struct ftrace_ops *ops; + unsigned long addr; + + if ((rec->flags & FTRACE_FL_DIRECT) && + (ftrace_rec_count(rec) == 1)) { + addr = find_rec_direct(rec->ip); + if (addr) + return addr; + WARN_ON_ONCE(1); + } /* Trampolines take precedence over regs */ if (rec->flags & FTRACE_FL_TRAMP) { @@ -2354,6 +2459,15 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) { struct ftrace_ops *ops; + unsigned long addr; + + /* Direct calls take precedence over trampolines */ + if (rec->flags & FTRACE_FL_DIRECT_EN) { + addr = find_rec_direct(rec->ip); + if (addr) + return addr; + WARN_ON_ONCE(1); + } /* Trampolines take precedence over regs */ if (rec->flags & FTRACE_FL_TRAMP_EN) { @@ -3465,10 +3579,11 @@ static int t_show(struct seq_file *m, void *v) if (iter->flags & FTRACE_ITER_ENABLED) { struct ftrace_ops *ops; - seq_printf(m, " (%ld)%s%s", + seq_printf(m, " (%ld)%s%s%s", ftrace_rec_count(rec), rec->flags & FTRACE_FL_REGS ? " R" : " ", - rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); + rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ", + rec->flags & FTRACE_FL_DIRECT ? " D" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); if (ops) { @@ -3484,6 +3599,13 @@ static int t_show(struct seq_file *m, void *v) } else { add_trampoline_func(m, NULL, rec); } + if (rec->flags & FTRACE_FL_DIRECT) { + unsigned long direct; + + direct = find_rec_direct(rec->ip); + if (direct) + seq_printf(m, "\n\tdirect-->%pS", (void *)direct); + } } seq_putc(m, '\n'); @@ -4815,6 +4937,143 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/** + * register_ftrace_direct - Call a custom trampoline directly + * @ip: The address of the nop at the beginning of a function + * @addr: The address of the trampoline to call at @ip + * + * This is used to connect a direct call from the nop location (@ip) + * at the start of ftrace traced functions. The location that it calls + * (@addr) must be able to handle a direct call, and save the parameters + * of the function being traced, and restore them (or inject new ones + * if needed), before returning. + * + * Returns: + * 0 on success + * -EBUSY - Another direct function is already attached (there can be only one) + * -ENODEV - @ip does not point to a ftrace nop location (or not supported) + * -ENOMEM - There was an allocation failure. + */ +int register_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *free_hash = NULL; + struct dyn_ftrace *rec; + int ret = -EBUSY; + + mutex_lock(&direct_mutex); + + /* See if there's a direct function at @ip already */ + if (find_rec_direct(ip)) + goto out_unlock; + + ret = -ENODEV; + rec = lookup_rec(ip, ip); + if (!rec) + goto out_unlock; + + /* + * Check if the rec says it has a direct call but we didn't + * find one earlier? + */ + if (WARN_ON(rec->flags & FTRACE_FL_DIRECT)) + goto out_unlock; + + /* Make sure the ip points to the exact record */ + ip = rec->ip; + + ret = -ENOMEM; + if (ftrace_hash_empty(direct_functions) || + direct_functions->count > 2 * (1 << direct_functions->size_bits)) { + struct ftrace_hash *new_hash; + int size = ftrace_hash_empty(direct_functions) ? 0 : + direct_functions->count + 1; + + if (size < 32) + size = 32; + + new_hash = dup_hash(direct_functions, size); + if (!new_hash) + goto out_unlock; + + free_hash = direct_functions; + direct_functions = new_hash; + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out_unlock; + + entry->ip = ip; + entry->direct = addr; + __add_hash_entry(direct_functions, entry); + + ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); + if (ret) + remove_hash_entry(direct_functions, entry); + + if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { + ret = register_ftrace_function(&direct_ops); + if (ret) + ftrace_set_filter_ip(&direct_ops, ip, 1, 0); + } + + if (ret) + kfree(entry); + out_unlock: + mutex_unlock(&direct_mutex); + + if (free_hash) { + synchronize_rcu_tasks(); + free_ftrace_hash(free_hash); + } + + return ret; +} +EXPORT_SYMBOL_GPL(register_ftrace_direct); + +int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) { + /* OK if it is off by a little */ + rec = lookup_rec(ip, ip); + if (!rec || rec->ip == ip) + goto out_unlock; + + entry = __ftrace_lookup_ip(direct_functions, rec->ip); + if (!entry) { + WARN_ON(rec->flags & FTRACE_FL_DIRECT); + goto out_unlock; + } + + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); + } + + if (direct_functions->count == 1) + unregister_ftrace_function(&direct_ops); + + ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0); + + WARN_ON(ret); + + remove_hash_entry(direct_functions, entry); + + out_unlock: + mutex_unlock(&direct_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_direct); +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + /** * ftrace_set_filter_ip - set a function to filter on in ftrace by address * @ops - the ops to set the filter with -- cgit From 013bf0da0474816f57739daa006c8564ad7396a3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:11:27 -0500 Subject: ftrace: Add ftrace_find_direct_func() As function_graph tracer modifies the return address to insert a trampoline to trace the return of a function, it must be aware of a direct caller, as when it gets called, the function's return address may not be at on the stack where it expects. It may have to see if that return address points to the a direct caller and adjust if it is. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 329a3f3789a1..c4446eabacbe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4938,6 +4938,46 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + +struct ftrace_direct_func { + struct list_head next; + unsigned long addr; + int count; +}; + +static LIST_HEAD(ftrace_direct_funcs); + +/** + * ftrace_find_direct_func - test an address if it is a registered direct caller + * @addr: The address of a registered direct caller + * + * This searches to see if a ftrace direct caller has been registered + * at a specific address, and if so, it returns a descriptor for it. + * + * This can be used by architecture code to see if an address is + * a direct caller (trampoline) attached to a fentry/mcount location. + * This is useful for the function_graph tracer, as it may need to + * do adjustments if it traced a location that also has a direct + * trampoline attached to it. + */ +struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) +{ + struct ftrace_direct_func *entry; + bool found = false; + + /* May be called by fgraph trampoline (protected by rcu tasks) */ + list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) { + if (entry->addr == addr) { + found = true; + break; + } + } + if (found) + return entry; + + return NULL; +} + /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -4957,6 +4997,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, */ int register_ftrace_direct(unsigned long ip, unsigned long addr) { + struct ftrace_direct_func *direct; struct ftrace_func_entry *entry; struct ftrace_hash *free_hash = NULL; struct dyn_ftrace *rec; @@ -5005,6 +5046,18 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (!entry) goto out_unlock; + direct = ftrace_find_direct_func(addr); + if (!direct) { + direct = kmalloc(sizeof(*direct), GFP_KERNEL); + if (!direct) { + kfree(entry); + goto out_unlock; + } + direct->addr = addr; + direct->count = 0; + list_add_rcu(&direct->next, &ftrace_direct_funcs); + } + entry->ip = ip; entry->direct = addr; __add_hash_entry(direct_functions, entry); @@ -5019,8 +5072,20 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) ftrace_set_filter_ip(&direct_ops, ip, 1, 0); } - if (ret) + if (ret) { kfree(entry); + if (!direct->count) { + list_del_rcu(&direct->next); + synchronize_rcu_tasks(); + kfree(direct); + if (free_hash) + free_ftrace_hash(free_hash); + free_hash = NULL; + } + } else { + if (!direct->count) + direct->count++; + } out_unlock: mutex_unlock(&direct_mutex); @@ -5036,6 +5101,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { struct ftrace_func_entry *entry; + struct ftrace_direct_func *direct; struct dyn_ftrace *rec; int ret = -ENODEV; @@ -5066,6 +5132,17 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) remove_hash_entry(direct_functions, entry); + direct = ftrace_find_direct_func(addr); + if (!WARN_ON(!direct)) { + /* This is the good path (see the ! before WARN) */ + direct->count--; + WARN_ON(direct->count < 0); + if (!direct->count) { + list_del_rcu(&direct->next); + synchronize_rcu_tasks(); + kfree(direct); + } + } out_unlock: mutex_unlock(&direct_mutex); -- cgit From a3ad1a7e39689005cb04a4f2adb82f9d55b4724f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:12:57 -0500 Subject: ftrace/x86: Add a counter to test function_graph with direct As testing for direct calls from the function graph tracer adds a little overhead (which is a lot when tracing every function), add a counter that can be used to test if function_graph tracer needs to test for a direct caller or not. It would have been nicer if we could use a static branch, but the static branch logic fails when used within the function graph tracer trampoline. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c4446eabacbe..f9456346ec66 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2364,6 +2364,7 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) /* Protected by rcu_tasks for reading, and direct_mutex for writing */ static struct ftrace_hash *direct_functions = EMPTY_HASH; static DEFINE_MUTEX(direct_mutex); +int ftrace_direct_func_count; /* * Search the direct_functions hash to see if the given instruction pointer @@ -5056,6 +5057,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) direct->addr = addr; direct->count = 0; list_add_rcu(&direct->next, &ftrace_direct_funcs); + ftrace_direct_func_count++; } entry->ip = ip; @@ -5081,6 +5083,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (free_hash) free_ftrace_hash(free_hash); free_hash = NULL; + ftrace_direct_func_count--; } } else { if (!direct->count) @@ -5141,6 +5144,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) list_del_rcu(&direct->next); synchronize_rcu_tasks(); kfree(direct); + ftrace_direct_func_count--; } } out_unlock: -- cgit From da537f0aef1372c5204356a7df06be8769467b7b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 1 Oct 2019 14:38:07 -0400 Subject: ftrace: Add information on number of page groups allocated Looking for ways to shrink the size of the dyn_ftrace structure, knowing the information about how many pages and the number of groups of those pages, is useful in working out the best ways to save on memory. This adds one info print on how many groups of pages were used to allocate the ftrace dyn_ftrace structures, and also shows the number of pages and groups in the dyn_ftrace_total_info (which is used for debugging). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 14 ++++++++++++++ kernel/trace/trace.c | 21 +++++++++++++++------ kernel/trace/trace.h | 2 ++ 3 files changed, 31 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9456346ec66..d2d488c43a6a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2991,6 +2991,8 @@ static void ftrace_shutdown_sysctl(void) static u64 ftrace_update_time; unsigned long ftrace_update_tot_cnt; +unsigned long ftrace_number_of_pages; +unsigned long ftrace_number_of_groups; static inline int ops_traces_mod(struct ftrace_ops *ops) { @@ -3115,6 +3117,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) goto again; } + ftrace_number_of_pages += 1 << order; + ftrace_number_of_groups++; + cnt = (PAGE_SIZE << order) / ENTRY_SIZE; pg->size = cnt; @@ -3170,6 +3175,8 @@ ftrace_allocate_pages(unsigned long num_to_init) start_pg = pg->next; kfree(pg); pg = start_pg; + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; } pr_info("ftrace: FAILED to allocate memory for functions\n"); return NULL; @@ -6173,6 +6180,8 @@ void ftrace_release_mod(struct module *mod) free_pages((unsigned long)pg->records, order); tmp_page = pg->next; kfree(pg); + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; } } @@ -6514,6 +6523,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) *last_pg = pg->next; order = get_count_order(pg->size / ENTRIES_PER_PAGE); free_pages((unsigned long)pg->records, order); + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; kfree(pg); pg = container_of(last_pg, struct ftrace_page, next); if (!(*last_pg)) @@ -6569,6 +6580,9 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); + pr_info("ftrace: allocated %ld pages with %ld groups\n", + ftrace_number_of_pages, ftrace_number_of_groups); + set_ftrace_early_filters(); return; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6a0ee9178365..5ea8c7c0f2d7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7583,14 +7583,23 @@ static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - unsigned long *p = filp->private_data; - char buf[64]; /* Not too big for a shallow stack */ + ssize_t ret; + char *buf; int r; - r = scnprintf(buf, 63, "%ld", *p); - buf[r++] = '\n'; + /* 256 should be plenty to hold the amount needed */ + buf = kmalloc(256, GFP_KERNEL); + if (!buf) + return -ENOMEM; - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n", + ftrace_update_tot_cnt, + ftrace_number_of_pages, + ftrace_number_of_groups); + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + kfree(buf); + return ret; } static const struct file_operations tracing_dyn_info_fops = { @@ -8782,7 +8791,7 @@ static __init int tracer_init_tracefs(void) #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, &tracing_dyn_info_fops); + NULL, &tracing_dyn_info_fops); #endif create_trace_instances(d_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d685c61085c0..8b590f10bc72 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -804,6 +804,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +extern unsigned long ftrace_number_of_pages; +extern unsigned long ftrace_number_of_groups; void ftrace_init_trace_array(struct trace_array *tr); #else static inline void ftrace_init_trace_array(struct trace_array *tr) { } -- cgit From 91edde2e6ae1dd5e33812f076f3fe4cb7ccbfdd0 Mon Sep 17 00:00:00 2001 From: "Viktor Rosendahl (BMW)" Date: Wed, 9 Oct 2019 00:08:21 +0200 Subject: ftrace: Implement fs notification for tracing_max_latency This patch implements the feature that the tracing_max_latency file, e.g. /sys/kernel/debug/tracing/tracing_max_latency will receive notifications through the fsnotify framework when a new latency is available. One particularly interesting use of this facility is when enabling threshold tracing, through /sys/kernel/debug/tracing/tracing_thresh, together with the preempt/irqsoff tracers. This makes it possible to implement a user space program that can, with equal probability, obtain traces of latencies that occur immediately after each other in spite of the fact that the preempt/irqsoff tracers operate in overwrite mode. This facility works with the hwlat, preempt/irqsoff, and wakeup tracers. The tracers may call the latency_fsnotify() from places such as __schedule() or do_idle(); this makes it impossible to call queue_work() directly without risking a deadlock. The same would happen with a softirq, kernel thread or tasklet. For this reason we use the irq_work mechanism to call queue_work(). This patch creates a new workqueue. The reason for doing this is that I wanted to use the WQ_UNBOUND and WQ_HIGHPRI flags. My thinking was that WQ_UNBOUND might help with the latency in some important cases. If we use: queue_work(system_highpri_wq, &tr->fsnotify_work); then the work will (almost) always execute on the same CPU but if we are unlucky that CPU could be too busy while there could be another CPU in the system that would be able to process the work soon enough. queue_work_on() could be used to queue the work on another CPU but it seems difficult to select the right CPU. Link: http://lkml.kernel.org/r/20191008220824.7911-2-viktor.rosendahl@gmail.com Reviewed-by: Joel Fernandes (Google) Signed-off-by: Viktor Rosendahl (BMW) [ Added max() to have one compare for max latency ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 75 ++++++++++++++++++++++++++++++++++++++++++++-- kernel/trace/trace.h | 18 +++++++++++ kernel/trace/trace_hwlat.c | 11 ++++--- 3 files changed, 98 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5ea8c7c0f2d7..f093a433cb42 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -45,6 +45,9 @@ #include #include #include +#include +#include +#include #include "trace.h" #include "trace_output.h" @@ -1497,6 +1500,74 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) } unsigned long __read_mostly tracing_thresh; +static const struct file_operations tracing_max_lat_fops; + +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +static struct workqueue_struct *fsnotify_wq; + +static void latency_fsnotify_workfn(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, + fsnotify_work); + fsnotify(tr->d_max_latency->d_inode, FS_MODIFY, + tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); +} + +static void latency_fsnotify_workfn_irq(struct irq_work *iwork) +{ + struct trace_array *tr = container_of(iwork, struct trace_array, + fsnotify_irqwork); + queue_work(fsnotify_wq, &tr->fsnotify_work); +} + +static void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) +{ + INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); + init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); + tr->d_max_latency = trace_create_file("tracing_max_latency", 0644, + d_tracer, &tr->max_latency, + &tracing_max_lat_fops); +} + +__init static int latency_fsnotify_init(void) +{ + fsnotify_wq = alloc_workqueue("tr_max_lat_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!fsnotify_wq) { + pr_err("Unable to allocate tr_max_lat_wq\n"); + return -ENOMEM; + } + return 0; +} + +late_initcall_sync(latency_fsnotify_init); + +void latency_fsnotify(struct trace_array *tr) +{ + if (!fsnotify_wq) + return; + /* + * We cannot call queue_work(&tr->fsnotify_work) from here because it's + * possible that we are called from __schedule() or do_idle(), which + * could cause a deadlock. + */ + irq_work_queue(&tr->fsnotify_irqwork); +} + +/* + * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + * defined(CONFIG_FSNOTIFY) + */ +#else + +#define trace_create_maxlat_file(tr, d_tracer) \ + trace_create_file("tracing_max_latency", 0644, d_tracer, \ + &tr->max_latency, &tracing_max_lat_fops) + +#endif #ifdef CONFIG_TRACER_MAX_TRACE /* @@ -1536,6 +1607,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) /* record this tasks comm */ tracing_record_cmdline(tsk); + latency_fsnotify(tr); } /** @@ -8594,8 +8666,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tr->max_latency, &tracing_max_lat_fops); + trace_create_maxlat_file(tr, d_tracer); #endif if (ftrace_create_function_files(tr, d_tracer)) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8b590f10bc72..718eb998c13e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -16,6 +16,8 @@ #include #include #include +#include +#include #ifdef CONFIG_FTRACE_SYSCALLS #include /* For NR_SYSCALLS */ @@ -264,6 +266,11 @@ struct trace_array { #endif #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) unsigned long max_latency; +#ifdef CONFIG_FSNOTIFY + struct dentry *d_max_latency; + struct work_struct fsnotify_work; + struct irq_work fsnotify_irqwork; +#endif #endif struct trace_pid_list __rcu *filtered_pids; /* @@ -786,6 +793,17 @@ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +void latency_fsnotify(struct trace_array *tr); + +#else + +static void latency_fsnotify(struct trace_array *tr) { } + +#endif + #ifdef CONFIG_STACKTRACE void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 862f4b0139fc..63526670605a 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -237,6 +237,7 @@ static int get_sample(void) /* If we exceed the threshold value, we have found a hardware latency */ if (sample > thresh || outer_sample > thresh) { struct hwlat_sample s; + u64 latency; ret = 1; @@ -253,11 +254,13 @@ static int get_sample(void) s.nmi_count = nmi_count; trace_hwlat_sample(&s); + latency = max(sample, outer_sample); + /* Keep a running maximum ever recorded hardware latency */ - if (sample > tr->max_latency) - tr->max_latency = sample; - if (outer_sample > tr->max_latency) - tr->max_latency = outer_sample; + if (latency > tr->max_latency) { + tr->max_latency = latency; + latency_fsnotify(tr); + } } out: -- cgit From 793937236d1ee032d2ee5ccc27bdd280a04e766e Mon Sep 17 00:00:00 2001 From: "Viktor Rosendahl (BMW)" Date: Wed, 9 Oct 2019 00:08:22 +0200 Subject: preemptirq_delay_test: Add the burst feature and a sysfs trigger This burst feature enables the user to generate a burst of preempt/irqsoff latencies. This makes it possible to test whether we are able to detect latencies that systematically occur very close to each other. The maximum burst size is 10. We also create 10 identical test functions, so that we get 10 different backtraces; this is useful when we want to test whether we can detect all the latencies in a burst. Otherwise, there would be no easy way of differentiating between which latency in a burst was captured by the tracer. In addition, there is a sysfs trigger, so that it's not necessary to reload the module to repeat the test. The trigger will appear as /sys/kernel/preemptirq_delay_test/trigger in sysfs. Link: http://lkml.kernel.org/r/20191008220824.7911-3-viktor.rosendahl@gmail.com Reviewed-by: Joel Fernandes (Google) Signed-off-by: Viktor Rosendahl (BMW) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 6 +- kernel/trace/preemptirq_delay_test.c | 144 ++++++++++++++++++++++++++++++----- 2 files changed, 128 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 624a05e99b0b..d25314bc7a1c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -760,9 +760,9 @@ config PREEMPTIRQ_DELAY_TEST configurable delay. The module busy waits for the duration of the critical section. - For example, the following invocation forces a one-time irq-disabled - critical section for 500us: - modprobe preemptirq_delay_test test_mode=irq delay=500000 + For example, the following invocation generates a burst of three + irq-disabled critical sections for 500us: + modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3 If unsure, say N diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index d8765c952fab..31c0fad4cb9e 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -10,18 +10,25 @@ #include #include #include +#include #include #include #include #include +#include static ulong delay = 100; -static char test_mode[10] = "irq"; +static char test_mode[12] = "irq"; +static uint burst_size = 1; -module_param_named(delay, delay, ulong, S_IRUGO); -module_param_string(test_mode, test_mode, 10, S_IRUGO); -MODULE_PARM_DESC(delay, "Period in microseconds (100 uS default)"); -MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default irq)"); +module_param_named(delay, delay, ulong, 0444); +module_param_string(test_mode, test_mode, 12, 0444); +module_param_named(burst_size, burst_size, uint, 0444); +MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)"); +MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)"); +MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)"); + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) static void busy_wait(ulong time) { @@ -34,37 +41,136 @@ static void busy_wait(ulong time) } while ((end - start) < (time * 1000)); } -static int preemptirq_delay_run(void *data) +static __always_inline void irqoff_test(void) { unsigned long flags; + local_irq_save(flags); + busy_wait(delay); + local_irq_restore(flags); +} - if (!strcmp(test_mode, "irq")) { - local_irq_save(flags); - busy_wait(delay); - local_irq_restore(flags); - } else if (!strcmp(test_mode, "preempt")) { - preempt_disable(); - busy_wait(delay); - preempt_enable(); +static __always_inline void preemptoff_test(void) +{ + preempt_disable(); + busy_wait(delay); + preempt_enable(); +} + +static void execute_preemptirqtest(int idx) +{ + if (!strcmp(test_mode, "irq")) + irqoff_test(); + else if (!strcmp(test_mode, "preempt")) + preemptoff_test(); + else if (!strcmp(test_mode, "alternate")) { + if (idx % 2 == 0) + irqoff_test(); + else + preemptoff_test(); } +} + +#define DECLARE_TESTFN(POSTFIX) \ + static void preemptirqtest_##POSTFIX(int idx) \ + { \ + execute_preemptirqtest(idx); \ + } \ +/* + * We create 10 different functions, so that we can get 10 different + * backtraces. + */ +DECLARE_TESTFN(0) +DECLARE_TESTFN(1) +DECLARE_TESTFN(2) +DECLARE_TESTFN(3) +DECLARE_TESTFN(4) +DECLARE_TESTFN(5) +DECLARE_TESTFN(6) +DECLARE_TESTFN(7) +DECLARE_TESTFN(8) +DECLARE_TESTFN(9) + +static void (*testfuncs[])(int) = { + preemptirqtest_0, + preemptirqtest_1, + preemptirqtest_2, + preemptirqtest_3, + preemptirqtest_4, + preemptirqtest_5, + preemptirqtest_6, + preemptirqtest_7, + preemptirqtest_8, + preemptirqtest_9, +}; + +#define NR_TEST_FUNCS ARRAY_SIZE(testfuncs) + +static int preemptirq_delay_run(void *data) +{ + int i; + int s = MIN(burst_size, NR_TEST_FUNCS); + + for (i = 0; i < s; i++) + (testfuncs[i])(i); return 0; } -static int __init preemptirq_delay_init(void) +static struct task_struct *preemptirq_start_test(void) { char task_name[50]; - struct task_struct *test_task; snprintf(task_name, sizeof(task_name), "%s_test", test_mode); + return kthread_run(preemptirq_delay_run, NULL, task_name); +} + + +static ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + preemptirq_start_test(); + return count; +} + +static struct kobj_attribute trigger_attribute = + __ATTR(trigger, 0200, NULL, trigger_store); + +static struct attribute *attrs[] = { + &trigger_attribute.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +static struct kobject *preemptirq_delay_kobj; + +static int __init preemptirq_delay_init(void) +{ + struct task_struct *test_task; + int retval; + + test_task = preemptirq_start_test(); + retval = PTR_ERR_OR_ZERO(test_task); + if (retval != 0) + return retval; + + preemptirq_delay_kobj = kobject_create_and_add("preemptirq_delay_test", + kernel_kobj); + if (!preemptirq_delay_kobj) + return -ENOMEM; + + retval = sysfs_create_group(preemptirq_delay_kobj, &attr_group); + if (retval) + kobject_put(preemptirq_delay_kobj); - test_task = kthread_run(preemptirq_delay_run, NULL, task_name); - return PTR_ERR_OR_ZERO(test_task); + return retval; } static void __exit preemptirq_delay_exit(void) { - return; + kobject_put(preemptirq_delay_kobj); } module_init(preemptirq_delay_init) -- cgit From 9c34fc4b7e903117cc27712b9e6c8690debb7e95 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:20 +0200 Subject: tracing: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Add additional header output for PREEMPT_RT. Link: http://lkml.kernel.org/r/20191015191821.11479-34-bigeasy@linutronix.de Cc: Ingo Molnar Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f093a433cb42..db7d06a26861 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3726,6 +3726,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) "desktop", #elif defined(CONFIG_PREEMPT) "preempt", +#elif defined(CONFIG_PREEMPT_RT) + "preempt_rt", #else "unknown", #endif -- cgit From 6dff4d7dd3e0158688683a17dd792861aa9d61e2 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 15 Oct 2019 13:10:12 +0100 Subject: tracing: Make internal ftrace events static The event_class_ftrace_##call and event_##call do not seem to be used outside of trace_export.c so make them both static to avoid a number of sparse warnings: kernel/trace/trace_entries.h:59:1: warning: symbol 'event_class_ftrace_function' was not declared. Should it be static? kernel/trace/trace_entries.h:59:1: warning: symbol '__event_function' was not declared. Should it be static? kernel/trace/trace_entries.h:77:1: warning: symbol 'event_class_ftrace_funcgraph_entry' was not declared. Should it be static? kernel/trace/trace_entries.h:77:1: warning: symbol '__event_funcgraph_entry' was not declared. Should it be static? kernel/trace/trace_entries.h:93:1: warning: symbol 'event_class_ftrace_funcgraph_exit' was not declared. Should it be static? kernel/trace/trace_entries.h:93:1: warning: symbol '__event_funcgraph_exit' was not declared. Should it be static? kernel/trace/trace_entries.h:129:1: warning: symbol 'event_class_ftrace_context_switch' was not declared. Should it be static? kernel/trace/trace_entries.h:129:1: warning: symbol '__event_context_switch' was not declared. Should it be static? kernel/trace/trace_entries.h:149:1: warning: symbol 'event_class_ftrace_wakeup' was not declared. Should it be static? kernel/trace/trace_entries.h:149:1: warning: symbol '__event_wakeup' was not declared. Should it be static? kernel/trace/trace_entries.h:171:1: warning: symbol 'event_class_ftrace_kernel_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:171:1: warning: symbol '__event_kernel_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:191:1: warning: symbol 'event_class_ftrace_user_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:191:1: warning: symbol '__event_user_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:214:1: warning: symbol 'event_class_ftrace_bprint' was not declared. Should it be static? kernel/trace/trace_entries.h:214:1: warning: symbol '__event_bprint' was not declared. Should it be static? kernel/trace/trace_entries.h:230:1: warning: symbol 'event_class_ftrace_print' was not declared. Should it be static? kernel/trace/trace_entries.h:230:1: warning: symbol '__event_print' was not declared. Should it be static? kernel/trace/trace_entries.h:247:1: warning: symbol 'event_class_ftrace_raw_data' was not declared. Should it be static? kernel/trace/trace_entries.h:247:1: warning: symbol '__event_raw_data' was not declared. Should it be static? kernel/trace/trace_entries.h:262:1: warning: symbol 'event_class_ftrace_bputs' was not declared. Should it be static? kernel/trace/trace_entries.h:262:1: warning: symbol '__event_bputs' was not declared. Should it be static? kernel/trace/trace_entries.h:277:1: warning: symbol 'event_class_ftrace_mmiotrace_rw' was not declared. Should it be static? kernel/trace/trace_entries.h:277:1: warning: symbol '__event_mmiotrace_rw' was not declared. Should it be static? kernel/trace/trace_entries.h:298:1: warning: symbol 'event_class_ftrace_mmiotrace_map' was not declared. Should it be static? kernel/trace/trace_entries.h:298:1: warning: symbol '__event_mmiotrace_map' was not declared. Should it be static? kernel/trace/trace_entries.h:322:1: warning: symbol 'event_class_ftrace_branch' was not declared. Should it be static? kernel/trace/trace_entries.h:322:1: warning: symbol '__event_branch' was not declared. Should it be static? kernel/trace/trace_entries.h:343:1: warning: symbol 'event_class_ftrace_hwlat' was not declared. Should it be static? kernel/trace/trace_entries.h:343:1: warning: symbol '__event_hwlat' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20191015121012.18824-1-ben.dooks@codethink.co.uk Signed-off-by: Ben Dooks Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_export.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 45630a76ed3a..2e6d2e9741cc 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -171,7 +171,7 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \ #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ regfn) \ \ -struct trace_event_class __refdata event_class_ftrace_##call = { \ +static struct trace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ @@ -187,7 +187,7 @@ struct trace_event_call __used event_##call = { \ .print_fmt = print, \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ -struct trace_event_call __used \ +static struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #undef FTRACE_ENTRY -- cgit From 2d6425af61166e026e7476db64f70f1266127b1d Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:23 -0700 Subject: tracing: Declare newly exported APIs in include/linux/trace.h Declare the newly introduced and exported APIs in the header file - include/linux/trace.h. Moving previous declarations from kernel/trace/trace.h to include/linux/trace.h. Link: http://lkml.kernel.org/r/1565805327-579-2-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 718eb998c13e..90cba68c8b50 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -873,8 +874,6 @@ trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); -int trace_array_printk(struct trace_array *tr, - unsigned long ip, const char *fmt, ...); int trace_array_printk_buf(struct ring_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); @@ -1890,7 +1889,6 @@ extern const char *__start___tracepoint_str[]; extern const char *__stop___tracepoint_str[]; void trace_printk_control(bool enabled); -void trace_printk_init_buffers(void); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); -- cgit From e585e6469d6f476b82aa148dc44aaf7ae269a4e2 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:24 -0700 Subject: tracing: Verify if trace array exists before destroying it. A trace array can be destroyed from userspace or kernel. Verify if the trace array exists before proceeding to destroy/remove it. Link: http://lkml.kernel.org/r/1565805327-579-3-git-send-email-divya.indi@oracle.com Reviewed-by: Aruna Ramakrishna Signed-off-by: Divya Indi [ Removed unneeded braces ] Signed-off-by: Steven Rostedt (VMware) --- kernel/module.c | 6 +++++- kernel/trace/trace.c | 15 ++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ff2d7359a418..6e2fd40a6ed9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3728,7 +3728,6 @@ static int complete_formation(struct module *mod, struct load_info *info) module_enable_ro(mod, false); module_enable_nx(mod); - module_enable_x(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ @@ -3751,6 +3750,11 @@ static int prepare_coming_module(struct module *mod) if (err) return err; + /* Make module executable after ftrace is enabled */ + mutex_lock(&module_mutex); + module_enable_x(mod); + mutex_unlock(&module_mutex); + blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index db7d06a26861..fa4f742fc449 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8556,17 +8556,26 @@ static int __remove_instance(struct trace_array *tr) return 0; } -int trace_array_destroy(struct trace_array *tr) +int trace_array_destroy(struct trace_array *this_tr) { + struct trace_array *tr; int ret; - if (!tr) + if (!this_tr) return -EINVAL; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - ret = __remove_instance(tr); + ret = -ENODEV; + + /* Making sure trace array exists before destroying it. */ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == this_tr) { + ret = __remove_instance(tr); + break; + } + } mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); -- cgit From 953ae45a0c25e09428d4a03d7654f97ab8a36647 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:25 -0700 Subject: tracing: Adding NULL checks for trace_array descriptor pointer As part of commit f45d1225adb0 ("tracing: Kernel access to Ftrace instances") we exported certain functions. Here, we are adding some additional NULL checks to ensure safe usage by users of these APIs. Link: http://lkml.kernel.org/r/1565805327-579-4-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 +++ kernel/trace/trace_events.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa4f742fc449..79fe4d6ecbd8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3297,6 +3297,9 @@ int trace_array_printk(struct trace_array *tr, if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; + if (!tr) + return -ENOENT; + va_start(ap, fmt); ret = trace_array_vprintk(tr, ip, fmt, ap); va_end(ap); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fba87d10f0c1..2a3ac2365445 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -793,6 +793,8 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) char *event = NULL, *sub = NULL, *match; int ret; + if (!tr) + return -ENOENT; /* * The buf format can be : * *: means any event by that name. -- cgit From b83b43ffc6e4b514ca034a0fbdee01322e2f7022 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 15 Oct 2019 09:00:55 -0400 Subject: fgraph: Fix function type mismatches of ftrace_graph_return using ftrace_stub The C compiler is allowing more checks to make sure that function pointers are assigned to the correct prototype function. Unfortunately, the function graph tracer uses a special name with its assigned ftrace_graph_return function pointer that maps to a stub function used by the function tracer (ftrace_stub). The ftrace_graph_return variable is compared to the ftrace_stub in some archs to know if the function graph tracer is enabled or not. This means we can not just simply create a new function stub that compares it without modifying all the archs. Instead, have the linker script create a function_graph_stub that maps to ftrace_stub, and this way we can define the prototype for it to match the prototype of ftrace_graph_return, and make the compiler checks all happy! Link: http://lkml.kernel.org/r/20191015090055.789a0aed@gandalf.local.home Cc: linux-sh@vger.kernel.org Cc: Yoshinori Sato Cc: Rich Felker Reported-by: Sami Tolvanen Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 7950a0356042..fa3ce10d0405 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -332,9 +332,14 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) return 0; } +/* + * Simply points to ftrace_stub, but with the proper protocol. + * Defined by the linker script in linux/vmlinux.lds.h + */ +extern void ftrace_graph_stub(struct ftrace_graph_ret *); + /* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = - (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_stub; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; @@ -614,7 +619,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) goto out; ftrace_graph_active--; - ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_return = ftrace_graph_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); -- cgit From 80042c8f06bf5a7b87a63deaa3deb56f2cd52645 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Oct 2019 16:56:56 +0300 Subject: tracing: Use generic type for comparator function Comparator function type, cmp_func_t, is defined in the types.h, use it in the code. Link: http://lkml.kernel.org/r/20191007135656.37734-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 12 ++++++------ kernel/trace/trace_branch.c | 8 ++++---- kernel/trace/trace_stat.c | 6 ++---- kernel/trace/trace_stat.h | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d2d488c43a6a..82ef8d60a42b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -465,10 +465,10 @@ static void *function_stat_start(struct tracer_stat *trace) #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* function graph compares on total time */ -static int function_stat_cmp(void *p1, void *p2) +static int function_stat_cmp(const void *p1, const void *p2) { - struct ftrace_profile *a = p1; - struct ftrace_profile *b = p2; + const struct ftrace_profile *a = p1; + const struct ftrace_profile *b = p2; if (a->time < b->time) return -1; @@ -479,10 +479,10 @@ static int function_stat_cmp(void *p1, void *p2) } #else /* not function graph compares against hits */ -static int function_stat_cmp(void *p1, void *p2) +static int function_stat_cmp(const void *p1, const void *p2) { - struct ftrace_profile *a = p1; - struct ftrace_profile *b = p2; + const struct ftrace_profile *a = p1; + const struct ftrace_profile *b = p2; if (a->counter < b->counter) return -1; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 3ea65cdff30d..88e158d27965 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -244,7 +244,7 @@ static int annotated_branch_stat_headers(struct seq_file *m) return 0; } -static inline long get_incorrect_percent(struct ftrace_branch_data *p) +static inline long get_incorrect_percent(const struct ftrace_branch_data *p) { long percent; @@ -332,10 +332,10 @@ annotated_branch_stat_next(void *v, int idx) return p; } -static int annotated_branch_stat_cmp(void *p1, void *p2) +static int annotated_branch_stat_cmp(const void *p1, const void *p2) { - struct ftrace_branch_data *a = p1; - struct ftrace_branch_data *b = p2; + const struct ftrace_branch_data *a = p1; + const struct ftrace_branch_data *b = p2; long percent_a, percent_b; diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 9ab0a1a7ad5e..874f1274cf99 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -72,9 +72,7 @@ static void destroy_session(struct stat_session *session) kfree(session); } -typedef int (*cmp_stat_t)(void *, void *); - -static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) +static int insert_stat(struct rb_root *root, void *stat, cmp_func_t cmp) { struct rb_node **new = &(root->rb_node), *parent = NULL; struct stat_node *data; @@ -112,7 +110,7 @@ static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) * This one will force an insertion as right-most node * in the rbtree. */ -static int dummy_cmp(void *p1, void *p2) +static int dummy_cmp(const void *p1, const void *p2) { return -1; } diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 8786d17caf49..31d7dc5bf1db 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -16,7 +16,7 @@ struct tracer_stat { void *(*stat_start)(struct tracer_stat *trace); void *(*stat_next)(void *prev, int idx); /* Compare two entries for stats sorting */ - int (*stat_cmp)(void *p1, void *p2); + cmp_func_t stat_cmp; /* Print a stat entry */ int (*stat_show)(struct seq_file *s, void *p); /* Release an entry */ -- cgit From 0c3c86bdc691c794a6154f8515b7fa82c82dfc4d Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat (VMware)" Date: Thu, 10 Oct 2019 11:51:17 -0700 Subject: tracing/hwlat: Fix a few trivial nits Update the source file name in the comments, and fix a grammatical error. Link: http://lkml.kernel.org/r/157073346821.17189.8946944856026592247.stgit@srivatsa-ubuntu Signed-off-by: Srivatsa S. Bhat (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 63526670605a..6638d63f0921 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * trace_hwlatdetect.c - A simple Hardware Latency detector. + * trace_hwlat.c - A simple Hardware Latency detector. * * Use this tracer to detect large system latencies induced by the behavior of * certain underlying system hardware or firmware, independent of Linux itself. @@ -279,7 +279,7 @@ static void move_to_next_cpu(void) return; /* * If for some reason the user modifies the CPU affinity - * of this thread, than stop migrating for the duration + * of this thread, then stop migrating for the duration * of the current test. */ if (!cpumask_equal(current_mask, current->cpus_ptr)) -- cgit From 6ee40511cb838f9ced002dff7131bca87e3ccbdd Mon Sep 17 00:00:00 2001 From: Yuming Han Date: Thu, 24 Oct 2019 11:34:30 +0800 Subject: tracing: use kvcalloc for tgid_map array allocation Fail to allocate memory for tgid_map, because it requires order-6 page. detail as: c3 sh: page allocation failure: order:6, mode:0x140c0c0(GFP_KERNEL), nodemask=(null) c3 sh cpuset=/ mems_allowed=0 c3 CPU: 3 PID: 5632 Comm: sh Tainted: G W O 4.14.133+ #10 c3 Hardware name: Generic DT based system c3 Backtrace: c3 [] (dump_backtrace) from [](show_stack+0x18/0x1c) c3 [] (show_stack) from [](dump_stack+0x84/0xa4) c3 [] (dump_stack) from [](warn_alloc+0xc4/0x19c) c3 [] (warn_alloc) from [](__alloc_pages_nodemask+0xd18/0xf28) c3 [] (__alloc_pages_nodemask) from [](kmalloc_order+0x20/0x38) c3 [] (kmalloc_order) from [](kmalloc_order_trace+0x24/0x108) c3 [] (kmalloc_order_trace) from [](set_tracer_flag+0xb0/0x158) c3 [] (set_tracer_flag) from [](trace_options_core_write+0x7c/0xcc) c3 [] (trace_options_core_write) from [](__vfs_write+0x40/0x14c) c3 [] (__vfs_write) from [](vfs_write+0xc4/0x198) c3 [] (vfs_write) from [](SyS_write+0x6c/0xd0) c3 [] (SyS_write) from [](ret_fast_syscall+0x0/0x54) Switch to use kvcalloc to avoid unexpected allocation failures. Link: http://lkml.kernel.org/r/1571888070-24425-1-git-send-email-chunyan.zhang@unisoc.com Signed-off-by: Yuming Han Signed-off-by: Chunyan Zhang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 79fe4d6ecbd8..42659ce6ac0c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4686,7 +4686,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_TGID) { if (!tgid_map) - tgid_map = kcalloc(PID_MAX_DEFAULT + 1, + tgid_map = kvcalloc(PID_MAX_DEFAULT + 1, sizeof(*tgid_map), GFP_KERNEL); if (!tgid_map) { -- cgit From c7411a1a126f649be71526a36d4afac9e5aefa13 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 29 Oct 2019 17:31:44 +0900 Subject: tracing/kprobe: Check whether the non-suffixed symbol is notrace Check whether the non-suffixed symbol is notrace, since suffixed symbols are generated by the compilers for optimization. Based on these suffixed symbols, notrace check might not work because some of them are just a partial code of the original function. (e.g. cold-cache (unlikely) code is separated from original function as FUNCTION.cold.XX) For example, without this fix, # echo p device_add.cold.67 > /sys/kernel/debug/tracing/kprobe_events sh: write error: Invalid argument # cat /sys/kernel/debug/tracing/error_log [ 135.491035] trace_kprobe: error: Failed to register probe event Command: p device_add.cold.67 ^ # dmesg | tail -n 1 [ 135.488599] trace_kprobe: Could not probe notrace function device_add.cold.67 With this, # echo p device_add.cold.66 > /sys/kernel/debug/tracing/kprobe_events # cat /sys/kernel/debug/kprobes/list ffffffff81599de9 k device_add.cold.66+0x0 [DISABLED] Actually, kprobe blacklist already did similar thing, see within_kprobe_blacklist(). Link: http://lkml.kernel.org/r/157233790394.6706.18243942030937189679.stgit@devnote2 Fixes: 45408c4f9250 ("tracing: kprobes: Prohibit probing on notrace function") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1552a95c743b..7f890262c8a3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -435,11 +435,10 @@ static int disable_trace_kprobe(struct trace_event_call *call, #if defined(CONFIG_KPROBES_ON_FTRACE) && \ !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) -static bool within_notrace_func(struct trace_kprobe *tk) +static bool __within_notrace_func(unsigned long addr) { - unsigned long offset, size, addr; + unsigned long offset, size; - addr = trace_kprobe_address(tk); if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset)) return false; @@ -452,6 +451,28 @@ static bool within_notrace_func(struct trace_kprobe *tk) */ return !ftrace_location_range(addr, addr + size - 1); } + +static bool within_notrace_func(struct trace_kprobe *tk) +{ + unsigned long addr = addr = trace_kprobe_address(tk); + char symname[KSYM_NAME_LEN], *p; + + if (!__within_notrace_func(addr)) + return false; + + /* Check if the address is on a suffixed-symbol */ + if (!lookup_symbol_name(addr, symname)) { + p = strchr(symname, '.'); + if (!p) + return true; + *p = '\0'; + addr = (unsigned long)kprobe_lookup_name(symname, 0); + if (addr) + return __within_notrace_func(addr); + } + + return true; +} #else #define within_notrace_func(tk) (false) #endif -- cgit From ef56e047b2bd4dabb801fd073dfcab5f40de5f78 Mon Sep 17 00:00:00 2001 From: Piotr Maziarz Date: Thu, 7 Nov 2019 13:45:38 +0100 Subject: tracing: Use seq_buf_hex_dump() to dump buffers Without this, buffers can be printed with __print_array macro that has no formatting options and can be hard to read. The other way is to mimic formatting capability with multiple calls of trace event with one call per row which gives performance impact and different timestamp in each row. Link: http://lkml.kernel.org/r/1573130738-29390-2-git-send-email-piotrx.maziarz@linux.intel.com Signed-off-by: Piotr Maziarz Signed-off-by: Cezary Rojewski Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 15 +++++++++++++++ kernel/trace/trace_seq.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d54ce252b05a..d9b4b7c22db4 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -274,6 +274,21 @@ trace_print_array_seq(struct trace_seq *p, const void *buf, int count, } EXPORT_SYMBOL(trace_print_array_seq); +const char * +trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_putc(p, '\n'); + trace_seq_hex_dump(p, prefix_str, prefix_type, + rowsize, groupsize, buf, len, ascii); + trace_seq_putc(p, 0); + return ret; +} +EXPORT_SYMBOL(trace_print_hex_dump_seq); + int trace_raw_output_prep(struct trace_iterator *iter, struct trace_event *trace_event) { diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 6b1c562ffdaf..344e4c1aa09c 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -376,3 +376,33 @@ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) return seq_buf_to_user(&s->seq, ubuf, cnt); } EXPORT_SYMBOL_GPL(trace_seq_to_user); + +int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return 0; + + __trace_seq_init(s); + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return 0; + } + + seq_buf_hex_dump(&(s->seq), prefix_str, + prefix_type, rowsize, groupsize, + buf, len, ascii); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return 0; + } + + return 1; +} +EXPORT_SYMBOL(trace_seq_hex_dump); -- cgit From 9b4712044d059e7842aaeeafd7c7a7ee88c589db Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 12 Nov 2019 18:42:19 +0100 Subject: tracing: Remove stray tab in TRACE_EVAL_MAP_FILE's help text There was a stray tab in the help text of the aforementioned config option which showed like this: The "print fmt" of the trace events will show the enum/sizeof names instead of their values. This can cause problems for user space tools ... in menuconfig. Remove it and end a sentence with a fullstop. No functional changes. Link: http://lkml.kernel.org/r/20191112174219.10933-1-bp@alien8.de Signed-off-by: Borislav Petkov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d25314bc7a1c..b872716bb2a0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -771,7 +771,7 @@ config TRACE_EVAL_MAP_FILE depends on TRACING help The "print fmt" of the trace events will show the enum/sizeof names - instead of their values. This can cause problems for user space tools + instead of their values. This can cause problems for user space tools that use this string to parse the raw data as user space does not know how to convert the string to its value. @@ -792,7 +792,7 @@ config TRACE_EVAL_MAP_FILE they are needed for the "eval_map" file. Enabling this option will increase the memory footprint of the running kernel. - If unsure, say N + If unsure, say N. config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" -- cgit From d7495343228f30d8206e92dccfd1c41adcfa142d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Nov 2019 14:46:51 -0800 Subject: cgroup: fix incorrect WARN_ON_ONCE() in cgroup_setup_root() 743210386c03 ("cgroup: use cgrp->kn->id as the cgroup ID") added WARN which triggers if cgroup_id(root_cgrp) is not 1. This is fine on 64bit ino archs but on 32bit archs cgroup ID is ((gen << 32) | ino) and gen starts at 1, so the root id is 0x1_0000_0001 instead of 1 always triggering the WARN. What we wanna make sure is that the ino part is 1. Fix it. Reported-by: Naresh Kamboju Fixes: 743210386c03 ("cgroup: use cgrp->kn->id as the cgroup ID") Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c12dcf7dc432..53098c1d45e2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1966,7 +1966,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto exit_root_id; } root_cgrp->kn = root->kf_root->kn; - WARN_ON_ONCE(cgroup_id(root_cgrp) != 1); + WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); ret = css_populate_dir(&root_cgrp->self); -- cgit From 36b3615dc3b625c8b587f34e413a600f7ac16403 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Nov 2019 22:43:58 -0500 Subject: tracing: Add missing "inline" in stub function of latency_fsnotify() The latency_fsnotify() stub when the function is not defined, was missing the "inline". Link: https://lore.kernel.org/r/20191115140213.74c5efe7@canb.auug.org.au Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 90cba68c8b50..2df8aed6a8f0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -801,7 +801,7 @@ void latency_fsnotify(struct trace_array *tr); #else -static void latency_fsnotify(struct trace_array *tr) { } +static inline void latency_fsnotify(struct trace_array *tr) { } #endif -- cgit From 0567d6809182df53da03636fad36c507c5cf07a5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Nov 2019 14:39:35 -0500 Subject: ftrace: Add modify_ftrace_direct() Add a new function modify_ftrace_direct() that will allow a user to update an existing direct caller to a new trampoline, without missing hits due to unregistering one and then adding another. Link: https://lore.kernel.org/r/20191109022907.6zzo6orhxpt5n2sv@ast-mbp.dhcp.thefacebook.com Suggested-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 82ef8d60a42b..834f3556ea1e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5160,6 +5160,84 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) return ret; } EXPORT_SYMBOL_GPL(unregister_ftrace_direct); + +static struct ftrace_ops stub_ops = { + .func = ftrace_stub, +}; + +/** + * modify_ftrace_direct - Modify an existing direct call to call something else + * @ip: The instruction pointer to modify + * @old_addr: The address that the current @ip calls directly + * @new_addr: The address that the @ip should call + * + * This modifies a ftrace direct caller at an instruction pointer without + * having to disable it first. The direct call will switch over to the + * @new_addr without missing anything. + * + * Returns: zero on success. Non zero on error, which includes: + * -ENODEV : the @ip given has no direct caller attached + * -EINVAL : the @old_addr does not match the current direct caller + */ +int modify_ftrace_direct(unsigned long ip, + unsigned long old_addr, unsigned long new_addr) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) { + /* OK if it is off by a little */ + rec = lookup_rec(ip, ip); + if (!rec || rec->ip == ip) + goto out_unlock; + + entry = __ftrace_lookup_ip(direct_functions, rec->ip); + if (!entry) + goto out_unlock; + + ip = rec->ip; + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); + } + + ret = -EINVAL; + if (entry->direct != old_addr) + goto out_unlock; + + /* + * By setting a stub function at the same address, we force + * the code to call the iterator and the direct_ops helper. + * This means that @ip does not call the direct call, and + * we can simply modify it. + */ + ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); + if (ret) + goto out_unlock; + + ret = register_ftrace_function(&stub_ops); + if (ret) { + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + goto out_unlock; + } + + entry->direct = new_addr; + + /* + * By removing the stub, we put back the direct call, calling + * the @new_addr. + */ + unregister_ftrace_function(&stub_ops); + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + + ret = 0; + + out_unlock: + mutex_unlock(&direct_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(modify_ftrace_direct); #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit From e9838bd51169af87ae248336d4c3fc59184a0e46 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 13 Nov 2019 18:12:01 +0100 Subject: irq_work: Fix IRQ_WORK_BUSY bit clearing While attempting to clear the busy bit at the end of a work execution, atomic_cmpxchg() expects the value of the flags with the pending bit cleared as the old value. However by mistake the value of the flags is passed without clearing the pending bit first. As a result, clearing the busy bit fails and irq_work_sync() may stall: watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [blktrace:4948] CPU: 0 PID: 4948 Comm: blktrace Not tainted 5.4.0-rc7-00003-gfeb4a51323bab #1 RIP: 0010:irq_work_sync+0x4/0x10 Call Trace: relay_close_buf+0x19/0x50 relay_close+0x64/0x100 blk_trace_free+0x1f/0x50 __blk_trace_remove+0x1e/0x30 blk_trace_ioctl+0x11b/0x140 blkdev_ioctl+0x6c1/0xa40 block_ioctl+0x39/0x40 do_vfs_ioctl+0xa5/0x700 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1d0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 So clear the appropriate bit before passing the old flags to cmpxchg(). Fixes: feb4a51323ba ("irq_work: Slightly simplify IRQ_WORK_PENDING clearing") Reported-by: kernel test robot Reported-by: Leonard Crestez Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Leonard Crestez Link: https://lkml.kernel.org/r/20191113171201.14032-1-frederic@kernel.org --- kernel/irq_work.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 49c53f80a13a..828cc30774bc 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -158,6 +158,7 @@ static void irq_work_run_list(struct llist_head *list) * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ + flags &= ~IRQ_WORK_PENDING; (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); } } -- cgit From 20a15ee040f23bd553d4e6bbb1f8724ccd282abc Mon Sep 17 00:00:00 2001 From: luanshi Date: Wed, 13 Nov 2019 22:41:33 +0800 Subject: genirq: Fix function documentation of __irq_alloc_descs() The function got renamed at some point, but the kernel-doc was not updated. Signed-off-by: Liguang Zhang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1573656093-8643-1-git-send-email-zhangliguang@linux.alibaba.com --- kernel/irq/irqdesc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9be995fc3c5a..5b8fdd659e54 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -750,7 +750,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt) EXPORT_SYMBOL_GPL(irq_free_descs); /** - * irq_alloc_descs - allocate and initialize a range of irq descriptors + * __irq_alloc_descs - allocate and initialize a range of irq descriptors * @irq: Allocate for specific irq number if irq >= 0 * @from: Start the search from this irq number * @cnt: Number of consecutive irqs to allocate. -- cgit From 6e1ff0773f49c7d38e8b4a9df598def6afb9f415 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 14 Nov 2019 21:10:52 +0000 Subject: sched/uclamp: Fix incorrect condition uclamp_update_active() should perform the update when p->uclamp[clamp_id].active is true. But when the logic was inverted in [1], the if condition wasn't inverted correctly too. [1] https://lore.kernel.org/lkml/20190902073836.GO2369@hirez.programming.kicks-ass.net/ Reported-by: Suren Baghdasaryan Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Fixes: babbe170e053 ("sched/uclamp: Update CPU's refcount on TG's clamp changes") Link: https://lkml.kernel.org/r/20191114211052.15116-1-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 33cd25051f3a..44123b4d14e8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1065,7 +1065,7 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - if (!p->uclamp[clamp_id].active) { + if (p->uclamp[clamp_id].active) { uclamp_rq_dec_id(rq, p, clamp_id); uclamp_rq_inc_id(rq, p, clamp_id); } -- cgit From 3ca270fc9edb258d5bfa271bcf851614e9e6e7d4 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sun, 27 Oct 2019 18:52:38 +0800 Subject: perf/core: Provide a kernel-internal interface to recalibrate event period Currently, perf_event_period() is used by user tools via ioctl. Based on naming convention, exporting perf_event_period() for kernel users (such as KVM) who may recalibrate the event period for their assigned counter according to their requirements. The perf_event_period() is an external accessor, just like the perf_event_{en,dis}able() and should thus use perf_event_ctx_lock(). Suggested-by: Kan Liang Signed-off-by: Like Xu Acked-by: Peter Zijlstra Signed-off-by: Paolo Bonzini --- kernel/events/core.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9ec0b0bfddbd..e1b83d2731da 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5106,16 +5106,11 @@ static int perf_event_check_period(struct perf_event *event, u64 value) return event->pmu->check_period(event, value); } -static int perf_event_period(struct perf_event *event, u64 __user *arg) +static int _perf_event_period(struct perf_event *event, u64 value) { - u64 value; - if (!is_sampling_event(event)) return -EINVAL; - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - if (!value) return -EINVAL; @@ -5133,6 +5128,19 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) return 0; } +int perf_event_period(struct perf_event *event, u64 value) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_event_period(event, value); + perf_event_ctx_unlock(event, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(perf_event_period); + static const struct file_operations perf_fops; static inline int perf_fget_light(int fd, struct fd *p) @@ -5176,8 +5184,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: - return perf_event_period(event, (u64 __user *)arg); + { + u64 value; + + if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) + return -EFAULT; + return _perf_event_period(event, value); + } case PERF_EVENT_IOC_ID: { u64 id = primary_event_id(event); -- cgit From 52ba4b0b99770e892f43da1238f437155acb8b58 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sun, 27 Oct 2019 18:52:39 +0800 Subject: perf/core: Provide a kernel-internal interface to pause perf_event Exporting perf_event_pause() as an external accessor for kernel users (such as KVM) who may do both disable perf_event and read count with just one time to hold perf_event_ctx_lock. Also the value could be reset optionally. Suggested-by: Peter Zijlstra Signed-off-by: Like Xu Acked-by: Peter Zijlstra Signed-off-by: Paolo Bonzini --- kernel/events/core.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index e1b83d2731da..fc9f5ebf4849 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5029,6 +5029,24 @@ static void _perf_event_reset(struct perf_event *event) perf_event_update_userpage(event); } +/* Assume it's not an event with inherit set. */ +u64 perf_event_pause(struct perf_event *event, bool reset) +{ + struct perf_event_context *ctx; + u64 count; + + ctx = perf_event_ctx_lock(event); + WARN_ON_ONCE(event->attr.inherit); + _perf_event_disable(event); + count = local64_read(&event->count); + if (reset) + local64_set(&event->count, 0); + perf_event_ctx_unlock(event, ctx); + + return count; +} +EXPORT_SYMBOL_GPL(perf_event_pause); + /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block -- cgit From 3ca47e958a64b1116a2c35e65dcf467fc53d52de Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Apr 2019 17:43:50 +0200 Subject: y2038: remove CONFIG_64BIT_TIME The CONFIG_64BIT_TIME option is defined on all architectures, and can be removed for simplicity now. Signed-off-by: Arnd Bergmann --- kernel/time/hrtimer.c | 2 +- kernel/time/time.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 65605530ee34..9e20873148c6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1940,7 +1940,7 @@ out: return ret; } -#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +#ifdef CONFIG_64BIT SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) diff --git a/kernel/time/time.c b/kernel/time/time.c index 5c54ca632d08..96b8c02657ed 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -267,7 +267,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, } #endif -#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +#ifdef CONFIG_64BIT SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p) { struct __kernel_timex txc; /* Local copy of parameter */ @@ -881,7 +881,7 @@ int get_timespec64(struct timespec64 *ts, ts->tv_sec = kts.tv_sec; /* Zero out the padding for 32 bit systems or in compat mode */ - if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall()) + if (in_compat_syscall()) kts.tv_nsec &= 0xFFFFFFFFUL; ts->tv_nsec = kts.tv_nsec; -- cgit From 2a785996cc5e2fc1d1d29d196f530905f68d2dc2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Nov 2019 11:10:01 +0100 Subject: y2038: uapi: change __kernel_time_t to __kernel_old_time_t This is mainly a patch for clarification, and to let us remove the time_t definition from the kernel to prevent new users from creeping in that might not be y2038-safe. All remaining uses of 'time_t' or '__kernel_time_t' are part of the user API that cannot be changed by that either have a replacement or that do not suffer from the y2038 overflow. Acked-by: Deepa Dinamani Acked-by: Christian Brauner Signed-off-by: Arnd Bergmann --- kernel/time/time.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 96b8c02657ed..833abae3364f 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -59,9 +59,9 @@ EXPORT_SYMBOL(sys_tz); * why not move it into the appropriate arch directory (for those * architectures that need it). */ -SYSCALL_DEFINE1(time, time_t __user *, tloc) +SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc) { - time_t i = (time_t)ktime_get_real_seconds(); + __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) @@ -78,7 +78,7 @@ SYSCALL_DEFINE1(time, time_t __user *, tloc) * architectures that need it). */ -SYSCALL_DEFINE1(stime, time_t __user *, tptr) +SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr) { struct timespec64 tv; int err; -- cgit From bdd565f817a74b9e30edec108f7cb1dbc762b8a6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 22:46:48 +0200 Subject: y2038: rusage: use __kernel_old_timeval There are two 'struct timeval' fields in 'struct rusage'. Unfortunately the definition of timeval is now ambiguous when used in user space with a libc that has a 64-bit time_t, and this also changes the 'rusage' definition in user space in a way that is incompatible with the system call interface. While there is no good solution to avoid all ambiguity here, change the definition in the kernel headers to be compatible with the kernel ABI, using __kernel_old_timeval as an unambiguous base type. In previous discussions, there was also a plan to add a replacement for rusage based on 64-bit timestamps and nanosecond resolution, i.e. 'struct __kernel_timespec'. I have patches for that as well, if anyone thinks we should do that. Reviewed-by: Cyrill Gorcunov Signed-off-by: Arnd Bergmann --- kernel/sys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index a611d1d58c7d..d3aef31e24dc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1763,8 +1763,8 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) unlock_task_sighand(p, &flags); out: - r->ru_utime = ns_to_timeval(utime); - r->ru_stime = ns_to_timeval(stime); + r->ru_utime = ns_to_kernel_old_timeval(utime); + r->ru_stime = ns_to_kernel_old_timeval(stime); if (who != RUSAGE_CHILDREN) { struct mm_struct *mm = get_task_mm(p); -- cgit From 75d319c06e6a76f67549c0ae1007dc3167804f4e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 22:56:17 +0200 Subject: y2038: syscalls: change remaining timeval to __kernel_old_timeval All of the remaining syscalls that pass a timeval (gettimeofday, utime, futimesat) can trivially be changed to pass a __kernel_old_timeval instead, which has a compatible layout, but avoids ambiguity with the timeval type in user space. Acked-by: Christian Brauner Acked-by: Rafael J. Wysocki Signed-off-by: Arnd Bergmann --- kernel/power/power.h | 2 +- kernel/time/time.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 44bee462ff57..7cdc64dc2373 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -179,7 +179,7 @@ extern void swsusp_close(fmode_t); extern int swsusp_unmark(void); #endif -struct timeval; +struct __kernel_old_timeval; /* kernel/power/swsusp.c */ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); diff --git a/kernel/time/time.c b/kernel/time/time.c index 833abae3364f..a0e7b9909f2d 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -137,7 +137,7 @@ SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr) #endif /* __ARCH_WANT_SYS_TIME32 */ #endif -SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, +SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { if (likely(tv != NULL)) { -- cgit From 5e0fb1b57bea8d11fe77da2bc80f4c9a67e28318 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Aug 2018 20:04:11 +0200 Subject: y2038: time: avoid timespec usage in settimeofday() The compat_get_timeval() and timeval_valid() interfaces are deprecated and getting removed along with the definition of struct timeval itself. Change the two implementations of the settimeofday() system call to open-code these helpers and completely avoid references to timeval. The timeval_valid() call is not needed any more here, only a check to avoid overflowing tv_nsec during the multiplication, as there is another range check in do_sys_settimeofday64(). Tested-by: syzbot+dccce9b26ba09ca49966@syzkaller.appspotmail.com Signed-off-by: Arnd Bergmann --- kernel/time/time.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index a0e7b9909f2d..58e312e7380f 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -196,22 +196,21 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz return 0; } -SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, +SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; - struct timeval user_tv; struct timezone new_tz; if (tv) { - if (copy_from_user(&user_tv, tv, sizeof(*tv))) + if (get_user(new_ts.tv_sec, &tv->tv_sec) || + get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; - if (!timeval_valid(&user_tv)) + if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) @@ -245,18 +244,17 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; - struct timeval user_tv; struct timezone new_tz; if (tv) { - if (compat_get_timeval(&user_tv, tv)) + if (get_user(new_ts.tv_sec, &tv->tv_sec) || + get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; - if (!timeval_valid(&user_tv)) + if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) -- cgit From c1745f84be2657f5702388133551b759b9237f59 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 10:46:22 +0200 Subject: y2038: itimer: compat handling to itimer.c The structure is only used in one place, moving it there simplifies the interface and helps with later changes to this code. Rename it to match the other time32 structures in the process. Reviewed-by: Thomas Gleixner Signed-off-by: Arnd Bergmann --- kernel/compat.c | 24 ------------------------ kernel/time/itimer.c | 42 +++++++++++++++++++++++++++++++++++------- 2 files changed, 35 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index a2bc1d6ceb57..95005f849c68 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -90,30 +90,6 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts) } EXPORT_SYMBOL_GPL(compat_put_timespec); -int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i) -{ - struct compat_itimerval v32; - - if (copy_from_user(&v32, i, sizeof(struct compat_itimerval))) - return -EFAULT; - o->it_interval.tv_sec = v32.it_interval.tv_sec; - o->it_interval.tv_usec = v32.it_interval.tv_usec; - o->it_value.tv_sec = v32.it_value.tv_sec; - o->it_value.tv_usec = v32.it_value.tv_usec; - return 0; -} - -int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerval *i) -{ - struct compat_itimerval v32; - - v32.it_interval.tv_sec = i->it_interval.tv_sec; - v32.it_interval.tv_usec = i->it_interval.tv_usec; - v32.it_value.tv_sec = i->it_value.tv_sec; - v32.it_value.tv_usec = i->it_value.tv_usec; - return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0; -} - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 77f1e5635cc1..c52ebb40b60b 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -112,19 +112,34 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) } #ifdef CONFIG_COMPAT +struct old_itimerval32 { + struct old_timeval32 it_interval; + struct old_timeval32 it_value; +}; + +static int put_old_itimerval32(struct old_itimerval32 __user *o, const struct itimerval *i) +{ + struct old_itimerval32 v32; + + v32.it_interval.tv_sec = i->it_interval.tv_sec; + v32.it_interval.tv_usec = i->it_interval.tv_usec; + v32.it_value.tv_sec = i->it_value.tv_sec; + v32.it_value.tv_usec = i->it_value.tv_usec; + return copy_to_user(o, &v32, sizeof(struct old_itimerval32)) ? -EFAULT : 0; +} + COMPAT_SYSCALL_DEFINE2(getitimer, int, which, - struct compat_itimerval __user *, it) + struct old_itimerval32 __user *, it) { struct itimerval kit; int error = do_getitimer(which, &kit); - if (!error && put_compat_itimerval(it, &kit)) + if (!error && put_old_itimerval32(it, &kit)) error = -EFAULT; return error; } #endif - /* * The timer is automagically restarted, when interval != 0 */ @@ -310,15 +325,28 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, } #ifdef CONFIG_COMPAT +static int get_old_itimerval32(struct itimerval *o, const struct old_itimerval32 __user *i) +{ + struct old_itimerval32 v32; + + if (copy_from_user(&v32, i, sizeof(struct old_itimerval32))) + return -EFAULT; + o->it_interval.tv_sec = v32.it_interval.tv_sec; + o->it_interval.tv_usec = v32.it_interval.tv_usec; + o->it_value.tv_sec = v32.it_value.tv_sec; + o->it_value.tv_usec = v32.it_value.tv_usec; + return 0; +} + COMPAT_SYSCALL_DEFINE3(setitimer, int, which, - struct compat_itimerval __user *, in, - struct compat_itimerval __user *, out) + struct old_itimerval32 __user *, in, + struct old_itimerval32 __user *, out) { struct itimerval kin, kout; int error; if (in) { - if (get_compat_itimerval(&kin, in)) + if (get_old_itimerval32(&kin, in)) return -EFAULT; } else { memset(&kin, 0, sizeof(kin)); @@ -327,7 +355,7 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which, error = do_setitimer(which, &kin, out ? &kout : NULL); if (error || !out) return error; - if (put_compat_itimerval(out, &kout)) + if (put_old_itimerval32(out, &kout)) return -EFAULT; return 0; } -- cgit From 4c22ea2b91203564fdf392b3d3cae249b652a8ae Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 16:59:39 +0200 Subject: y2038: use compat_{get,set}_itimer on alpha The itimer handling for the old alpha osf_setitimer/osf_getitimer system calls is identical to the compat version of getitimer/setitimer, so just use those directly. Signed-off-by: Arnd Bergmann --- kernel/time/itimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index c52ebb40b60b..4664c6addf69 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -111,7 +111,7 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) return error; } -#ifdef CONFIG_COMPAT +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) struct old_itimerval32 { struct old_timeval32 it_interval; struct old_timeval32 it_value; @@ -324,7 +324,7 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, return 0; } -#ifdef CONFIG_COMPAT +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) static int get_old_itimerval32(struct itimerval *o, const struct old_itimerval32 __user *i) { struct old_itimerval32 v32; -- cgit From ddbc7d0657e9fd38b69f16bd0310703367b52d29 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 21:37:43 +0200 Subject: y2038: move itimer reset into itimer.c Preparing for a change to the itimer internals, stop using the do_setitimer() symbol and instead use a new higher-level interface. The do_getitimer()/do_setitimer functions can now be made static, allowing the compiler to potentially produce better object code. Reviewed-by: Thomas Gleixner Signed-off-by: Arnd Bergmann --- kernel/time/itimer.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 4664c6addf69..ce9cd19ce72e 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -73,7 +73,7 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, value->it_interval = ns_to_timeval(interval); } -int do_getitimer(int which, struct itimerval *value) +static int do_getitimer(int which, struct itimerval *value) { struct task_struct *tsk = current; @@ -197,7 +197,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, #define timeval_valid(t) \ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) -int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) +static int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) { struct task_struct *tsk = current; struct hrtimer *timer; @@ -249,6 +249,17 @@ again: return 0; } +#ifdef CONFIG_SECURITY_SELINUX +void clear_itimer(void) +{ + struct itimerval v = {}; + int i; + + for (i = 0; i < 3; i++) + do_setitimer(i, &v, NULL); +} +#endif + #ifdef __ARCH_WANT_SYS_ALARM /** -- cgit From bd40a175769d411b2a37e1c087082ac7ee2c15bb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 Nov 2019 15:27:39 +0100 Subject: y2038: itimer: change implementation to timespec64 There is no 64-bit version of getitimer/setitimer since that is not actually needed. However, the implementation is built around the deprecated 'struct timeval' type. Change the code to use timespec64 internally to reduce the dependencies on timeval and associated helper functions. Minor adjustments in the code are needed to make the native and compat version work the same way, and to keep the range check working after the conversion. Signed-off-by: Arnd Bergmann --- kernel/time/itimer.c | 158 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 96 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index ce9cd19ce72e..5872db9bd5f7 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -26,7 +26,7 @@ * Returns the delta between the expiry time and now, which can be * less than zero or 1usec for an pending expired timer */ -static struct timeval itimer_get_remtime(struct hrtimer *timer) +static struct timespec64 itimer_get_remtime(struct hrtimer *timer) { ktime_t rem = __hrtimer_get_remaining(timer, true); @@ -41,11 +41,11 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) } else rem = 0; - return ktime_to_timeval(rem); + return ktime_to_timespec64(rem); } static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - struct itimerval *const value) + struct itimerspec64 *const value) { u64 val, interval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; @@ -69,11 +69,11 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, spin_unlock_irq(&tsk->sighand->siglock); - value->it_value = ns_to_timeval(val); - value->it_interval = ns_to_timeval(interval); + value->it_value = ns_to_timespec64(val); + value->it_interval = ns_to_timespec64(interval); } -static int do_getitimer(int which, struct itimerval *value) +static int do_getitimer(int which, struct itimerspec64 *value) { struct task_struct *tsk = current; @@ -82,7 +82,7 @@ static int do_getitimer(int which, struct itimerval *value) spin_lock_irq(&tsk->sighand->siglock); value->it_value = itimer_get_remtime(&tsk->signal->real_timer); value->it_interval = - ktime_to_timeval(tsk->signal->it_real_incr); + ktime_to_timespec64(tsk->signal->it_real_incr); spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: @@ -97,17 +97,26 @@ static int do_getitimer(int which, struct itimerval *value) return 0; } +static int put_itimerval(struct itimerval __user *o, + const struct itimerspec64 *i) +{ + struct itimerval v; + + v.it_interval.tv_sec = i->it_interval.tv_sec; + v.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC; + v.it_value.tv_sec = i->it_value.tv_sec; + v.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC; + return copy_to_user(o, &v, sizeof(struct itimerval)) ? -EFAULT : 0; +} + + SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) { - int error = -EFAULT; - struct itimerval get_buffer; + struct itimerspec64 get_buffer; + int error = do_getitimer(which, &get_buffer); - if (value) { - error = do_getitimer(which, &get_buffer); - if (!error && - copy_to_user(value, &get_buffer, sizeof(get_buffer))) - error = -EFAULT; - } + if (!error && put_itimerval(value, &get_buffer)) + error = -EFAULT; return error; } @@ -117,24 +126,25 @@ struct old_itimerval32 { struct old_timeval32 it_value; }; -static int put_old_itimerval32(struct old_itimerval32 __user *o, const struct itimerval *i) +static int put_old_itimerval32(struct old_itimerval32 __user *o, + const struct itimerspec64 *i) { struct old_itimerval32 v32; v32.it_interval.tv_sec = i->it_interval.tv_sec; - v32.it_interval.tv_usec = i->it_interval.tv_usec; + v32.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC; v32.it_value.tv_sec = i->it_value.tv_sec; - v32.it_value.tv_usec = i->it_value.tv_usec; + v32.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC; return copy_to_user(o, &v32, sizeof(struct old_itimerval32)) ? -EFAULT : 0; } COMPAT_SYSCALL_DEFINE2(getitimer, int, which, - struct old_itimerval32 __user *, it) + struct old_itimerval32 __user *, value) { - struct itimerval kit; - int error = do_getitimer(which, &kit); + struct itimerspec64 get_buffer; + int error = do_getitimer(which, &get_buffer); - if (!error && put_old_itimerval32(it, &kit)) + if (!error && put_old_itimerval32(value, &get_buffer)) error = -EFAULT; return error; } @@ -156,8 +166,8 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) } static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - const struct itimerval *const value, - struct itimerval *const ovalue) + const struct itimerspec64 *const value, + struct itimerspec64 *const ovalue) { u64 oval, nval, ointerval, ninterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; @@ -166,8 +176,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, * Use the to_ktime conversion because that clamps the maximum * value to KTIME_MAX and avoid multiplication overflows. */ - nval = ktime_to_ns(timeval_to_ktime(value->it_value)); - ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval)); + nval = timespec64_to_ns(&value->it_value); + ninterval = timespec64_to_ns(&value->it_interval); spin_lock_irq(&tsk->sighand->siglock); @@ -186,8 +196,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, spin_unlock_irq(&tsk->sighand->siglock); if (ovalue) { - ovalue->it_value = ns_to_timeval(oval); - ovalue->it_interval = ns_to_timeval(ointerval); + ovalue->it_value = ns_to_timespec64(oval); + ovalue->it_interval = ns_to_timespec64(ointerval); } } @@ -197,19 +207,13 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, #define timeval_valid(t) \ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) -static int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) +static int do_setitimer(int which, struct itimerspec64 *value, + struct itimerspec64 *ovalue) { struct task_struct *tsk = current; struct hrtimer *timer; ktime_t expires; - /* - * Validate the timevals in value. - */ - if (!timeval_valid(&value->it_value) || - !timeval_valid(&value->it_interval)) - return -EINVAL; - switch (which) { case ITIMER_REAL: again: @@ -218,7 +222,7 @@ again: if (ovalue) { ovalue->it_value = itimer_get_remtime(timer); ovalue->it_interval - = ktime_to_timeval(tsk->signal->it_real_incr); + = ktime_to_timespec64(tsk->signal->it_real_incr); } /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { @@ -226,10 +230,10 @@ again: hrtimer_cancel_wait_running(timer); goto again; } - expires = timeval_to_ktime(value->it_value); + expires = timespec64_to_ktime(value->it_value); if (expires != 0) { tsk->signal->it_real_incr = - timeval_to_ktime(value->it_interval); + timespec64_to_ktime(value->it_interval); hrtimer_start(timer, expires, HRTIMER_MODE_REL); } else tsk->signal->it_real_incr = 0; @@ -252,7 +256,7 @@ again: #ifdef CONFIG_SECURITY_SELINUX void clear_itimer(void) { - struct itimerval v = {}; + struct itimerspec64 v = {}; int i; for (i = 0; i < 3; i++) @@ -276,15 +280,15 @@ void clear_itimer(void) */ static unsigned int alarm_setitimer(unsigned int seconds) { - struct itimerval it_new, it_old; + struct itimerspec64 it_new, it_old; #if BITS_PER_LONG < 64 if (seconds > INT_MAX) seconds = INT_MAX; #endif it_new.it_value.tv_sec = seconds; - it_new.it_value.tv_usec = 0; - it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + it_new.it_value.tv_nsec = 0; + it_new.it_interval.tv_sec = it_new.it_interval.tv_nsec = 0; do_setitimer(ITIMER_REAL, &it_new, &it_old); @@ -292,8 +296,8 @@ static unsigned int alarm_setitimer(unsigned int seconds) * We can't return 0 if we have an alarm pending ... And we'd * better return too much than too little anyway */ - if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || - it_old.it_value.tv_usec >= 500000) + if ((!it_old.it_value.tv_sec && it_old.it_value.tv_nsec) || + it_old.it_value.tv_nsec >= 500000) it_old.it_value.tv_sec++; return it_old.it_value.tv_sec; @@ -310,15 +314,35 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) #endif +static int get_itimerval(struct itimerspec64 *o, const struct itimerval __user *i) +{ + struct itimerval v; + + if (copy_from_user(&v, i, sizeof(struct itimerval))) + return -EFAULT; + + /* Validate the timevals in value. */ + if (!timeval_valid(&v.it_value) || + !timeval_valid(&v.it_interval)) + return -EINVAL; + + o->it_interval.tv_sec = v.it_interval.tv_sec; + o->it_interval.tv_nsec = v.it_interval.tv_usec * NSEC_PER_USEC; + o->it_value.tv_sec = v.it_value.tv_sec; + o->it_value.tv_nsec = v.it_value.tv_usec * NSEC_PER_USEC; + return 0; +} + SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, struct itimerval __user *, ovalue) { - struct itimerval set_buffer, get_buffer; + struct itimerspec64 set_buffer, get_buffer; int error; if (value) { - if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) - return -EFAULT; + error = get_itimerval(&set_buffer, value); + if (error) + return error; } else { memset(&set_buffer, 0, sizeof(set_buffer)); printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." @@ -330,43 +354,53 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, if (error || !ovalue) return error; - if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) + if (put_itimerval(ovalue, &get_buffer)) return -EFAULT; return 0; } #if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) -static int get_old_itimerval32(struct itimerval *o, const struct old_itimerval32 __user *i) +static int get_old_itimerval32(struct itimerspec64 *o, const struct old_itimerval32 __user *i) { struct old_itimerval32 v32; if (copy_from_user(&v32, i, sizeof(struct old_itimerval32))) return -EFAULT; + + /* Validate the timevals in value. */ + if (!timeval_valid(&v32.it_value) || + !timeval_valid(&v32.it_interval)) + return -EINVAL; + o->it_interval.tv_sec = v32.it_interval.tv_sec; - o->it_interval.tv_usec = v32.it_interval.tv_usec; + o->it_interval.tv_nsec = v32.it_interval.tv_usec * NSEC_PER_USEC; o->it_value.tv_sec = v32.it_value.tv_sec; - o->it_value.tv_usec = v32.it_value.tv_usec; + o->it_value.tv_nsec = v32.it_value.tv_usec * NSEC_PER_USEC; return 0; } COMPAT_SYSCALL_DEFINE3(setitimer, int, which, - struct old_itimerval32 __user *, in, - struct old_itimerval32 __user *, out) + struct old_itimerval32 __user *, value, + struct old_itimerval32 __user *, ovalue) { - struct itimerval kin, kout; + struct itimerspec64 set_buffer, get_buffer; int error; - if (in) { - if (get_old_itimerval32(&kin, in)) - return -EFAULT; + if (value) { + error = get_old_itimerval32(&set_buffer, value); + if (error) + return error; } else { - memset(&kin, 0, sizeof(kin)); + memset(&set_buffer, 0, sizeof(set_buffer)); + printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." + " Misfeature support will be removed\n", + current->comm); } - error = do_setitimer(which, &kin, out ? &kout : NULL); - if (error || !out) + error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); + if (error || !ovalue) return error; - if (put_old_itimerval32(out, &kout)) + if (put_old_itimerval32(ovalue, &get_buffer)) return -EFAULT; return 0; } -- cgit From 942437c97fd9ff23a17c13118f50bd0490f6868c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 15 Jul 2019 11:46:10 +0200 Subject: y2038: allow disabling time32 system calls At the moment, the compilation of the old time32 system calls depends purely on the architecture. As systems with new libc based on 64-bit time_t are getting deployed, even architectures that previously supported these (notably x86-32 and arm32 but also many others) no longer depend on them, and removing them from a kernel image results in a smaller kernel binary, the same way we can leave out many other optional system calls. More importantly, on an embedded system that needs to keep working beyond year 2038, any user space program calling these system calls is likely a bug, so removing them from the kernel image does provide an extra debugging help for finding broken applications. I've gone back and forth on hiding this option unless CONFIG_EXPERT is set. This version leaves it visible based on the logic that eventually it will be turned off indefinitely. Acked-by: Christian Brauner Signed-off-by: Arnd Bergmann --- kernel/sys_ni.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 34b76895b81e..3b69a560a7ac 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -410,6 +410,29 @@ COND_SYSCALL(send); COND_SYSCALL(bdflush); COND_SYSCALL(uselib); +/* optional: time32 */ +COND_SYSCALL(time32); +COND_SYSCALL(stime32); +COND_SYSCALL(utime32); +COND_SYSCALL(adjtimex_time32); +COND_SYSCALL(sched_rr_get_interval_time32); +COND_SYSCALL(nanosleep_time32); +COND_SYSCALL(rt_sigtimedwait_time32); +COND_SYSCALL_COMPAT(rt_sigtimedwait_time32); +COND_SYSCALL(timer_settime32); +COND_SYSCALL(timer_gettime32); +COND_SYSCALL(clock_settime32); +COND_SYSCALL(clock_gettime32); +COND_SYSCALL(clock_getres_time32); +COND_SYSCALL(clock_nanosleep_time32); +COND_SYSCALL(utimes_time32); +COND_SYSCALL(futimesat_time32); +COND_SYSCALL(pselect6_time32); +COND_SYSCALL_COMPAT(pselect6_time32); +COND_SYSCALL(ppoll_time32); +COND_SYSCALL_COMPAT(ppoll_time32); +COND_SYSCALL(utimensat_time32); +COND_SYSCALL(clock_adjtime32); /* * The syscalls below are not found in include/uapi/asm-generic/unistd.h -- cgit From 58a74a2925a5b4125dd4f4e728490b9642534c81 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 15 Nov 2019 11:17:30 +0200 Subject: tracing: Increase SYNTH_FIELDS_MAX for synthetic_events Increase the maximum allowed count of synthetic event fields from 16 to 32 in order to allow for larger-than-usual events. Link: http://lkml.kernel.org/r/20191115091730.9192-1-dedekind1@gmail.com Reviewed-by: Tom Zanussi Signed-off-by: Artem Bityutskiy Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7482a1466ebf..f49d1a36d3ae 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -23,7 +23,7 @@ #include "trace_dynevent.h" #define SYNTH_SYSTEM "synthetic" -#define SYNTH_FIELDS_MAX 16 +#define SYNTH_FIELDS_MAX 32 #define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ -- cgit From ca16d5bee59807bf04deaab0a8eccecd5061528c Mon Sep 17 00:00:00 2001 From: Yang Tao Date: Wed, 6 Nov 2019 22:55:35 +0100 Subject: futex: Prevent robust futex exit race Robust futexes utilize the robust_list mechanism to allow the kernel to release futexes which are held when a task exits. The exit can be voluntary or caused by a signal or fault. This prevents that waiters block forever. The futex operations in user space store a pointer to the futex they are either locking or unlocking in the op_pending member of the per task robust list. After a lock operation has succeeded the futex is queued in the robust list linked list and the op_pending pointer is cleared. After an unlock operation has succeeded the futex is removed from the robust list linked list and the op_pending pointer is cleared. The robust list exit code checks for the pending operation and any futex which is queued in the linked list. It carefully checks whether the futex value is the TID of the exiting task. If so, it sets the OWNER_DIED bit and tries to wake up a potential waiter. This is race free for the lock operation but unlock has two race scenarios where waiters might not be woken up. These issues can be observed with regular robust pthread mutexes. PI aware pthread mutexes are not affected. (1) Unlocking task is killed after unlocking the futex value in user space before being able to wake a waiter. pthread_mutex_unlock() | V atomic_exchange_rel (&mutex->__data.__lock, 0) <------------------------killed lll_futex_wake () | | |(__lock = 0) |(enter kernel) | V do_exit() exit_mm() mm_release() exit_robust_list() handle_futex_death() | |(__lock = 0) |(uval = 0) | V if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) return 0; The sanity check which ensures that the user space futex is owned by the exiting task prevents the wakeup of waiters which in consequence block infinitely. (2) Waiting task is killed after a wakeup and before it can acquire the futex in user space. OWNER WAITER futex_wait() pthread_mutex_unlock() | | | |(__lock = 0) | | | V | futex_wake() ------------> wakeup() | |(return to userspace) |(__lock = 0) | V oldval = mutex->__data.__lock <-----------------killed atomic_compare_and_exchange_val_acq (&mutex->__data.__lock, | id | assume_other_futex_waiters, 0) | | | (enter kernel)| | V do_exit() | | V handle_futex_death() | |(__lock = 0) |(uval = 0) | V if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) return 0; The sanity check which ensures that the user space futex is owned by the exiting task prevents the wakeup of waiters, which seems to be correct as the exiting task does not own the futex value, but the consequence is that other waiters wont be woken up and block infinitely. In both scenarios the following conditions are true: - task->robust_list->list_op_pending != NULL - user space futex value == 0 - Regular futex (not PI) If these conditions are met then it is reasonably safe to wake up a potential waiter in order to prevent the above problems. As this might be a false positive it can cause spurious wakeups, but the waiter side has to handle other types of unrelated wakeups, e.g. signals gracefully anyway. So such a spurious wakeup will not affect the correctness of these operations. This workaround must not touch the user space futex value and cannot set the OWNER_DIED bit because the lock value is 0, i.e. uncontended. Setting OWNER_DIED in this case would result in inconsistent state and subsequently in malfunction of the owner died handling in user space. The rest of the user space state is still consistent as no other task can observe the list_op_pending entry in the exiting tasks robust list. The eventually woken up waiter will observe the uncontended lock value and take it over. [ tglx: Massaged changelog and comment. Made the return explicit and not depend on the subsequent check and added constants to hand into handle_futex_death() instead of plain numbers. Fixed a few coding style issues. ] Fixes: 0771dfefc9e5 ("[PATCH] lightweight robust futexes: core") Signed-off-by: Yang Tao Signed-off-by: Yi Wang Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1573010582-35297-1-git-send-email-wang.yi59@zte.com.cn Link: https://lkml.kernel.org/r/20191106224555.943191378@linutronix.de --- kernel/futex.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 43229f8999fc..49eaf5be851a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3452,11 +3452,16 @@ err_unlock: return ret; } +/* Constants for the pending_op argument of handle_futex_death */ +#define HANDLE_DEATH_PENDING true +#define HANDLE_DEATH_LIST false + /* * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ -static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) +static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, + bool pi, bool pending_op) { u32 uval, uninitialized_var(nval), mval; int err; @@ -3469,6 +3474,42 @@ retry: if (get_user(uval, uaddr)) return -1; + /* + * Special case for regular (non PI) futexes. The unlock path in + * user space has two race scenarios: + * + * 1. The unlock path releases the user space futex value and + * before it can execute the futex() syscall to wake up + * waiters it is killed. + * + * 2. A woken up waiter is killed before it can acquire the + * futex in user space. + * + * In both cases the TID validation below prevents a wakeup of + * potential waiters which can cause these waiters to block + * forever. + * + * In both cases the following conditions are met: + * + * 1) task->robust_list->list_op_pending != NULL + * @pending_op == true + * 2) User space futex value == 0 + * 3) Regular futex: @pi == false + * + * If these conditions are met, it is safe to attempt waking up a + * potential waiter without touching the user space futex value and + * trying to set the OWNER_DIED bit. The user space futex value is + * uncontended and the rest of the user space mutex state is + * consistent, so a woken waiter will just take over the + * uncontended futex. Setting the OWNER_DIED bit would create + * inconsistent state and malfunction of the user space owner died + * handling. + */ + if (pending_op && !pi && !uval) { + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + return 0; + } + if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) return 0; @@ -3588,10 +3629,11 @@ void exit_robust_list(struct task_struct *curr) * A pending lock might already be on the list, so * don't process it twice: */ - if (entry != pending) + if (entry != pending) { if (handle_futex_death((void __user *)entry + futex_offset, - curr, pi)) + curr, pi, HANDLE_DEATH_LIST)) return; + } if (rc) return; entry = next_entry; @@ -3605,9 +3647,10 @@ void exit_robust_list(struct task_struct *curr) cond_resched(); } - if (pending) + if (pending) { handle_futex_death((void __user *)pending + futex_offset, - curr, pip); + curr, pip, HANDLE_DEATH_PENDING); + } } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, @@ -3784,7 +3827,8 @@ void compat_exit_robust_list(struct task_struct *curr) if (entry != pending) { void __user *uaddr = futex_uaddr(entry, futex_offset); - if (handle_futex_death(uaddr, curr, pi)) + if (handle_futex_death(uaddr, curr, pi, + HANDLE_DEATH_LIST)) return; } if (rc) @@ -3803,7 +3847,7 @@ void compat_exit_robust_list(struct task_struct *curr) if (pending) { void __user *uaddr = futex_uaddr(pending, futex_offset); - handle_futex_death(uaddr, curr, pip); + handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); } } -- cgit From 1c7f9b673dc0a15753274c4e7f5ebfd4468fc69f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:13:20 -0500 Subject: ftrace: Fix accounting bug with direct->count in register_ftrace_direct() The direct->count wasn't being updated properly, where it only was updated when the first entry was added, but should be updated every time. Fixes: 013bf0da04748 ("ftrace: Add ftrace_find_direct_func()") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 834f3556ea1e..32e4e5ffdd97 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5093,8 +5093,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) ftrace_direct_func_count--; } } else { - if (!direct->count) - direct->count++; + direct->count++; } out_unlock: mutex_unlock(&direct_mutex); -- cgit From 406acdd32d3e7d5a6dcb7f67798e89068fbe0d77 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:19:04 -0500 Subject: ftrace: Add another check for match in register_ftrace_direct() As an instruction pointer passed into register_ftrace_direct() may just exist on the ftrace call site, but may not be the start of the call site itself, register_ftrace_direct() still needs to update test if a direct call exists on the normalized site, as only one direct call is allowed at any one time. Fixes: 763e34e74bb7d ("ftrace: Add register_ftrace_direct()") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 32e4e5ffdd97..9fe33ebaf914 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5030,7 +5030,12 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) goto out_unlock; /* Make sure the ip points to the exact record */ - ip = rec->ip; + if (ip != rec->ip) { + ip = rec->ip; + /* Need to check this ip for a direct. */ + if (find_rec_direct(ip)) + goto out_unlock; + } ret = -ENOMEM; if (ftrace_hash_empty(direct_functions) || -- cgit From 128161f47bc3797b0d068da13e311770685d6e4f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:14:45 -0500 Subject: ftrace: Add helper find_direct_entry() to consolidate code Both unregister_ftrace_direct() and modify_ftrace_direct() needs to normalize the ip passed in to match the rec->ip, as it is acceptable to have the ip on the ftrace call site but not the start. There are also common validity checks with the record found by the ip, these should be done for both unregister_ftrace_direct() and modify_ftrace_direct(). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 59 +++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9fe33ebaf914..ef79c8393f53 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5112,30 +5112,40 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } EXPORT_SYMBOL_GPL(register_ftrace_direct); -int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) { struct ftrace_func_entry *entry; - struct ftrace_direct_func *direct; struct dyn_ftrace *rec; - int ret = -ENODEV; - mutex_lock(&direct_mutex); + rec = lookup_rec(*ip, *ip); + if (!rec) + return NULL; - entry = __ftrace_lookup_ip(direct_functions, ip); + entry = __ftrace_lookup_ip(direct_functions, rec->ip); if (!entry) { - /* OK if it is off by a little */ - rec = lookup_rec(ip, ip); - if (!rec || rec->ip == ip) - goto out_unlock; + WARN_ON(rec->flags & FTRACE_FL_DIRECT); + return NULL; + } - entry = __ftrace_lookup_ip(direct_functions, rec->ip); - if (!entry) { - WARN_ON(rec->flags & FTRACE_FL_DIRECT); - goto out_unlock; - } + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - } + /* Passed in ip just needs to be on the call site */ + *ip = rec->ip; + + return entry; +} + +int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_direct_func *direct; + struct ftrace_func_entry *entry; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + + entry = find_direct_entry(&ip); + if (!entry) + goto out_unlock; if (direct_functions->count == 1) unregister_ftrace_function(&direct_ops); @@ -5187,24 +5197,13 @@ int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr) { struct ftrace_func_entry *entry; - struct dyn_ftrace *rec; int ret = -ENODEV; mutex_lock(&direct_mutex); - entry = __ftrace_lookup_ip(direct_functions, ip); - if (!entry) { - /* OK if it is off by a little */ - rec = lookup_rec(ip, ip); - if (!rec || rec->ip == ip) - goto out_unlock; - - entry = __ftrace_lookup_ip(direct_functions, rec->ip); - if (!entry) - goto out_unlock; - ip = rec->ip; - WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - } + entry = find_direct_entry(&ip); + if (!entry) + goto out_unlock; ret = -EINVAL; if (entry->direct != old_addr) -- cgit From 49e9d1a9faf2f71fdfd80a30697ee9a15070626d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 15 Nov 2019 19:01:25 +0100 Subject: workqueue: Add RCU annotation for pwq list walk An additional check has been recently added to ensure that a RCU related lock is held while the RCU list is iterated. The `pwqs' are sometimes iterated without a RCU lock but with the &wq->mutex acquired leading to a warning. Teach list_for_each_entry_rcu() that the RCU usage is okay if &wq->mutex is acquired during the list traversal. Fixes: 28875945ba98d ("rcu: Add support for consolidated-RCU reader checking") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4dc8270326d7..914b845ad4ff 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -426,7 +426,8 @@ static void show_pwq(struct pool_workqueue *pwq); * ignored. */ #define for_each_pwq(pwq, wq) \ - list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ + list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \ + lockdep_is_held(&wq->mutex)) \ if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ else -- cgit From b7b3fc8dd95bc02bd30680da258e09dda55270db Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 15 Nov 2019 13:37:22 +0100 Subject: bpf: Support doubleword alignment in bpf_jit_binary_alloc Currently passing alignment greater than 4 to bpf_jit_binary_alloc does not work: in such cases it silently aligns only to 4 bytes. On s390, in order to load a constant from memory in a large (>512k) BPF program, one must use lgrl instruction, whose memory operand must be aligned on an 8-byte boundary. This patch makes it possible to request 8-byte alignment from bpf_jit_binary_alloc, and also makes it issue a warning when an unsupported alignment is requested. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191115123722.58462-1-iii@linux.ibm.com --- kernel/bpf/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1fde0303280..99693f3c4e99 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -31,6 +31,7 @@ #include #include #include +#include #include /* Registers */ @@ -815,6 +816,9 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, struct bpf_binary_header *hdr; u32 size, hole, start, pages; + WARN_ON_ONCE(!is_power_of_2(alignment) || + alignment > BPF_IMAGE_ALIGNMENT); + /* Most of BPF filters are really small, but if some of them * fill a page, allow at least 128 extra bytes to insert a * random section of illegal instructions. -- cgit From 5964b2000f283ff5df366f718e0f083ebbaae977 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:03 -0800 Subject: bpf: Add bpf_arch_text_poke() helper Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch nops/calls in kernel text into calls into BPF trampoline and to patch calls/nops inside BPF programs too. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-4-ast@kernel.org --- kernel/bpf/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 99693f3c4e99..434a0d920153 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2144,6 +2144,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, return -EFAULT; } +int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *addr1, void *addr2) +{ + return -ENOTSUPP; +} + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); -- cgit From fec56f5890d93fc2ed74166c397dc186b1c25951 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:04 -0800 Subject: bpf: Introduce BPF trampoline Introduce BPF trampoline concept to allow kernel code to call into BPF programs with practically zero overhead. The trampoline generation logic is architecture dependent. It's converting native calling convention into BPF calling convention. BPF ISA is 64-bit (even on 32-bit architectures). The registers R1 to R5 are used to pass arguments into BPF functions. The main BPF program accepts only single argument "ctx" in R1. Whereas CPU native calling convention is different. x86-64 is passing first 6 arguments in registers and the rest on the stack. x86-32 is passing first 3 arguments in registers. sparc64 is passing first 6 in registers. And so on. The trampolines between BPF and kernel already exist. BPF_CALL_x macros in include/linux/filter.h statically compile trampolines from BPF into kernel helpers. They convert up to five u64 arguments into kernel C pointers and integers. On 64-bit architectures this BPF_to_kernel trampolines are nops. On 32-bit architecture they're meaningful. The opposite job kernel_to_BPF trampolines is done by CAST_TO_U64 macros and __bpf_trace_##call() shim functions in include/trace/bpf_probe.h. They convert kernel function arguments into array of u64s that BPF program consumes via R1=ctx pointer. This patch set is doing the same job as __bpf_trace_##call() static trampolines, but dynamically for any kernel function. There are ~22k global kernel functions that are attachable via nop at function entry. The function arguments and types are described in BTF. The job of btf_distill_func_proto() function is to extract useful information from BTF into "function model" that architecture dependent trampoline generators will use to generate assembly code to cast kernel function arguments into array of u64s. For example the kernel function eth_type_trans has two pointers. They will be casted to u64 and stored into stack of generated trampoline. The pointer to that stack space will be passed into BPF program in R1. On x86-64 such generated trampoline will consume 16 bytes of stack and two stores of %rdi and %rsi into stack. The verifier will make sure that only two u64 are accessed read-only by BPF program. The verifier will also recognize the precise type of the pointers being accessed and will not allow typecasting of the pointer to a different type within BPF program. The tracing use case in the datacenter demonstrated that certain key kernel functions have (like tcp_retransmit_skb) have 2 or more kprobes that are always active. Other functions have both kprobe and kretprobe. So it is essential to keep both kernel code and BPF programs executing at maximum speed. Hence generated BPF trampoline is re-generated every time new program is attached or detached to maintain maximum performance. To avoid the high cost of retpoline the attached BPF programs are called directly. __bpf_prog_enter/exit() are used to support per-program execution stats. In the future this logic will be optimized further by adding support for bpf_stats_enabled_key inside generated assembly code. Introduction of preemptible and sleepable BPF programs will completely remove the need to call to __bpf_prog_enter/exit(). Detach of a BPF program from the trampoline should not fail. To avoid memory allocation in detach path the half of the page is used as a reserve and flipped after each attach/detach. 2k bytes is enough to call 40+ BPF programs directly which is enough for BPF tracing use cases. This limit can be increased in the future. BPF_TRACE_FENTRY programs have access to raw kernel function arguments while BPF_TRACE_FEXIT programs have access to kernel return value as well. Often kprobe BPF program remembers function arguments in a map while kretprobe fetches arguments from a map and analyzes them together with return value. BPF_TRACE_FEXIT accelerates this typical use case. Recursion prevention for kprobe BPF programs is done via per-cpu bpf_prog_active counter. In practice that turned out to be a mistake. It caused programs to randomly skip execution. The tracing tools missed results they were looking for. Hence BPF trampoline doesn't provide builtin recursion prevention. It's a job of BPF program itself and will be addressed in the follow up patches. BPF trampoline is intended to be used beyond tracing and fentry/fexit use cases in the future. For example to remove retpoline cost from XDP programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-5-ast@kernel.org --- kernel/bpf/Makefile | 1 + kernel/bpf/btf.c | 77 ++++++++++++++- kernel/bpf/core.c | 1 + kernel/bpf/syscall.c | 53 +++++++++- kernel/bpf/trampoline.c | 253 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 42 ++++++++ 6 files changed, 419 insertions(+), 8 deletions(-) create mode 100644 kernel/bpf/trampoline.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e1d9adb212f9..3f671bf617e8 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o +obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4639c4ba9a9b..9e1164e5b429 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3517,13 +3517,18 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, args++; nr_args--; } - if (arg >= nr_args) { + + if (prog->expected_attach_type == BPF_TRACE_FEXIT && + arg == nr_args) { + /* function return type */ + t = btf_type_by_id(btf_vmlinux, t->type); + } else if (arg >= nr_args) { bpf_log(log, "func '%s' doesn't have %d-th argument\n", - tname, arg); + tname, arg + 1); return false; + } else { + t = btf_type_by_id(btf_vmlinux, args[arg].type); } - - t = btf_type_by_id(btf_vmlinux, args[arg].type); /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf_vmlinux, t->type); @@ -3784,6 +3789,70 @@ u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg) return btf_id; } +static int __get_type_size(struct btf *btf, u32 btf_id, + const struct btf_type **bad_type) +{ + const struct btf_type *t; + + if (!btf_id) + /* void */ + return 0; + t = btf_type_by_id(btf, btf_id); + while (t && btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (!t) + return -EINVAL; + if (btf_type_is_ptr(t)) + /* kernel size of pointer. Not BPF's size of pointer*/ + return sizeof(void *); + if (btf_type_is_int(t) || btf_type_is_enum(t)) + return t->size; + *bad_type = t; + return -EINVAL; +} + +int btf_distill_func_proto(struct bpf_verifier_log *log, + struct btf *btf, + const struct btf_type *func, + const char *tname, + struct btf_func_model *m) +{ + const struct btf_param *args; + const struct btf_type *t; + u32 i, nargs; + int ret; + + args = (const struct btf_param *)(func + 1); + nargs = btf_type_vlen(func); + if (nargs >= MAX_BPF_FUNC_ARGS) { + bpf_log(log, + "The function %s has %d arguments. Too many.\n", + tname, nargs); + return -EINVAL; + } + ret = __get_type_size(btf, func->type, &t); + if (ret < 0) { + bpf_log(log, + "The function %s return type %s is unsupported.\n", + tname, btf_kind_str[BTF_INFO_KIND(t->info)]); + return -EINVAL; + } + m->ret_size = ret; + + for (i = 0; i < nargs; i++) { + ret = __get_type_size(btf, args[i].type, &t); + if (ret < 0) { + bpf_log(log, + "The function %s arg%d type %s is unsupported.\n", + tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]); + return -EINVAL; + } + m->arg_size[i] = ret; + } + m->nr_args = nargs; + return 0; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 434a0d920153..da5a8b8e278f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2015,6 +2015,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) if (aux->prog->has_callchain_buf) put_callchain_buffers(); #endif + bpf_trampoline_put(aux->trampoline); for (i = 0; i < aux->func_cnt; i++) bpf_jit_free(aux->func[i]); if (aux->func_cnt) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6d9ce95e5a8d..e2e37bea86bc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1799,6 +1799,49 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } +static int bpf_tracing_prog_release(struct inode *inode, struct file *filp) +{ + struct bpf_prog *prog = filp->private_data; + + WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); + bpf_prog_put(prog); + return 0; +} + +static const struct file_operations bpf_tracing_prog_fops = { + .release = bpf_tracing_prog_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, +}; + +static int bpf_tracing_prog_attach(struct bpf_prog *prog) +{ + int tr_fd, err; + + if (prog->expected_attach_type != BPF_TRACE_FENTRY && + prog->expected_attach_type != BPF_TRACE_FEXIT) { + err = -EINVAL; + goto out_put_prog; + } + + err = bpf_trampoline_link_prog(prog); + if (err) + goto out_put_prog; + + tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops, + prog, O_CLOEXEC); + if (tr_fd < 0) { + WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); + err = tr_fd; + goto out_put_prog; + } + return tr_fd; + +out_put_prog: + bpf_prog_put(prog); + return err; +} + struct bpf_raw_tracepoint { struct bpf_raw_event_map *btp; struct bpf_prog *prog; @@ -1850,14 +1893,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (prog->type == BPF_PROG_TYPE_TRACING) { if (attr->raw_tracepoint.name) { - /* raw_tp name should not be specified in raw_tp - * programs that were verified via in-kernel BTF info + /* The attach point for this category of programs + * should be specified via btf_id during program load. */ err = -EINVAL; goto out_put_prog; } - /* raw_tp name is taken from type name instead */ - tp_name = prog->aux->attach_func_name; + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) + tp_name = prog->aux->attach_func_name; + else + return bpf_tracing_prog_attach(prog); } else { if (strncpy_from_user(buf, u64_to_user_ptr(attr->raw_tracepoint.name), diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c new file mode 100644 index 000000000000..10ae59d65f13 --- /dev/null +++ b/kernel/bpf/trampoline.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2019 Facebook */ +#include +#include +#include + +/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ +#define TRAMPOLINE_HASH_BITS 10 +#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) + +static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; + +/* serializes access to trampoline_table */ +static DEFINE_MUTEX(trampoline_mutex); + +struct bpf_trampoline *bpf_trampoline_lookup(u64 key) +{ + struct bpf_trampoline *tr; + struct hlist_head *head; + void *image; + int i; + + mutex_lock(&trampoline_mutex); + head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; + hlist_for_each_entry(tr, head, hlist) { + if (tr->key == key) { + refcount_inc(&tr->refcnt); + goto out; + } + } + tr = kzalloc(sizeof(*tr), GFP_KERNEL); + if (!tr) + goto out; + + /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!image) { + kfree(tr); + tr = NULL; + goto out; + } + + tr->key = key; + INIT_HLIST_NODE(&tr->hlist); + hlist_add_head(&tr->hlist, head); + refcount_set(&tr->refcnt, 1); + mutex_init(&tr->mutex); + for (i = 0; i < BPF_TRAMP_MAX; i++) + INIT_HLIST_HEAD(&tr->progs_hlist[i]); + + set_vm_flush_reset_perms(image); + /* Keep image as writeable. The alternative is to keep flipping ro/rw + * everytime new program is attached or detached. + */ + set_memory_x((long)image, 1); + tr->image = image; +out: + mutex_unlock(&trampoline_mutex); + return tr; +} + +/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 + * bytes on x86. Pick a number to fit into PAGE_SIZE / 2 + */ +#define BPF_MAX_TRAMP_PROGS 40 + +static int bpf_trampoline_update(struct bpf_trampoline *tr) +{ + void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; + void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; + struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS]; + int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY]; + int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT]; + struct bpf_prog **progs, **fentry, **fexit; + u32 flags = BPF_TRAMP_F_RESTORE_REGS; + struct bpf_prog_aux *aux; + int err; + + if (fentry_cnt + fexit_cnt == 0) { + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_NOP, + old_image, NULL); + tr->selector = 0; + goto out; + } + + /* populate fentry progs */ + fentry = progs = progs_to_run; + hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist) + *progs++ = aux->prog; + + /* populate fexit progs */ + fexit = progs; + hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist) + *progs++ = aux->prog; + + if (fexit_cnt) + flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; + + err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags, + fentry, fentry_cnt, + fexit, fexit_cnt, + tr->func.addr); + if (err) + goto out; + + if (tr->selector) + /* progs already running at this address */ + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_CALL, + old_image, new_image); + else + /* first time registering */ + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP_TO_CALL, + NULL, new_image); + if (err) + goto out; + tr->selector++; +out: + return err; +} + +static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t) +{ + switch (t) { + case BPF_TRACE_FENTRY: + return BPF_TRAMP_FENTRY; + default: + return BPF_TRAMP_FEXIT; + } +} + +int bpf_trampoline_link_prog(struct bpf_prog *prog) +{ + enum bpf_tramp_prog_type kind; + struct bpf_trampoline *tr; + int err = 0; + + tr = prog->aux->trampoline; + kind = bpf_attach_type_to_tramp(prog->expected_attach_type); + mutex_lock(&tr->mutex); + if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT] + >= BPF_MAX_TRAMP_PROGS) { + err = -E2BIG; + goto out; + } + if (!hlist_unhashed(&prog->aux->tramp_hlist)) { + /* prog already linked */ + err = -EBUSY; + goto out; + } + hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]); + tr->progs_cnt[kind]++; + err = bpf_trampoline_update(prog->aux->trampoline); + if (err) { + hlist_del(&prog->aux->tramp_hlist); + tr->progs_cnt[kind]--; + } +out: + mutex_unlock(&tr->mutex); + return err; +} + +/* bpf_trampoline_unlink_prog() should never fail. */ +int bpf_trampoline_unlink_prog(struct bpf_prog *prog) +{ + enum bpf_tramp_prog_type kind; + struct bpf_trampoline *tr; + int err; + + tr = prog->aux->trampoline; + kind = bpf_attach_type_to_tramp(prog->expected_attach_type); + mutex_lock(&tr->mutex); + hlist_del(&prog->aux->tramp_hlist); + tr->progs_cnt[kind]--; + err = bpf_trampoline_update(prog->aux->trampoline); + mutex_unlock(&tr->mutex); + return err; +} + +void bpf_trampoline_put(struct bpf_trampoline *tr) +{ + if (!tr) + return; + mutex_lock(&trampoline_mutex); + if (!refcount_dec_and_test(&tr->refcnt)) + goto out; + WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); + if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY]))) + goto out; + if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) + goto out; + bpf_jit_free_exec(tr->image); + hlist_del(&tr->hlist); + kfree(tr); +out: + mutex_unlock(&trampoline_mutex); +} + +/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that + * are needed for trampoline. The macro is split into + * call _bpf_prog_enter + * call prog->bpf_func + * call __bpf_prog_exit + */ +u64 notrace __bpf_prog_enter(void) +{ + u64 start = 0; + + rcu_read_lock(); + preempt_disable(); + if (static_branch_unlikely(&bpf_stats_enabled_key)) + start = sched_clock(); + return start; +} + +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) +{ + struct bpf_prog_stats *stats; + + if (static_branch_unlikely(&bpf_stats_enabled_key) && + /* static_key could be enabled in __bpf_prog_enter + * and disabled in __bpf_prog_exit. + * And vice versa. + * Hence check that 'start' is not zero. + */ + start) { + stats = this_cpu_ptr(prog->aux->stats); + u64_stats_update_begin(&stats->syncp); + stats->cnt++; + stats->nsecs += sched_clock() - start; + u64_stats_update_end(&stats->syncp); + } + preempt_enable(); + rcu_read_unlock(); +} + +int __weak +arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, + struct bpf_prog **fentry_progs, int fentry_cnt, + struct bpf_prog **fexit_progs, int fexit_cnt, + void *orig_call) +{ + return -ENOTSUPP; +} + +static int __init init_trampolines(void) +{ + int i; + + for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) + INIT_HLIST_HEAD(&trampoline_table[i]); + return 0; +} +late_initcall(init_trampolines); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2f2374967b36..8f89cfa93e88 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9382,8 +9382,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) struct bpf_prog *prog = env->prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + struct bpf_trampoline *tr; const struct btf_type *t; const char *tname; + int ret = 0; + long addr; if (prog->type != BPF_PROG_TYPE_TRACING) return 0; @@ -9432,6 +9435,45 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + if (!btf_type_is_func(t)) { + verbose(env, "attach_btf_id %u is not a function\n", + btf_id); + return -EINVAL; + } + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + tr = bpf_trampoline_lookup(btf_id); + if (!tr) + return -ENOMEM; + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + mutex_lock(&tr->mutex); + if (tr->func.addr) { + prog->aux->trampoline = tr; + goto out; + } + ret = btf_distill_func_proto(&env->log, btf_vmlinux, t, + tname, &tr->func.model); + if (ret < 0) + goto out; + addr = kallsyms_lookup_name(tname); + if (!addr) { + verbose(env, + "The address of function %s cannot be found\n", + tname); + ret = -ENOENT; + goto out; + } + tr->func.addr = (void *)addr; + prog->aux->trampoline = tr; +out: + mutex_unlock(&tr->mutex); + if (ret) + bpf_trampoline_put(tr); + return ret; default: return -EINVAL; } -- cgit From 9cc31b3a092d9bf2a18f09ad77e727ddb42a5b1e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:14 -0800 Subject: bpf: Fix race in btf_resolve_helper_id() btf_resolve_helper_id() caching logic is a bit racy, since under root the verifier can verify several programs in parallel. Fix it with READ/WRITE_ONCE. Fix the type as well, since error is also recorded. Fixes: a7658e1a4164 ("bpf: Check types of arguments passed into helpers") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-15-ast@kernel.org --- kernel/bpf/btf.c | 26 +++++++++++++++++++++++++- kernel/bpf/verifier.c | 8 +++----- 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9e1164e5b429..033d071eb59c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3721,7 +3721,8 @@ again: return -EINVAL; } -u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg) +static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, + int arg) { char fnname[KSYM_SYMBOL_LEN + 4] = "btf_"; const struct btf_param *args; @@ -3789,6 +3790,29 @@ u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg) return btf_id; } +int btf_resolve_helper_id(struct bpf_verifier_log *log, + const struct bpf_func_proto *fn, int arg) +{ + int *btf_id = &fn->btf_id[arg]; + int ret; + + if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID) + return -EINVAL; + + ret = READ_ONCE(*btf_id); + if (ret) + return ret; + /* ok to race the search. The result is the same */ + ret = __btf_resolve_helper_id(log, fn->func, arg); + if (!ret) { + /* Function argument cannot be type 'void' */ + bpf_log(log, "BTF resolution bug\n"); + return -EFAULT; + } + WRITE_ONCE(*btf_id, ret); + return ret; +} + static int __get_type_size(struct btf *btf, u32 btf_id, const struct btf_type **bad_type) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8f89cfa93e88..e78ec7990767 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4147,11 +4147,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID) { - if (!fn->btf_id[i]) - fn->btf_id[i] = btf_resolve_helper_id(&env->log, fn->func, i); - meta.btf_id = fn->btf_id[i]; - } + err = btf_resolve_helper_id(&env->log, fn, i); + if (err > 0) + meta.btf_id = err; err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); if (err) return err; -- cgit From 91cc1a99740e2ed1d903b5906afb470cc5a07379 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:15 -0800 Subject: bpf: Annotate context types Annotate BPF program context types with program-side type and kernel-side type. This type information is used by the verifier. btf_get_prog_ctx_type() is used in the later patches to verify that BTF type of ctx in BPF program matches to kernel expected ctx type. For example, the XDP program type is: BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp, struct xdp_md, struct xdp_buff) That means that XDP program should be written as: int xdp_prog(struct xdp_md *ctx) { ... } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-16-ast@kernel.org --- kernel/bpf/btf.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 4 +- kernel/bpf/verifier.c | 2 +- 3 files changed, 114 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 033d071eb59c..4b7c8bd423d6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2,6 +2,8 @@ /* Copyright (c) 2018 Facebook */ #include +#include +#include #include #include #include @@ -16,6 +18,9 @@ #include #include #include +#include +#include +#include /* BTF (BPF Type Format) is the meta data format which describes * the data types of BPF program/map. Hence, it basically focus @@ -3439,13 +3444,98 @@ errout: extern char __weak _binary__btf_vmlinux_bin_start[]; extern char __weak _binary__btf_vmlinux_bin_end[]; +extern struct btf *btf_vmlinux; + +#define BPF_MAP_TYPE(_id, _ops) +static union { + struct bpf_ctx_convert { +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ + prog_ctx_type _id##_prog; \ + kern_ctx_type _id##_kern; +#include +#undef BPF_PROG_TYPE + } *__t; + /* 't' is written once under lock. Read many times. */ + const struct btf_type *t; +} bpf_ctx_convert; +enum { +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ + __ctx_convert##_id, +#include +#undef BPF_PROG_TYPE +}; +static u8 bpf_ctx_convert_map[] = { +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ + [_id] = __ctx_convert##_id, +#include +#undef BPF_PROG_TYPE +}; +#undef BPF_MAP_TYPE + +static const struct btf_member * +btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type) +{ + const struct btf_type *conv_struct; + const struct btf_type *ctx_struct; + const struct btf_member *ctx_type; + const char *tname, *ctx_tname; + + conv_struct = bpf_ctx_convert.t; + if (!conv_struct) { + bpf_log(log, "btf_vmlinux is malformed\n"); + return NULL; + } + t = btf_type_by_id(btf, t->type); + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_struct(t)) { + /* Only pointer to struct is supported for now. + * That means that BPF_PROG_TYPE_TRACEPOINT with BTF + * is not supported yet. + * BPF_PROG_TYPE_RAW_TRACEPOINT is fine. + */ + bpf_log(log, "BPF program ctx type is not a struct\n"); + return NULL; + } + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) { + bpf_log(log, "BPF program ctx struct doesn't have a name\n"); + return NULL; + } + /* prog_type is valid bpf program type. No need for bounds check. */ + ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2; + /* ctx_struct is a pointer to prog_ctx_type in vmlinux. + * Like 'struct __sk_buff' + */ + ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type); + if (!ctx_struct) + /* should not happen */ + return NULL; + ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off); + if (!ctx_tname) { + /* should not happen */ + bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n"); + return NULL; + } + /* only compare that prog's ctx type name is the same as + * kernel expects. No need to compare field by field. + * It's ok for bpf prog to do: + * struct __sk_buff {}; + * int socket_filter_bpf_prog(struct __sk_buff *skb) + * { // no fields of skb are ever used } + */ + if (strcmp(ctx_tname, tname)) + return NULL; + return ctx_type; +} struct btf *btf_parse_vmlinux(void) { struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; struct btf *btf = NULL; - int err; + int err, i; env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) @@ -3479,6 +3569,26 @@ struct btf *btf_parse_vmlinux(void) if (err) goto errout; + /* find struct bpf_ctx_convert for type checking later */ + for (i = 1; i <= btf->nr_types; i++) { + const struct btf_type *t; + const char *tname; + + t = btf_type_by_id(btf, i); + if (!__btf_type_is_struct(t)) + continue; + tname = __btf_name_by_offset(btf, t->name_off); + if (!strcmp(tname, "bpf_ctx_convert")) { + /* btf_parse_vmlinux() runs under bpf_verifier_lock */ + bpf_ctx_convert.t = t; + break; + } + } + if (i > btf->nr_types) { + err = -ENOENT; + goto errout; + } + btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; @@ -3492,8 +3602,6 @@ errout: return ERR_PTR(err); } -extern struct btf *btf_vmlinux; - bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e2e37bea86bc..05a0ee75eca0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -43,7 +43,7 @@ static DEFINE_SPINLOCK(map_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; static const struct bpf_map_ops * const bpf_map_types[] = { -#define BPF_PROG_TYPE(_id, _ops) +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, #include @@ -1189,7 +1189,7 @@ err_put: } static const struct bpf_prog_ops * const bpf_prog_types[] = { -#define BPF_PROG_TYPE(_id, _name) \ +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) #include diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e78ec7990767..7395d6bebefd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23,7 +23,7 @@ #include "disasm.h" static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { -#define BPF_PROG_TYPE(_id, _name) \ +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) #include -- cgit From 8c1b6e69dcc1e11bd24111e3734dd740aaf3fda1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:16 -0800 Subject: bpf: Compare BTF types of functions arguments with actual types Make the verifier check that BTF types of function arguments match actual types passed into top-level BPF program and into BPF-to-BPF calls. If types match such BPF programs and sub-programs will have full support of BPF trampoline. If types mismatch the trampoline has to be conservative. It has to save/restore five program arguments and assume 64-bit scalars. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-17-ast@kernel.org --- kernel/bpf/btf.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 1 + kernel/bpf/verifier.c | 18 +++++++++-- 3 files changed, 98 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4b7c8bd423d6..4620267b186e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3985,6 +3985,88 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return 0; } +int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) +{ + struct bpf_verifier_state *st = env->cur_state; + struct bpf_func_state *func = st->frame[st->curframe]; + struct bpf_reg_state *reg = func->regs; + struct bpf_verifier_log *log = &env->log; + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + const struct btf_param *args; + const struct btf_type *t; + u32 i, nargs, btf_id; + const char *tname; + + if (!prog->aux->func_info) + return 0; + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) + return 0; + + if (prog->aux->func_info_aux[subprog].unreliable) + return 0; + + t = btf_type_by_id(btf, btf_id); + if (!t || !btf_type_is_func(t)) { + bpf_log(log, "BTF of subprog %d doesn't point to KIND_FUNC\n", + subprog); + return -EINVAL; + } + tname = btf_name_by_offset(btf, t->name_off); + + t = btf_type_by_id(btf, t->type); + if (!t || !btf_type_is_func_proto(t)) { + bpf_log(log, "Invalid type of func %s\n", tname); + return -EINVAL; + } + args = (const struct btf_param *)(t + 1); + nargs = btf_type_vlen(t); + if (nargs > 5) { + bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs); + goto out; + } + /* check that BTF function arguments match actual types that the + * verifier sees. + */ + for (i = 0; i < nargs; i++) { + t = btf_type_by_id(btf, args[i].type); + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (btf_type_is_int(t) || btf_type_is_enum(t)) { + if (reg[i + 1].type == SCALAR_VALUE) + continue; + bpf_log(log, "R%d is not a scalar\n", i + 1); + goto out; + } + if (btf_type_is_ptr(t)) { + if (reg[i + 1].type == SCALAR_VALUE) { + bpf_log(log, "R%d is not a pointer\n", i + 1); + goto out; + } + /* If program is passing PTR_TO_CTX into subprogram + * check that BTF type matches. + */ + if (reg[i + 1].type == PTR_TO_CTX && + !btf_get_prog_ctx_type(log, btf, t, prog->type)) + goto out; + /* All other pointers are ok */ + continue; + } + bpf_log(log, "Unrecognized argument type %s\n", + btf_kind_str[BTF_INFO_KIND(t->info)]); + goto out; + } + return 0; +out: + /* LLVM optimizations can remove arguments from static functions. */ + bpf_log(log, + "Type info disagrees with actual arguments due to compiler optimizations\n"); + prog->aux->func_info_aux[subprog].unreliable = true; + return 0; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 05a0ee75eca0..43ba647de720 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1328,6 +1328,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); kvfree(aux->func_info); + kfree(aux->func_info_aux); free_used_maps(aux); bpf_prog_uncharge_memlock(aux->prog); security_bpf_prog_free(aux); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7395d6bebefd..11910149ca2f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3970,6 +3970,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* only increment it after check_reg_arg() finished */ state->curframe++; + if (btf_check_func_arg_match(env, subprog)) + return -EINVAL; + /* and go analyze first insn of the callee */ *insn_idx = target_insn; @@ -6564,6 +6567,7 @@ static int check_btf_func(struct bpf_verifier_env *env, u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); struct bpf_func_info *krecord; + struct bpf_func_info_aux *info_aux = NULL; const struct btf_type *type; struct bpf_prog *prog; const struct btf *btf; @@ -6597,6 +6601,9 @@ static int check_btf_func(struct bpf_verifier_env *env, krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); if (!krecord) return -ENOMEM; + info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN); + if (!info_aux) + goto err_free; for (i = 0; i < nfuncs; i++) { ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); @@ -6648,29 +6655,31 @@ static int check_btf_func(struct bpf_verifier_env *env, ret = -EINVAL; goto err_free; } - prev_offset = krecord[i].insn_off; urecord += urec_size; } prog->aux->func_info = krecord; prog->aux->func_info_cnt = nfuncs; + prog->aux->func_info_aux = info_aux; return 0; err_free: kvfree(krecord); + kfree(info_aux); return ret; } static void adjust_btf_func(struct bpf_verifier_env *env) { + struct bpf_prog_aux *aux = env->prog->aux; int i; - if (!env->prog->aux->func_info) + if (!aux->func_info) return; for (i = 0; i < env->subprog_cnt; i++) - env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start; + aux->func_info[i].insn_off = env->subprog_info[i].start; } #define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \ @@ -7651,6 +7660,9 @@ static int do_check(struct bpf_verifier_env *env) 0 /* frameno */, 0 /* subprogno, zero == main subprog */); + if (btf_check_func_arg_match(env, 0)) + return -EINVAL; + for (;;) { struct bpf_insn *insn; u8 class; -- cgit From 5b92a28aae4dd0f88778d540ecfdcdaec5a41723 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:17 -0800 Subject: bpf: Support attaching tracing BPF program to other BPF programs Allow FENTRY/FEXIT BPF programs to attach to other BPF programs of any type including their subprograms. This feature allows snooping on input and output packets in XDP, TC programs including their return values. In order to do that the verifier needs to track types not only of vmlinux, but types of other BPF programs as well. The verifier also needs to translate uapi/linux/bpf.h types used by networking programs into kernel internal BTF types used by FENTRY/FEXIT BPF programs. In some cases LLVM optimizations can remove arguments from BPF subprograms without adjusting BTF info that LLVM backend knows. When BTF info disagrees with actual types that the verifiers sees the BPF trampoline has to fallback to conservative and treat all arguments as u64. The FENTRY/FEXIT program can still attach to such subprograms, but it won't be able to recognize pointer types like 'struct sk_buff *' and it won't be able to pass them to bpf_skb_output() for dumping packets to user space. The FENTRY/FEXIT program would need to use bpf_probe_read_kernel() instead. The BPF_PROG_LOAD command is extended with attach_prog_fd field. When it's set to zero the attach_btf_id is one vmlinux BTF type ids. When attach_prog_fd points to previously loaded BPF program the attach_btf_id is BTF type id of main function or one of its subprograms. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-18-ast@kernel.org --- kernel/bpf/btf.c | 70 ++++++++++++++++++++++++++++++++++++++----- kernel/bpf/core.c | 2 ++ kernel/bpf/syscall.c | 19 +++++++++--- kernel/bpf/verifier.c | 83 +++++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 147 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4620267b186e..40efde5eedcb 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3530,6 +3530,20 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, return ctx_type; } +static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, + struct btf *btf, + const struct btf_type *t, + enum bpf_prog_type prog_type) +{ + const struct btf_member *prog_ctx_type, *kern_ctx_type; + + prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type); + if (!prog_ctx_type) + return -ENOENT; + kern_ctx_type = prog_ctx_type + 1; + return kern_ctx_type->type; +} + struct btf *btf_parse_vmlinux(void) { struct btf_verifier_env *env = NULL; @@ -3602,15 +3616,29 @@ errout: return ERR_PTR(err); } +struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) +{ + struct bpf_prog *tgt_prog = prog->aux->linked_prog; + + if (tgt_prog) { + return tgt_prog->aux->btf; + } else { + return btf_vmlinux; + } +} + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const struct btf_type *t = prog->aux->attach_func_proto; + struct bpf_prog *tgt_prog = prog->aux->linked_prog; + struct btf *btf = bpf_prog_get_target_btf(prog); const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; u32 nr_args, arg; + int ret; if (off % 8) { bpf_log(log, "func '%s' offset %d is not multiple of 8\n", @@ -3619,7 +3647,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } arg = off / 8; args = (const struct btf_param *)(t + 1); - nr_args = btf_type_vlen(t); + /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */ + nr_args = t ? btf_type_vlen(t) : 5; if (prog->aux->attach_btf_trace) { /* skip first 'void *__data' argument in btf_trace_##name typedef */ args++; @@ -3628,18 +3657,24 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (prog->expected_attach_type == BPF_TRACE_FEXIT && arg == nr_args) { + if (!t) + /* Default prog with 5 args. 6th arg is retval. */ + return true; /* function return type */ - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); } else if (arg >= nr_args) { bpf_log(log, "func '%s' doesn't have %d-th argument\n", tname, arg + 1); return false; } else { - t = btf_type_by_id(btf_vmlinux, args[arg].type); + if (!t) + /* Default prog with 5 args */ + return true; + t = btf_type_by_id(btf, args[arg].type); } /* skip modifiers */ while (btf_type_is_modifier(t)) - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t)) /* accessing a scalar */ return true; @@ -3647,7 +3682,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, bpf_log(log, "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", tname, arg, - __btf_name_by_offset(btf_vmlinux, t->name_off), + __btf_name_by_offset(btf, t->name_off), btf_kind_str[BTF_INFO_KIND(t->info)]); return false; } @@ -3662,10 +3697,19 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = PTR_TO_BTF_ID; info->btf_id = t->type; - t = btf_type_by_id(btf_vmlinux, t->type); + if (tgt_prog) { + ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type); + if (ret > 0) { + info->btf_id = ret; + return true; + } else { + return false; + } + } + t = btf_type_by_id(btf, t->type); /* skip modifiers */ while (btf_type_is_modifier(t)) - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (!btf_type_is_struct(t)) { bpf_log(log, "func '%s' arg%d type %s is not a struct\n", @@ -3674,7 +3718,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)], - __btf_name_by_offset(btf_vmlinux, t->name_off)); + __btf_name_by_offset(btf, t->name_off)); return true; } @@ -3954,6 +3998,16 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, u32 i, nargs; int ret; + if (!func) { + /* BTF function prototype doesn't match the verifier types. + * Fall back to 5 u64 args. + */ + for (i = 0; i < 5; i++) + m->arg_size[i] = 8; + m->ret_size = 8; + m->nr_args = 5; + return 0; + } args = (const struct btf_param *)(func + 1); nargs = btf_type_vlen(func); if (nargs >= MAX_BPF_FUNC_ARGS) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index da5a8b8e278f..b5945c3aaa8e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2031,6 +2031,8 @@ void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; + if (aux->linked_prog) + bpf_prog_put(aux->linked_prog); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 43ba647de720..c88c815c2154 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1577,7 +1577,7 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) static int bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, - u32 btf_id) + u32 btf_id, u32 prog_fd) { switch (prog_type) { case BPF_PROG_TYPE_TRACING: @@ -1585,7 +1585,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, return -EINVAL; break; default: - if (btf_id) + if (btf_id || prog_fd) return -EINVAL; break; } @@ -1636,7 +1636,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD attach_btf_id +#define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -1679,7 +1679,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach(type, attr->expected_attach_type, - attr->attach_btf_id)) + attr->attach_btf_id, + attr->attach_prog_fd)) return -EINVAL; /* plain bpf_prog allocation */ @@ -1689,6 +1690,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->expected_attach_type = attr->expected_attach_type; prog->aux->attach_btf_id = attr->attach_btf_id; + if (attr->attach_prog_fd) { + struct bpf_prog *tgt_prog; + + tgt_prog = bpf_prog_get(attr->attach_prog_fd); + if (IS_ERR(tgt_prog)) { + err = PTR_ERR(tgt_prog); + goto free_prog_nouncharge; + } + prog->aux->linked_prog = tgt_prog; + } prog->aux->offload_requested = !!attr->prog_ifindex; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 11910149ca2f..e9dc95a18d44 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9390,13 +9390,17 @@ static void print_verification_stats(struct bpf_verifier_env *env) static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; + struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + int ret = 0, subprog = -1, i; struct bpf_trampoline *tr; const struct btf_type *t; + bool conservative = true; const char *tname; - int ret = 0; + struct btf *btf; long addr; + u64 key; if (prog->type != BPF_PROG_TYPE_TRACING) return 0; @@ -9405,19 +9409,47 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) verbose(env, "Tracing programs must provide btf_id\n"); return -EINVAL; } - t = btf_type_by_id(btf_vmlinux, btf_id); + btf = bpf_prog_get_target_btf(prog); + if (!btf) { + verbose(env, + "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n"); + return -EINVAL; + } + t = btf_type_by_id(btf, btf_id); if (!t) { verbose(env, "attach_btf_id %u is invalid\n", btf_id); return -EINVAL; } - tname = btf_name_by_offset(btf_vmlinux, t->name_off); + tname = btf_name_by_offset(btf, t->name_off); if (!tname) { verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id); return -EINVAL; } + if (tgt_prog) { + struct bpf_prog_aux *aux = tgt_prog->aux; + + for (i = 0; i < aux->func_info_cnt; i++) + if (aux->func_info[i].type_id == btf_id) { + subprog = i; + break; + } + if (subprog == -1) { + verbose(env, "Subprog %s doesn't exist\n", tname); + return -EINVAL; + } + conservative = aux->func_info_aux[subprog].unreliable; + key = ((u64)aux->id) << 32 | btf_id; + } else { + key = btf_id; + } switch (prog->expected_attach_type) { case BPF_TRACE_RAW_TP: + if (tgt_prog) { + verbose(env, + "Only FENTRY/FEXIT progs are attachable to another BPF prog\n"); + return -EINVAL; + } if (!btf_type_is_typedef(t)) { verbose(env, "attach_btf_id %u is not a typedef\n", btf_id); @@ -9429,11 +9461,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } tname += sizeof(prefix) - 1; - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (!btf_type_is_ptr(t)) /* should never happen in valid vmlinux build */ return -EINVAL; - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) /* should never happen in valid vmlinux build */ return -EINVAL; @@ -9452,30 +9484,51 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) btf_id); return -EINVAL; } - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) return -EINVAL; - tr = bpf_trampoline_lookup(btf_id); + tr = bpf_trampoline_lookup(key); if (!tr) return -ENOMEM; prog->aux->attach_func_name = tname; + /* t is either vmlinux type or another program's type */ prog->aux->attach_func_proto = t; mutex_lock(&tr->mutex); if (tr->func.addr) { prog->aux->trampoline = tr; goto out; } - ret = btf_distill_func_proto(&env->log, btf_vmlinux, t, + if (tgt_prog && conservative) { + prog->aux->attach_func_proto = NULL; + t = NULL; + } + ret = btf_distill_func_proto(&env->log, btf, t, tname, &tr->func.model); if (ret < 0) goto out; - addr = kallsyms_lookup_name(tname); - if (!addr) { - verbose(env, - "The address of function %s cannot be found\n", - tname); - ret = -ENOENT; - goto out; + if (tgt_prog) { + if (!tgt_prog->jited) { + /* for now */ + verbose(env, "Can trace only JITed BPF progs\n"); + ret = -EINVAL; + goto out; + } + if (tgt_prog->type == BPF_PROG_TYPE_TRACING) { + /* prevent cycles */ + verbose(env, "Cannot recursively attach\n"); + ret = -EINVAL; + goto out; + } + addr = (long) tgt_prog->aux->func[subprog]->bpf_func; + } else { + addr = kallsyms_lookup_name(tname); + if (!addr) { + verbose(env, + "The address of function %s cannot be found\n", + tname); + ret = -ENOENT; + goto out; + } } tr->func.addr = (void *)addr; prog->aux->trampoline = tr; -- cgit From 49cb2fc42ce4b7a656ee605e30c302efaa39c1a7 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 15 Nov 2019 13:36:20 +0100 Subject: fork: extend clone3() to support setting a PID The main motivation to add set_tid to clone3() is CRIU. To restore a process with the same PID/TID CRIU currently uses /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to ns_last_pid and then (quickly) does a clone(). This works most of the time, but it is racy. It is also slow as it requires multiple syscalls. Extending clone3() to support *set_tid makes it possible restore a process using CRIU without accessing /proc/sys/kernel/ns_last_pid and race free (as long as the desired PID/TID is available). This clone3() extension places the same restrictions (CAP_SYS_ADMIN) on clone3() with *set_tid as they are currently in place for ns_last_pid. The original version of this change was using a single value for set_tid. At the 2019 LPC, after presenting set_tid, it was, however, decided to change set_tid to an array to enable setting the PID of a process in multiple PID namespaces at the same time. If a process is created in a PID namespace it is possible to influence the PID inside and outside of the PID namespace. Details also in the corresponding selftest. To create a process with the following PIDs: PID NS level Requested PID 0 (host) 31496 1 42 2 1 For that example the two newly introduced parameters to struct clone_args (set_tid and set_tid_size) would need to be: set_tid[0] = 1; set_tid[1] = 42; set_tid[2] = 31496; set_tid_size = 3; If only the PIDs of the two innermost nested PID namespaces should be defined it would look like this: set_tid[0] = 1; set_tid[1] = 42; set_tid_size = 2; The PID of the newly created process would then be the next available free PID in the PID namespace level 0 (host) and 42 in the PID namespace at level 1 and the PID of the process in the innermost PID namespace would be 1. The set_tid array is used to specify the PID of a process starting from the innermost nested PID namespaces up to set_tid_size PID namespaces. set_tid_size cannot be larger then the current PID namespace level. Signed-off-by: Adrian Reber Reviewed-by: Christian Brauner Reviewed-by: Oleg Nesterov Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Acked-by: Andrei Vagin Link: https://lore.kernel.org/r/20191115123621.142252-1-areber@redhat.com Signed-off-by: Christian Brauner --- kernel/fork.c | 24 ++++++++++++++++- kernel/pid.c | 72 +++++++++++++++++++++++++++++++++++++++----------- kernel/pid_namespace.c | 2 -- 3 files changed, 80 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 954e875e72b1..417570263f1f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2087,7 +2087,8 @@ static __latent_entropy struct task_struct *copy_process( stackleak_task_init(p); if (pid != &init_struct_pid) { - pid = alloc_pid(p->nsproxy->pid_ns_for_children); + pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid, + args->set_tid_size); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_thread; @@ -2590,6 +2591,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, { int err; struct clone_args args; + pid_t *kset_tid = kargs->set_tid; if (unlikely(usize > PAGE_SIZE)) return -E2BIG; @@ -2600,6 +2602,15 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, if (err) return err; + if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) + return -EINVAL; + + if (unlikely(!args.set_tid && args.set_tid_size > 0)) + return -EINVAL; + + if (unlikely(args.set_tid && args.set_tid_size == 0)) + return -EINVAL; + /* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal @@ -2617,8 +2628,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, .stack = args.stack, .stack_size = args.stack_size, .tls = args.tls, + .set_tid_size = args.set_tid_size, }; + if (args.set_tid && + copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), + (kargs->set_tid_size * sizeof(pid_t)))) + return -EFAULT; + + kargs->set_tid = kset_tid; + return 0; } @@ -2662,6 +2681,9 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) int err; struct kernel_clone_args kargs; + pid_t set_tid[MAX_PID_NS_LEVEL]; + + kargs.set_tid = set_tid; err = copy_clone_args_from_user(&kargs, uargs, size); if (err) diff --git a/kernel/pid.c b/kernel/pid.c index 7b5f6c963d72..2278e249141d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -157,7 +157,8 @@ void free_pid(struct pid *pid) call_rcu(&pid->rcu, delayed_put_pid); } -struct pid *alloc_pid(struct pid_namespace *ns) +struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, + size_t set_tid_size) { struct pid *pid; enum pid_type type; @@ -166,6 +167,17 @@ struct pid *alloc_pid(struct pid_namespace *ns) struct upid *upid; int retval = -ENOMEM; + /* + * set_tid_size contains the size of the set_tid array. Starting at + * the most nested currently active PID namespace it tells alloc_pid() + * which PID to set for a process in that most nested PID namespace + * up to set_tid_size PID namespaces. It does not have to set the PID + * for a process in all nested PID namespaces but set_tid_size must + * never be greater than the current ns->level + 1. + */ + if (set_tid_size > ns->level + 1) + return ERR_PTR(-EINVAL); + pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); @@ -174,24 +186,54 @@ struct pid *alloc_pid(struct pid_namespace *ns) pid->level = ns->level; for (i = ns->level; i >= 0; i--) { - int pid_min = 1; + int tid = 0; + + if (set_tid_size) { + tid = set_tid[ns->level - i]; + + retval = -EINVAL; + if (tid < 1 || tid >= pid_max) + goto out_free; + /* + * Also fail if a PID != 1 is requested and + * no PID 1 exists. + */ + if (tid != 1 && !tmp->child_reaper) + goto out_free; + retval = -EPERM; + if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN)) + goto out_free; + set_tid_size--; + } idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); - /* - * init really needs pid 1, but after reaching the maximum - * wrap back to RESERVED_PIDS - */ - if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) - pid_min = RESERVED_PIDS; - - /* - * Store a null pointer so find_pid_ns does not find - * a partially initialized PID (see below). - */ - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, - pid_max, GFP_ATOMIC); + if (tid) { + nr = idr_alloc(&tmp->idr, NULL, tid, + tid + 1, GFP_ATOMIC); + /* + * If ENOSPC is returned it means that the PID is + * alreay in use. Return EEXIST in that case. + */ + if (nr == -ENOSPC) + nr = -EEXIST; + } else { + int pid_min = 1; + /* + * init really needs pid 1, but after reaching the + * maximum wrap back to RESERVED_PIDS + */ + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) + pid_min = RESERVED_PIDS; + + /* + * Store a null pointer so find_pid_ns does not find + * a partially initialized PID (see below). + */ + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, + pid_max, GFP_ATOMIC); + } spin_unlock_irq(&pidmap_lock); idr_preload_end(); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a6a79f85c81a..d40017e79ebe 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -26,8 +26,6 @@ static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; -/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ -#define MAX_PID_NS_LEVEL 32 /* Write once array, filled from the beginning. */ static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; -- cgit From 4a169a95d885fe5c050bac1a21d43c86ba955bcf Mon Sep 17 00:00:00 2001 From: Maulik Shah Date: Fri, 15 Nov 2019 15:11:49 -0700 Subject: genirq: Introduce irq_chip_get/set_parent_state calls On certain QTI chipsets some GPIOs are direct-connect interrupts to the GIC to be used as regular interrupt lines. When the GPIOs are not used for interrupt generation the interrupt line is disabled. But disabling the interrupt at GIC does not prevent the interrupt to be reported as pending at GIC_ISPEND. Later, when drivers call enable_irq() on the interrupt, an unwanted interrupt occurs. Introduce get and set methods for irqchip's parent to clear it's pending irq state. This then can be invoked by the GPIO interrupt controller on the parents in it hierarchy to clear the interrupt before enabling the interrupt. Signed-off-by: Maulik Shah Signed-off-by: Lina Iyer Signed-off-by: Marc Zyngier Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/1573855915-9841-7-git-send-email-ilina@codeaurora.org [updated commit text and minor code fixes] --- kernel/irq/chip.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b76703b2c0af..b3fa2d87d2f3 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1297,6 +1297,50 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq); #endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */ +/** + * irq_chip_set_parent_state - set the state of a parent interrupt. + * + * @data: Pointer to interrupt specific data + * @which: State to be restored (one of IRQCHIP_STATE_*) + * @val: Value corresponding to @which + * + * Conditional success, if the underlying irqchip does not implement it. + */ +int irq_chip_set_parent_state(struct irq_data *data, + enum irqchip_irq_state which, + bool val) +{ + data = data->parent_data; + + if (!data || !data->chip->irq_set_irqchip_state) + return 0; + + return data->chip->irq_set_irqchip_state(data, which, val); +} +EXPORT_SYMBOL_GPL(irq_chip_set_parent_state); + +/** + * irq_chip_get_parent_state - get the state of a parent interrupt. + * + * @data: Pointer to interrupt specific data + * @which: one of IRQCHIP_STATE_* the caller wants to know + * @state: a pointer to a boolean where the state is to be stored + * + * Conditional success, if the underlying irqchip does not implement it. + */ +int irq_chip_get_parent_state(struct irq_data *data, + enum irqchip_irq_state which, + bool *state) +{ + data = data->parent_data; + + if (!data || !data->chip->irq_get_irqchip_state) + return 0; + + return data->chip->irq_get_irqchip_state(data, which, state); +} +EXPORT_SYMBOL_GPL(irq_chip_get_parent_state); + /** * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if * NULL) -- cgit From 7763baace1b738d65efa46d68326c9406311c6bf Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 15 Nov 2019 10:39:08 +0000 Subject: sched/uclamp: Fix overzealous type replacement Some uclamp helpers had their return type changed from 'unsigned int' to 'enum uclamp_id' by commit 0413d7f33e60 ("sched/uclamp: Always use 'enum uclamp_id' for clamp_id values") but it happens that some do return a value in the [0, SCHED_CAPACITY_SCALE] range, which should really be unsigned int. The affected helpers are uclamp_none(), uclamp_rq_max_value() and uclamp_eff_value(). Fix those up. Note that this doesn't lead to any obj diff using a relatively recent aarch64 compiler (8.3-2019.03). The current code of e.g. uclamp_eff_value() properly returns an 11 bit value (bits_per(1024)) and doesn't seem to do anything funny. I'm still marking this as fixing the above commit to be on the safe side. Signed-off-by: Valentin Schneider Reviewed-by: Qais Yousef Acked-by: Vincent Guittot Cc: Dietmar.Eggemann@arm.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: patrick.bellasi@matbug.net Cc: qperret@google.com Cc: surenb@google.com Cc: tj@kernel.org Fixes: 0413d7f33e60 ("sched/uclamp: Always use 'enum uclamp_id' for clamp_id values") Link: https://lkml.kernel.org/r/20191115103908.27610-1-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/sched.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 513a4794ff36..3ceff1c93ef1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -810,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); } -static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id) +static inline unsigned int uclamp_none(enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -853,7 +853,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, } static inline -enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, +unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; @@ -918,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) return uc_req; } -enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) +unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_eff; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 05c282775f21..280a3c735935 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2300,7 +2300,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef CONFIG_UCLAMP_TASK -enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static __always_inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, -- cgit From 1e0bd5a091e5d9e0f1d5b0e6329b87bb1792f784 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:02 -0800 Subject: bpf: Switch bpf_map ref counter to atomic64_t so bpf_map_inc() never fails 92117d8443bc ("bpf: fix refcnt overflow") turned refcounting of bpf_map into potentially failing operation, when refcount reaches BPF_MAX_REFCNT limit (32k). Due to using 32-bit counter, it's possible in practice to overflow refcounter and make it wrap around to 0, causing erroneous map free, while there are still references to it, causing use-after-free problems. But having a failing refcounting operations are problematic in some cases. One example is mmap() interface. After establishing initial memory-mapping, user is allowed to arbitrarily map/remap/unmap parts of mapped memory, arbitrarily splitting it into multiple non-contiguous regions. All this happening without any control from the users of mmap subsystem. Rather mmap subsystem sends notifications to original creator of memory mapping through open/close callbacks, which are optionally specified during initial memory mapping creation. These callbacks are used to maintain accurate refcount for bpf_map (see next patch in this series). The problem is that open() callback is not supposed to fail, because memory-mapped resource is set up and properly referenced. This is posing a problem for using memory-mapping with BPF maps. One solution to this is to maintain separate refcount for just memory-mappings and do single bpf_map_inc/bpf_map_put when it goes from/to zero, respectively. There are similar use cases in current work on tcp-bpf, necessitating extra counter as well. This seems like a rather unfortunate and ugly solution that doesn't scale well to various new use cases. Another approach to solve this is to use non-failing refcount_t type, which uses 32-bit counter internally, but, once reaching overflow state at UINT_MAX, stays there. This utlimately causes memory leak, but prevents use after free. But given refcounting is not the most performance-critical operation with BPF maps (it's not used from running BPF program code), we can also just switch to 64-bit counter that can't overflow in practice, potentially disadvantaging 32-bit platforms a tiny bit. This simplifies semantics and allows above described scenarios to not worry about failing refcount increment operation. In terms of struct bpf_map size, we are still good and use the same amount of space: BEFORE (3 cache lines, 8 bytes of padding at the end): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic_t refcnt __attribute__((__aligned__(64))); /* 128 4 */ atomic_t usercnt; /* 132 4 */ struct work_struct work; /* 136 32 */ char name[16]; /* 168 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 146, holes: 1, sum holes: 38 */ /* padding: 8 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); AFTER (same 3 cache lines, no extra padding now): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic64_t refcnt __attribute__((__aligned__(64))); /* 128 8 */ atomic64_t usercnt; /* 136 8 */ struct work_struct work; /* 144 32 */ char name[16]; /* 176 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 154, holes: 1, sum holes: 38 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); This patch, while modifying all users of bpf_map_inc, also cleans up its interface to match bpf_map_put with separate operations for bpf_map_inc and bpf_map_inc_with_uref (to match bpf_map_put and bpf_map_put_with_uref, respectively). Also, given there are no users of bpf_map_inc_not_zero specifying uref=true, remove uref flag and default to uref=false internally. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191117172806.2195367-2-andriin@fb.com --- kernel/bpf/inode.c | 2 +- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/syscall.c | 51 +++++++++++++++++++++---------------------------- kernel/bpf/verifier.c | 6 +----- kernel/bpf/xskmap.c | 6 ++---- 5 files changed, 27 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index a70f7209cda3..2f17f24258dc 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -34,7 +34,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) raw = bpf_prog_inc(raw); break; case BPF_TYPE_MAP: - raw = bpf_map_inc(raw, true); + bpf_map_inc_with_uref(raw); break; default: WARN_ON_ONCE(1); diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index fab4fb134547..4cbe987be35b 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -98,7 +98,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, return inner_map; if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) - inner_map = bpf_map_inc(inner_map, false); + bpf_map_inc(inner_map); else inner_map = ERR_PTR(-EINVAL); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c88c815c2154..20030751b7a2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -311,7 +311,7 @@ static void bpf_map_free_deferred(struct work_struct *work) static void bpf_map_put_uref(struct bpf_map *map) { - if (atomic_dec_and_test(&map->usercnt)) { + if (atomic64_dec_and_test(&map->usercnt)) { if (map->ops->map_release_uref) map->ops->map_release_uref(map); } @@ -322,7 +322,7 @@ static void bpf_map_put_uref(struct bpf_map *map) */ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) { - if (atomic_dec_and_test(&map->refcnt)) { + if (atomic64_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map, do_idr_lock); btf_put(map->btf); @@ -575,8 +575,8 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map; - atomic_set(&map->refcnt, 1); - atomic_set(&map->usercnt, 1); + atomic64_set(&map->refcnt, 1); + atomic64_set(&map->usercnt, 1); if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; @@ -653,21 +653,19 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -/* prog's and map's refcnt limit */ -#define BPF_MAX_REFCNT 32768 - -struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) +void bpf_map_inc(struct bpf_map *map) { - if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) { - atomic_dec(&map->refcnt); - return ERR_PTR(-EBUSY); - } - if (uref) - atomic_inc(&map->usercnt); - return map; + atomic64_inc(&map->refcnt); } EXPORT_SYMBOL_GPL(bpf_map_inc); +void bpf_map_inc_with_uref(struct bpf_map *map) +{ + atomic64_inc(&map->refcnt); + atomic64_inc(&map->usercnt); +} +EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); + struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); @@ -677,38 +675,30 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) if (IS_ERR(map)) return map; - map = bpf_map_inc(map, true); + bpf_map_inc_with_uref(map); fdput(f); return map; } /* map_idr_lock should have been held */ -static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, - bool uref) +static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) { int refold; - refold = atomic_fetch_add_unless(&map->refcnt, 1, 0); - - if (refold >= BPF_MAX_REFCNT) { - __bpf_map_put(map, false); - return ERR_PTR(-EBUSY); - } - + refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0); if (!refold) return ERR_PTR(-ENOENT); - if (uref) - atomic_inc(&map->usercnt); + atomic64_inc(&map->usercnt); return map; } -struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) { spin_lock_bh(&map_idr_lock); - map = __bpf_map_inc_not_zero(map, uref); + map = __bpf_map_inc_not_zero(map, false); spin_unlock_bh(&map_idr_lock); return map; @@ -1455,6 +1445,9 @@ static struct bpf_prog *____bpf_prog_get(struct fd f) return f.file->private_data; } +/* prog's refcnt limit */ +#define BPF_MAX_REFCNT 32768 + struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) { if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e9dc95a18d44..9f59f7a19dd0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8179,11 +8179,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) * will be used by the valid program until it's unloaded * and all maps are released in free_used_maps() */ - map = bpf_map_inc(map, false); - if (IS_ERR(map)) { - fdput(f); - return PTR_ERR(map); - } + bpf_map_inc(map); aux->map_index = env->used_map_cnt; env->used_maps[env->used_map_cnt++] = map; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index da16c30868f3..90c4fce1c981 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -11,10 +11,8 @@ int xsk_map_inc(struct xsk_map *map) { - struct bpf_map *m = &map->map; - - m = bpf_map_inc(m, false); - return PTR_ERR_OR_ZERO(m); + bpf_map_inc(&map->map); + return 0; } void xsk_map_put(struct xsk_map *map) -- cgit From 85192dbf4de08795afe2b88e52a36fc6abfc3dba Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:03 -0800 Subject: bpf: Convert bpf_prog refcnt to atomic64_t Similarly to bpf_map's refcnt/usercnt, convert bpf_prog's refcnt to atomic64 and remove artificial 32k limit. This allows to make bpf_prog's refcounting non-failing, simplifying logic of users of bpf_prog_add/bpf_prog_inc. Validated compilation by running allyesconfig kernel build. Suggested-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191117172806.2195367-3-andriin@fb.com --- kernel/bpf/inode.c | 5 +++-- kernel/bpf/syscall.c | 30 +++++++++--------------------- kernel/events/core.c | 7 ++----- 3 files changed, 14 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 2f17f24258dc..ecf42bec38c0 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -31,7 +31,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: - raw = bpf_prog_inc(raw); + bpf_prog_inc(raw); break; case BPF_TYPE_MAP: bpf_map_inc_with_uref(raw); @@ -534,7 +534,8 @@ static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type if (!bpf_prog_get_ok(prog, &type, false)) return ERR_PTR(-EINVAL); - return bpf_prog_inc(prog); + bpf_prog_inc(prog); + return prog; } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 20030751b7a2..52fe4bacb330 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1339,7 +1339,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { - if (atomic_dec_and_test(&prog->aux->refcnt)) { + if (atomic64_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); @@ -1445,16 +1445,9 @@ static struct bpf_prog *____bpf_prog_get(struct fd f) return f.file->private_data; } -/* prog's refcnt limit */ -#define BPF_MAX_REFCNT 32768 - -struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) +void bpf_prog_add(struct bpf_prog *prog, int i) { - if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { - atomic_sub(i, &prog->aux->refcnt); - return ERR_PTR(-EBUSY); - } - return prog; + atomic64_add(i, &prog->aux->refcnt); } EXPORT_SYMBOL_GPL(bpf_prog_add); @@ -1465,13 +1458,13 @@ void bpf_prog_sub(struct bpf_prog *prog, int i) * path holds a reference to the program, thus atomic_sub() can * be safely used in such cases! */ - WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0); + WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0); } EXPORT_SYMBOL_GPL(bpf_prog_sub); -struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +void bpf_prog_inc(struct bpf_prog *prog) { - return bpf_prog_add(prog, 1); + atomic64_inc(&prog->aux->refcnt); } EXPORT_SYMBOL_GPL(bpf_prog_inc); @@ -1480,12 +1473,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) { int refold; - refold = atomic_fetch_add_unless(&prog->aux->refcnt, 1, 0); - - if (refold >= BPF_MAX_REFCNT) { - __bpf_prog_put(prog, false); - return ERR_PTR(-EBUSY); - } + refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0); if (!refold) return ERR_PTR(-ENOENT); @@ -1523,7 +1511,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, goto out; } - prog = bpf_prog_inc(prog); + bpf_prog_inc(prog); out: fdput(f); return prog; @@ -1714,7 +1702,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->orig_prog = NULL; prog->jited = 0; - atomic_set(&prog->aux->refcnt, 1); + atomic64_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; if (bpf_prog_is_dev_bound(prog->aux)) { diff --git a/kernel/events/core.c b/kernel/events/core.c index aec8dba2bea4..73c616876597 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10477,12 +10477,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) if (overflow_handler == bpf_overflow_handler) { - struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + struct bpf_prog *prog = parent_event->prog; - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto err_ns; - } + bpf_prog_inc(prog); event->prog = prog; event->orig_overflow_handler = parent_event->orig_overflow_handler; -- cgit From fc9702273e2edb90400a34b3be76f7b08fa3344b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:04 -0800 Subject: bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY Add ability to memory-map contents of BPF array map. This is extremely useful for working with BPF global data from userspace programs. It allows to avoid typical bpf_map_{lookup,update}_elem operations, improving both performance and usability. There had to be special considerations for map freezing, to avoid having writable memory view into a frozen map. To solve this issue, map freezing and mmap-ing is happening under mutex now: - if map is already frozen, no writable mapping is allowed; - if map has writable memory mappings active (accounted in map->writecnt), map freezing will keep failing with -EBUSY; - once number of writable memory mappings drops to zero, map freezing can be performed again. Only non-per-CPU plain arrays are supported right now. Maps with spinlocks can't be memory mapped either. For BPF_F_MMAPABLE array, memory allocation has to be done through vmalloc() to be mmap()'able. We also need to make sure that array data memory is page-sized and page-aligned, so we over-allocate memory in such a way that struct bpf_array is at the end of a single page of memory with array->value being aligned with the start of the second page. On deallocation we need to accomodate this memory arrangement to free vmalloc()'ed memory correctly. One important consideration regarding how memory-mapping subsystem functions. Memory-mapping subsystem provides few optional callbacks, among them open() and close(). close() is called for each memory region that is unmapped, so that users can decrease their reference counters and free up resources, if necessary. open() is *almost* symmetrical: it's called for each memory region that is being mapped, **except** the very first one. So bpf_map_mmap does initial refcnt bump, while open() will do any extra ones after that. Thus number of close() calls is equal to number of open() calls plus one more. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: John Fastabend Acked-by: Johannes Weiner Link: https://lore.kernel.org/bpf/20191117172806.2195367-4-andriin@fb.com --- kernel/bpf/arraymap.c | 58 ++++++++++++++++++++++++++---- kernel/bpf/syscall.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 148 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1c65ce0098a9..a42097c36b0c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -14,7 +14,7 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) + (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -59,6 +59,10 @@ int array_map_alloc_check(union bpf_attr *attr) (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; + if (attr->map_type != BPF_MAP_TYPE_ARRAY && + attr->map_flags & BPF_F_MMAPABLE) + return -EINVAL; + if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. @@ -102,10 +106,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } array_size = sizeof(*array); - if (percpu) + if (percpu) { array_size += (u64) max_entries * sizeof(void *); - else - array_size += (u64) max_entries * elem_size; + } else { + /* rely on vmalloc() to return page-aligned memory and + * ensure array->value is exactly page-aligned + */ + if (attr->map_flags & BPF_F_MMAPABLE) { + array_size = PAGE_ALIGN(array_size); + array_size += PAGE_ALIGN((u64) max_entries * elem_size); + } else { + array_size += (u64) max_entries * elem_size; + } + } /* make sure there is no u32 overflow later in round_up() */ cost = array_size; @@ -117,7 +130,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ - array = bpf_map_area_alloc(array_size, numa_node); + if (attr->map_flags & BPF_F_MMAPABLE) { + void *data; + + /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ + data = bpf_map_area_mmapable_alloc(array_size, numa_node); + if (!data) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + array = data + PAGE_ALIGN(sizeof(struct bpf_array)) + - offsetof(struct bpf_array, value); + } else { + array = bpf_map_area_alloc(array_size, numa_node); + } if (!array) { bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); @@ -350,6 +376,11 @@ static int array_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } +static void *array_map_vmalloc_addr(struct bpf_array *array) +{ + return (void *)round_down((unsigned long)array, PAGE_SIZE); +} + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void array_map_free(struct bpf_map *map) { @@ -365,7 +396,10 @@ static void array_map_free(struct bpf_map *map) if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); - bpf_map_area_free(array); + if (array->map.map_flags & BPF_F_MMAPABLE) + bpf_map_area_free(array_map_vmalloc_addr(array)); + else + bpf_map_area_free(array); } static void array_map_seq_show_elem(struct bpf_map *map, void *key, @@ -444,6 +478,17 @@ static int array_map_check_btf(const struct bpf_map *map, return 0; } +int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; + + if (!(map->map_flags & BPF_F_MMAPABLE)) + return -EINVAL; + + return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff); +} + const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -455,6 +500,7 @@ const struct bpf_map_ops array_map_ops = { .map_gen_lookup = array_map_gen_lookup, .map_direct_value_addr = array_map_direct_value_addr, .map_direct_value_meta = array_map_direct_value_meta, + .map_mmap = array_map_mmap, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 52fe4bacb330..bac3becf9f90 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -127,7 +127,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -void *bpf_map_area_alloc(size_t size, int numa_node) +static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, @@ -142,18 +142,33 @@ void *bpf_map_area_alloc(size_t size, int numa_node) const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; void *area; - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + /* kmalloc()'ed memory can't be mmap()'ed */ + if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, numa_node); if (area != NULL) return area; } - + if (mmapable) { + BUG_ON(!PAGE_ALIGNED(size)); + return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | + __GFP_RETRY_MAYFAIL | flags); + } return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags, __builtin_return_address(0)); } +void *bpf_map_area_alloc(size_t size, int numa_node) +{ + return __bpf_map_area_alloc(size, numa_node, false); +} + +void *bpf_map_area_mmapable_alloc(size_t size, int numa_node) +{ + return __bpf_map_area_alloc(size, numa_node, true); +} + void bpf_map_area_free(void *area) { kvfree(area); @@ -425,6 +440,74 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, return -EINVAL; } +/* called for any extra memory-mapped regions (except initial) */ +static void bpf_map_mmap_open(struct vm_area_struct *vma) +{ + struct bpf_map *map = vma->vm_file->private_data; + + bpf_map_inc_with_uref(map); + + if (vma->vm_flags & VM_WRITE) { + mutex_lock(&map->freeze_mutex); + map->writecnt++; + mutex_unlock(&map->freeze_mutex); + } +} + +/* called for all unmapped memory region (including initial) */ +static void bpf_map_mmap_close(struct vm_area_struct *vma) +{ + struct bpf_map *map = vma->vm_file->private_data; + + if (vma->vm_flags & VM_WRITE) { + mutex_lock(&map->freeze_mutex); + map->writecnt--; + mutex_unlock(&map->freeze_mutex); + } + + bpf_map_put_with_uref(map); +} + +static const struct vm_operations_struct bpf_map_default_vmops = { + .open = bpf_map_mmap_open, + .close = bpf_map_mmap_close, +}; + +static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct bpf_map *map = filp->private_data; + int err; + + if (!map->ops->map_mmap || map_value_has_spin_lock(map)) + return -ENOTSUPP; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + mutex_lock(&map->freeze_mutex); + + if ((vma->vm_flags & VM_WRITE) && map->frozen) { + err = -EPERM; + goto out; + } + + /* set default open/close callbacks */ + vma->vm_ops = &bpf_map_default_vmops; + vma->vm_private_data = map; + + err = map->ops->map_mmap(map, vma); + if (err) + goto out; + + bpf_map_inc_with_uref(map); + + if (vma->vm_flags & VM_WRITE) + map->writecnt++; +out: + mutex_unlock(&map->freeze_mutex); + return err; +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -432,6 +515,7 @@ const struct file_operations bpf_map_fops = { .release = bpf_map_release, .read = bpf_dummy_read, .write = bpf_dummy_write, + .mmap = bpf_map_mmap, }; int bpf_map_new_fd(struct bpf_map *map, int flags) @@ -577,6 +661,7 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); + mutex_init(&map->freeze_mutex); if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; @@ -1163,6 +1248,13 @@ static int map_freeze(const union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + + mutex_lock(&map->freeze_mutex); + + if (map->writecnt) { + err = -EBUSY; + goto err_put; + } if (READ_ONCE(map->frozen)) { err = -EBUSY; goto err_put; @@ -1174,6 +1266,7 @@ static int map_freeze(const union bpf_attr *attr) WRITE_ONCE(map->frozen, true); err_put: + mutex_unlock(&map->freeze_mutex); fdput(f); return err; } -- cgit From 3318544b721d3072fdd1f85ee0f1f214c0b211ee Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 22 Oct 2019 18:46:38 +0200 Subject: sched/fair: Fix rework of find_idlest_group() The task, for which the scheduler looks for the idlest group of CPUs, must be discounted from all statistics in order to get a fair comparison between groups. This includes utilization, load, nr_running and idle_cpus. Such unfairness can be easily highlighted with the unixbench execl 1 task. This test continuously call execve() and the scheduler looks for the idlest group/CPU on which it should place the task. Because the task runs on the local group/CPU, the latter seems already busy even if there is nothing else running on it. As a result, the scheduler will always select another group/CPU than the local one. This recovers most of the performance regression on my system from the recent load-balancer rewrite. [ mingo: Minor cleanups. ] Reported-by: kernel test robot Tested-by: kernel test robot Signed-off-by: Vincent Guittot Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: hdanton@sina.com Cc: parth@linux.ibm.com Cc: pauld@redhat.com Cc: quentin.perret@arm.com Cc: riel@surriel.com Cc: srikar@linux.vnet.ibm.com Cc: valentin.schneider@arm.com Fixes: 57abff067a08 ("sched/fair: Rework find_idlest_group()") Link: https://lkml.kernel.org/r/1571762798-25900-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 84 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 81eba554db8d..2fc08e7d9cd6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5391,6 +5391,37 @@ static unsigned long cpu_load(struct rq *rq) return cfs_rq_load_avg(&rq->cfs); } +/* + * cpu_load_without - compute CPU load without any contributions from *p + * @cpu: the CPU which load is requested + * @p: the task which load should be discounted + * + * The load of a CPU is defined by the load of tasks currently enqueued on that + * CPU as well as tasks which are currently sleeping after an execution on that + * CPU. + * + * This method returns the load of the specified CPU by discounting the load of + * the specified task, whenever the task is currently contributing to the CPU + * load. + */ +static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p) +{ + struct cfs_rq *cfs_rq; + unsigned int load; + + /* Task has no contribution or is new */ + if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_load(rq); + + cfs_rq = &rq->cfs; + load = READ_ONCE(cfs_rq->avg.load_avg); + + /* Discount task's util from CPU's util */ + lsub_positive(&load, task_h_load(p)); + + return load; +} + static unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; @@ -8141,11 +8172,56 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) struct sg_lb_stats; +/* + * task_running_on_cpu - return 1 if @p is running on @cpu. + */ + +static unsigned int task_running_on_cpu(int cpu, struct task_struct *p) +{ + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return 0; + + if (task_on_rq_queued(p)) + return 1; + + return 0; +} + +/** + * idle_cpu_without - would a given CPU be idle without p ? + * @cpu: the processor on which idleness is tested. + * @p: task which should be ignored. + * + * Return: 1 if the CPU would be idle. 0 otherwise. + */ +static int idle_cpu_without(int cpu, struct task_struct *p) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->curr != rq->idle && rq->curr != p) + return 0; + + /* + * rq->nr_running can't be used but an updated version without the + * impact of p on cpu must be used instead. The updated nr_running + * be computed and tested before calling idle_cpu_without(). + */ + +#ifdef CONFIG_SMP + if (!llist_empty(&rq->wake_list)) + return 0; +#endif + + return 1; +} + /* * update_sg_wakeup_stats - Update sched_group's statistics for wakeup. - * @denv: The ched_domain level to look for idlest group. + * @sd: The sched_domain level to look for idlest group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. + * @p: The task for which we look for the idlest group/CPU. */ static inline void update_sg_wakeup_stats(struct sched_domain *sd, struct sched_group *group, @@ -8158,21 +8234,22 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, for_each_cpu(i, sched_group_span(group)) { struct rq *rq = cpu_rq(i); + unsigned int local; - sgs->group_load += cpu_load(rq); + sgs->group_load += cpu_load_without(rq, p); sgs->group_util += cpu_util_without(i, p); - sgs->sum_h_nr_running += rq->cfs.h_nr_running; + local = task_running_on_cpu(i, p); + sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; - nr_running = rq->nr_running; + nr_running = rq->nr_running - local; sgs->sum_nr_running += nr_running; /* - * No need to call idle_cpu() if nr_running is not 0 + * No need to call idle_cpu_without() if nr_running is not 0 */ - if (!nr_running && idle_cpu(i)) + if (!nr_running && idle_cpu_without(i, p)) sgs->idle_cpus++; - } /* Check if task fits in the group */ -- cgit From a9723389cc759c891d481de271ac73eeaa123bcb Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 12 Nov 2019 15:50:43 +0100 Subject: sched/fair: Add comments for group_type and balancing at SD_NUMA level Add comments to describe each state of goup_type and to add some details about the load balance at NUMA level. [ Valentin Schneider: Updates to the comments. ] [ mingo: Other updates to the comments. ] Reported-by: Mel Gorman Signed-off-by: Vincent Guittot Acked-by: Valentin Schneider Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/1573570243-1903-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2fc08e7d9cd6..1f93d96dd06b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6980,17 +6980,40 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; /* - * group_type describes the group of CPUs at the moment of the load balance. + * 'group_type' describes the group of CPUs at the moment of load balancing. + * * The enum is ordered by pulling priority, with the group with lowest priority - * first so the groupe_type can be simply compared when selecting the busiest - * group. see update_sd_pick_busiest(). + * first so the group_type can simply be compared when selecting the busiest + * group. See update_sd_pick_busiest(). */ enum group_type { + /* The group has spare capacity that can be used to run more tasks. */ group_has_spare = 0, + /* + * The group is fully used and the tasks don't compete for more CPU + * cycles. Nevertheless, some tasks might wait before running. + */ group_fully_busy, + /* + * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity + * and must be migrated to a more powerful CPU. + */ group_misfit_task, + /* + * SD_ASYM_PACKING only: One local CPU with higher capacity is available, + * and the task should be migrated to it instead of running on the + * current CPU. + */ group_asym_packing, + /* + * The tasks' affinity constraints previously prevented the scheduler + * from balancing the load across the system. + */ group_imbalanced, + /* + * The CPU is overloaded and can't provide expected CPU cycles to all + * tasks. + */ group_overloaded }; @@ -8589,7 +8612,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * Try to use spare capacity of local group without overloading it or - * emptying busiest + * emptying busiest. + * XXX Spreading tasks across NUMA nodes is not always the best policy + * and special care should be taken for SD_NUMA domain level before + * spreading the tasks. For now, load_balance() fully relies on + * NUMA_BALANCING and fbq_classify_group/rq to override the decision. */ if (local->group_type == group_has_spare) { if (busiest->group_type > group_fully_busy) { -- cgit From bef69dd87828ef5d8ecdab8d857cd3a33cf98675 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 18 Nov 2019 14:21:19 +0100 Subject: sched/cpufreq: Move the cfs_rq_util_change() call to cpufreq_update_util() update_cfs_rq_load_avg() calls cfs_rq_util_change() every time PELT decays, which might be inefficient when the cpufreq driver has rate limitation. When a task is attached on a CPU, we have this call path: update_load_avg() update_cfs_rq_load_avg() cfs_rq_util_change -- > trig frequency update attach_entity_load_avg() cfs_rq_util_change -- > trig frequency update The 1st frequency update will not take into account the utilization of the newly attached task and the 2nd one might be discarded because of rate limitation of the cpufreq driver. update_cfs_rq_load_avg() is only called by update_blocked_averages() and update_load_avg() so we can move the call to cfs_rq_util_change/cpufreq_update_util() into these two functions. It's also interesting to note that update_load_avg() already calls cfs_rq_util_change() directly for the !SMP case. This change will also ensure that cpufreq_update_util() is called even when there is no more CFS rq in the leaf_cfs_rq_list to update, but only IRQ, RT or DL PELT signals. [ mingo: Minor updates. ] Reported-by: Doug Smythies Tested-by: Doug Smythies Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Acked-by: Rafael J. Wysocki Cc: Linus Torvalds Cc: Thomas Gleixner Cc: juri.lelli@redhat.com Cc: linux-pm@vger.kernel.org Cc: mgorman@suse.de Cc: rostedt@goodmis.org Cc: sargun@sargun.me Cc: srinivas.pandruvada@linux.intel.com Cc: tj@kernel.org Cc: xiexiuqi@huawei.com Cc: xiezhipeng1@huawei.com Fixes: 039ae8bcf7a5 ("sched/fair: Fix O(nr_cgroups) in the load balancing path") Link: https://lkml.kernel.org/r/1574083279-799-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 111 +++++++++++++++++++++++++++++----------------------- 1 file changed, 62 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 209cad663adf..08a233e97a01 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3508,9 +3508,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (decayed) - cfs_rq_util_change(cfs_rq, 0); - return decayed; } @@ -3620,8 +3617,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); update_tg_load_avg(cfs_rq, 0); - } else if (decayed && (flags & UPDATE_TG)) - update_tg_load_avg(cfs_rq, 0); + } else if (decayed) { + cfs_rq_util_change(cfs_rq, 0); + + if (flags & UPDATE_TG) + update_tg_load_avg(cfs_rq, 0); + } } #ifndef CONFIG_64BIT @@ -7482,6 +7483,28 @@ static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} #endif +static bool __update_blocked_others(struct rq *rq, bool *done) +{ + const struct sched_class *curr_class; + u64 now = rq_clock_pelt(rq); + bool decayed; + + /* + * update_load_avg() can call cpufreq_update_util(). Make sure that RT, + * DL and IRQ signals have been updated before updating CFS. + */ + curr_class = rq->curr->sched_class; + + decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | + update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | + update_irq_load_avg(rq, 0); + + if (others_have_blocked(rq)) + *done = false; + + return decayed; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -7501,29 +7524,11 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) return true; } -static void update_blocked_averages(int cpu) +static bool __update_blocked_fair(struct rq *rq, bool *done) { - struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq, *pos; - const struct sched_class *curr_class; - struct rq_flags rf; - bool done = true; - - rq_lock_irqsave(rq, &rf); - update_rq_clock(rq); - - /* - * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure - * that RT, DL and IRQ signals have been updated before updating CFS. - */ - curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); - update_irq_load_avg(rq, 0); - - /* Don't need periodic decay once load/util_avg are null */ - if (others_have_blocked(rq)) - done = false; + bool decayed = false; + int cpu = cpu_of(rq); /* * Iterates the task_group tree in a bottom up fashion, see @@ -7532,9 +7537,13 @@ static void update_blocked_averages(int cpu) for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; - if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { update_tg_load_avg(cfs_rq, 0); + if (cfs_rq == &rq->cfs) + decayed = true; + } + /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) @@ -7549,11 +7558,10 @@ static void update_blocked_averages(int cpu) /* Don't need periodic decay once load/util_avg are null */ if (cfs_rq_has_blocked(cfs_rq)) - done = false; + *done = false; } - update_blocked_load_status(rq, !done); - rq_unlock_irqrestore(rq, &rf); + return decayed; } /* @@ -7603,29 +7611,16 @@ static unsigned long task_h_load(struct task_struct *p) cfs_rq_load_avg(cfs_rq) + 1); } #else -static inline void update_blocked_averages(int cpu) +static bool __update_blocked_fair(struct rq *rq, bool *done) { - struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; - const struct sched_class *curr_class; - struct rq_flags rf; - - rq_lock_irqsave(rq, &rf); - update_rq_clock(rq); - - /* - * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure - * that RT, DL and IRQ signals have been updated before updating CFS. - */ - curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); - update_irq_load_avg(rq, 0); + bool decayed; - update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + if (cfs_rq_has_blocked(cfs_rq)) + *done = false; - update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq)); - rq_unlock_irqrestore(rq, &rf); + return decayed; } static unsigned long task_h_load(struct task_struct *p) @@ -7634,6 +7629,24 @@ static unsigned long task_h_load(struct task_struct *p) } #endif +static void update_blocked_averages(int cpu) +{ + bool decayed = false, done = true; + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + + decayed |= __update_blocked_others(rq, &done); + decayed |= __update_blocked_fair(rq, &done); + + update_blocked_load_status(rq, !done); + if (decayed) + cpufreq_update_util(rq, 0); + rq_unlock_irqrestore(rq, &rf); +} + /********** Helpers for find_busiest_group ************************/ /* -- cgit From 36b3db03b4741b8935b68fffc7e69951d8d70a89 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 15 Nov 2019 18:08:18 +0200 Subject: perf/core: Fix the mlock accounting, again Commit: 5e6c3c7b1ec2 ("perf/aux: Fix tracking of auxiliary trace buffer allocation") tried to guess the correct combination of arithmetic operations that would undo the AUX buffer's mlock accounting, and failed, leaking the bottom part when an allocation needs to be charged partially to both user->locked_vm and mm->pinned_vm, eventually leaving the user with no locked bonus: $ perf record -e intel_pt//u -m1,128 uname [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.061 MB perf.data ] $ perf record -e intel_pt//u -m1,128 uname Permission error mapping pages. Consider increasing /proc/sys/kernel/perf_event_mlock_kb, or try again with a smaller value of -m/--mmap_pages. (current value: 1,128) Fix this by subtracting both locked and pinned counts when AUX buffer is unmapped. Reported-by: Thomas Richter Tested-by: Thomas Richter Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 00a014670ed0..8f66a4833ded 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5607,10 +5607,8 @@ static void perf_mmap_close(struct vm_area_struct *vma) perf_pmu_output_stop(event); /* now it's safe to free the pages */ - if (!rb->aux_mmap_locked) - atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - else - atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); + atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm); + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); -- cgit From ea806eb3eab35528b578a061b2c4b28f0f92c465 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 17 Nov 2019 17:04:15 -0500 Subject: ftrace: Add a helper function to modify_ftrace_direct() to allow arch optimization If a direct ftrace callback is at a location that does not have any other ftrace helpers attached to it, it is possible to simply just change the text to call the new caller (if the architecture supports it). But this requires special architecture code. Currently, modify_ftrace_direct() uses a trick to add a stub ftrace callback to the location forcing it to call the ftrace iterator. Then it can change the direct helper to call the new function in C, and then remove the stub. Removing the stub will have the location now call the new location that the direct helper is using. The new helper function does the registering the stub trick, but is a weak function, allowing an architecture to override it to do something a bit more direct. Link: https://lore.kernel.org/r/20191115215125.mbqv7taqnx376yed@ast-mbp.dhcp.thefacebook.com Suggested-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 120 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 88 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ef79c8393f53..caae523f4ef3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1020,12 +1020,6 @@ static bool update_all_ops; # error Dynamic ftrace depends on MCOUNT_RECORD #endif -struct ftrace_func_entry { - struct hlist_node hlist; - unsigned long ip; - unsigned long direct; /* for direct lookup only */ -}; - struct ftrace_func_probe { struct ftrace_probe_ops *probe_ops; struct ftrace_ops ops; @@ -5112,7 +5106,8 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } EXPORT_SYMBOL_GPL(register_ftrace_direct); -static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) +static struct ftrace_func_entry *find_direct_entry(unsigned long *ip, + struct dyn_ftrace **recp) { struct ftrace_func_entry *entry; struct dyn_ftrace *rec; @@ -5132,6 +5127,9 @@ static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) /* Passed in ip just needs to be on the call site */ *ip = rec->ip; + if (recp) + *recp = rec; + return entry; } @@ -5143,7 +5141,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) mutex_lock(&direct_mutex); - entry = find_direct_entry(&ip); + entry = find_direct_entry(&ip, NULL); if (!entry) goto out_unlock; @@ -5179,6 +5177,75 @@ static struct ftrace_ops stub_ops = { .func = ftrace_stub, }; +/** + * ftrace_modify_direct_caller - modify ftrace nop directly + * @entry: The ftrace hash entry of the direct helper for @rec + * @rec: The record representing the function site to patch + * @old_addr: The location that the site at @rec->ip currently calls + * @new_addr: The location that the site at @rec->ip should call + * + * An architecture may overwrite this function to optimize the + * changing of the direct callback on an ftrace nop location. + * This is called with the ftrace_lock mutex held, and no other + * ftrace callbacks are on the associated record (@rec). Thus, + * it is safe to modify the ftrace record, where it should be + * currently calling @old_addr directly, to call @new_addr. + * + * Safety checks should be made to make sure that the code at + * @rec->ip is currently calling @old_addr. And this must + * also update entry->direct to @new_addr. + */ +int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, + struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long new_addr) +{ + unsigned long ip = rec->ip; + int ret; + + /* + * The ftrace_lock was used to determine if the record + * had more than one registered user to it. If it did, + * we needed to prevent that from changing to do the quick + * switch. But if it did not (only a direct caller was attached) + * then this function is called. But this function can deal + * with attached callers to the rec that we care about, and + * since this function uses standard ftrace calls that take + * the ftrace_lock mutex, we need to release it. + */ + mutex_unlock(&ftrace_lock); + + /* + * By setting a stub function at the same address, we force + * the code to call the iterator and the direct_ops helper. + * This means that @ip does not call the direct call, and + * we can simply modify it. + */ + ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); + if (ret) + goto out_lock; + + ret = register_ftrace_function(&stub_ops); + if (ret) { + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + goto out_lock; + } + + entry->direct = new_addr; + + /* + * By removing the stub, we put back the direct call, calling + * the @new_addr. + */ + unregister_ftrace_function(&stub_ops); + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + + out_lock: + mutex_lock(&ftrace_lock); + + return ret; +} + /** * modify_ftrace_direct - Modify an existing direct call to call something else * @ip: The instruction pointer to modify @@ -5197,11 +5264,13 @@ int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr) { struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; int ret = -ENODEV; mutex_lock(&direct_mutex); - entry = find_direct_entry(&ip); + mutex_lock(&ftrace_lock); + entry = find_direct_entry(&ip, &rec); if (!entry) goto out_unlock; @@ -5210,33 +5279,20 @@ int modify_ftrace_direct(unsigned long ip, goto out_unlock; /* - * By setting a stub function at the same address, we force - * the code to call the iterator and the direct_ops helper. - * This means that @ip does not call the direct call, and - * we can simply modify it. + * If there's no other ftrace callback on the rec->ip location, + * then it can be changed directly by the architecture. + * If there is another caller, then we just need to change the + * direct caller helper to point to @new_addr. */ - ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); - if (ret) - goto out_unlock; - - ret = register_ftrace_function(&stub_ops); - if (ret) { - ftrace_set_filter_ip(&stub_ops, ip, 1, 0); - goto out_unlock; + if (ftrace_rec_count(rec) == 1) { + ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr); + } else { + entry->direct = new_addr; + ret = 0; } - entry->direct = new_addr; - - /* - * By removing the stub, we put back the direct call, calling - * the @new_addr. - */ - unregister_ftrace_function(&stub_ops); - ftrace_set_filter_ip(&stub_ops, ip, 1, 0); - - ret = 0; - out_unlock: + mutex_unlock(&ftrace_lock); mutex_unlock(&direct_mutex); return ret; } -- cgit From 46f9469247c6f4697cbbf37e4b3961120bf07f29 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 18 Nov 2019 10:41:29 -0500 Subject: ftrace: Rename ftrace_graph_stub to ftrace_stub_graph The ftrace_graph_stub was created and points to ftrace_stub as a way to assign the functon graph tracer function pointer to a stub function with a different prototype than what ftrace_stub has and not trigger the C verifier. The ftrace_graph_stub was created via the linker script vmlinux.lds.h. Unfortunately, powerpc already uses the name ftrace_graph_stub for its internal implementation of the function graph tracer, and even though powerpc would still build, the change via the linker script broke function tracer on powerpc from working. By using the name ftrace_stub_graph, which does not exist anywhere else in the kernel, this should not be a problem. Link: https://lore.kernel.org/r/1573849732.5937.136.camel@lca.pw Fixes: b83b43ffc6e4 ("fgraph: Fix function type mismatches of ftrace_graph_return using ftrace_stub") Reorted-by: Qian Cai Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index fa3ce10d0405..67e0c462b059 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -336,10 +336,10 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h */ -extern void ftrace_graph_stub(struct ftrace_graph_ret *); +extern void ftrace_stub_graph(struct ftrace_graph_ret *); /* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_stub; +trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; @@ -619,7 +619,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) goto out; ftrace_graph_active--; - ftrace_graph_return = ftrace_graph_stub; + ftrace_graph_return = ftrace_stub_graph; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); -- cgit From b2e2f0e6a6f910c906c083584b6e0afd12266f22 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 19 Nov 2019 22:21:13 +0800 Subject: bpf: Make array_map_mmap static Fix sparse warning: kernel/bpf/arraymap.c:481:5: warning: symbol 'array_map_mmap' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191119142113.15388-1-yuehaibing@huawei.com --- kernel/bpf/arraymap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index a42097c36b0c..633c8c701ff6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -478,7 +478,7 @@ static int array_map_check_btf(const struct bpf_map *map, return 0; } -int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_array *array = container_of(map, struct bpf_array, map); pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; -- cgit From ba31c1a48538992316cc71ce94fa9cd3e7b427c0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:36 +0100 Subject: futex: Move futex exit handling into futex code The futex exit handling is #ifdeffed into mm_release() which is not pretty to begin with. But upcoming changes to address futex exit races need to add more functionality to this exit code. Split it out into a function, move it into futex code and make the various futex exit functions static. Preparatory only and no functional change. Folded build fix from Borislav. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.049705556@linutronix.de --- kernel/fork.c | 25 +++---------------------- kernel/futex.c | 33 +++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..bd7c218691d4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1286,20 +1286,7 @@ static int wait_for_vfork_done(struct task_struct *child, void mm_release(struct task_struct *tsk, struct mm_struct *mm) { /* Get rid of any futexes when releasing the mm */ -#ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) { - exit_robust_list(tsk); - tsk->robust_list = NULL; - } -#ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) { - compat_exit_robust_list(tsk); - tsk->compat_robust_list = NULL; - } -#endif - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); -#endif + futex_mm_release(tsk); uprobe_free_utask(tsk); @@ -2062,14 +2049,8 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_BLOCK p->plug = NULL; #endif -#ifdef CONFIG_FUTEX - p->robust_list = NULL; -#ifdef CONFIG_COMPAT - p->compat_robust_list = NULL; -#endif - INIT_LIST_HEAD(&p->pi_state_list); - p->pi_state_cache = NULL; -#endif + futex_init_task(p); + /* * sigaltstack should be cleared when sharing the same VM */ diff --git a/kernel/futex.c b/kernel/futex.c index 49eaf5be851a..f8f00d47c821 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -325,6 +325,12 @@ static inline bool should_fail_futex(bool fshared) } #endif /* CONFIG_FAIL_FUTEX */ +#ifdef CONFIG_COMPAT +static void compat_exit_robust_list(struct task_struct *curr); +#else +static inline void compat_exit_robust_list(struct task_struct *curr) { } +#endif + static inline void futex_get_mm(union futex_key *key) { mmgrab(key->private.mm); @@ -890,7 +896,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) * Kernel cleans up PI-state, but userspace is likely hosed. * (Robust-futex cleanup is separate and might save the day for userspace.) */ -void exit_pi_state_list(struct task_struct *curr) +static void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; @@ -960,7 +966,8 @@ void exit_pi_state_list(struct task_struct *curr) } raw_spin_unlock_irq(&curr->pi_lock); } - +#else +static inline void exit_pi_state_list(struct task_struct *curr) { } #endif /* @@ -3588,7 +3595,7 @@ static inline int fetch_robust_entry(struct robust_list __user **entry, * * We silently return on any sign of list-walking problem. */ -void exit_robust_list(struct task_struct *curr) +static void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; @@ -3653,6 +3660,24 @@ void exit_robust_list(struct task_struct *curr) } } +void futex_mm_release(struct task_struct *tsk) +{ + if (unlikely(tsk->robust_list)) { + exit_robust_list(tsk); + tsk->robust_list = NULL; + } + +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) { + compat_exit_robust_list(tsk); + tsk->compat_robust_list = NULL; + } +#endif + + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); +} + long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { @@ -3780,7 +3805,7 @@ static void __user *futex_uaddr(struct robust_list __user *entry, * * We silently return on any sign of list-walking problem. */ -void compat_exit_robust_list(struct task_struct *curr) +static void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; -- cgit From 3d4775df0a89240f671861c6ab6e8d59af8e9e41 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:37 +0100 Subject: futex: Replace PF_EXITPIDONE with a state The futex exit handling relies on PF_ flags. That's suboptimal as it requires a smp_mb() and an ugly lock/unlock of the exiting tasks pi_lock in the middle of do_exit() to enforce the observability of PF_EXITING in the futex code. Add a futex_state member to task_struct and convert the PF_EXITPIDONE logic over to the new state. The PF_EXITING dependency will be cleaned up in a later step. This prepares for handling various futex exit issues later. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.149449274@linutronix.de --- kernel/exit.c | 18 ++---------------- kernel/futex.c | 25 +++++++++++++------------ 2 files changed, 15 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a46a50d67002..d11bdcaac2e1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -746,16 +746,7 @@ void __noreturn do_exit(long code) */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); - /* - * We can do this unlocked here. The futex code uses - * this flag just to verify whether the pi state - * cleanup has been done or not. In the worst case it - * loops once more. We pretend that the cleanup was - * done as there is no way to return. Either the - * OWNER_DIED bit is set by now or we push the blocked - * task into the wait for ever nirwana as well. - */ - tsk->flags |= PF_EXITPIDONE; + futex_exit_done(tsk); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } @@ -846,12 +837,7 @@ void __noreturn do_exit(long code) * Make sure we are holding no locks: */ debug_check_no_locks_held(); - /* - * We can do this unlocked here. The futex code uses this flag - * just to verify whether the pi state cleanup has been done - * or not. In the worst case it loops once more. - */ - tsk->flags |= PF_EXITPIDONE; + futex_exit_done(tsk); if (tsk->io_context) exit_io_context(tsk); diff --git a/kernel/futex.c b/kernel/futex.c index f8f00d47c821..41c75277d7d1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1182,9 +1182,10 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, u32 uval2; /* - * If PF_EXITPIDONE is not yet set, then try again. + * If the futex exit state is not yet FUTEX_STATE_DEAD, wait + * for it to finish. */ - if (tsk && !(tsk->flags & PF_EXITPIDONE)) + if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) return -EAGAIN; /* @@ -1203,8 +1204,9 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, * *uaddr = 0xC0000000; tsk = get_task(PID); * } if (!tsk->flags & PF_EXITING) { * ... attach(); - * tsk->flags |= PF_EXITPIDONE; } else { - * if (!(tsk->flags & PF_EXITPIDONE)) + * tsk->futex_state = } else { + * FUTEX_STATE_DEAD; if (tsk->futex_state != + * FUTEX_STATE_DEAD) * return -EAGAIN; * return -ESRCH; <--- FAIL * } @@ -1260,17 +1262,16 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, } /* - * We need to look at the task state flags to figure out, - * whether the task is exiting. To protect against the do_exit - * change of the task flags, we do this protected by - * p->pi_lock: + * We need to look at the task state to figure out, whether the + * task is exiting. To protect against the change of the task state + * in futex_exit_release(), we do this protected by p->pi_lock: */ raw_spin_lock_irq(&p->pi_lock); - if (unlikely(p->flags & PF_EXITING)) { + if (unlikely(p->futex_state != FUTEX_STATE_OK)) { /* - * The task is on the way out. When PF_EXITPIDONE is - * set, we know that the task has finished the - * cleanup: + * The task is on the way out. When the futex state is + * FUTEX_STATE_DEAD, we know that the task has finished + * the cleanup: */ int ret = handle_exit_race(uaddr, uval, p); -- cgit From 4610ba7ad877fafc0a25a30c6c82015304120426 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:38 +0100 Subject: exit/exec: Seperate mm_release() mm_release() contains the futex exit handling. mm_release() is called from do_exit()->exit_mm() and from exec()->exec_mm(). In the exit_mm() case PF_EXITING and the futex state is updated. In the exec_mm() case these states are not touched. As the futex exit code needs further protections against exit races, this needs to be split into two functions. Preparatory only, no functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.240518241@linutronix.de --- kernel/exit.c | 2 +- kernel/fork.c | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d11bdcaac2e1..cd893b530902 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -437,7 +437,7 @@ static void exit_mm(void) struct mm_struct *mm = current->mm; struct core_state *core_state; - mm_release(current, mm); + exit_mm_release(current, mm); if (!mm) return; sync_mm_rss(mm); diff --git a/kernel/fork.c b/kernel/fork.c index bd7c218691d4..096f9d840bb8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1283,7 +1283,7 @@ static int wait_for_vfork_done(struct task_struct *child, * restoring the old one. . . * Eric Biederman 10 January 1998 */ -void mm_release(struct task_struct *tsk, struct mm_struct *mm) +static void mm_release(struct task_struct *tsk, struct mm_struct *mm) { /* Get rid of any futexes when releasing the mm */ futex_mm_release(tsk); @@ -1320,6 +1320,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) complete_vfork_done(tsk); } +void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm) +{ + mm_release(tsk, mm); +} + +void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm) +{ + mm_release(tsk, mm); +} + /** * dup_mm() - duplicates an existing mm structure * @tsk: the task_struct with which the new mm will be associated. -- cgit From 150d71584b12809144b8145b817e83b81158ae5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:39 +0100 Subject: futex: Split futex_mm_release() for exit/exec To allow separate handling of the futex exit state in the futex exit code for exit and exec, split futex_mm_release() into two functions and invoke them from the corresponding exit/exec_mm_release() callsites. Preparatory only, no functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.332094221@linutronix.de --- kernel/fork.c | 5 ++--- kernel/futex.c | 7 ++++++- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 096f9d840bb8..f1eb4d1f1a3b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1285,9 +1285,6 @@ static int wait_for_vfork_done(struct task_struct *child, */ static void mm_release(struct task_struct *tsk, struct mm_struct *mm) { - /* Get rid of any futexes when releasing the mm */ - futex_mm_release(tsk); - uprobe_free_utask(tsk); /* Get rid of any cached register state */ @@ -1322,11 +1319,13 @@ static void mm_release(struct task_struct *tsk, struct mm_struct *mm) void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm) { + futex_exit_release(tsk); mm_release(tsk, mm); } void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm) { + futex_exec_release(tsk); mm_release(tsk, mm); } diff --git a/kernel/futex.c b/kernel/futex.c index 41c75277d7d1..909e4d3c3099 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3661,7 +3661,7 @@ static void exit_robust_list(struct task_struct *curr) } } -void futex_mm_release(struct task_struct *tsk) +void futex_exec_release(struct task_struct *tsk) { if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); @@ -3679,6 +3679,11 @@ void futex_mm_release(struct task_struct *tsk) exit_pi_state_list(tsk); } +void futex_exit_release(struct task_struct *tsk) +{ + futex_exec_release(tsk); +} + long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { -- cgit From f24f22435dcc11389acc87e5586239c1819d217c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:40 +0100 Subject: futex: Set task::futex_state to DEAD right after handling futex exit Setting task::futex_state in do_exit() is rather arbitrarily placed for no reason. Move it into the futex code. Note, this is only done for the exit cleanup as the exec cleanup cannot set the state to FUTEX_STATE_DEAD because the task struct is still in active use. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.439511191@linutronix.de --- kernel/exit.c | 1 - kernel/futex.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index cd893b530902..f3b8fa1b8945 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -837,7 +837,6 @@ void __noreturn do_exit(long code) * Make sure we are holding no locks: */ debug_check_no_locks_held(); - futex_exit_done(tsk); if (tsk->io_context) exit_io_context(tsk); diff --git a/kernel/futex.c b/kernel/futex.c index 909e4d3c3099..426dd71e170d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3682,6 +3682,7 @@ void futex_exec_release(struct task_struct *tsk) void futex_exit_release(struct task_struct *tsk) { futex_exec_release(tsk); + futex_exit_done(tsk); } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, -- cgit From 18f694385c4fd77a09851fd301236746ca83f3cb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:41 +0100 Subject: futex: Mark the begin of futex exit explicitly Instead of relying on PF_EXITING use an explicit state for the futex exit and set it in the futex exit function. This moves the smp barrier and the lock/unlock serialization into the futex code. As with the DEAD state this is restricted to the exit path as exec continues to use the same task struct. This allows to simplify that logic in a next step. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.539409004@linutronix.de --- kernel/exit.c | 13 +------------ kernel/futex.c | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f3b8fa1b8945..d351fd09e739 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -746,23 +746,12 @@ void __noreturn do_exit(long code) */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); - futex_exit_done(tsk); + futex_exit_recursive(tsk); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } exit_signals(tsk); /* sets PF_EXITING */ - /* - * Ensure that all new tsk->pi_lock acquisitions must observe - * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). - */ - smp_mb(); - /* - * Ensure that we must observe the pi_state in exit_mm() -> - * mm_release() -> exit_pi_state_list(). - */ - raw_spin_lock_irq(&tsk->pi_lock); - raw_spin_unlock_irq(&tsk->pi_lock); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", diff --git a/kernel/futex.c b/kernel/futex.c index 426dd71e170d..3488fb024a20 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3679,10 +3679,45 @@ void futex_exec_release(struct task_struct *tsk) exit_pi_state_list(tsk); } +/** + * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD + * @tsk: task to set the state on + * + * Set the futex exit state of the task lockless. The futex waiter code + * observes that state when a task is exiting and loops until the task has + * actually finished the futex cleanup. The worst case for this is that the + * waiter runs through the wait loop until the state becomes visible. + * + * This is called from the recursive fault handling path in do_exit(). + * + * This is best effort. Either the futex exit code has run already or + * not. If the OWNER_DIED bit has been set on the futex then the waiter can + * take it over. If not, the problem is pushed back to user space. If the + * futex exit code did not run yet, then an already queued waiter might + * block forever, but there is nothing which can be done about that. + */ +void futex_exit_recursive(struct task_struct *tsk) +{ + tsk->futex_state = FUTEX_STATE_DEAD; +} + void futex_exit_release(struct task_struct *tsk) { + tsk->futex_state = FUTEX_STATE_EXITING; + /* + * Ensure that all new tsk->pi_lock acquisitions must observe + * FUTEX_STATE_EXITING. Serializes against attach_to_pi_owner(). + */ + smp_mb(); + /* + * Ensure that we must observe the pi_state in exit_pi_state_list(). + */ + raw_spin_lock_irq(&tsk->pi_lock); + raw_spin_unlock_irq(&tsk->pi_lock); + futex_exec_release(tsk); - futex_exit_done(tsk); + + tsk->futex_state = FUTEX_STATE_DEAD; } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, -- cgit From 4a8e991b91aca9e20705d434677ac013974e0e30 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:42 +0100 Subject: futex: Sanitize exit state handling Instead of having a smp_mb() and an empty lock/unlock of task::pi_lock move the state setting into to the lock section. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.645603214@linutronix.de --- kernel/futex.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 3488fb024a20..f618562b4f5f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3703,16 +3703,19 @@ void futex_exit_recursive(struct task_struct *tsk) void futex_exit_release(struct task_struct *tsk) { - tsk->futex_state = FUTEX_STATE_EXITING; - /* - * Ensure that all new tsk->pi_lock acquisitions must observe - * FUTEX_STATE_EXITING. Serializes against attach_to_pi_owner(). - */ - smp_mb(); /* - * Ensure that we must observe the pi_state in exit_pi_state_list(). + * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. + * + * This ensures that all subsequent checks of tsk->futex_state in + * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with + * tsk->pi_lock held. + * + * It guarantees also that a pi_state which was queued right before + * the state change under tsk->pi_lock by a concurrent waiter must + * be observed in exit_pi_state_list(). */ raw_spin_lock_irq(&tsk->pi_lock); + tsk->futex_state = FUTEX_STATE_EXITING; raw_spin_unlock_irq(&tsk->pi_lock); futex_exec_release(tsk); -- cgit From af8cbda2cfcaa5515d61ec500498d46e9a8247e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:43 +0100 Subject: futex: Provide state handling for exec() as well exec() attempts to handle potentially held futexes gracefully by running the futex exit handling code like exit() does. The current implementation has no protection against concurrent incoming waiters. The reason is that the futex state cannot be set to FUTEX_STATE_DEAD after the cleanup because the task struct is still active and just about to execute the new binary. While its arguably buggy when a task holds a futex over exec(), for consistency sake the state handling can at least cover the actual futex exit cleanup section. This provides state consistency protection accross the cleanup. As the futex state of the task becomes FUTEX_STATE_OK after the cleanup has been finished, this cannot prevent subsequent attempts to attach to the task in case that the cleanup was not successfull in mopping up all leftovers. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.753355618@linutronix.de --- kernel/futex.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index f618562b4f5f..0c9850af2724 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3661,7 +3661,7 @@ static void exit_robust_list(struct task_struct *curr) } } -void futex_exec_release(struct task_struct *tsk) +static void futex_cleanup(struct task_struct *tsk) { if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); @@ -3701,7 +3701,7 @@ void futex_exit_recursive(struct task_struct *tsk) tsk->futex_state = FUTEX_STATE_DEAD; } -void futex_exit_release(struct task_struct *tsk) +static void futex_cleanup_begin(struct task_struct *tsk) { /* * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. @@ -3717,10 +3717,40 @@ void futex_exit_release(struct task_struct *tsk) raw_spin_lock_irq(&tsk->pi_lock); tsk->futex_state = FUTEX_STATE_EXITING; raw_spin_unlock_irq(&tsk->pi_lock); +} - futex_exec_release(tsk); +static void futex_cleanup_end(struct task_struct *tsk, int state) +{ + /* + * Lockless store. The only side effect is that an observer might + * take another loop until it becomes visible. + */ + tsk->futex_state = state; +} - tsk->futex_state = FUTEX_STATE_DEAD; +void futex_exec_release(struct task_struct *tsk) +{ + /* + * The state handling is done for consistency, but in the case of + * exec() there is no way to prevent futher damage as the PID stays + * the same. But for the unlikely and arguably buggy case that a + * futex is held on exec(), this provides at least as much state + * consistency protection which is possible. + */ + futex_cleanup_begin(tsk); + futex_cleanup(tsk); + /* + * Reset the state to FUTEX_STATE_OK. The task is alive and about + * exec a new binary. + */ + futex_cleanup_end(tsk, FUTEX_STATE_OK); +} + +void futex_exit_release(struct task_struct *tsk) +{ + futex_cleanup_begin(tsk); + futex_cleanup(tsk); + futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, -- cgit From 3f186d974826847a07bc7964d79ec4eded475ad9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:44 +0100 Subject: futex: Add mutex around futex exit The mutex will be used in subsequent changes to replace the busy looping of a waiter when the futex owner is currently executing the exit cleanup to prevent a potential live lock. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.845798895@linutronix.de --- kernel/futex.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0c9850af2724..46a81e611065 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3698,11 +3698,22 @@ static void futex_cleanup(struct task_struct *tsk) */ void futex_exit_recursive(struct task_struct *tsk) { + /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ + if (tsk->futex_state == FUTEX_STATE_EXITING) + mutex_unlock(&tsk->futex_exit_mutex); tsk->futex_state = FUTEX_STATE_DEAD; } static void futex_cleanup_begin(struct task_struct *tsk) { + /* + * Prevent various race issues against a concurrent incoming waiter + * including live locks by forcing the waiter to block on + * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in + * attach_to_pi_owner(). + */ + mutex_lock(&tsk->futex_exit_mutex); + /* * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. * @@ -3726,6 +3737,11 @@ static void futex_cleanup_end(struct task_struct *tsk, int state) * take another loop until it becomes visible. */ tsk->futex_state = state; + /* + * Drop the exit protection. This unblocks waiters which observed + * FUTEX_STATE_EXITING to reevaluate the state. + */ + mutex_unlock(&tsk->futex_exit_mutex); } void futex_exec_release(struct task_struct *tsk) -- cgit From ac31c7ff8624409ba3c4901df9237a616c187a5d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:45 +0100 Subject: futex: Provide distinct return value when owner is exiting attach_to_pi_owner() returns -EAGAIN for various cases: - Owner task is exiting - Futex value has changed The caller drops the held locks (hash bucket, mmap_sem) and retries the operation. In case of the owner task exiting this can result in a live lock. As a preparatory step for seperating those cases, provide a distinct return value (EBUSY) for the owner exiting case. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191106224556.935606117@linutronix.de --- kernel/futex.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 46a81e611065..4f9d7a4b6dbf 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1182,11 +1182,11 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, u32 uval2; /* - * If the futex exit state is not yet FUTEX_STATE_DEAD, wait - * for it to finish. + * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the + * caller that the alleged owner is busy. */ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) - return -EAGAIN; + return -EBUSY; /* * Reread the user space value to handle the following situation: @@ -2092,12 +2092,13 @@ retry_private: if (!ret) goto retry; goto out; + case -EBUSY: case -EAGAIN: /* * Two reasons for this: - * - Owner is exiting and we just wait for the + * - EBUSY: Owner is exiting and we just wait for the * exit to complete. - * - The user space value changed. + * - EAGAIN: The user space value changed. */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); @@ -2843,12 +2844,13 @@ retry_private: goto out_unlock_put_key; case -EFAULT: goto uaddr_faulted; + case -EBUSY: case -EAGAIN: /* * Two reasons for this: - * - Task is exiting and we just wait for the + * - EBUSY: Task is exiting and we just wait for the * exit to complete. - * - The user space value changed. + * - EAGAIN: The user space value changed. */ queue_unlock(hb); put_futex_key(&q.key); -- cgit From 3ef240eaff36b8119ac9e2ea17cbf41179c930ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Nov 2019 22:55:46 +0100 Subject: futex: Prevent exit livelock Oleg provided the following test case: int main(void) { struct sched_param sp = {}; sp.sched_priority = 2; assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0); int lock = vfork(); if (!lock) { sp.sched_priority = 1; assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0); _exit(0); } syscall(__NR_futex, &lock, FUTEX_LOCK_PI, 0,0,0); return 0; } This creates an unkillable RT process spinning in futex_lock_pi() on a UP machine or if the process is affine to a single CPU. The reason is: parent child set FIFO prio 2 vfork() -> set FIFO prio 1 implies wait_for_child() sched_setscheduler(...) exit() do_exit() .... mm_release() tsk->futex_state = FUTEX_STATE_EXITING; exit_futex(); (NOOP in this case) complete() --> wakes parent sys_futex() loop infinite because tsk->futex_state == FUTEX_STATE_EXITING The same problem can happen just by regular preemption as well: task holds futex ... do_exit() tsk->futex_state = FUTEX_STATE_EXITING; --> preemption (unrelated wakeup of some other higher prio task, e.g. timer) switch_to(other_task) return to user sys_futex() loop infinite as above Just for the fun of it the futex exit cleanup could trigger the wakeup itself before the task sets its futex state to DEAD. To cure this, the handling of the exiting owner is changed so: - A refcount is held on the task - The task pointer is stored in a caller visible location - The caller drops all locks (hash bucket, mmap_sem) and blocks on task::futex_exit_mutex. When the mutex is acquired then the exiting task has completed the cleanup and the state is consistent and can be reevaluated. This is not a pretty solution, but there is no choice other than returning an error code to user space, which would break the state consistency guarantee and open another can of problems including regressions. For stable backports the preparatory commits ac31c7ff8624 .. ba31c1a48538 are required as well, but for anything older than 5.3.y the backports are going to be provided when this hits mainline as the other dependencies for those kernels are definitely not stable material. Fixes: 778e9a9c3e71 ("pi-futex: fix exit races and locking problems") Reported-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Cc: Stable Team Link: https://lkml.kernel.org/r/20191106224557.041676471@linutronix.de --- kernel/futex.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 91 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 4f9d7a4b6dbf..03c518e9747e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1176,6 +1176,36 @@ out_error: return ret; } +/** + * wait_for_owner_exiting - Block until the owner has exited + * @exiting: Pointer to the exiting task + * + * Caller must hold a refcount on @exiting. + */ +static void wait_for_owner_exiting(int ret, struct task_struct *exiting) +{ + if (ret != -EBUSY) { + WARN_ON_ONCE(exiting); + return; + } + + if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) + return; + + mutex_lock(&exiting->futex_exit_mutex); + /* + * No point in doing state checking here. If the waiter got here + * while the task was in exec()->exec_futex_release() then it can + * have any FUTEX_STATE_* value when the waiter has acquired the + * mutex. OK, if running, EXITING or DEAD if it reached exit() + * already. Highly unlikely and not a problem. Just one more round + * through the futex maze. + */ + mutex_unlock(&exiting->futex_exit_mutex); + + put_task_struct(exiting); +} + static int handle_exit_race(u32 __user *uaddr, u32 uval, struct task_struct *tsk) { @@ -1237,7 +1267,8 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, * it after doing proper sanity checks. */ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, - struct futex_pi_state **ps) + struct futex_pi_state **ps, + struct task_struct **exiting) { pid_t pid = uval & FUTEX_TID_MASK; struct futex_pi_state *pi_state; @@ -1276,7 +1307,19 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, int ret = handle_exit_race(uaddr, uval, p); raw_spin_unlock_irq(&p->pi_lock); - put_task_struct(p); + /* + * If the owner task is between FUTEX_STATE_EXITING and + * FUTEX_STATE_DEAD then store the task pointer and keep + * the reference on the task struct. The calling code will + * drop all locks, wait for the task to reach + * FUTEX_STATE_DEAD and then drop the refcount. This is + * required to prevent a live lock when the current task + * preempted the exiting task between the two states. + */ + if (ret == -EBUSY) + *exiting = p; + else + put_task_struct(p); return ret; } @@ -1315,7 +1358,8 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, static int lookup_pi_state(u32 __user *uaddr, u32 uval, struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps) + union futex_key *key, struct futex_pi_state **ps, + struct task_struct **exiting) { struct futex_q *top_waiter = futex_top_waiter(hb, key); @@ -1330,7 +1374,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, * We are the first waiter - try to look up the owner based on * @uval and attach to it. */ - return attach_to_pi_owner(uaddr, uval, key, ps); + return attach_to_pi_owner(uaddr, uval, key, ps, exiting); } static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) @@ -1358,6 +1402,8 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) * lookup * @task: the task to perform the atomic lock work for. This will * be "current" except in the case of requeue pi. + * @exiting: Pointer to store the task pointer of the owner task + * which is in the middle of exiting * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Return: @@ -1366,11 +1412,17 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) * - <0 - error * * The hb->lock and futex_key refs shall be held by the caller. + * + * @exiting is only set when the return value is -EBUSY. If so, this holds + * a refcount on the exiting task on return and the caller needs to drop it + * after waiting for the exit to complete. */ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps, - struct task_struct *task, int set_waiters) + struct task_struct *task, + struct task_struct **exiting, + int set_waiters) { u32 uval, newval, vpid = task_pid_vnr(task); struct futex_q *top_waiter; @@ -1440,7 +1492,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, * attach to the owner. If that fails, no harm done, we only * set the FUTEX_WAITERS bit in the user space variable. */ - return attach_to_pi_owner(uaddr, newval, key, ps); + return attach_to_pi_owner(uaddr, newval, key, ps, exiting); } /** @@ -1858,6 +1910,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * @key1: the from futex key * @key2: the to futex key * @ps: address to store the pi_state pointer + * @exiting: Pointer to store the task pointer of the owner task + * which is in the middle of exiting * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Try and get the lock on behalf of the top waiter if we can do it atomically. @@ -1865,16 +1919,20 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. * hb1 and hb2 must be held by the caller. * + * @exiting is only set when the return value is -EBUSY. If so, this holds + * a refcount on the exiting task on return and the caller needs to drop it + * after waiting for the exit to complete. + * * Return: * - 0 - failed to acquire the lock atomically; * - >0 - acquired the lock, return value is vpid of the top_waiter * - <0 - error */ -static int futex_proxy_trylock_atomic(u32 __user *pifutex, - struct futex_hash_bucket *hb1, - struct futex_hash_bucket *hb2, - union futex_key *key1, union futex_key *key2, - struct futex_pi_state **ps, int set_waiters) +static int +futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key1, + union futex_key *key2, struct futex_pi_state **ps, + struct task_struct **exiting, int set_waiters) { struct futex_q *top_waiter = NULL; u32 curval; @@ -1911,7 +1969,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, */ vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, - set_waiters); + exiting, set_waiters); if (ret == 1) { requeue_pi_wake_futex(top_waiter, key2, hb2); return vpid; @@ -2040,6 +2098,8 @@ retry_private: } if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + struct task_struct *exiting = NULL; + /* * Attempt to acquire uaddr2 and wake the top waiter. If we * intend to requeue waiters, force setting the FUTEX_WAITERS @@ -2047,7 +2107,8 @@ retry_private: * faults rather in the requeue loop below. */ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, - &key2, &pi_state, nr_requeue); + &key2, &pi_state, + &exiting, nr_requeue); /* * At this point the top_waiter has either taken uaddr2 or is @@ -2074,7 +2135,8 @@ retry_private: * If that call succeeds then we have pi_state and an * initial refcount on it. */ - ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); + ret = lookup_pi_state(uaddr2, ret, hb2, &key2, + &pi_state, &exiting); } switch (ret) { @@ -2104,6 +2166,12 @@ retry_private: hb_waiters_dec(hb2); put_futex_key(&key2); put_futex_key(&key1); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); cond_resched(); goto retry; default: @@ -2810,6 +2878,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, { struct hrtimer_sleeper timeout, *to; struct futex_pi_state *pi_state = NULL; + struct task_struct *exiting = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; @@ -2831,7 +2900,8 @@ retry: retry_private: hb = queue_lock(&q); - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, + &exiting, 0); if (unlikely(ret)) { /* * Atomic work succeeded and we got the lock, @@ -2854,6 +2924,12 @@ retry_private: */ queue_unlock(hb); put_futex_key(&q.key); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); cond_resched(); goto retry; default: -- cgit From 05ff1ba412fd6bd48d56dd4c0baff626533728cc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Nov 2019 10:33:34 +0100 Subject: PM: QoS: Invalidate frequency QoS requests after removal Switching cpufreq drivers (or switching operation modes of the intel_pstate driver from "active" to "passive" and vice versa) does not work on some x86 systems with ACPI after commit 3000ce3c52f8 ("cpufreq: Use per-policy frequency QoS"), because the ACPI _PPC and thermal code uses the same frequency QoS request object for a given CPU every time a cpufreq driver is registered and freq_qos_remove_request() does not invalidate the request after removing it from its QoS list, so freq_qos_add_request() complains and fails when that request is passed to it again. Fix the issue by modifying freq_qos_remove_request() to clear the qos and type fields of the frequency request pointed to by its argument after removing it from its QoS list so as to invalidate it. Fixes: 3000ce3c52f8 ("cpufreq: Use per-policy frequency QoS") Reported-and-tested-by: Doug Smythies Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- kernel/power/qos.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 04e83fdfbe80..a45cba7df0ae 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -814,6 +814,8 @@ EXPORT_SYMBOL_GPL(freq_qos_update_request); */ int freq_qos_remove_request(struct freq_qos_request *req) { + int ret; + if (!req) return -EINVAL; @@ -821,7 +823,11 @@ int freq_qos_remove_request(struct freq_qos_request *req) "%s() called for unknown object\n", __func__)) return -EINVAL; - return freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); + ret = freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); + req->qos = NULL; + req->type = 0; + + return ret; } EXPORT_SYMBOL_GPL(freq_qos_remove_request); -- cgit From c55b51a06b01d67a99457bb82a8c31081c7faa23 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sat, 16 Nov 2019 14:16:12 +0100 Subject: cpuidle: Allow idle injection to apply exit latency limit In some cases it may be useful to specify an exit latency limit for the idle state to be used during CPU idle time injection. Instead of duplicating the information in struct cpuidle_device or propagating the latency limit in the call stack, replace the use_deepest_state field with forced_latency_limit_ns to represent that limit, so that the deepest idle state with exit latency within that limit is forced (i.e. no governors) when it is set. A zero exit latency limit for forced idle means to use governors in the usual way (analogous to use_deepest_state equal to "false" before this change). Additionally, add play_idle_precise() taking two arguments, the duration of forced idle and the idle state exit latency limit, both in nanoseconds, and redefine play_idle() as a wrapper around that new function. This change is preparatory, no functional impact is expected. Suggested-by: Rafael J. Wysocki Signed-off-by: Daniel Lezcano [ rjw: Subject, changelog, cpuidle_use_deepest_state() kerneldoc, whitespace ] Signed-off-by: Rafael J. Wysocki --- kernel/sched/idle.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1aa260702b38..cd05ffa0abfe 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -165,7 +165,7 @@ static void cpuidle_idle_call(void) * until a proper wakeup interrupt happens. */ - if (idle_should_enter_s2idle() || dev->use_deepest_state) { + if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) { if (idle_should_enter_s2idle()) { rcu_idle_enter(); @@ -311,7 +311,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -void play_idle(unsigned long duration_us) +void play_idle_precise(u64 duration_ns, u64 latency_ns) { struct idle_timer it; @@ -323,29 +323,29 @@ void play_idle(unsigned long duration_us) WARN_ON_ONCE(current->nr_cpus_allowed != 1); WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); - WARN_ON_ONCE(!duration_us); + WARN_ON_ONCE(!duration_ns); rcu_sleep_check(); preempt_disable(); current->flags |= PF_IDLE; - cpuidle_use_deepest_state(true); + cpuidle_use_deepest_state(latency_ns); it.done = 0; hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); it.timer.function = idle_inject_timer_fn; - hrtimer_start(&it.timer, ns_to_ktime(duration_us * NSEC_PER_USEC), + hrtimer_start(&it.timer, ns_to_ktime(duration_ns), HRTIMER_MODE_REL_PINNED); while (!READ_ONCE(it.done)) do_idle(); - cpuidle_use_deepest_state(false); + cpuidle_use_deepest_state(0); current->flags &= ~PF_IDLE; preempt_fold_need_resched(); preempt_enable(); } -EXPORT_SYMBOL_GPL(play_idle); +EXPORT_SYMBOL_GPL(play_idle_precise); void cpu_startup_entry(enum cpuhp_state state) { -- cgit From 5aa9ba6312e36c18626e73506b92d1513d815435 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sat, 16 Nov 2019 14:16:13 +0100 Subject: cpuidle: Pass exit latency limit to cpuidle_use_deepest_state() Modify cpuidle_use_deepest_state() to take an additional exit latency limit argument to be passed to find_deepest_idle_state() and make cpuidle_idle_call() pass dev->forced_idle_latency_limit_ns to it for forced idle. Suggested-by: Rafael J. Wysocki Signed-off-by: Daniel Lezcano [ rjw: Rebase and rearrange code, subject & changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/sched/idle.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cd05ffa0abfe..fc9604ddd802 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -166,6 +166,8 @@ static void cpuidle_idle_call(void) */ if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) { + u64 max_latency_ns; + if (idle_should_enter_s2idle()) { rcu_idle_enter(); @@ -176,12 +178,16 @@ static void cpuidle_idle_call(void) } rcu_idle_exit(); + + max_latency_ns = U64_MAX; + } else { + max_latency_ns = dev->forced_idle_latency_limit_ns; } tick_nohz_idle_stop_tick(); rcu_idle_enter(); - next_state = cpuidle_find_deepest_state(drv, dev); + next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns); call_cpuidle(drv, dev, next_state); } else { bool stop_tick = true; -- cgit From 9e77716a75bc6cf54965e5ec069ba7c02b32251c Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Wed, 20 Nov 2019 01:33:20 +0100 Subject: fork: fix pidfd_poll()'s return type pidfd_poll() is defined as returning 'unsigned int' but the .poll method is declared as returning '__poll_t', a bitwise type. Fix this by using the proper return type and using the EPOLL constants instead of the POLL ones, as required for __poll_t. Fixes: b53b0b9d9a61 ("pidfd: add polling support") Cc: Joel Fernandes (Google) Cc: stable@vger.kernel.org # 5.3 Signed-off-by: Luc Van Oostenryck Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20191120003320.31138-1-luc.vanoostenryck@gmail.com Signed-off-by: Christian Brauner --- kernel/fork.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 55af6931c6ec..13b38794efb5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1708,11 +1708,11 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) /* * Poll support for process exit notification. */ -static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) +static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct task_struct *task; struct pid *pid = file->private_data; - int poll_flags = 0; + __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); @@ -1724,7 +1724,7 @@ static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) * group, then poll(2) should block, similar to the wait(2) family. */ if (!task || (task->exit_state && thread_group_empty(task))) - poll_flags = POLLIN | POLLRDNORM; + poll_flags = EPOLLIN | EPOLLRDNORM; rcu_read_unlock(); return poll_flags; -- cgit From 56e35f9c5b87ec1ae93e483284e189c84388de16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Nov 2019 18:03:11 +0100 Subject: dma-mapping: drop the dev argument to arch_sync_dma_for_* These are pure cache maintainance routines, so drop the unused struct device argument. Signed-off-by: Christoph Hellwig Suggested-by: Daniel Vetter --- kernel/dma/direct.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 22a2e0833862..077876ae5c74 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -268,7 +268,7 @@ void dma_direct_sync_single_for_device(struct device *dev, swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_device(dev, paddr, size, dir); + arch_sync_dma_for_device(paddr, size, dir); } EXPORT_SYMBOL(dma_direct_sync_single_for_device); @@ -286,7 +286,7 @@ void dma_direct_sync_sg_for_device(struct device *dev, dir, SYNC_FOR_DEVICE); if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_device(dev, paddr, sg->length, + arch_sync_dma_for_device(paddr, sg->length, dir); } } @@ -302,8 +302,8 @@ void dma_direct_sync_single_for_cpu(struct device *dev, phys_addr_t paddr = dma_to_phys(dev, addr); if (!dev_is_dma_coherent(dev)) { - arch_sync_dma_for_cpu(dev, paddr, size, dir); - arch_sync_dma_for_cpu_all(dev); + arch_sync_dma_for_cpu(paddr, size, dir); + arch_sync_dma_for_cpu_all(); } if (unlikely(is_swiotlb_buffer(paddr))) @@ -321,7 +321,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_cpu(dev, paddr, sg->length, dir); + arch_sync_dma_for_cpu(paddr, sg->length, dir); if (unlikely(is_swiotlb_buffer(paddr))) swiotlb_tbl_sync_single(dev, paddr, sg->length, dir, @@ -329,7 +329,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, } if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_cpu_all(dev); + arch_sync_dma_for_cpu_all(); } EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu); @@ -380,7 +380,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, } if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_device(dev, phys, size, dir); + arch_sync_dma_for_device(phys, size, dir); return dma_addr; } EXPORT_SYMBOL(dma_direct_map_page); -- cgit From 50f579a2399dee0ad1c86ea8159ab8657b74f95b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Nov 2019 09:18:19 +0300 Subject: dma-debug: clean up put_hash_bucket() The put_hash_bucket() is a bit cleaner if it takes an unsigned long directly instead of a pointer to unsigned long. Signed-off-by: Dan Carpenter Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 004496654aaa..64972aa9bfc3 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -255,12 +255,10 @@ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, * Give up exclusive access to the hash bucket */ static void put_hash_bucket(struct hash_bucket *bucket, - unsigned long *flags) + unsigned long flags) __releases(&bucket->lock) { - unsigned long __flags = *flags; - - spin_unlock_irqrestore(&bucket->lock, __flags); + spin_unlock_irqrestore(&bucket->lock, flags); } static bool exact_match(struct dma_debug_entry *a, struct dma_debug_entry *b) @@ -359,7 +357,7 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, /* * Nothing found, go back a hash bucket */ - put_hash_bucket(*bucket, flags); + put_hash_bucket(*bucket, *flags); range += (1 << HASH_FN_SHIFT); index.dev_addr -= (1 << HASH_FN_SHIFT); *bucket = get_hash_bucket(&index, flags); @@ -609,7 +607,7 @@ static void add_dma_entry(struct dma_debug_entry *entry) bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); rc = active_cacheline_insert(entry); if (rc == -ENOMEM) { @@ -1002,7 +1000,7 @@ static void check_unmap(struct dma_debug_entry *ref) if (!entry) { /* must drop lock before calling dma_mapping_error */ - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); if (dma_mapping_error(ref->dev, ref->dev_addr)) { err_printk(ref->dev, NULL, @@ -1084,7 +1082,7 @@ static void check_unmap(struct dma_debug_entry *ref) hash_bucket_del(entry); dma_entry_free(entry); - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); } static void check_for_stack(struct device *dev, @@ -1204,7 +1202,7 @@ static void check_sync(struct device *dev, } out: - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); } static void check_sg_segment(struct device *dev, struct scatterlist *sg) @@ -1319,7 +1317,7 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) } } - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); } EXPORT_SYMBOL(debug_dma_mapping_error); @@ -1392,7 +1390,7 @@ static int get_nr_mapped_entries(struct device *dev, if (entry) mapped_ents = entry->sg_mapped_ents; - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); return mapped_ents; } -- cgit From 4268ac6ae5870af10a7417b22990d615f72f77e2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2019 17:35:36 +0100 Subject: dma-direct: don't check swiotlb=force in dma_direct_map_resource When mapping resources we can't just use swiotlb ram for bounce buffering. Switch to a direct dma_capable check instead. Fixes: cfced786969c ("dma-mapping: remove the default map_resource implementation") Reported-by: Robin Murphy Signed-off-by: Christoph Hellwig Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski --- kernel/dma/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 077876ae5c74..a479bd2d1e8b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -412,7 +412,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, { dma_addr_t dma_addr = paddr; - if (unlikely(!dma_direct_possible(dev, dma_addr, size))) { + if (unlikely(!dma_capable(dev, dma_addr, size))) { report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; } -- cgit From 68a33b1794665ba8a1d1ef1d3bfcc7c587d380a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2019 17:38:58 +0100 Subject: dma-direct: exclude dma_direct_map_resource from the min_low_pfn check The valid memory address check in dma_capable only makes sense when mapping normal memory, not when using dma_map_resource to map a device resource. Add a new boolean argument to dma_capable to exclude that check for the dma_map_resource case. Fixes: b12d66278dd6 ("dma-direct: check for overflows on 32 bit DMA addresses") Reported-by: Marek Szyprowski Signed-off-by: Christoph Hellwig Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski --- kernel/dma/direct.c | 4 ++-- kernel/dma/swiotlb.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index a479bd2d1e8b..40f1f0aac4b1 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -363,7 +363,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, size_t size) { return swiotlb_force != SWIOTLB_FORCE && - dma_capable(dev, dma_addr, size); + dma_capable(dev, dma_addr, size, true); } dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, @@ -412,7 +412,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, { dma_addr_t dma_addr = paddr; - if (unlikely(!dma_capable(dev, dma_addr, size))) { + if (unlikely(!dma_capable(dev, dma_addr, size, false))) { report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 673a2cdb2656..9280d6f8271e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -678,7 +678,7 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, /* Ensure that the address returned is DMA'ble */ *dma_addr = __phys_to_dma(dev, *phys); - if (unlikely(!dma_capable(dev, *dma_addr, size))) { + if (unlikely(!dma_capable(dev, *dma_addr, size, true))) { swiotlb_tbl_unmap_single(dev, *phys, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return false; -- cgit From 91e6015b082b08a74e5d9d326f651e5890a93519 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Nov 2019 22:38:16 +0100 Subject: bpf: Emit audit messages upon successful prog load and unload Allow for audit messages to be emitted upon BPF program load and unload for having a timeline of events. The load itself is in syscall context, so additional info about the process initiating the BPF prog creation can be logged and later directly correlated to the unload event. The only info really needed from BPF side is the globally unique prog ID where then audit user space tooling can query / dump all info needed about the specific BPF program right upon load event and enrich the record, thus these changes needed here can be kept small and non-intrusive to the core. Raw example output: # auditctl -D # auditctl -a always,exit -F arch=x86_64 -S bpf # ausearch --start recent -m 1334 [...] ---- time->Wed Nov 20 12:45:51 2019 type=PROCTITLE msg=audit(1574271951.590:8974): proctitle="./test_verifier" type=SYSCALL msg=audit(1574271951.590:8974): arch=c000003e syscall=321 success=yes exit=14 a0=5 a1=7ffe2d923e80 a2=78 a3=0 items=0 ppid=742 pid=949 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=2 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1334] msg=audit(1574271951.590:8974): auid=0 uid=0 gid=0 ses=2 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 pid=949 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" prog-id=3260 event=LOAD ---- time->Wed Nov 20 12:45:51 2019 type=UNKNOWN[1334] msg=audit(1574271951.590:8975): prog-id=3260 event=UNLOAD ---- [...] Signed-off-by: Daniel Borkmann Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191120213816.8186-1-jolsa@kernel.org --- kernel/auditsc.c | 2 +- kernel/bpf/syscall.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4effe01ebbe2..9bf1045fedfa 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2545,7 +2545,7 @@ void __audit_ntp_log(const struct audit_ntp_data *ad) audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); } -static void audit_log_task(struct audit_buffer *ab) +void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; kgid_t gid; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bac3becf9f90..17f4254495f2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ @@ -1318,6 +1319,34 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +enum bpf_event { + BPF_EVENT_LOAD, + BPF_EVENT_UNLOAD, +}; + +static const char * const bpf_event_audit_str[] = { + [BPF_EVENT_LOAD] = "LOAD", + [BPF_EVENT_UNLOAD] = "UNLOAD", +}; + +static void bpf_audit_prog(const struct bpf_prog *prog, enum bpf_event event) +{ + bool has_task_context = event == BPF_EVENT_LOAD; + struct audit_buffer *ab; + + if (audit_enabled == AUDIT_OFF) + return; + ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_BPF); + if (unlikely(!ab)) + return; + if (has_task_context) + audit_log_task(ab); + audit_log_format(ab, "%sprog-id=%u event=%s", + has_task_context ? " " : "", + prog->aux->id, bpf_event_audit_str[event]); + audit_log_end(ab); +} + int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1434,6 +1463,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic64_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); + bpf_audit_prog(prog, BPF_EVENT_UNLOAD); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); __bpf_prog_put_noref(prog, true); @@ -1843,6 +1873,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); + bpf_audit_prog(prog, BPF_EVENT_LOAD); err = bpf_prog_new_fd(prog); if (err < 0) -- cgit From 196e8ca74886c433dcfc64a809707074b936aaf5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Nov 2019 23:04:44 +0100 Subject: bpf: Switch bpf_map_{area_alloc,area_mmapable_alloc}() to u64 size Given we recently extended the original bpf_map_area_alloc() helper in commit fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY"), we need to apply the same logic as in ff1c08e1f74b ("bpf: Change size to u64 for bpf_map_{area_alloc, charge_init}()"). To avoid conflicts, extend it for bpf-next. Reported-by: Stephen Rothwell Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 17f4254495f2..b51ecb9644d0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -128,7 +128,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) +static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, @@ -143,6 +143,9 @@ static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; void *area; + if (size >= SIZE_MAX) + return NULL; + /* kmalloc()'ed memory can't be mmap()'ed */ if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, @@ -160,12 +163,12 @@ static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) flags, __builtin_return_address(0)); } -void *bpf_map_area_alloc(size_t size, int numa_node) +void *bpf_map_area_alloc(u64 size, int numa_node) { return __bpf_map_area_alloc(size, numa_node, false); } -void *bpf_map_area_mmapable_alloc(size_t size, int numa_node) +void *bpf_map_area_mmapable_alloc(u64 size, int numa_node) { return __bpf_map_area_alloc(size, numa_node, true); } @@ -214,7 +217,7 @@ static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) atomic_long_sub(pages, &user->locked_vm); } -int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size) +int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size) { u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; struct user_struct *user; -- cgit From 7b8474466ed97be458c825f34a85f2c2b84c3f95 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 21 Nov 2019 00:03:03 +0000 Subject: time: Zero the upper 32-bits in __kernel_timespec on 32-bit On compat interfaces, the high order bits of nanoseconds should be zeroed out. This is because the application code or the libc do not guarantee zeroing of these. If used without zeroing, kernel might be at risk of using timespec values incorrectly. Originally it was handled correctly, but lost during is_compat_syscall() cleanup. Revert the condition back to check CONFIG_64BIT. Fixes: 98f76206b335 ("compat: Cleanup in_compat_syscall() callers") Reported-by: Ben Hutchings Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20191121000303.126523-1-dima@arista.com --- kernel/time/time.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 5c54ca632d08..83f403e7a15c 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -881,7 +881,8 @@ int get_timespec64(struct timespec64 *ts, ts->tv_sec = kts.tv_sec; /* Zero out the padding for 32 bit systems or in compat mode */ - if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall()) + if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || + in_compat_syscall())) kts.tv_nsec &= 0xFFFFFFFFUL; ts->tv_nsec = kts.tv_nsec; -- cgit From 5a1c95580f1d89c8a736bb02ecd82a8858388b8a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 21 Nov 2019 03:44:25 +0100 Subject: sched/cputime: Support other fields on kcpustat_field() Provide support for user, nice, guest and guest_nice fields through kcpustat_field(). Whether we account the delta to a nice or not nice field is decided on top of the nice value snapshot taken at the time we call kcpustat_field(). If the nice value of the task has been changed since the last vtime update, we may have inacurrate distribution of the nice VS unnice cputime. However this is considered as a minor issue compared to the proper fix that would involve interrupting the target on nice updates, which is undesired on nohz_full CPUs. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Wanpeng Li Cc: Yauheni Kaliuta Link: https://lkml.kernel.org/r/20191121024430.19938-2-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 54 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e0cd20693ef5..27b5406222fc 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -912,11 +912,21 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) } while (read_seqcount_retry(&vtime->seqcount, seq)); } +static u64 kcpustat_user_vtime(struct vtime *vtime) +{ + if (vtime->state == VTIME_USER) + return vtime->utime + vtime_delta(vtime); + else if (vtime->state == VTIME_GUEST) + return vtime->gtime + vtime_delta(vtime); + return 0; +} + static int kcpustat_field_vtime(u64 *cpustat, - struct vtime *vtime, + struct task_struct *tsk, enum cpu_usage_stat usage, int cpu, u64 *val) { + struct vtime *vtime = &tsk->vtime; unsigned int seq; int err; @@ -946,9 +956,37 @@ static int kcpustat_field_vtime(u64 *cpustat, *val = cpustat[usage]; - if (vtime->state == VTIME_SYS) - *val += vtime->stime + vtime_delta(vtime); - + /* + * Nice VS unnice cputime accounting may be inaccurate if + * the nice value has changed since the last vtime update. + * But proper fix would involve interrupting target on nice + * updates which is a no go on nohz_full (although the scheduler + * may still interrupt the target if rescheduling is needed...) + */ + switch (usage) { + case CPUTIME_SYSTEM: + if (vtime->state == VTIME_SYS) + *val += vtime->stime + vtime_delta(vtime); + break; + case CPUTIME_USER: + if (task_nice(tsk) <= 0) + *val += kcpustat_user_vtime(vtime); + break; + case CPUTIME_NICE: + if (task_nice(tsk) > 0) + *val += kcpustat_user_vtime(vtime); + break; + case CPUTIME_GUEST: + if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0) + *val += vtime->gtime + vtime_delta(vtime); + break; + case CPUTIME_GUEST_NICE: + if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0) + *val += vtime->gtime + vtime_delta(vtime); + break; + default: + break; + } } while (read_seqcount_retry(&vtime->seqcount, seq)); return 0; @@ -965,15 +1003,10 @@ u64 kcpustat_field(struct kernel_cpustat *kcpustat, if (!vtime_accounting_enabled_cpu(cpu)) return cpustat[usage]; - /* Only support sys vtime for now */ - if (usage != CPUTIME_SYSTEM) - return cpustat[usage]; - rq = cpu_rq(cpu); for (;;) { struct task_struct *curr; - struct vtime *vtime; rcu_read_lock(); curr = rcu_dereference(rq->curr); @@ -982,8 +1015,7 @@ u64 kcpustat_field(struct kernel_cpustat *kcpustat, return cpustat[usage]; } - vtime = &curr->vtime; - err = kcpustat_field_vtime(cpustat, vtime, usage, cpu, &val); + err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val); rcu_read_unlock(); if (!err) -- cgit From 74722bb223d0f236303b60c9509ff924a9713780 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 21 Nov 2019 03:44:26 +0100 Subject: sched/vtime: Bring up complete kcpustat accessor Many callsites want to fetch the values of system, user, user_nice, guest or guest_nice kcpustat fields altogether or at least a pair of these. In that case calling kcpustat_field() for each requested field brings unecessary overhead when we could fetch all of them in a row. So provide kcpustat_cpu_fetch() that fetches the whole kcpustat array in a vtime safe way under the same RCU and seqcount block. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Wanpeng Li Cc: Yauheni Kaliuta Link: https://lkml.kernel.org/r/20191121024430.19938-3-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 136 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 116 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 27b5406222fc..d43318a489f2 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -912,6 +912,30 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) } while (read_seqcount_retry(&vtime->seqcount, seq)); } +static int vtime_state_check(struct vtime *vtime, int cpu) +{ + /* + * We raced against a context switch, fetch the + * kcpustat task again. + */ + if (vtime->cpu != cpu && vtime->cpu != -1) + return -EAGAIN; + + /* + * Two possible things here: + * 1) We are seeing the scheduling out task (prev) or any past one. + * 2) We are seeing the scheduling in task (next) but it hasn't + * passed though vtime_task_switch() yet so the pending + * cputime of the prev task may not be flushed yet. + * + * Case 1) is ok but 2) is not. So wait for a safe VTIME state. + */ + if (vtime->state == VTIME_INACTIVE) + return -EAGAIN; + + return 0; +} + static u64 kcpustat_user_vtime(struct vtime *vtime) { if (vtime->state == VTIME_USER) @@ -933,26 +957,9 @@ static int kcpustat_field_vtime(u64 *cpustat, do { seq = read_seqcount_begin(&vtime->seqcount); - /* - * We raced against context switch, fetch the - * kcpustat task again. - */ - if (vtime->cpu != cpu && vtime->cpu != -1) - return -EAGAIN; - - /* - * Two possible things here: - * 1) We are seeing the scheduling out task (prev) or any past one. - * 2) We are seeing the scheduling in task (next) but it hasn't - * passed though vtime_task_switch() yet so the pending - * cputime of the prev task may not be flushed yet. - * - * Case 1) is ok but 2) is not. So wait for a safe VTIME state. - */ - if (vtime->state == VTIME_INACTIVE) - return -EAGAIN; - - err = 0; + err = vtime_state_check(vtime, cpu); + if (err < 0) + return err; *val = cpustat[usage]; @@ -1025,4 +1032,93 @@ u64 kcpustat_field(struct kernel_cpustat *kcpustat, } } EXPORT_SYMBOL_GPL(kcpustat_field); + +static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, + const struct kernel_cpustat *src, + struct task_struct *tsk, int cpu) +{ + struct vtime *vtime = &tsk->vtime; + unsigned int seq; + int err; + + do { + u64 *cpustat; + u64 delta; + + seq = read_seqcount_begin(&vtime->seqcount); + + err = vtime_state_check(vtime, cpu); + if (err < 0) + return err; + + *dst = *src; + cpustat = dst->cpustat; + + /* Task is sleeping, dead or idle, nothing to add */ + if (vtime->state < VTIME_SYS) + continue; + + delta = vtime_delta(vtime); + + /* + * Task runs either in user (including guest) or kernel space, + * add pending nohz time to the right place. + */ + if (vtime->state == VTIME_SYS) { + cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; + } else if (vtime->state == VTIME_USER) { + if (task_nice(tsk) > 0) + cpustat[CPUTIME_NICE] += vtime->utime + delta; + else + cpustat[CPUTIME_USER] += vtime->utime + delta; + } else { + WARN_ON_ONCE(vtime->state != VTIME_GUEST); + if (task_nice(tsk) > 0) { + cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; + cpustat[CPUTIME_NICE] += vtime->gtime + delta; + } else { + cpustat[CPUTIME_GUEST] += vtime->gtime + delta; + cpustat[CPUTIME_USER] += vtime->gtime + delta; + } + } + } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return err; +} + +void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) +{ + const struct kernel_cpustat *src = &kcpustat_cpu(cpu); + struct rq *rq; + int err; + + if (!vtime_accounting_enabled_cpu(cpu)) { + *dst = *src; + return; + } + + rq = cpu_rq(cpu); + + for (;;) { + struct task_struct *curr; + + rcu_read_lock(); + curr = rcu_dereference(rq->curr); + if (WARN_ON_ONCE(!curr)) { + rcu_read_unlock(); + *dst = *src; + return; + } + + err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); + rcu_read_unlock(); + + if (!err) + return; + + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit From c4b75479741c9c3a4f0abff5baa5013d27640ac1 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 20 Nov 2019 19:06:40 +0200 Subject: perf/core: Make the mlock accounting simple again Commit: d44248a41337 ("perf/core: Rework memory accounting in perf_mmap()") does a lot of things to the mlock accounting arithmetics, while the only thing that actually needed to happen is subtracting the part that is charged to the mm from the part that is charged to the user, so that the former isn't charged twice. Signed-off-by: Alexander Shishkin Acked-by: Song Liu Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Wanpeng Li Cc: Yauheni Kaliuta Cc: songliubraving@fb.com Link: https://lkml.kernel.org/r/20191120170640.54123-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 8f66a4833ded..7e8980d0b997 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5825,13 +5825,7 @@ accounting: user_locked = atomic_long_read(&user->locked_vm) + user_extra; - if (user_locked <= user_lock_limit) { - /* charge all to locked_vm */ - } else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) { - /* charge all to pinned_vm */ - extra = user_extra; - user_extra = 0; - } else { + if (user_locked > user_lock_limit) { /* * charge locked_vm until it hits user_lock_limit; * charge the rest from pinned_vm -- cgit From a7ba70f1787f977f970cd116076c6fce4b9e01cc Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Thu, 21 Nov 2019 10:26:44 +0100 Subject: dma-mapping: treat dev->bus_dma_mask as a DMA limit Using a mask to represent bus DMA constraints has a set of limitations. The biggest one being it can only hold a power of two (minus one). The DMA mapping code is already aware of this and treats dev->bus_dma_mask as a limit. This quirk is already used by some architectures although still rare. With the introduction of the Raspberry Pi 4 we've found a new contender for the use of bus DMA limits, as its PCIe bus can only address the lower 3GB of memory (of a total of 4GB). This is impossible to represent with a mask. To make things worse the device-tree code rounds non power of two bus DMA limits to the next power of two, which is unacceptable in this case. In the light of this, rename dev->bus_dma_mask to dev->bus_dma_limit all over the tree and treat it as such. Note that dev->bus_dma_limit should contain the higher accessible DMA address. Signed-off-by: Nicolas Saenz Julienne Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 267b23a13b69..6af7ae83c4ad 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -27,10 +27,10 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) { if (!dev->dma_mask) { dev_err_once(dev, "DMA map on device without dma_mask\n"); - } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) { + } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) { dev_err_once(dev, - "overflow %pad+%zu of DMA mask %llx bus mask %llx\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_mask); + "overflow %pad+%zu of DMA mask %llx bus limit %llx\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); } WARN_ON_ONCE(1); } @@ -57,15 +57,14 @@ u64 dma_direct_get_required_mask(struct device *dev) } static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, - u64 *phys_mask) + u64 *phys_limit) { - if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask) - dma_mask = dev->bus_dma_mask; + u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); if (force_dma_unencrypted(dev)) - *phys_mask = __dma_to_phys(dev, dma_mask); + *phys_limit = __dma_to_phys(dev, dma_limit); else - *phys_mask = dma_to_phys(dev, dma_mask); + *phys_limit = dma_to_phys(dev, dma_limit); /* * Optimistically try the zone that the physical address mask falls @@ -75,9 +74,9 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding * zones. */ - if (*phys_mask <= DMA_BIT_MASK(zone_dma_bits)) + if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) return GFP_DMA; - if (*phys_mask <= DMA_BIT_MASK(32)) + if (*phys_limit <= DMA_BIT_MASK(32)) return GFP_DMA32; return 0; } @@ -85,7 +84,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= - min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask); + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, @@ -94,7 +93,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, size_t alloc_size = PAGE_ALIGN(size); int node = dev_to_node(dev); struct page *page = NULL; - u64 phys_mask; + u64 phys_limit; if (attrs & DMA_ATTR_NO_WARN) gfp |= __GFP_NOWARN; @@ -102,7 +101,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, /* we always manually zero the memory once we are done: */ gfp &= ~__GFP_ZERO; gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_mask); + &phys_limit); page = dma_alloc_contiguous(dev, alloc_size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, alloc_size); @@ -116,7 +115,7 @@ again: page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && - phys_mask < DMA_BIT_MASK(64) && + phys_limit < DMA_BIT_MASK(64) && !(gfp & (GFP_DMA32 | GFP_DMA))) { gfp |= GFP_DMA32; goto again; -- cgit From 0e4a459f56c32d3e52ae69a4b447db2f48a65f44 Mon Sep 17 00:00:00 2001 From: Kusanagi Kouichi Date: Wed, 20 Nov 2019 19:43:50 +0900 Subject: tracing: Remove unnecessary DEBUG_FS dependency Tracing replaced debugfs with tracefs. Signed-off-by: Kusanagi Kouichi Reviewed-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20191120104350753.EWCT.12796.ppp.dion.ne.jp@dmta0009.auone-net.jp Signed-off-by: Greg Kroah-Hartman --- kernel/trace/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e08527f50d2a..382628b9b759 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -106,7 +106,6 @@ config PREEMPTIRQ_TRACEPOINTS config TRACING bool - select DEBUG_FS select RING_BUFFER select STACKTRACE if STACKTRACE_SUPPORT select TRACEPOINTS -- cgit From a82a4804b4ee3636c8988fea14d44f70f4de45f1 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Sat, 16 Nov 2019 10:05:55 -0500 Subject: ring-buffer: Fix typos in function ring_buffer_producer Fix spelling and other typos Link: http://lkml.kernel.org/r/1573916755-32478-1-git-send-email-xianting_tian@126.com Signed-off-by: Xianting Tian Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer_benchmark.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 09b0b49f346e..32149e46551c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -269,10 +269,10 @@ static void ring_buffer_producer(void) #ifndef CONFIG_PREEMPTION /* - * If we are a non preempt kernel, the 10 second run will + * If we are a non preempt kernel, the 10 seconds run will * stop everything while it runs. Instead, we will call * cond_resched and also add any time that was lost by a - * rescedule. + * reschedule. * * Do a cond resched at the same frequency we would wake up * the reader. -- cgit From fc809bc5ceaa665497384064ab2d76713c774bad Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 20 Nov 2019 21:38:07 +0800 Subject: tracing: Fix Kconfig indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ /\t/' -i */Kconfig Link: http://lkml.kernel.org/r/20191120133807.12741-1-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b872716bb2a0..f67620499faa 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -79,7 +79,7 @@ config FTRACE_NMI_ENTER config EVENT_TRACING select CONTEXT_SWITCH_TRACER - select GLOB + select GLOB bool config CONTEXT_SWITCH_TRACER @@ -311,7 +311,7 @@ config TRACER_SNAPSHOT cat snapshot config TRACER_SNAPSHOT_PER_CPU_SWAP - bool "Allow snapshot to swap per CPU" + bool "Allow snapshot to swap per CPU" depends on TRACER_SNAPSHOT select RING_BUFFER_ALLOW_SWAP help @@ -683,7 +683,7 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. config TRACEPOINT_BENCHMARK - bool "Add tracepoint that benchmarks tracepoints" + bool "Add tracepoint that benchmarks tracepoints" help This option creates the tracepoint "benchmark:benchmark_event". When the tracepoint is enabled, it kicks off a kernel thread that @@ -732,7 +732,7 @@ config RING_BUFFER_STARTUP_TEST bool "Ring buffer startup self test" depends on RING_BUFFER help - Run a simple self test on the ring buffer on boot up. Late in the + Run a simple self test on the ring buffer on boot up. Late in the kernel boot sequence, the test will start that kicks off a thread per cpu. Each thread will write various size events into the ring buffer. Another thread is created to send IPIs -- cgit From 28879787147358e8ffcae397f11748de3dd26577 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 20 Nov 2019 11:08:38 -0800 Subject: tracing: Adding new functions for kernel access to Ftrace instances Adding 2 new functions - 1) struct trace_array *trace_array_get_by_name(const char *name); Return pointer to a trace array with given name. If it does not exist, create and return pointer to the new trace array. 2) int trace_array_set_clr_event(struct trace_array *tr, const char *system ,const char *event, bool enable); Enable/Disable events to this trace array. Additionally, - To handle reference counters, export trace_array_put() - Due to introduction of the above 2 new functions, we no longer need to export - ftrace_set_clr_event & trace_array_create APIs. Link: http://lkml.kernel.org/r/1574276919-11119-2-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Reviewed-by: Aruna Ramakrishna Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 96 +++++++++++++++++++++++++++++++++++---------- kernel/trace/trace.h | 1 - kernel/trace/trace_events.c | 27 ++++++++++++- 3 files changed, 102 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42659ce6ac0c..02a23a6e5e00 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -301,12 +301,24 @@ static void __trace_array_put(struct trace_array *this_tr) this_tr->ref--; } +/** + * trace_array_put - Decrement the reference counter for this trace array. + * + * NOTE: Use this when we no longer need the trace array returned by + * trace_array_get_by_name(). This ensures the trace array can be later + * destroyed. + * + */ void trace_array_put(struct trace_array *this_tr) { + if (!this_tr) + return; + mutex_lock(&trace_types_lock); __trace_array_put(this_tr); mutex_unlock(&trace_types_lock); } +EXPORT_SYMBOL_GPL(trace_array_put); int tracing_check_open_get_tr(struct trace_array *tr) { @@ -8437,24 +8449,15 @@ static void update_tracer_options(struct trace_array *tr) mutex_unlock(&trace_types_lock); } -struct trace_array *trace_array_create(const char *name) +static struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; int ret; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); - - ret = -EEXIST; - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) - goto out_unlock; - } - ret = -ENOMEM; tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) - goto out_unlock; + return ERR_PTR(ret); tr->name = kstrdup(name, GFP_KERNEL); if (!tr->name) @@ -8499,8 +8502,8 @@ struct trace_array *trace_array_create(const char *name) list_add(&tr->list, &ftrace_trace_arrays); - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); + tr->ref++; + return tr; @@ -8510,24 +8513,77 @@ struct trace_array *trace_array_create(const char *name) kfree(tr->name); kfree(tr); - out_unlock: - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); - return ERR_PTR(ret); } -EXPORT_SYMBOL_GPL(trace_array_create); static int instance_mkdir(const char *name) { - return PTR_ERR_OR_ZERO(trace_array_create(name)); + struct trace_array *tr; + int ret; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = -EEXIST; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + tr = trace_array_create(name); + + ret = PTR_ERR_OR_ZERO(tr); + +out_unlock: + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + return ret; +} + +/** + * trace_array_get_by_name - Create/Lookup a trace array, given its name. + * @name: The name of the trace array to be looked up/created. + * + * Returns pointer to trace array with given name. + * NULL, if it cannot be created. + * + * NOTE: This function increments the reference counter associated with the + * trace array returned. This makes sure it cannot be freed while in use. + * Use trace_array_put() once the trace array is no longer needed. + * + */ +struct trace_array *trace_array_get_by_name(const char *name) +{ + struct trace_array *tr; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + tr = trace_array_create(name); + + if (IS_ERR(tr)) + tr = NULL; +out_unlock: + if (tr) + tr->ref++; + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + return tr; } +EXPORT_SYMBOL_GPL(trace_array_get_by_name); static int __remove_instance(struct trace_array *tr) { int i; - if (tr->ref || (tr->current_trace && tr->current_trace->ref)) + /* Reference counter for a newly created trace array = 1. */ + if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref)) return -EBUSY; list_del(&tr->list); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2df8aed6a8f0..ca7fccafbcbb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -345,7 +345,6 @@ extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); -extern void trace_array_put(struct trace_array *tr); extern int tracing_check_open_get_tr(struct trace_array *tr); extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2a3ac2365445..6b3a69e9aa6a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -827,7 +827,6 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) return ret; } -EXPORT_SYMBOL_GPL(ftrace_set_clr_event); /** * trace_set_clr_event - enable or disable an event @@ -852,6 +851,32 @@ int trace_set_clr_event(const char *system, const char *event, int set) } EXPORT_SYMBOL_GPL(trace_set_clr_event); +/** + * trace_array_set_clr_event - enable or disable an event for a trace array. + * @tr: concerned trace array. + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @enable: true to enable, false to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_array_set_clr_event(struct trace_array *tr, const char *system, + const char *event, bool enable) +{ + int set; + + if (!tr) + return -ENOENT; + + set = (enable == true) ? 1 : 0; + return __ftrace_set_clr_event(tr, NULL, system, event, set); +} +EXPORT_SYMBOL_GPL(trace_array_set_clr_event); + /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 -- cgit From 0e24220821b0e0e330a18bfef29ac6396545d62e Mon Sep 17 00:00:00 2001 From: Hassan Naveed Date: Fri, 15 Nov 2019 23:44:42 +0000 Subject: tracing: Use xarray for syscall trace events Currently, a lot of memory is wasted for architectures like MIPS when init_ftrace_syscalls() allocates the array for syscalls using kcalloc. This is because syscalls numbers start from 4000, 5000 or 6000 and array elements up to that point are unused. Fix this by using a data structure more suited to storing sparsely populated arrays. The XARRAY data structure, implemented using radix trees, is much more memory efficient for storing the syscalls in question. Link: http://lkml.kernel.org/r/20191115234314.21599-1-hnaveed@wavecomp.com Signed-off-by: Hassan Naveed Reviewed-by: Paul Burton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_syscalls.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index fa8fbff736d6..16fa218556fa 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -7,6 +7,7 @@ #include /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ #include #include +#include #include #include "trace_output.h" @@ -30,6 +31,7 @@ syscall_get_enter_fields(struct trace_event_call *call) extern struct syscall_metadata *__start_syscalls_metadata[]; extern struct syscall_metadata *__stop_syscalls_metadata[]; +static DEFINE_XARRAY(syscalls_metadata_sparse); static struct syscall_metadata **syscalls_metadata; #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME @@ -101,6 +103,9 @@ find_syscall_meta(unsigned long syscall) static struct syscall_metadata *syscall_nr_to_meta(int nr) { + if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) + return xa_load(&syscalls_metadata_sparse, (unsigned long)nr); + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) return NULL; @@ -536,12 +541,16 @@ void __init init_ftrace_syscalls(void) struct syscall_metadata *meta; unsigned long addr; int i; - - syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), - GFP_KERNEL); - if (!syscalls_metadata) { - WARN_ON(1); - return; + void *ret; + + if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { + syscalls_metadata = kcalloc(NR_syscalls, + sizeof(*syscalls_metadata), + GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return; + } } for (i = 0; i < NR_syscalls; i++) { @@ -551,7 +560,16 @@ void __init init_ftrace_syscalls(void) continue; meta->syscall_nr = i; - syscalls_metadata[i] = meta; + + if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { + syscalls_metadata[i] = meta; + } else { + ret = xa_store(&syscalls_metadata_sparse, i, meta, + GFP_KERNEL); + WARN(xa_is_err(ret), + "Syscall memory allocation failed\n"); + } + } } -- cgit From 84bb46cd62283cc371769ec1f77ff7924099f584 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 23 Nov 2019 09:54:58 -0800 Subject: Revert "bpf: Emit audit messages upon successful prog load and unload" This commit reverts commit 91e6015b082b ("bpf: Emit audit messages upon successful prog load and unload") and its follow up commit 7599a896f2e4 ("audit: Move audit_log_task declaration under CONFIG_AUDITSYSCALL") as requested by Paul Moore. The change needs close review on linux-audit, tests etc. Signed-off-by: Jakub Kicinski --- kernel/auditsc.c | 2 +- kernel/bpf/syscall.c | 31 ------------------------------- 2 files changed, 1 insertion(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9bf1045fedfa..4effe01ebbe2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2545,7 +2545,7 @@ void __audit_ntp_log(const struct audit_ntp_data *ad) audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); } -void audit_log_task(struct audit_buffer *ab) +static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; kgid_t gid; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b51ecb9644d0..4ae52eb05f41 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ @@ -1322,34 +1321,6 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } -enum bpf_event { - BPF_EVENT_LOAD, - BPF_EVENT_UNLOAD, -}; - -static const char * const bpf_event_audit_str[] = { - [BPF_EVENT_LOAD] = "LOAD", - [BPF_EVENT_UNLOAD] = "UNLOAD", -}; - -static void bpf_audit_prog(const struct bpf_prog *prog, enum bpf_event event) -{ - bool has_task_context = event == BPF_EVENT_LOAD; - struct audit_buffer *ab; - - if (audit_enabled == AUDIT_OFF) - return; - ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_BPF); - if (unlikely(!ab)) - return; - if (has_task_context) - audit_log_task(ab); - audit_log_format(ab, "%sprog-id=%u event=%s", - has_task_context ? " " : "", - prog->aux->id, bpf_event_audit_str[event]); - audit_log_end(ab); -} - int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1466,7 +1437,6 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic64_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); - bpf_audit_prog(prog, BPF_EVENT_UNLOAD); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); __bpf_prog_put_noref(prog, true); @@ -1876,7 +1846,6 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); - bpf_audit_prog(prog, BPF_EVENT_LOAD); err = bpf_prog_new_fd(prog); if (err < 0) -- cgit From 107e899874e95dcddc779142942bf285eba38bc5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Nov 2019 16:22:21 -0400 Subject: mm/hmm: define the pre-processor related parts of hmm.h even if disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only the function calls are stubbed out with static inlines that always fail. This is the standard way to write a header for an optional component and makes it easier for drivers that only optionally need HMM_MIRROR. Link: https://lore.kernel.org/r/20191112202231.3856-5-jgg@ziepe.ca Reviewed-by: Jérôme Glisse Tested-by: Ralph Campbell Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..ca39cfc404e3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include -- cgit From 071cdecec57fb5d5df78e6a12114ad7bccea5b0e Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 21 Nov 2019 14:36:12 +0100 Subject: xdp: Fix cleanup on map free for devmap_hash map type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tetsuo pointed out that it was not only the device unregister hook that was broken for devmap_hash types, it was also cleanup on map free. So better fix this as well. While we're at it, there's no reason to allocate the netdev_map array for DEVMAP_HASH, so skip that and adjust the cost accordingly. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Tetsuo Handa Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20191121133612.430414-1-toke@redhat.com --- kernel/bpf/devmap.c | 74 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3867864cdc2f..3d3d61b5985b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -74,7 +74,7 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; - struct bpf_dtab_netdev **netdev_map; + struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ struct list_head __percpu *flush_list; struct list_head list; @@ -101,6 +101,12 @@ static struct hlist_head *dev_map_create_hash(unsigned int entries) return hash; } +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, + int idx) +{ + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +} + static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { int err, cpu; @@ -120,8 +126,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); /* make sure page count doesn't overflow */ - cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += sizeof(struct list_head) * num_possible_cpus(); + cost = (u64) sizeof(struct list_head) * num_possible_cpus(); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); @@ -129,6 +134,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; + } else { + cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); } /* if map size is larger than memlock limit, reject it */ @@ -143,24 +150,22 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) for_each_possible_cpu(cpu) INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); - dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * - sizeof(struct bpf_dtab_netdev *), - dtab->map.numa_node); - if (!dtab->netdev_map) - goto free_percpu; - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); if (!dtab->dev_index_head) - goto free_map_area; + goto free_percpu; spin_lock_init(&dtab->index_lock); + } else { + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * + sizeof(struct bpf_dtab_netdev *), + dtab->map.numa_node); + if (!dtab->netdev_map) + goto free_percpu; } return 0; -free_map_area: - bpf_map_area_free(dtab->netdev_map); free_percpu: free_percpu(dtab->flush_list); free_charge: @@ -228,21 +233,40 @@ static void dev_map_free(struct bpf_map *map) cond_resched(); } - for (i = 0; i < dtab->map.max_entries; i++) { - struct bpf_dtab_netdev *dev; + if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + for (i = 0; i < dtab->n_buckets; i++) { + struct bpf_dtab_netdev *dev; + struct hlist_head *head; + struct hlist_node *next; - dev = dtab->netdev_map[i]; - if (!dev) - continue; + head = dev_map_index_hash(dtab, i); - free_percpu(dev->bulkq); - dev_put(dev->dev); - kfree(dev); + hlist_for_each_entry_safe(dev, next, head, index_hlist) { + hlist_del_rcu(&dev->index_hlist); + free_percpu(dev->bulkq); + dev_put(dev->dev); + kfree(dev); + } + } + + kfree(dtab->dev_index_head); + } else { + for (i = 0; i < dtab->map.max_entries; i++) { + struct bpf_dtab_netdev *dev; + + dev = dtab->netdev_map[i]; + if (!dev) + continue; + + free_percpu(dev->bulkq); + dev_put(dev->dev); + kfree(dev); + } + + bpf_map_area_free(dtab->netdev_map); } free_percpu(dtab->flush_list); - bpf_map_area_free(dtab->netdev_map); - kfree(dtab->dev_index_head); kfree(dtab); } @@ -263,12 +287,6 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, - int idx) -{ - return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; -} - struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); -- cgit From 581738a681b6faae5725c2555439189ca81c0f1f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Nov 2019 09:06:50 -0800 Subject: bpf: Provide better register bounds after jmp32 instructions With latest llvm (trunk https://github.com/llvm/llvm-project), test_progs, which has +alu32 enabled, failed for strobemeta.o. The verifier output looks like below with edit to replace large decimal numbers with hex ones. 193: (85) call bpf_probe_read_user_str#114 R0=inv(id=0) 194: (26) if w0 > 0x1 goto pc+4 R0_w=inv(id=0,umax_value=0xffffffff00000001) 195: (6b) *(u16 *)(r7 +80) = r0 196: (bc) w6 = w0 R6_w=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) 197: (67) r6 <<= 32 R6_w=inv(id=0,smax_value=0x7fffffff00000000,umax_value=0xffffffff00000000, var_off=(0x0; 0xffffffff00000000)) 198: (77) r6 >>= 32 R6=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 201: (79) r8 = *(u64 *)(r10 -416) R8_w=map_value(id=0,off=40,ks=4,vs=13872,imm=0) 202: (0f) r8 += r6 R8_w=map_value(id=0,off=40,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) 203: (07) r8 += 9696 R8_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 255: (bf) r1 = r8 R1_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 257: (85) call bpf_probe_read_user_str#114 R1 unbounded memory access, make sure to bounds check any array access into a map The value range for register r6 at insn 198 should be really just 0/1. The umax_value=0xffffffff caused later verification failure. After jmp instructions, the current verifier already tried to use just obtained information to get better register range. The current mechanism is for 64bit register only. This patch implemented to tighten the range for 32bit sub-registers after jmp32 instructions. With the patch, we have the below range ranges for the above code sequence: 193: (85) call bpf_probe_read_user_str#114 R0=inv(id=0) 194: (26) if w0 > 0x1 goto pc+4 R0_w=inv(id=0,smax_value=0x7fffffff00000001,umax_value=0xffffffff00000001, var_off=(0x0; 0xffffffff00000001)) 195: (6b) *(u16 *)(r7 +80) = r0 196: (bc) w6 = w0 R6_w=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0x1)) 197: (67) r6 <<= 32 R6_w=inv(id=0,umax_value=0x100000000,var_off=(0x0; 0x100000000)) 198: (77) r6 >>= 32 R6=inv(id=0,umax_value=1,var_off=(0x0; 0x1)) ... 201: (79) r8 = *(u64 *)(r10 -416) R8_w=map_value(id=0,off=40,ks=4,vs=13872,imm=0) 202: (0f) r8 += r6 R8_w=map_value(id=0,off=40,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) 203: (07) r8 += 9696 R8_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) ... 255: (bf) r1 = r8 R1_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) ... 257: (85) call bpf_probe_read_user_str#114 ... At insn 194, the register R0 has better var_off.mask and smax_value. Especially, the var_off.mask ensures later lshift and rshift maintains proper value range. Suggested-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121170650.449030-1-yhs@fb.com --- kernel/bpf/verifier.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9f59f7a19dd0..fc85714428c7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1007,6 +1007,17 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->umax_value)); } +static void __reg_bound_offset32(struct bpf_reg_state *reg) +{ + u64 mask = 0xffffFFFF; + struct tnum range = tnum_range(reg->umin_value & mask, + reg->umax_value & mask); + struct tnum lo32 = tnum_cast(reg->var_off, 4); + struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32); + + reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range)); +} + /* Reset the min/max bounds of a register */ static void __mark_reg_unbounded(struct bpf_reg_state *reg) { @@ -5589,6 +5600,10 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); + if (is_jmp32) { + __reg_bound_offset32(false_reg); + __reg_bound_offset32(true_reg); + } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. @@ -5698,6 +5713,10 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); + if (is_jmp32) { + __reg_bound_offset32(false_reg); + __reg_bound_offset32(true_reg); + } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. -- cgit From 6332be04c039a72fca32ed0a4265bac58d606bb6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:55 +0100 Subject: bpf: Move bpf_free_used_maps into sleepable section We later on are going to need a sleepable context as opposed to plain RCU callback in order to untrack programs we need to poke at runtime and tracking as well as image update is performed under mutex. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/09823b1d5262876e9b83a8e75df04cf0467357a4.1574452833.git.daniel@iogearbox.net --- kernel/bpf/core.c | 23 +++++++++++++++++++++++ kernel/bpf/syscall.c | 20 -------------------- 2 files changed, 23 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b5945c3aaa8e..0e825c164f1a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2003,12 +2003,35 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array, : 0; } +static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) { + if (!aux->cgroup_storage[stype]) + continue; + bpf_cgroup_storage_release(aux->prog, + aux->cgroup_storage[stype]); + } +} + +static void bpf_free_used_maps(struct bpf_prog_aux *aux) +{ + int i; + + bpf_free_cgroup_storage(aux); + for (i = 0; i < aux->used_map_cnt; i++) + bpf_map_put(aux->used_maps[i]); + kfree(aux->used_maps); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; int i; aux = container_of(work, struct bpf_prog_aux, work); + bpf_free_used_maps(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ae52eb05f41..373778da8489 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1302,25 +1302,6 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) return 0; } -/* drop refcnt on maps used by eBPF program and free auxilary data */ -static void free_used_maps(struct bpf_prog_aux *aux) -{ - enum bpf_cgroup_storage_type stype; - int i; - - for_each_cgroup_storage_type(stype) { - if (!aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(aux->prog, - aux->cgroup_storage[stype]); - } - - for (i = 0; i < aux->used_map_cnt; i++) - bpf_map_put(aux->used_maps[i]); - - kfree(aux->used_maps); -} - int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1415,7 +1396,6 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) kvfree(aux->func_info); kfree(aux->func_info_aux); - free_used_maps(aux); bpf_prog_uncharge_memlock(aux->prog); security_bpf_prog_free(aux); bpf_prog_free(aux->prog); -- cgit From 2beee5f57441413b64a9c2bd657e17beabb98d1c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:56 +0100 Subject: bpf: Move owner type, jited info into array auxiliary data We're going to extend this with further information which is only relevant for prog array at this point. Given this info is not used in critical path, move it into its own structure such that the main array map structure can be kept on diet. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/b9ddccdb0f6f7026489ee955f16c96381e1e7238.1574452833.git.daniel@iogearbox.net --- kernel/bpf/arraymap.c | 32 ++++++++++++++++++++++++++++++-- kernel/bpf/core.c | 11 +++++------ kernel/bpf/map_in_map.c | 5 ++--- kernel/bpf/syscall.c | 16 ++++++---------- 4 files changed, 43 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 633c8c701ff6..57da950ee55b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -671,10 +671,38 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +{ + struct bpf_array_aux *aux; + struct bpf_map *map; + + aux = kzalloc(sizeof(*aux), GFP_KERNEL); + if (!aux) + return ERR_PTR(-ENOMEM); + + map = array_map_alloc(attr); + if (IS_ERR(map)) { + kfree(aux); + return map; + } + + container_of(map, struct bpf_array, map)->aux = aux; + return map; +} + +static void prog_array_map_free(struct bpf_map *map) +{ + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + kfree(aux); + fd_array_map_free(map); +} + const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, - .map_alloc = array_map_alloc, - .map_free = fd_array_map_free, + .map_alloc = prog_array_map_alloc, + .map_free = prog_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0e825c164f1a..07af9c1d9cf1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1691,18 +1691,17 @@ bool bpf_prog_array_compatible(struct bpf_array *array, if (fp->kprobe_override) return false; - if (!array->owner_prog_type) { + if (!array->aux->type) { /* There's no owner yet where we could check for * compatibility. */ - array->owner_prog_type = fp->type; - array->owner_jited = fp->jited; - + array->aux->type = fp->type; + array->aux->jited = fp->jited; return true; } - return array->owner_prog_type == fp->type && - array->owner_jited == fp->jited; + return array->aux->type == fp->type && + array->aux->jited == fp->jited; } static int bpf_check_tail_call(const struct bpf_prog *fp) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 4cbe987be35b..5e9366b33f0f 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -17,9 +17,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) if (IS_ERR(inner_map)) return inner_map; - /* prog_array->owner_prog_type and owner_jited - * is a runtime binding. Doing static check alone - * in the verifier is not enough. + /* prog_array->aux->{type,jited} is a runtime binding. + * Doing static check alone in the verifier is not enough. */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 373778da8489..b904d56ec686 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -389,13 +389,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_map *map = filp->private_data; const struct bpf_array *array; - u32 owner_prog_type = 0; - u32 owner_jited = 0; + u32 type = 0, jited = 0; if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { array = container_of(map, struct bpf_array, map); - owner_prog_type = array->owner_prog_type; - owner_jited = array->owner_jited; + type = array->aux->type; + jited = array->aux->jited; } seq_printf(m, @@ -415,12 +414,9 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->memory.pages * 1ULL << PAGE_SHIFT, map->id, READ_ONCE(map->frozen)); - - if (owner_prog_type) { - seq_printf(m, "owner_prog_type:\t%u\n", - owner_prog_type); - seq_printf(m, "owner_jited:\t%u\n", - owner_jited); + if (type) { + seq_printf(m, "owner_prog_type:\t%u\n", type); + seq_printf(m, "owner_jited:\t%u\n", jited); } } #endif -- cgit From a66886fe6c24ebeeb6dc10fbd9b75158029eacf7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:57 +0100 Subject: bpf: Add initial poke descriptor table for jit images Add initial poke table data structures and management to the BPF prog that can later be used by JITs. Also add an instance of poke specific data for tail call maps; plan for later work is to extend this also for BPF static keys. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1db285ec2ea4207ee0455b3f8e191a4fc58b9ade.1574452833.git.daniel@iogearbox.net --- kernel/bpf/core.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 07af9c1d9cf1..608b7085e0c9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -256,6 +256,7 @@ void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { free_percpu(fp->aux->stats); + kfree(fp->aux->poke_tab); kfree(fp->aux); } vfree(fp); @@ -756,6 +757,39 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; } +int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke) +{ + struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; + static const u32 poke_tab_max = 1024; + u32 slot = prog->aux->size_poke_tab; + u32 size = slot + 1; + + if (size > poke_tab_max) + return -ENOSPC; + if (poke->ip || poke->ip_stable || poke->adj_off) + return -EINVAL; + + switch (poke->reason) { + case BPF_POKE_REASON_TAIL_CALL: + if (!poke->tail_call.map) + return -EINVAL; + break; + default: + return -EINVAL; + } + + tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL); + if (!tab) + return -ENOMEM; + + memcpy(&tab[slot], poke, sizeof(*poke)); + prog->aux->size_poke_tab = size; + prog->aux->poke_tab = tab; + + return slot; +} + static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, -- cgit From da765a2f599304a81a25e77908d1790414ecdbb6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:58 +0100 Subject: bpf: Add poke dependency tracking for prog array maps This work adds program tracking to prog array maps. This is needed such that upon prog array updates/deletions we can fix up all programs which make use of this tail call map. We add ops->map_poke_{un,}track() helpers to maps to maintain the list of programs and ops->map_poke_run() for triggering the actual update. bpf_array_aux is extended to contain the list head and poke_mutex in order to serialize program patching during updates/deletions. bpf_free_used_maps() will untrack the program shortly before dropping the reference to the map. For clearing out the prog array once all urefs are dropped we need to use schedule_work() to have a sleepable context. The prog_array_map_poke_run() is triggered during updates/deletions and walks the maintained prog list. It checks in their poke_tabs whether the map and key is matching and runs the actual bpf_arch_text_poke() for patching in the nop or new jmp location. Depending on the type of update, we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1fb364bb3c565b3e415d5ea348f036ff379e779d.1574452833.git.daniel@iogearbox.net --- kernel/bpf/arraymap.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/bpf/core.c | 9 ++- kernel/bpf/syscall.c | 20 ++++-- 3 files changed, 200 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 57da950ee55b..58bdf5fd24cc 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -586,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); - old_ptr = xchg(array->ptrs + index, new_ptr); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, new_ptr); + map->ops->map_poke_run(map, index, old_ptr, new_ptr); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, new_ptr); + } + if (old_ptr) map->ops->map_fd_put_ptr(old_ptr); - return 0; } @@ -602,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) if (index >= array->map.max_entries) return -E2BIG; - old_ptr = xchg(array->ptrs + index, NULL); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, NULL); + map->ops->map_poke_run(map, index, old_ptr, NULL); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, NULL); + } + if (old_ptr) { map->ops->map_fd_put_ptr(old_ptr); return 0; @@ -671,6 +686,152 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +struct prog_poke_elem { + struct list_head list; + struct bpf_prog_aux *aux; +}; + +static int prog_array_map_poke_track(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + int ret = 0; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry(elem, &aux->poke_progs, list) { + if (elem->aux == prog_aux) + goto out; + } + + elem = kmalloc(sizeof(*elem), GFP_KERNEL); + if (!elem) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&elem->list); + /* We must track the program's aux info at this point in time + * since the program pointer itself may not be stable yet, see + * also comment in prog_array_map_poke_run(). + */ + elem->aux = prog_aux; + + list_add_tail(&elem->list, &aux->poke_progs); +out: + mutex_unlock(&aux->poke_mutex); + return ret; +} + +static void prog_array_map_poke_untrack(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem, *tmp; + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + if (elem->aux == prog_aux) { + list_del_init(&elem->list); + kfree(elem); + break; + } + } + mutex_unlock(&aux->poke_mutex); +} + +static void prog_array_map_poke_run(struct bpf_map *map, u32 key, + struct bpf_prog *old, + struct bpf_prog *new) +{ + enum bpf_text_poke_type type; + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + + if (!old && new) + type = BPF_MOD_NOP_TO_JUMP; + else if (old && !new) + type = BPF_MOD_JUMP_TO_NOP; + else if (old && new) + type = BPF_MOD_JUMP_TO_JUMP; + else + return; + + aux = container_of(map, struct bpf_array, map)->aux; + WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); + + list_for_each_entry(elem, &aux->poke_progs, list) { + struct bpf_jit_poke_descriptor *poke; + int i, ret; + + for (i = 0; i < elem->aux->size_poke_tab; i++) { + poke = &elem->aux->poke_tab[i]; + + /* Few things to be aware of: + * + * 1) We can only ever access aux in this context, but + * not aux->prog since it might not be stable yet and + * there could be danger of use after free otherwise. + * 2) Initially when we start tracking aux, the program + * is not JITed yet and also does not have a kallsyms + * entry. We skip these as poke->ip_stable is not + * active yet. The JIT will do the final fixup before + * setting it stable. The various poke->ip_stable are + * successively activated, so tail call updates can + * arrive from here while JIT is still finishing its + * final fixup for non-activated poke entries. + * 3) On program teardown, the program's kallsym entry gets + * removed out of RCU callback, but we can only untrack + * from sleepable context, therefore bpf_arch_text_poke() + * might not see that this is in BPF text section and + * bails out with -EINVAL. As these are unreachable since + * RCU grace period already passed, we simply skip them. + * 4) Also programs reaching refcount of zero while patching + * is in progress is okay since we're protected under + * poke_mutex and untrack the programs before the JIT + * buffer is freed. When we're still in the middle of + * patching and suddenly kallsyms entry of the program + * gets evicted, we just skip the rest which is fine due + * to point 3). + * 5) Any other error happening below from bpf_arch_text_poke() + * is a unexpected bug. + */ + if (!READ_ONCE(poke->ip_stable)) + continue; + if (poke->reason != BPF_POKE_REASON_TAIL_CALL) + continue; + if (poke->tail_call.map != map || + poke->tail_call.key != key) + continue; + + ret = bpf_arch_text_poke(poke->ip, type, + old ? (u8 *)old->bpf_func + + poke->adj_off : NULL, + new ? (u8 *)new->bpf_func + + poke->adj_off : NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } + } +} + +static void prog_array_map_clear_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_array_aux, + work)->map; + bpf_fd_array_map_clear(map); + bpf_map_put(map); +} + +static void prog_array_map_clear(struct bpf_map *map) +{ + struct bpf_array_aux *aux = container_of(map, struct bpf_array, + map)->aux; + bpf_map_inc(map); + schedule_work(&aux->work); +} + static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) { struct bpf_array_aux *aux; @@ -680,6 +841,10 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) if (!aux) return ERR_PTR(-ENOMEM); + INIT_WORK(&aux->work, prog_array_map_clear_deferred); + INIT_LIST_HEAD(&aux->poke_progs); + mutex_init(&aux->poke_mutex); + map = array_map_alloc(attr); if (IS_ERR(map)) { kfree(aux); @@ -687,14 +852,21 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) } container_of(map, struct bpf_array, map)->aux = aux; + aux->map = map; + return map; } static void prog_array_map_free(struct bpf_map *map) { + struct prog_poke_elem *elem, *tmp; struct bpf_array_aux *aux; aux = container_of(map, struct bpf_array, map)->aux; + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + list_del_init(&elem->list); + kfree(elem); + } kfree(aux); fd_array_map_free(map); } @@ -703,13 +875,16 @@ const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, .map_free = prog_array_map_free, + .map_poke_track = prog_array_map_poke_track, + .map_poke_untrack = prog_array_map_poke_untrack, + .map_poke_run = prog_array_map_poke_run, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, - .map_release_uref = bpf_fd_array_map_clear, + .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 608b7085e0c9..49e32acad7d8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2050,11 +2050,16 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) static void bpf_free_used_maps(struct bpf_prog_aux *aux) { + struct bpf_map *map; int i; bpf_free_cgroup_storage(aux); - for (i = 0; i < aux->used_map_cnt; i++) - bpf_map_put(aux->used_maps[i]); + for (i = 0; i < aux->used_map_cnt; i++) { + map = aux->used_maps[i]; + if (map->ops->map_poke_untrack) + map->ops->map_poke_untrack(map, aux); + bpf_map_put(map); + } kfree(aux->used_maps); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b904d56ec686..e3461ec59570 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,12 +25,13 @@ #include #include -#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY) #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) -#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) +#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \ + IS_FD_HASH(map)) #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) @@ -877,7 +878,7 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); - } else if (IS_FD_ARRAY(map)) { + } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); @@ -1004,6 +1005,10 @@ static int map_update_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_SOCKMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; + } else if (IS_FD_PROG_ARRAY(map)) { + err = bpf_fd_array_map_update_elem(map, f.file, key, value, + attr->flags); + goto out; } /* must increment bpf_prog_active to avoid kprobe+bpf triggering from @@ -1086,6 +1091,9 @@ static int map_delete_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; + } else if (IS_FD_PROG_ARRAY(map)) { + err = map->ops->map_delete_elem(map, key); + goto out; } preempt_disable(); -- cgit From d2e4c1e6c2947269346054ac8937ccfe9e0bcc6b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:59 +0100 Subject: bpf: Constant map key tracking for prog array pokes Add tracking of constant keys into tail call maps. The signature of bpf_tail_call_proto is that arg1 is ctx, arg2 map pointer and arg3 is a index key. The direct call approach for tail calls can be enabled if the verifier asserted that for all branches leading to the tail call helper invocation, the map pointer and index key were both constant and the same. Tracking of map pointers we already do from prior work via c93552c443eb ("bpf: properly enforce index mask to prevent out-of-bounds speculation") and 09772d92cd5a ("bpf: avoid retpoline for lookup/update/ delete calls on maps"). Given the tail call map index key is not on stack but directly in the register, we can add similar tracking approach and later in fixup_bpf_calls() add a poke descriptor to the progs poke_tab with the relevant information for the JITing phase. We internally reuse insn->imm for the rewritten BPF_JMP | BPF_TAIL_CALL instruction in order to point into the prog's poke_tab, and keep insn->imm as 0 as indicator that current indirect tail call emission must be used. Note that publishing to the tracker must happen at the end of fixup_bpf_calls() since adding elements to the poke_tab reallocates its memory, so we need to wait until its in final state. Future work can generalize and add similar approach to optimize plain array map lookups. Difference there is that we need to look into the key value that sits on stack. For clarity in bpf_insn_aux_data, map_state has been renamed into map_ptr_state, so we get map_{ptr,key}_state as trackers. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/e8db37f6b2ae60402fa40216c96738ee9b316c32.1574452833.git.daniel@iogearbox.net --- kernel/bpf/verifier.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 111 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fc85714428c7..a0482e1c4a77 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -171,6 +171,9 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 #define BPF_COMPLEXITY_LIMIT_STATES 64 +#define BPF_MAP_KEY_POISON (1ULL << 63) +#define BPF_MAP_KEY_SEEN (1ULL << 62) + #define BPF_MAP_PTR_UNPRIV 1UL #define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ POISON_POINTER_DELTA)) @@ -178,12 +181,12 @@ struct bpf_verifier_stack_elem { static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { - return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON; + return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON; } static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) { - return aux->map_state & BPF_MAP_PTR_UNPRIV; + return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV; } static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, @@ -191,8 +194,31 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, { BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); unpriv |= bpf_map_ptr_unpriv(aux); - aux->map_state = (unsigned long)map | - (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); + aux->map_ptr_state = (unsigned long)map | + (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); +} + +static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux) +{ + return aux->map_key_state & BPF_MAP_KEY_POISON; +} + +static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux) +{ + return !(aux->map_key_state & BPF_MAP_KEY_SEEN); +} + +static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux) +{ + return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON); +} + +static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) +{ + bool poisoned = bpf_map_key_poisoned(aux); + + aux->map_key_state = state | BPF_MAP_KEY_SEEN | + (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } struct bpf_call_arg_meta { @@ -4090,15 +4116,49 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EACCES; } - if (!BPF_MAP_PTR(aux->map_state)) + if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, meta->map_ptr->unpriv_array); - else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr) + else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, meta->map_ptr->unpriv_array); return 0; } +static int +record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, + int func_id, int insn_idx) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + struct bpf_reg_state *regs = cur_regs(env), *reg; + struct bpf_map *map = meta->map_ptr; + struct tnum range; + u64 val; + + if (func_id != BPF_FUNC_tail_call) + return 0; + if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) { + verbose(env, "kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + + range = tnum_range(0, map->max_entries - 1); + reg = ®s[BPF_REG_3]; + + if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) { + bpf_map_key_store(aux, BPF_MAP_KEY_POISON); + return 0; + } + + val = reg->var_off.value; + if (bpf_map_key_unseen(aux)) + bpf_map_key_store(aux, val); + else if (!bpf_map_key_poisoned(aux) && + bpf_map_key_immediate(aux) != val) + bpf_map_key_store(aux, BPF_MAP_KEY_POISON); + return 0; +} + static int check_reference_leak(struct bpf_verifier_env *env) { struct bpf_func_state *state = cur_func(env); @@ -4173,6 +4233,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; + err = record_func_key(env, &meta, func_id, insn_idx); + if (err) + return err; + /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ @@ -9065,6 +9129,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) static int fixup_bpf_calls(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; + bool expect_blinding = bpf_jit_blinding_enabled(prog); struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; @@ -9073,7 +9138,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; struct bpf_map *map_ptr; - int i, cnt, delta = 0; + int i, ret, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || @@ -9217,6 +9282,26 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; + if (prog->jit_requested && !expect_blinding && + !bpf_map_key_poisoned(aux) && + !bpf_map_ptr_poisoned(aux) && + !bpf_map_ptr_unpriv(aux)) { + struct bpf_jit_poke_descriptor desc = { + .reason = BPF_POKE_REASON_TAIL_CALL, + .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), + .tail_call.key = bpf_map_key_immediate(aux), + }; + + ret = bpf_jit_add_poke_descriptor(prog, &desc); + if (ret < 0) { + verbose(env, "adding tail call poke descriptor failed\n"); + return ret; + } + + insn->imm = ret + 1; + continue; + } + if (!bpf_map_ptr_unpriv(aux)) continue; @@ -9231,7 +9316,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) return -EINVAL; } - map_ptr = BPF_MAP_PTR(aux->map_state); + map_ptr = BPF_MAP_PTR(aux->map_ptr_state); insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -9265,7 +9350,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; - map_ptr = BPF_MAP_PTR(aux->map_state); + map_ptr = BPF_MAP_PTR(aux->map_ptr_state); ops = map_ptr->ops; if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { @@ -9345,6 +9430,23 @@ patch_call_imm: insn->imm = fn->func - __bpf_call_base; } + /* Since poke tab is now finalized, publish aux to tracker. */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + if (!map_ptr->ops->map_poke_track || + !map_ptr->ops->map_poke_untrack || + !map_ptr->ops->map_poke_run) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + + ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux); + if (ret < 0) { + verbose(env, "tracking tail call prog failed\n"); + return ret; + } + } + return 0; } -- cgit From b553a6ec570044fc1ae300c6fb24f9ce204c5894 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 24 Nov 2019 01:39:42 +0100 Subject: bpf: Simplify __bpf_arch_text_poke poke type handling Given that we have BPF_MOD_NOP_TO_{CALL,JUMP}, BPF_MOD_{CALL,JUMP}_TO_NOP and BPF_MOD_{CALL,JUMP}_TO_{CALL,JUMP} poke types and that we also pass in old_addr as well as new_addr, it's a bit redundant and unnecessarily complicates __bpf_arch_text_poke() itself since we can derive the same from the *_addr that were passed in. Hence simplify and use BPF_MOD_{CALL,JUMP} as types which also allows to clean up call-sites. In addition to that, __bpf_arch_text_poke() currently verifies that text matches expected old_insn before we invoke text_poke_bp(). Also add a check on new_insn and skip rewrite if it already matches. Reason why this is rather useful is that it avoids making any special casing in prog_array_map_poke_run() when old and new prog were NULL and has the benefit that also for this case we perform a check on text whether it really matches our expectations. Suggested-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/fcb00a2b0b288d6c73de4ef58116a821c8fe8f2f.1574555798.git.daniel@iogearbox.net --- kernel/bpf/arraymap.c | 12 +----------- kernel/bpf/trampoline.c | 8 ++++---- 2 files changed, 5 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 58bdf5fd24cc..f0d19bbb9211 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -746,19 +746,9 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { - enum bpf_text_poke_type type; struct prog_poke_elem *elem; struct bpf_array_aux *aux; - if (!old && new) - type = BPF_MOD_NOP_TO_JUMP; - else if (old && !new) - type = BPF_MOD_JUMP_TO_NOP; - else if (old && new) - type = BPF_MOD_JUMP_TO_JUMP; - else - return; - aux = container_of(map, struct bpf_array, map)->aux; WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); @@ -806,7 +796,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - ret = bpf_arch_text_poke(poke->ip, type, + ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, old ? (u8 *)old->bpf_func + poke->adj_off : NULL, new ? (u8 *)new->bpf_func + diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 10ae59d65f13..7e89f1f49d77 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -77,7 +77,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) int err; if (fentry_cnt + fexit_cnt == 0) { - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_NOP, + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, old_image, NULL); tr->selector = 0; goto out; @@ -105,12 +105,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (tr->selector) /* progs already running at this address */ - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_CALL, + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, old_image, new_image); else /* first time registering */ - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP_TO_CALL, - NULL, new_image); + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, NULL, + new_image); if (err) goto out; tr->selector++; -- cgit From 2f30b36943adca070f2e1551f701bd524ed1ae5a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 21 Nov 2019 11:59:01 +0000 Subject: locking/refcount: Remove unused 'refcount_error_report()' function 'refcount_error_report()' has no callers. Remove it. Signed-off-by: Will Deacon Reviewed-by: Ard Biesheuvel Acked-by: Kees Cook Tested-by: Hanjun Guo Cc: Ard Biesheuvel Cc: Elena Reshetova Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191121115902.2551-10-will@kernel.org Signed-off-by: Ingo Molnar --- kernel/panic.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index f470a038b05b..b69ee9e76cb2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -671,17 +671,6 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif -#ifdef CONFIG_ARCH_HAS_REFCOUNT -void refcount_error_report(struct pt_regs *regs, const char *err) -{ - WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n", - err, (void *)instruction_pointer(regs), - current->comm, task_pid_nr(current), - from_kuid_munged(&init_user_ns, current_uid()), - from_kuid_munged(&init_user_ns, current_euid())); -} -#endif - core_param(panic, panic_timeout, int, 0644); core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); -- cgit From b111df8447acdeb4b9220f99d5d4b28f83eb56ad Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Nov 2019 21:25:46 +0100 Subject: y2038: alarm: fix half-second cut-off Changing alarm_itimer accidentally broke the logic for arithmetic rounding of half seconds in the return code. Change it to a constant based on NSEC_PER_SEC, as suggested by Ben Hutchings. Fixes: bd40a175769d ("y2038: itimer: change implementation to timespec64") Reported-by: Ben Hutchings Signed-off-by: Arnd Bergmann --- kernel/time/itimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 5872db9bd5f7..9e59c9ea92aa 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -297,7 +297,7 @@ static unsigned int alarm_setitimer(unsigned int seconds) * better return too much than too little anyway */ if ((!it_old.it_value.tv_sec && it_old.it_value.tv_nsec) || - it_old.it_value.tv_nsec >= 500000) + it_old.it_value.tv_nsec >= (NSEC_PER_SEC / 2)) it_old.it_value.tv_sec++; return it_old.it_value.tv_sec; -- cgit From 61a47c1ad3a4dc6882f01ebdc88138ac62d0df03 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 1 Oct 2019 13:01:19 -0500 Subject: sysctl: Remove the sysctl system call This system call has been deprecated almost since it was introduced, and in a survey of the linux distributions I can no longer find any of them that enable CONFIG_SYSCTL_SYSCALL. The only indication that I can find that anyone might care is that a few of the defconfigs in the kernel enable CONFIG_SYSCTL_SYSCALL. However this appears in only 31 of 414 defconfigs in the kernel, so I suspect this symbols presence is simply because it is harmless to include rather than because it is necessary. As there appear to be no users of the sysctl system call, remove the code. As this removes one of the few uses of the internal kernel mount of proc I hope this allows for even more simplifications of the proc filesystem. Cc: Alex Smith Cc: Anders Berg Cc: Apelete Seketeli Cc: Arnd Bergmann Cc: Chee Nouk Phoon Cc: Chris Zankel Cc: Christian Ruppert Cc: Greg Ungerer Cc: Harvey Hunt Cc: Helge Deller Cc: Hongliang Tao Cc: Hua Yan Cc: Huacai Chen Cc: John Crispin Cc: Jonas Jensen Cc: Josh Boyer Cc: Jun Nie Cc: Kevin Hilman Cc: Kevin Wells Cc: Kumar Gala Cc: Lars-Peter Clausen Cc: Ley Foon Tan Cc: Linus Walleij Cc: Markos Chandras Cc: Max Filippov Cc: Noam Camus Cc: Olof Johansson Cc: Paul Burton Cc: Paul Mundt Cc: Phil Edworthy Cc: Pierrick Hascoet Cc: Ralf Baechle Cc: Roland Stigge Cc: Santosh Shilimkar Cc: Scott Telford Cc: Stephen Boyd Cc: Steven J. Hill Cc: Tanmay Inamdar Cc: Vineet Gupta Cc: Wolfram Sang Acked-by: Andi Kleen Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/sysctl_binary.c | 1305 ------------------------------------------------ 1 file changed, 1305 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 73c132095a7b..7d550cc76a3b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -18,1317 +18,12 @@ #include #include -#ifdef CONFIG_SYSCTL_SYSCALL - -struct bin_table; -typedef ssize_t bin_convert_t(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen); - -static bin_convert_t bin_dir; -static bin_convert_t bin_string; -static bin_convert_t bin_intvec; -static bin_convert_t bin_ulongvec; -static bin_convert_t bin_uuid; -static bin_convert_t bin_dn_node_address; - -#define CTL_DIR bin_dir -#define CTL_STR bin_string -#define CTL_INT bin_intvec -#define CTL_ULONG bin_ulongvec -#define CTL_UUID bin_uuid -#define CTL_DNADR bin_dn_node_address - -#define BUFSZ 256 - -struct bin_table { - bin_convert_t *convert; - int ctl_name; - const char *procname; - const struct bin_table *child; -}; - -static const struct bin_table bin_random_table[] = { - { CTL_INT, RANDOM_POOLSIZE, "poolsize" }, - { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" }, - { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" }, - { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" }, - { CTL_UUID, RANDOM_BOOT_ID, "boot_id" }, - { CTL_UUID, RANDOM_UUID, "uuid" }, - {} -}; - -static const struct bin_table bin_pty_table[] = { - { CTL_INT, PTY_MAX, "max" }, - { CTL_INT, PTY_NR, "nr" }, - {} -}; - -static const struct bin_table bin_kern_table[] = { - { CTL_STR, KERN_OSTYPE, "ostype" }, - { CTL_STR, KERN_OSRELEASE, "osrelease" }, - /* KERN_OSREV not used */ - { CTL_STR, KERN_VERSION, "version" }, - /* KERN_SECUREMASK not used */ - /* KERN_PROF not used */ - { CTL_STR, KERN_NODENAME, "hostname" }, - { CTL_STR, KERN_DOMAINNAME, "domainname" }, - - { CTL_INT, KERN_PANIC, "panic" }, - { CTL_INT, KERN_REALROOTDEV, "real-root-dev" }, - - { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" }, - { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" }, - { CTL_INT, KERN_PRINTK, "printk" }, - - /* KERN_NAMETRANS not used */ - /* KERN_PPC_HTABRECLAIM not used */ - /* KERN_PPC_ZEROPAGED not used */ - { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" }, - - { CTL_STR, KERN_MODPROBE, "modprobe" }, - { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" }, - { CTL_INT, KERN_ACCT, "acct" }, - /* KERN_PPC_L2CR "l2cr" no longer used */ - - /* KERN_RTSIGNR not used */ - /* KERN_RTSIGMAX not used */ - - { CTL_ULONG, KERN_SHMMAX, "shmmax" }, - { CTL_INT, KERN_MSGMAX, "msgmax" }, - { CTL_INT, KERN_MSGMNB, "msgmnb" }, - /* KERN_MSGPOOL not used*/ - { CTL_INT, KERN_SYSRQ, "sysrq" }, - { CTL_INT, KERN_MAX_THREADS, "threads-max" }, - { CTL_DIR, KERN_RANDOM, "random", bin_random_table }, - { CTL_ULONG, KERN_SHMALL, "shmall" }, - { CTL_INT, KERN_MSGMNI, "msgmni" }, - { CTL_INT, KERN_SEM, "sem" }, - { CTL_INT, KERN_SPARC_STOP_A, "stop-a" }, - { CTL_INT, KERN_SHMMNI, "shmmni" }, - - { CTL_INT, KERN_OVERFLOWUID, "overflowuid" }, - { CTL_INT, KERN_OVERFLOWGID, "overflowgid" }, - - { CTL_STR, KERN_HOTPLUG, "hotplug", }, - { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" }, - - { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" }, - { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" }, - /* KERN_TAINTED "tainted" no longer used */ - { CTL_INT, KERN_CADPID, "cad_pid" }, - { CTL_INT, KERN_PIDMAX, "pid_max" }, - { CTL_STR, KERN_CORE_PATTERN, "core_pattern" }, - { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" }, - { CTL_INT, KERN_HPPA_PWRSW, "soft-power" }, - { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" }, - - { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" }, - { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" }, - - { CTL_DIR, KERN_PTY, "pty", bin_pty_table }, - { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" }, - { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" }, - /* KERN_HZ_TIMER "hz_timer" no longer used */ - { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, - { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" }, - { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" }, - - { CTL_INT, KERN_SPIN_RETRY, "spin_retry" }, - /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */ - { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, - { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, - { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, - { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, - { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, - { CTL_ULONG, KERN_PANIC_PRINT, "panic_print" }, - {} -}; - -static const struct bin_table bin_vm_table[] = { - { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, - { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" }, - { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, - { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, - /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ - /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ - /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */ - { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, - /* VM_PAGEBUF unused */ - /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ - { CTL_INT, VM_SWAPPINESS, "swappiness" }, - { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" }, - { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" }, - { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" }, - { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" }, - { CTL_INT, VM_BLOCK_DUMP, "block_dump" }, - { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" }, - { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" }, - { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" }, - /* VM_SWAP_TOKEN_TIMEOUT unused */ - { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" }, - { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" }, - { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" }, - { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" }, - { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" }, - { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" }, - { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" }, - - {} -}; - -static const struct bin_table bin_net_core_table[] = { - { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" }, - { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" }, - { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" }, - { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" }, - /* NET_CORE_DESTROY_DELAY unused */ - { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" }, - /* NET_CORE_FASTROUTE unused */ - { CTL_INT, NET_CORE_MSG_COST, "message_cost" }, - { CTL_INT, NET_CORE_MSG_BURST, "message_burst" }, - { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" }, - /* NET_CORE_HOT_LIST_LENGTH unused */ - /* NET_CORE_DIVERT_VERSION unused */ - /* NET_CORE_NO_CONG_THRESH unused */ - /* NET_CORE_NO_CONG unused */ - /* NET_CORE_LO_CONG unused */ - /* NET_CORE_MOD_CONG unused */ - { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" }, - { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" }, - { CTL_INT, NET_CORE_BUDGET, "netdev_budget" }, - { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, - { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, - { CTL_INT, NET_CORE_WARNINGS, "warnings" }, - {}, -}; - -static const struct bin_table bin_net_unix_table[] = { - /* NET_UNIX_DESTROY_DELAY unused */ - /* NET_UNIX_DELETE_DELAY unused */ - { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, - {} -}; - -static const struct bin_table bin_net_ipv4_route_table[] = { - { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" }, - /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */ - /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */ - { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" }, - { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" }, - { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, - /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */ - { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, - { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, - { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, - { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" }, - { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" }, - { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, - { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - {} -}; - -static const struct bin_table bin_net_ipv4_conf_vars_table[] = { - { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" }, - { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, - - { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" }, - { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" }, - { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" }, - { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" }, - { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" }, - { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" }, - { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" }, - { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" }, - { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" }, - { CTL_INT, NET_IPV4_CONF_TAG, "tag" }, - { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" }, - { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" }, - { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, - { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, - { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" }, - - { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" }, - { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" }, - { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, - { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, - {} -}; - -static const struct bin_table bin_net_ipv4_conf_table[] = { - { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table }, - { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table }, - { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table }, - {} -}; - -static const struct bin_table bin_net_neigh_vars_table[] = { - { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, - { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, - { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" }, - /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */ - { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" }, - { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" }, - { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" }, - { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" }, - { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" }, - /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */ - /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */ - /* NET_NEIGH_LOCKTIME "locktime" no longer used */ - { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" }, - { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" }, - { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" }, - { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" }, - { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" }, - { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" }, - {} -}; - -static const struct bin_table bin_net_neigh_table[] = { - { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table }, - { CTL_DIR, 0, NULL, bin_net_neigh_vars_table }, - {} -}; - -static const struct bin_table bin_net_ipv4_netfilter_table[] = { - { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, - - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */ - - /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */ - /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */ - /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */ - /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */ - - { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" }, - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */ - { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" }, - - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ - - { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" }, - {} -}; - -static const struct bin_table bin_net_ipv4_table[] = { - {CTL_INT, NET_IPV4_FORWARD, "ip_forward" }, - - { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table }, - { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table }, - { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table }, - /* NET_IPV4_FIB_HASH unused */ - { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table }, - - { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, - { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, - { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" }, - { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" }, - { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" }, - /* NET_IPV4_AUTOCONFIG unused */ - { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" }, - { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" }, - { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" }, - { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" }, - { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" }, - { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" }, - { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" }, - { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" }, - { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" }, - { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" }, - { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, - { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, - { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, - { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, - { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, - { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, - { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" }, - { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" }, - { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" }, - { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" }, - { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" }, - { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" }, - { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" }, - { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" }, - { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" }, - { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" }, - { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" }, - { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" }, - { CTL_INT, NET_TCP_FACK, "tcp_fack" }, - { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" }, - { CTL_INT, NET_TCP_ECN, "tcp_ecn" }, - { CTL_INT, NET_TCP_DSACK, "tcp_dsack" }, - { CTL_INT, NET_TCP_MEM, "tcp_mem" }, - { CTL_INT, NET_TCP_WMEM, "tcp_wmem" }, - { CTL_INT, NET_TCP_RMEM, "tcp_rmem" }, - { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" }, - { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" }, - { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" }, - { CTL_INT, NET_TCP_FRTO, "tcp_frto" }, - { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, - { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" }, - { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" }, - { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, - { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, - { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, - { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, - { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, - { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, - { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, - { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, - { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, - { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" }, - { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" }, - /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */ - { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, - { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, - - { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" }, - { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" }, - { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" }, - { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" }, - { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" }, - { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" }, - - { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" }, - { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" }, - { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" }, - - { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, - /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */ - - { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, - - /* NET_TCP_DEFAULT_WIN_SCALE unused */ - /* NET_TCP_BIC_BETA unused */ - /* NET_IPV4_TCP_MAX_KA_PROBES unused */ - /* NET_IPV4_IP_MASQ_DEBUG unused */ - /* NET_TCP_SYN_TAILDROP unused */ - /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */ - /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */ - /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */ - /* NET_IPV4_ICMP_PARAMPROB_RATE unused */ - /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */ - /* NET_IPV4_ALWAYS_DEFRAG unused */ - {} -}; - -static const struct bin_table bin_net_ipx_table[] = { - { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, - /* NET_IPX_FORWARDING unused */ - {} -}; - -static const struct bin_table bin_net_atalk_table[] = { - { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, - { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, - { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, - { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" }, - {}, -}; - -static const struct bin_table bin_net_netrom_table[] = { - { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, - { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, - { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, - { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" }, - { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" }, - { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" }, - { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" }, - { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" }, - { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" }, - { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" }, - { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" }, - { CTL_INT, NET_NETROM_RESET, "reset" }, - {} -}; - -static const struct bin_table bin_net_ax25_param_table[] = { - { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, - { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, - { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" }, - { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" }, - { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" }, - { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" }, - { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" }, - { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" }, - { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" }, - { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" }, - { CTL_INT, NET_AX25_N2, "maximum_retry_count" }, - { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" }, - { CTL_INT, NET_AX25_PROTOCOL, "protocol" }, - { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" }, - {} -}; - -static const struct bin_table bin_net_ax25_table[] = { - { CTL_DIR, 0, NULL, bin_net_ax25_param_table }, - {} -}; - -static const struct bin_table bin_net_rose_table[] = { - { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" }, - { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" }, - { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" }, - { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" }, - { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" }, - { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" }, - {} -}; - -static const struct bin_table bin_net_ipv6_conf_var_table[] = { - { CTL_INT, NET_IPV6_FORWARDING, "forwarding" }, - { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" }, - { CTL_INT, NET_IPV6_MTU, "mtu" }, - { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" }, - { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" }, - { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" }, - { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" }, - { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" }, - { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" }, - { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" }, - { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" }, - { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" }, - { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" }, - { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" }, - { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" }, - { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" }, - { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" }, - { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, - { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, - { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" }, - {} -}; - -static const struct bin_table bin_net_ipv6_conf_table[] = { - { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table }, - { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table }, - { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table }, - {} -}; - -static const struct bin_table bin_net_ipv6_route_table[] = { - /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */ - { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, - { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, - { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" }, - { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - {} -}; - -static const struct bin_table bin_net_ipv6_icmp_table[] = { - { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, - {} -}; - -static const struct bin_table bin_net_ipv6_table[] = { - { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table }, - { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table }, - { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table }, - { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table }, - { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" }, - { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" }, - { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" }, - { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" }, - { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" }, - { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" }, - { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" }, - {} -}; - -static const struct bin_table bin_net_x25_table[] = { - { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" }, - { CTL_INT, NET_X25_FORWARD, "x25_forward" }, - {} -}; - -static const struct bin_table bin_net_tr_table[] = { - { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" }, - {} -}; - - -static const struct bin_table bin_net_decnet_conf_vars[] = { - { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, - { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" }, - { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" }, - { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" }, - {} -}; - -static const struct bin_table bin_net_decnet_conf[] = { - { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars }, - { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars }, - {} -}; - -static const struct bin_table bin_net_decnet_table[] = { - { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf }, - { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" }, - { CTL_STR, NET_DECNET_NODE_NAME, "node_name" }, - { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" }, - { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" }, - { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" }, - { CTL_INT, NET_DECNET_DI_COUNT, "di_count" }, - { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" }, - { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" }, - { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" }, - { CTL_INT, NET_DECNET_MEM, "decnet_mem" }, - { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" }, - { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" }, - { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" }, - {} -}; - -static const struct bin_table bin_net_sctp_table[] = { - { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" }, - { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" }, - { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" }, - { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" }, - { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" }, - { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" }, - { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" }, - { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" }, - { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" }, - { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" }, - { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" }, - { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" }, - { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" }, - { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" }, - { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" }, - { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" }, - { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" }, - {} -}; - -static const struct bin_table bin_net_llc_llc2_timeout_table[] = { - { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" }, - { CTL_INT, NET_LLC2_P_TIMEOUT, "p" }, - { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" }, - { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" }, - {} -}; - -static const struct bin_table bin_net_llc_station_table[] = { - { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, - {} -}; - -static const struct bin_table bin_net_llc_llc2_table[] = { - { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table }, - {} -}; - -static const struct bin_table bin_net_llc_table[] = { - { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table }, - { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table }, - {} -}; - -static const struct bin_table bin_net_netfilter_table[] = { - { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, - /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */ - /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */ - /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */ - /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */ - /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" }, - { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" }, - /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" }, - { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" }, - { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" }, - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" }, - /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */ - /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" }, - { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" }, - { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" }, - - {} -}; - -static const struct bin_table bin_net_table[] = { - { CTL_DIR, NET_CORE, "core", bin_net_core_table }, - /* NET_ETHER not used */ - /* NET_802 not used */ - { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table }, - { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table }, - { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table }, - { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table }, - { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table }, - { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table }, - /* NET_BRIDGE "bridge" no longer used */ - { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table }, - { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table }, - { CTL_DIR, NET_X25, "x25", bin_net_x25_table }, - { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table }, - { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table }, - /* NET_ECONET not used */ - { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table }, - { CTL_DIR, NET_LLC, "llc", bin_net_llc_table }, - { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table }, - /* NET_DCCP "dccp" no longer used */ - /* NET_IRDA "irda" no longer used */ - { CTL_INT, 2089, "nf_conntrack_max" }, - {} -}; - -static const struct bin_table bin_fs_quota_table[] = { - { CTL_INT, FS_DQ_LOOKUPS, "lookups" }, - { CTL_INT, FS_DQ_DROPS, "drops" }, - { CTL_INT, FS_DQ_READS, "reads" }, - { CTL_INT, FS_DQ_WRITES, "writes" }, - { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" }, - { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" }, - { CTL_INT, FS_DQ_FREE, "free_dquots" }, - { CTL_INT, FS_DQ_SYNCS, "syncs" }, - { CTL_INT, FS_DQ_WARNINGS, "warnings" }, - {} -}; - -static const struct bin_table bin_fs_xfs_table[] = { - { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" }, - { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" }, - { CTL_INT, XFS_PANIC_MASK, "panic_mask" }, - - { CTL_INT, XFS_ERRLEVEL, "error_level" }, - { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" }, - { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" }, - { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" }, - { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" }, - { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" }, - { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" }, - { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" }, - { CTL_INT, XFS_ROTORSTEP, "rotorstep" }, - { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" }, - { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" }, - { CTL_INT, XFS_STATS_CLEAR, "stats_clear" }, - {} -}; - -static const struct bin_table bin_fs_ocfs2_nm_table[] = { - { CTL_STR, 1, "hb_ctl_path" }, - {} -}; - -static const struct bin_table bin_fs_ocfs2_table[] = { - { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table }, - {} -}; - -static const struct bin_table bin_inotify_table[] = { - { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, - { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, - { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, - {} -}; - -static const struct bin_table bin_fs_table[] = { - { CTL_INT, FS_NRINODE, "inode-nr" }, - { CTL_INT, FS_STATINODE, "inode-state" }, - /* FS_MAXINODE unused */ - /* FS_NRDQUOT unused */ - /* FS_MAXDQUOT unused */ - /* FS_NRFILE "file-nr" no longer used */ - { CTL_INT, FS_MAXFILE, "file-max" }, - { CTL_INT, FS_DENTRY, "dentry-state" }, - /* FS_NRSUPER unused */ - /* FS_MAXUPSER unused */ - { CTL_INT, FS_OVERFLOWUID, "overflowuid" }, - { CTL_INT, FS_OVERFLOWGID, "overflowgid" }, - { CTL_INT, FS_LEASES, "leases-enable" }, - { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" }, - { CTL_INT, FS_LEASE_TIME, "lease-break-time" }, - { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table }, - { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table }, - { CTL_ULONG, FS_AIO_NR, "aio-nr" }, - { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" }, - { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table }, - { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table }, - { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" }, - {} -}; - -static const struct bin_table bin_ipmi_table[] = { - { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, - {} -}; - -static const struct bin_table bin_mac_hid_files[] = { - /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ - /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ - { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, - { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" }, - { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" }, - /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */ - {} -}; - -static const struct bin_table bin_raid_table[] = { - { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, - { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, - {} -}; - -static const struct bin_table bin_scsi_table[] = { - { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" }, - {} -}; - -static const struct bin_table bin_dev_table[] = { - /* DEV_CDROM "cdrom" no longer used */ - /* DEV_HWMON unused */ - /* DEV_PARPORT "parport" no longer used */ - { CTL_DIR, DEV_RAID, "raid", bin_raid_table }, - { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files }, - { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table }, - { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table }, - {} -}; - -static const struct bin_table bin_bus_isa_table[] = { - { CTL_INT, BUS_ISA_MEM_BASE, "membase" }, - { CTL_INT, BUS_ISA_PORT_BASE, "portbase" }, - { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" }, - {} -}; - -static const struct bin_table bin_bus_table[] = { - { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table }, - {} -}; - - -static const struct bin_table bin_s390dbf_table[] = { - { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, - { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, - {} -}; - -static const struct bin_table bin_sunrpc_table[] = { - /* CTL_RPCDEBUG "rpc_debug" no longer used */ - /* CTL_NFSDEBUG "nfs_debug" no longer used */ - /* CTL_NFSDDEBUG "nfsd_debug" no longer used */ - /* CTL_NLMDEBUG "nlm_debug" no longer used */ - - { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" }, - { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" }, - { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" }, - { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" }, - {} -}; - -static const struct bin_table bin_pm_table[] = { - /* frv specific */ - /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */ - { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" }, - { CTL_INT, 3 /* CTL_PM_P0 */, "p0" }, - { CTL_INT, 4 /* CTL_PM_CM */, "cm" }, - {} -}; - -static const struct bin_table bin_root_table[] = { - { CTL_DIR, CTL_KERN, "kernel", bin_kern_table }, - { CTL_DIR, CTL_VM, "vm", bin_vm_table }, - { CTL_DIR, CTL_NET, "net", bin_net_table }, - /* CTL_PROC not used */ - { CTL_DIR, CTL_FS, "fs", bin_fs_table }, - /* CTL_DEBUG "debug" no longer used */ - { CTL_DIR, CTL_DEV, "dev", bin_dev_table }, - { CTL_DIR, CTL_BUS, "bus", bin_bus_table }, - { CTL_DIR, CTL_ABI, "abi" }, - /* CTL_CPU not used */ - /* CTL_ARLAN "arlan" no longer used */ - { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table }, - { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table }, - { CTL_DIR, CTL_PM, "pm", bin_pm_table }, - {} -}; - -static ssize_t bin_dir(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - return -ENOTDIR; -} - - -static ssize_t bin_string(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t result, copied = 0; - - if (oldval && oldlen) { - char __user *lastp; - loff_t pos = 0; - int ch; - - result = vfs_read(file, oldval, oldlen, &pos); - if (result < 0) - goto out; - - copied = result; - lastp = oldval + copied - 1; - - result = -EFAULT; - if (get_user(ch, lastp)) - goto out; - - /* Trim off the trailing newline */ - if (ch == '\n') { - result = -EFAULT; - if (put_user('\0', lastp)) - goto out; - copied -= 1; - } - } - - if (newval && newlen) { - loff_t pos = 0; - - result = vfs_write(file, newval, newlen, &pos); - if (result < 0) - goto out; - } - - result = copied; -out: - return result; -} - -static ssize_t bin_intvec(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t copied = 0; - char *buffer; - ssize_t result; - - result = -ENOMEM; - buffer = kmalloc(BUFSZ, GFP_KERNEL); - if (!buffer) - goto out; - - if (oldval && oldlen) { - unsigned __user *vec = oldval; - size_t length = oldlen / sizeof(*vec); - char *str, *end; - int i; - loff_t pos = 0; - - result = kernel_read(file, buffer, BUFSZ - 1, &pos); - if (result < 0) - goto out_kfree; - - str = buffer; - end = str + result; - *end++ = '\0'; - for (i = 0; i < length; i++) { - unsigned long value; - - value = simple_strtoul(str, &str, 10); - while (isspace(*str)) - str++; - - result = -EFAULT; - if (put_user(value, vec + i)) - goto out_kfree; - - copied += sizeof(*vec); - if (!isdigit(*str)) - break; - } - } - - if (newval && newlen) { - unsigned __user *vec = newval; - size_t length = newlen / sizeof(*vec); - char *str, *end; - int i; - loff_t pos = 0; - - str = buffer; - end = str + BUFSZ; - for (i = 0; i < length; i++) { - unsigned long value; - - result = -EFAULT; - if (get_user(value, vec + i)) - goto out_kfree; - - str += scnprintf(str, end - str, "%lu\t", value); - } - - result = kernel_write(file, buffer, str - buffer, &pos); - if (result < 0) - goto out_kfree; - } - result = copied; -out_kfree: - kfree(buffer); -out: - return result; -} - -static ssize_t bin_ulongvec(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t copied = 0; - char *buffer; - ssize_t result; - - result = -ENOMEM; - buffer = kmalloc(BUFSZ, GFP_KERNEL); - if (!buffer) - goto out; - - if (oldval && oldlen) { - unsigned long __user *vec = oldval; - size_t length = oldlen / sizeof(*vec); - char *str, *end; - int i; - loff_t pos = 0; - - result = kernel_read(file, buffer, BUFSZ - 1, &pos); - if (result < 0) - goto out_kfree; - - str = buffer; - end = str + result; - *end++ = '\0'; - for (i = 0; i < length; i++) { - unsigned long value; - - value = simple_strtoul(str, &str, 10); - while (isspace(*str)) - str++; - - result = -EFAULT; - if (put_user(value, vec + i)) - goto out_kfree; - - copied += sizeof(*vec); - if (!isdigit(*str)) - break; - } - } - - if (newval && newlen) { - unsigned long __user *vec = newval; - size_t length = newlen / sizeof(*vec); - char *str, *end; - int i; - loff_t pos = 0; - - str = buffer; - end = str + BUFSZ; - for (i = 0; i < length; i++) { - unsigned long value; - - result = -EFAULT; - if (get_user(value, vec + i)) - goto out_kfree; - - str += scnprintf(str, end - str, "%lu\t", value); - } - - result = kernel_write(file, buffer, str - buffer, &pos); - if (result < 0) - goto out_kfree; - } - result = copied; -out_kfree: - kfree(buffer); -out: - return result; -} - -static ssize_t bin_uuid(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t result, copied = 0; - - /* Only supports reads */ - if (oldval && oldlen) { - char buf[UUID_STRING_LEN + 1]; - uuid_t uuid; - loff_t pos = 0; - - result = kernel_read(file, buf, sizeof(buf) - 1, &pos); - if (result < 0) - goto out; - - buf[result] = '\0'; - - result = -EIO; - if (uuid_parse(buf, &uuid)) - goto out; - - if (oldlen > 16) - oldlen = 16; - - result = -EFAULT; - if (copy_to_user(oldval, &uuid, oldlen)) - goto out; - - copied = oldlen; - } - result = copied; -out: - return result; -} - -static ssize_t bin_dn_node_address(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t result, copied = 0; - - if (oldval && oldlen) { - char buf[15], *nodep; - unsigned long area, node; - __le16 dnaddr; - loff_t pos = 0; - - result = kernel_read(file, buf, sizeof(buf) - 1, &pos); - if (result < 0) - goto out; - - buf[result] = '\0'; - - /* Convert the decnet address to binary */ - result = -EIO; - nodep = strchr(buf, '.'); - if (!nodep) - goto out; - ++nodep; - - area = simple_strtoul(buf, NULL, 10); - node = simple_strtoul(nodep, NULL, 10); - - result = -EIO; - if ((area > 63)||(node > 1023)) - goto out; - - dnaddr = cpu_to_le16((area << 10) | node); - - result = -EFAULT; - if (put_user(dnaddr, (__le16 __user *)oldval)) - goto out; - - copied = sizeof(dnaddr); - } - - if (newval && newlen) { - __le16 dnaddr; - char buf[15]; - int len; - loff_t pos = 0; - - result = -EINVAL; - if (newlen != sizeof(dnaddr)) - goto out; - - result = -EFAULT; - if (get_user(dnaddr, (__le16 __user *)newval)) - goto out; - - len = scnprintf(buf, sizeof(buf), "%hu.%hu", - le16_to_cpu(dnaddr) >> 10, - le16_to_cpu(dnaddr) & 0x3ff); - - result = kernel_write(file, buf, len, &pos); - if (result < 0) - goto out; - } - - result = copied; -out: - return result; -} - -static const struct bin_table *get_sysctl(const int *name, int nlen, char *path) -{ - const struct bin_table *table = &bin_root_table[0]; - int ctl_name; - - /* The binary sysctl tables have a small maximum depth so - * there is no danger of overflowing our path as it PATH_MAX - * bytes long. - */ - memcpy(path, "sys/", 4); - path += 4; - -repeat: - if (!nlen) - return ERR_PTR(-ENOTDIR); - ctl_name = *name; - name++; - nlen--; - for ( ; table->convert; table++) { - int len = 0; - - /* - * For a wild card entry map from ifindex to network - * device name. - */ - if (!table->ctl_name) { -#ifdef CONFIG_NET - struct net *net = current->nsproxy->net_ns; - struct net_device *dev; - dev = dev_get_by_index(net, ctl_name); - if (dev) { - len = strlen(dev->name); - memcpy(path, dev->name, len); - dev_put(dev); - } -#endif - /* Use the well known sysctl number to proc name mapping */ - } else if (ctl_name == table->ctl_name) { - len = strlen(table->procname); - memcpy(path, table->procname, len); - } - if (len) { - path += len; - if (table->child) { - *path++ = '/'; - table = table->child; - goto repeat; - } - *path = '\0'; - return table; - } - } - return ERR_PTR(-ENOTDIR); -} - -static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep) -{ - char *tmp, *result; - - result = ERR_PTR(-ENOMEM); - tmp = __getname(); - if (tmp) { - const struct bin_table *table = get_sysctl(name, nlen, tmp); - result = tmp; - *tablep = table; - if (IS_ERR(table)) { - __putname(tmp); - result = ERR_CAST(table); - } - } - return result; -} - -static ssize_t binary_sysctl(const int *name, int nlen, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - const struct bin_table *table = NULL; - struct vfsmount *mnt; - struct file *file; - ssize_t result; - char *pathname; - int flags; - - pathname = sysctl_getname(name, nlen, &table); - result = PTR_ERR(pathname); - if (IS_ERR(pathname)) - goto out; - - /* How should the sysctl be accessed? */ - if (oldval && oldlen && newval && newlen) { - flags = O_RDWR; - } else if (newval && newlen) { - flags = O_WRONLY; - } else if (oldval && oldlen) { - flags = O_RDONLY; - } else { - result = 0; - goto out_putname; - } - - mnt = task_active_pid_ns(current)->proc_mnt; - file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0); - result = PTR_ERR(file); - if (IS_ERR(file)) - goto out_putname; - - result = table->convert(file, oldval, oldlen, newval, newlen); - - fput(file); -out_putname: - __putname(pathname); -out: - return result; -} - - -#else /* CONFIG_SYSCTL_SYSCALL */ - static ssize_t binary_sysctl(const int *name, int nlen, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { return -ENOSYS; } -#endif /* CONFIG_SYSCTL_SYSCALL */ - - static void deprecated_sysctl_warning(const int *name, int nlen) { int i; -- cgit From 36a8015f89e40f7c9c91cc7e6d028fa288dad27b Mon Sep 17 00:00:00 2001 From: Leonard Crestez Date: Tue, 26 Nov 2019 17:17:13 +0200 Subject: PM / QoS: Restore DEV_PM_QOS_MIN/MAX_FREQUENCY Support for adding per-device frequency limits was removed in commit 2aac8bdf7a0f ("PM: QoS: Drop frequency QoS types from device PM QoS") after cpufreq switched to use a new "freq_constraints" construct. Restore support for per-device freq limits but base this upon freq_constraints. This is primarily meant to be used by the devfreq subsystem. This removes the "static" marking on freq_qos_apply but does not export it for modules. Signed-off-by: Leonard Crestez Reviewed-by: Matthias Kaehlcke Tested-by: Matthias Kaehlcke Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index a45cba7df0ae..83edf8698118 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -714,8 +714,10 @@ s32 freq_qos_read_value(struct freq_constraints *qos, * @req: Constraint request to apply. * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. + * + * This is only meant to be called from inside pm_qos, not drivers. */ -static int freq_qos_apply(struct freq_qos_request *req, +int freq_qos_apply(struct freq_qos_request *req, enum pm_qos_req_action action, s32 value) { int ret; -- cgit From ff68dac6d65cd1347dad5d780dd8c90f29dc1b0b Mon Sep 17 00:00:00 2001 From: Gaowei Pu Date: Sat, 30 Nov 2019 17:51:03 -0800 Subject: mm/mmap.c: use IS_ERR_VALUE to check return value of get_unmapped_area MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit get_unmapped_area() returns an address or -errno on failure. Historically we have checked for the failure by offset_in_page() which is correct but quite hard to read. Newer code started using IS_ERR_VALUE which is much easier to read. Convert remaining users of offset_in_page as well. [mhocko@suse.com: rewrite changelog] [mhocko@kernel.org: fix mremap.c and uprobes.c sites also] Link: http://lkml.kernel.org/r/20191012102512.28051-1-pugaowei@gmail.com Signed-off-by: Gaowei Pu Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Wei Yang Cc: Konstantin Khlebnikov Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Cc: Mike Kravetz Cc: Rik van Riel Cc: Qian Cai Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c74761004ee5..ece7e13f6e4a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1457,7 +1457,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); - if (area->vaddr & ~PAGE_MASK) { + if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; } -- cgit From eafb149ed73a8bb8359c0ce027b98acd4e95b070 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Sat, 30 Nov 2019 17:54:57 -0800 Subject: fork: support VMAP_STACK with KASAN_VMALLOC Supporting VMAP_STACK with KASAN_VMALLOC is straightforward: - clear the shadow region of vmapped stacks when swapping them in - tweak Kconfig to allow VMAP_STACK to be turned on with KASAN Link: http://lkml.kernel.org/r/20191031093909.9228-4-dja@axtens.net Signed-off-by: Daniel Axtens Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Christophe Leroy Cc: Mark Rutland Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0f0bac8318dd..21c6c1e29b98 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -93,6 +93,7 @@ #include #include #include +#include #include #include @@ -223,6 +224,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; + /* Clear the KASAN shadow of the stack. */ + kasan_unpoison_shadow(s->addr, THREAD_SIZE); + /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); -- cgit From 204cb79ad42f015312a5bbd7012d09c93d9b46fb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:56:08 -0800 Subject: kernel: sysctl: make drop_caches write-only Currently, the drop_caches proc file and sysctl read back the last value written, suggesting this is somehow a stateful setting instead of a one-time command. Make it write-only, like e.g. compact_memory. While mitigating a VM problem at scale in our fleet, there was confusion about whether writing to this file will permanently switch the kernel into a non-caching mode. This influences the decision making in a tense situation, where tens of people are trying to fix tens of thousands of affected machines: Do we need a rollback strategy? What are the performance implications of operating in a non-caching state for several days? It also caused confusion when the kernel team said we may need to write the file several times to make sure it's effective ("But it already reads back 3?"). Link: http://lkml.kernel.org/r/20191031221602.9375-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Chris Down Acked-by: Vlastimil Babka Acked-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b6f2f35d0bcf..70665934d53e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1466,7 +1466,7 @@ static struct ctl_table vm_table[] = { .procname = "drop_caches", .data = &sysctl_drop_caches, .maxlen = sizeof(int), - .mode = 0644, + .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, .extra2 = &four, -- cgit From 6c3edaf9fd6a3be7fb5bc6931897c24cd3848f84 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 29 Nov 2019 20:52:18 -0800 Subject: tracing: Introduce trace event injection We have been trying to use rasdaemon to monitor hardware errors like correctable memory errors. rasdaemon uses trace events to monitor various hardware errors. In order to test it, we have to inject some hardware errors, unfortunately not all of them provide error injections. MCE does provide a way to inject MCE errors, but errors like PCI error and devlink error don't, it is not easy to add error injection to each of them. Instead, it is relatively easier to just allow users to inject trace events in a generic way so that all trace events can be injected. This patch introduces trace event injection, where a new 'inject' is added to each tracepoint directory. Users could write into this file with key=value pairs to specify the value of each fields of the trace event, all unspecified fields are set to zero values by default. For example, for the net/net_dev_queue tracepoint, we can inject: INJECT=/sys/kernel/debug/tracing/events/net/net_dev_queue/inject echo "" > $INJECT echo "name='test'" > $INJECT echo "name='test' len=1024" > $INJECT cat /sys/kernel/debug/tracing/trace ... <...>-614 [000] .... 36.571483: net_dev_queue: dev= skbaddr=00000000fbf338c2 len=0 <...>-614 [001] .... 136.588252: net_dev_queue: dev=test skbaddr=00000000fbf338c2 len=0 <...>-614 [001] .N.. 208.431878: net_dev_queue: dev=test skbaddr=00000000fbf338c2 len=1024 Triggers could be triggered as usual too: echo "stacktrace if len == 1025" > /sys/kernel/debug/tracing/events/net/net_dev_queue/trigger echo "len=1025" > $INJECT cat /sys/kernel/debug/tracing/trace ... bash-614 [000] .... 36.571483: net_dev_queue: dev= skbaddr=00000000fbf338c2 len=0 bash-614 [001] .... 136.588252: net_dev_queue: dev=test skbaddr=00000000fbf338c2 len=0 bash-614 [001] .N.. 208.431878: net_dev_queue: dev=test skbaddr=00000000fbf338c2 len=1024 bash-614 [001] .N.1 284.236349: => event_inject_write => vfs_write => ksys_write => do_syscall_64 => entry_SYSCALL_64_after_hwframe The only thing that can't be injected is string pointers as they require constant string pointers, this can't be done at run time. Link: http://lkml.kernel.org/r/20191130045218.18979-1-xiyou.wangcong@gmail.com Cc: Ingo Molnar Signed-off-by: Cong Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 9 + kernel/trace/Makefile | 1 + kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 6 + kernel/trace/trace_events_inject.c | 331 +++++++++++++++++++++++++++++++++++++ 5 files changed, 348 insertions(+) create mode 100644 kernel/trace/trace_events_inject.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f67620499faa..29a9c5058b62 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -672,6 +672,15 @@ config HIST_TRIGGERS See Documentation/trace/histogram.rst. If in doubt, say N. +config TRACE_EVENT_INJECT + bool "Trace event injection" + depends on TRACING + help + Allow user-space to inject a specific trace event into the ring + buffer. This is mainly used for testing purpose. + + If unsure, say N. + config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c2b2148bb1d2..0e63db62225f 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ca7fccafbcbb..63bf60f79398 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1601,6 +1601,7 @@ extern struct list_head ftrace_events; extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; +extern const struct file_operations event_inject_fops; #ifdef CONFIG_HIST_TRIGGERS extern int register_trigger_hist_cmd(void); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6b3a69e9aa6a..c6de3cebc127 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2044,6 +2044,12 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); +#ifdef CONFIG_TRACE_EVENT_INJECT + if (call->event.type && call->class->reg) + trace_create_file("inject", 0200, file->dir, file, + &event_inject_fops); +#endif + return 0; } diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c new file mode 100644 index 000000000000..d43710718ee5 --- /dev/null +++ b/kernel/trace/trace_events_inject.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_events_inject - trace event injection + * + * Copyright (C) 2019 Cong Wang + */ + +#include +#include +#include +#include +#include + +#include "trace.h" + +static int +trace_inject_entry(struct trace_event_file *file, void *rec, int len) +{ + struct trace_event_buffer fbuffer; + struct ring_buffer *buffer; + int written = 0; + void *entry; + + rcu_read_lock_sched(); + buffer = file->tr->trace_buffer.buffer; + entry = trace_event_buffer_reserve(&fbuffer, file, len); + if (entry) { + memcpy(entry, rec, len); + written = len; + trace_event_buffer_commit(&fbuffer); + } + rcu_read_unlock_sched(); + + return written; +} + +static int +parse_field(char *str, struct trace_event_call *call, + struct ftrace_event_field **pf, u64 *pv) +{ + struct ftrace_event_field *field; + char *field_name; + int s, i = 0; + int len; + u64 val; + + if (!str[i]) + return 0; + /* First find the field to associate to */ + while (isspace(str[i])) + i++; + s = i; + while (isalnum(str[i]) || str[i] == '_') + i++; + len = i - s; + if (!len) + return -EINVAL; + + field_name = kmemdup_nul(str + s, len, GFP_KERNEL); + if (!field_name) + return -ENOMEM; + field = trace_find_event_field(call, field_name); + kfree(field_name); + if (!field) + return -ENOENT; + + *pf = field; + while (isspace(str[i])) + i++; + if (str[i] != '=') + return -EINVAL; + i++; + while (isspace(str[i])) + i++; + s = i; + if (isdigit(str[i]) || str[i] == '-') { + char *num, c; + int ret; + + /* Make sure the field is not a string */ + if (is_string_field(field)) + return -EINVAL; + + if (str[i] == '-') + i++; + + /* We allow 0xDEADBEEF */ + while (isalnum(str[i])) + i++; + num = str + s; + c = str[i]; + if (c != '\0' && !isspace(c)) + return -EINVAL; + str[i] = '\0'; + /* Make sure it is a value */ + if (field->is_signed) + ret = kstrtoll(num, 0, &val); + else + ret = kstrtoull(num, 0, &val); + str[i] = c; + if (ret) + return ret; + + *pv = val; + return i; + } else if (str[i] == '\'' || str[i] == '"') { + char q = str[i]; + + /* Make sure the field is OK for strings */ + if (!is_string_field(field)) + return -EINVAL; + + for (i++; str[i]; i++) { + if (str[i] == '\\' && str[i + 1]) { + i++; + continue; + } + if (str[i] == q) + break; + } + if (!str[i]) + return -EINVAL; + + /* Skip quotes */ + s++; + len = i - s; + if (len >= MAX_FILTER_STR_VAL) + return -EINVAL; + + *pv = (unsigned long)(str + s); + str[i] = 0; + /* go past the last quote */ + i++; + return i; + } + + return -EINVAL; +} + +static int trace_get_entry_size(struct trace_event_call *call) +{ + struct ftrace_event_field *field; + struct list_head *head; + int size = 0; + + head = trace_get_fields(call); + list_for_each_entry(field, head, link) { + if (field->size + field->offset > size) + size = field->size + field->offset; + } + + return size; +} + +static void *trace_alloc_entry(struct trace_event_call *call, int *size) +{ + int entry_size = trace_get_entry_size(call); + struct ftrace_event_field *field; + struct list_head *head; + void *entry = NULL; + + /* We need an extra '\0' at the end. */ + entry = kzalloc(entry_size + 1, GFP_KERNEL); + if (!entry) + return NULL; + + head = trace_get_fields(call); + list_for_each_entry(field, head, link) { + if (!is_string_field(field)) + continue; + if (field->filter_type == FILTER_STATIC_STRING) + continue; + if (field->filter_type == FILTER_DYN_STRING) { + u32 *str_item; + int str_loc = entry_size & 0xffff; + + str_item = (u32 *)(entry + field->offset); + *str_item = str_loc; /* string length is 0. */ + } else { + char **paddr; + + paddr = (char **)(entry + field->offset); + *paddr = ""; + } + } + + *size = entry_size + 1; + return entry; +} + +#define INJECT_STRING "STATIC STRING CAN NOT BE INJECTED" + +/* Caller is responsible to free the *pentry. */ +static int parse_entry(char *str, struct trace_event_call *call, void **pentry) +{ + struct ftrace_event_field *field; + unsigned long irq_flags; + void *entry = NULL; + int entry_size; + u64 val; + int len; + + entry = trace_alloc_entry(call, &entry_size); + *pentry = entry; + if (!entry) + return -ENOMEM; + + local_save_flags(irq_flags); + tracing_generic_entry_update(entry, call->event.type, irq_flags, + preempt_count()); + + while ((len = parse_field(str, call, &field, &val)) > 0) { + if (is_function_field(field)) + return -EINVAL; + + if (is_string_field(field)) { + char *addr = (char *)(unsigned long) val; + + if (field->filter_type == FILTER_STATIC_STRING) { + strlcpy(entry + field->offset, addr, field->size); + } else if (field->filter_type == FILTER_DYN_STRING) { + int str_len = strlen(addr) + 1; + int str_loc = entry_size & 0xffff; + u32 *str_item; + + entry_size += str_len; + *pentry = krealloc(entry, entry_size, GFP_KERNEL); + if (!*pentry) { + kfree(entry); + return -ENOMEM; + } + entry = *pentry; + + strlcpy(entry + (entry_size - str_len), addr, str_len); + str_item = (u32 *)(entry + field->offset); + *str_item = (str_len << 16) | str_loc; + } else { + char **paddr; + + paddr = (char **)(entry + field->offset); + *paddr = INJECT_STRING; + } + } else { + switch (field->size) { + case 1: { + u8 tmp = (u8) val; + + memcpy(entry + field->offset, &tmp, 1); + break; + } + case 2: { + u16 tmp = (u16) val; + + memcpy(entry + field->offset, &tmp, 2); + break; + } + case 4: { + u32 tmp = (u32) val; + + memcpy(entry + field->offset, &tmp, 4); + break; + } + case 8: + memcpy(entry + field->offset, &val, 8); + break; + default: + return -EINVAL; + } + } + + str += len; + } + + if (len < 0) + return len; + + return entry_size; +} + +static ssize_t +event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_event_call *call; + struct trace_event_file *file; + int err = -ENODEV, size; + void *entry = NULL; + char *buf; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); + strim(buf); + + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (file) { + call = file->event_call; + size = parse_entry(buf, call, &entry); + if (size < 0) + err = size; + else + err = trace_inject_entry(file, entry, size); + } + mutex_unlock(&event_mutex); + + kfree(entry); + kfree(buf); + + if (err < 0) + return err; + + *ppos += err; + return cnt; +} + +static ssize_t +event_inject_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + return -EPERM; +} + +const struct file_operations event_inject_fops = { + .open = tracing_open_generic, + .read = event_inject_read, + .write = event_inject_write, +}; -- cgit From a356646a56857c2e5ad875beec734d7145ecd49a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 2 Dec 2019 16:25:27 -0500 Subject: tracing: Do not create directories if lockdown is in affect If lockdown is disabling tracing on boot up, it prevents the tracing files from even bering created. But when that happens, there's several places that will give a warning that the files were not created as that is usually a sign of a bug. Add in strategic locations where a check is made to see if tracing is disabled by lockdown, and if it is, do not go further, and fail silently (but print that tracing is disabled by lockdown, without doing a WARN_ON()). Cc: Matthew Garrett Fixes: 17911ff38aa5 ("tracing: Add locked_down checks to the open calls of files created for tracefs") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 6 ++++++ kernel/trace/trace.c | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 66358d66c933..4bf050fcfe3b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include /* for self test */ @@ -5068,6 +5069,11 @@ static __init int test_ringbuffer(void) int cpu; int ret = 0; + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warning("Lockdown is enabled, skipping ring buffer tests\n"); + return 0; + } + pr_info("Running ring buffer tests...\n"); buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 02a23a6e5e00..23459d53d576 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1888,6 +1888,12 @@ int __init register_tracer(struct tracer *type) return -1; } + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warning("Can not register tracer %s due to lockdown\n", + type->name); + return -EPERM; + } + mutex_lock(&trace_types_lock); tracing_selftest_running = true; @@ -8789,6 +8795,11 @@ struct dentry *tracing_init_dentry(void) { struct trace_array *tr = &global_trace; + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warning("Tracing disabled due to lockdown\n"); + return ERR_PTR(-EPERM); + } + /* The top level trace array uses NULL as parent */ if (tr->dir) return NULL; @@ -9231,6 +9242,12 @@ __init static int tracer_alloc_buffers(void) int ring_buf_size; int ret = -ENOMEM; + + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warning("Tracing disabled due to lockdown\n"); + return -EPERM; + } + /* * Make sure we don't accidently add more trace options * than we have bits for. -- cgit From 1a50cb80f219c44adb6265f5071b81fc3c1deced Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Wed, 4 Dec 2019 16:50:39 -0800 Subject: kernel/notifier.c: intercept duplicate registrations to avoid infinite loops Registering the same notifier to a hook repeatedly can cause the hook list to form a ring or lose other members of the list. case1: An infinite loop in notifier_chain_register() can cause soft lockup atomic_notifier_chain_register(&test_notifier_list, &test1); atomic_notifier_chain_register(&test_notifier_list, &test1); atomic_notifier_chain_register(&test_notifier_list, &test2); case2: An infinite loop in notifier_chain_register() can cause soft lockup atomic_notifier_chain_register(&test_notifier_list, &test1); atomic_notifier_chain_register(&test_notifier_list, &test1); atomic_notifier_call_chain(&test_notifier_list, 0, NULL); case3: lose other hook test2 atomic_notifier_chain_register(&test_notifier_list, &test1); atomic_notifier_chain_register(&test_notifier_list, &test2); atomic_notifier_chain_register(&test_notifier_list, &test1); case4: Unregister returns 0, but the hook is still in the linked list, and it is not really registered. If you call notifier_call_chain after ko is unloaded, it will trigger oops. If the system is configured with softlockup_panic and the same hook is repeatedly registered on the panic_notifier_list, it will cause a loop panic. Add a check in notifier_chain_register(), intercepting duplicate registrations to avoid infinite loops Link: http://lkml.kernel.org/r/1568861888-34045-2-git-send-email-nixiaoming@huawei.com Signed-off-by: Xiaoming Ni Reviewed-by: Vasily Averin Reviewed-by: Andrew Morton Cc: Alexey Dobriyan Cc: Anna Schumaker Cc: Arjan van de Ven Cc: J. Bruce Fields Cc: Chuck Lever Cc: David S. Miller Cc: Jeff Layton Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Nadia Derbey Cc: "Paul E. McKenney" Cc: Sam Protsenko Cc: Alan Stern Cc: Thomas Gleixner Cc: Trond Myklebust Cc: Viresh Kumar Cc: Xiaoming Ni Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/notifier.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index d9f5081d578d..30bedb8be6dd 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -23,7 +23,10 @@ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { - WARN_ONCE(((*nl) == n), "double register detected"); + if (unlikely((*nl) == n)) { + WARN(1, "double register detected"); + return 0; + } if (n->priority > (*nl)->priority) break; nl = &((*nl)->next); -- cgit From 5adaabb65a267d890b29193af2dbc38a3b85bbf2 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Wed, 4 Dec 2019 16:50:43 -0800 Subject: kernel/notifier.c: remove notifier_chain_cond_register() The only difference between notifier_chain_cond_register() and notifier_chain_register() is the lack of warning hints for duplicate registrations. Use notifier_chain_register() instead of notifier_chain_cond_register() to avoid duplicate code Link: http://lkml.kernel.org/r/1568861888-34045-3-git-send-email-nixiaoming@huawei.com Signed-off-by: Xiaoming Ni Reviewed-by: Andrew Morton Cc: Alan Stern Cc: Alexey Dobriyan Cc: Andy Lutomirski Cc: Anna Schumaker Cc: Arjan van de Ven Cc: Chuck Lever Cc: David S. Miller Cc: Ingo Molnar Cc: J. Bruce Fields Cc: Jeff Layton Cc: Nadia Derbey Cc: "Paul E. McKenney" Cc: Sam Protsenko Cc: Thomas Gleixner Cc: Trond Myklebust Cc: Vasily Averin Cc: Viresh Kumar Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/notifier.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index 30bedb8be6dd..e3d221f092fe 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -36,21 +36,6 @@ static int notifier_chain_register(struct notifier_block **nl, return 0; } -static int notifier_chain_cond_register(struct notifier_block **nl, - struct notifier_block *n) -{ - while ((*nl) != NULL) { - if ((*nl) == n) - return 0; - if (n->priority > (*nl)->priority) - break; - nl = &((*nl)->next); - } - n->next = *nl; - rcu_assign_pointer(*nl, n); - return 0; -} - static int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) { @@ -252,7 +237,7 @@ int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh, int ret; down_write(&nh->rwsem); - ret = notifier_chain_cond_register(&nh->head, n); + ret = notifier_chain_register(&nh->head, n); up_write(&nh->rwsem); return ret; } -- cgit From 260a2679e5cbfb3d8a4cf6cd1cb6f57e89c7e543 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Wed, 4 Dec 2019 16:50:47 -0800 Subject: kernel/notifier.c: remove blocking_notifier_chain_cond_register() blocking_notifier_chain_cond_register() does not consider system_booting state, which is the only difference between this function and blocking_notifier_cain_register(). This can be a bug and is a piece of duplicate code. Delete blocking_notifier_chain_cond_register() Link: http://lkml.kernel.org/r/1568861888-34045-4-git-send-email-nixiaoming@huawei.com Signed-off-by: Xiaoming Ni Reviewed-by: Andrew Morton Cc: Alan Stern Cc: Alexey Dobriyan Cc: Andy Lutomirski Cc: Anna Schumaker Cc: Arjan van de Ven Cc: Chuck Lever Cc: David S. Miller Cc: Ingo Molnar Cc: J. Bruce Fields Cc: Jeff Layton Cc: Nadia Derbey Cc: "Paul E. McKenney" Cc: Sam Protsenko Cc: Thomas Gleixner Cc: Trond Myklebust Cc: Vasily Averin Cc: Viresh Kumar Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/notifier.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index e3d221f092fe..63d7501ac638 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -220,29 +220,6 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, } EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); -/** - * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to a blocking notifier chain, only if not already - * present in the chain. - * Must be called in process context. - * - * Currently always returns zero. - */ -int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - down_write(&nh->rwsem); - ret = notifier_chain_register(&nh->head, n); - up_write(&nh->rwsem); - return ret; -} -EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register); - /** * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain -- cgit From ef70eff9dea66f38f8c2c2dcc7fe4b7a2bbb4921 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 4 Dec 2019 16:50:50 -0800 Subject: kernel/profile.c: use cpumask_available to check for NULL cpumask When building with clang + -Wtautological-pointer-compare, these instances pop up: kernel/profile.c:339:6: warning: comparison of array 'prof_cpu_mask' not equal to a null pointer is always true [-Wtautological-pointer-compare] if (prof_cpu_mask != NULL) ^~~~~~~~~~~~~ ~~~~ kernel/profile.c:376:6: warning: comparison of array 'prof_cpu_mask' not equal to a null pointer is always true [-Wtautological-pointer-compare] if (prof_cpu_mask != NULL) ^~~~~~~~~~~~~ ~~~~ kernel/profile.c:406:26: warning: comparison of array 'prof_cpu_mask' not equal to a null pointer is always true [-Wtautological-pointer-compare] if (!user_mode(regs) && prof_cpu_mask != NULL && ^~~~~~~~~~~~~ ~~~~ 3 warnings generated. This can be addressed with the cpumask_available helper, introduced in commit f7e30f01a9e2 ("cpumask: Add helper cpumask_available()") to fix warnings like this while keeping the code the same. Link: https://github.com/ClangBuiltLinux/linux/issues/747 Link: http://lkml.kernel.org/r/20191022191957.9554-1-natechancellor@gmail.com Signed-off-by: Nathan Chancellor Reviewed-by: Andrew Morton Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index af7c94bf5fa1..4b144b02ca5d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -336,7 +336,7 @@ static int profile_dead_cpu(unsigned int cpu) struct page *page; int i; - if (prof_cpu_mask != NULL) + if (cpumask_available(prof_cpu_mask)) cpumask_clear_cpu(cpu, prof_cpu_mask); for (i = 0; i < 2; i++) { @@ -373,7 +373,7 @@ static int profile_prepare_cpu(unsigned int cpu) static int profile_online_cpu(unsigned int cpu) { - if (prof_cpu_mask != NULL) + if (cpumask_available(prof_cpu_mask)) cpumask_set_cpu(cpu, prof_cpu_mask); return 0; @@ -403,7 +403,7 @@ void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); - if (!user_mode(regs) && prof_cpu_mask != NULL && + if (!user_mode(regs) && cpumask_available(prof_cpu_mask) && cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) profile_hit(type, (void *)profile_pc(regs)); } -- cgit From 5e1aada08cd19ea652b2d32a250501d09b02ff2e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 4 Dec 2019 16:50:53 -0800 Subject: kernel/sys.c: avoid copying possible padding bytes in copy_to_user Initialization is not guaranteed to zero padding bytes so use an explicit memset instead to avoid leaking any kernel content in any possible padding bytes. Link: http://lkml.kernel.org/r/dfa331c00881d61c8ee51577a082d8bebd61805c.camel@perches.com Signed-off-by: Joe Perches Cc: Dan Carpenter Cc: Julia Lawall Cc: Thomas Gleixner Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index d3aef31e24dc..a9331f101883 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1279,11 +1279,13 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) { - struct oldold_utsname tmp = {}; + struct oldold_utsname tmp; if (!name) return -EFAULT; + memset(&tmp, 0, sizeof(tmp)); + down_read(&uts_sem); memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN); memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN); -- cgit From 964975ac6677c97ae61ec9d6969dd5d03f18d1c3 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Wed, 4 Dec 2019 16:52:03 -0800 Subject: lib/genalloc.c: rename addr_in_gen_pool to gen_pool_has_addr Follow the kernel conventions, rename addr_in_gen_pool to gen_pool_has_addr. [sjhuang@iluvatar.ai: fix Documentation/ too] Link: http://lkml.kernel.org/r/20181229015914.5573-1-sjhuang@iluvatar.ai Link: http://lkml.kernel.org/r/20181228083950.20398-1-sjhuang@iluvatar.ai Signed-off-by: Huang Shijie Reviewed-by: Andrew Morton Cc: Russell King Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma/remap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index d47bd40fc0f5..d14cbc83986a 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -178,7 +178,7 @@ bool dma_in_atomic_pool(void *start, size_t size) if (unlikely(!atomic_pool)) return false; - return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); + return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); } void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) -- cgit From eec028c9386ed1a692aa01a85b55952202b41619 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 4 Dec 2019 16:52:43 -0800 Subject: kcov: remote coverage support Patch series " kcov: collect coverage from usb and vhost", v3. This patchset extends kcov to allow collecting coverage from backgound kernel threads. This extension requires custom annotations for each of the places where coverage collection is desired. This patchset implements this for hub events in the USB subsystem and for vhost workers. See the first patch description for details about the kcov extension. The other two patches apply this kcov extension to USB and vhost. Examples of other subsystems that might potentially benefit from this when custom annotations are added (the list is based on process_one_work() callers for bugs recently reported by syzbot): 1. fs: writeback wb_workfn() worker, 2. net: addrconf_dad_work()/addrconf_verify_work() workers, 3. net: neigh_periodic_work() worker, 4. net/p9: p9_write_work()/p9_read_work() workers, 5. block: blk_mq_run_work_fn() worker. These patches have been used to enable coverage-guided USB fuzzing with syzkaller for the last few years, see the details here: https://github.com/google/syzkaller/blob/master/docs/linux/external_fuzzing_usb.md This patchset has been pushed to the public Linux kernel Gerrit instance: https://linux-review.googlesource.com/c/linux/kernel/git/torvalds/linux/+/1524 This patch (of 3): Add background thread coverage collection ability to kcov. With KCOV_ENABLE coverage is collected only for syscalls that are issued from the current process. With KCOV_REMOTE_ENABLE it's possible to collect coverage for arbitrary parts of the kernel code, provided that those parts are annotated with kcov_remote_start()/kcov_remote_stop(). This allows to collect coverage from two types of kernel background threads: the global ones, that are spawned during kernel boot in a limited number of instances (e.g. one USB hub_event() worker thread is spawned per USB HCD); and the local ones, that are spawned when a user interacts with some kernel interface (e.g. vhost workers). To enable collecting coverage from a global background thread, a unique global handle must be assigned and passed to the corresponding kcov_remote_start() call. Then a userspace process can pass a list of such handles to the KCOV_REMOTE_ENABLE ioctl in the handles array field of the kcov_remote_arg struct. This will attach the used kcov device to the code sections, that are referenced by those handles. Since there might be many local background threads spawned from different userspace processes, we can't use a single global handle per annotation. Instead, the userspace process passes a non-zero handle through the common_handle field of the kcov_remote_arg struct. This common handle gets saved to the kcov_handle field in the current task_struct and needs to be passed to the newly spawned threads via custom annotations. Those threads should in turn be annotated with kcov_remote_start()/kcov_remote_stop(). Internally kcov stores handles as u64 integers. The top byte of a handle is used to denote the id of a subsystem that this handle belongs to, and the lower 4 bytes are used to denote the id of a thread instance within that subsystem. A reserved value 0 is used as a subsystem id for common handles as they don't belong to a particular subsystem. The bytes 4-7 are currently reserved and must be zero. In the future the number of bytes used for the subsystem or handle ids might be increased. When a particular userspace process collects coverage by via a common handle, kcov will collect coverage for each code section that is annotated to use the common handle obtained as kcov_handle from the current task_struct. However non common handles allow to collect coverage selectively from different subsystems. Link: http://lkml.kernel.org/r/e90e315426a384207edbec1d6aa89e43008e4caf.1572366574.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Dmitry Vyukov Cc: Greg Kroah-Hartman Cc: Alan Stern Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Arnd Bergmann Cc: Steven Rostedt Cc: David Windsor Cc: Elena Reshetova Cc: Anders Roxell Cc: Alexander Potapenko Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 547 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 512 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 2ee38727844a..f50354202dbe 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -21,8 +22,11 @@ #include #include #include +#include #include +#define kcov_debug(fmt, ...) pr_debug("%s: " fmt, __func__, ##__VA_ARGS__) + /* Number of 64-bit words written per one comparison: */ #define KCOV_WORDS_PER_CMP 4 @@ -44,19 +48,100 @@ struct kcov { * Reference counter. We keep one for: * - opened file descriptor * - task with enabled coverage (we can't unwire it from another task) + * - each code section for remote coverage collection */ refcount_t refcount; /* The lock protects mode, size, area and t. */ spinlock_t lock; enum kcov_mode mode; - /* Size of arena (in long's for KCOV_MODE_TRACE). */ - unsigned size; + /* Size of arena (in long's). */ + unsigned int size; /* Coverage buffer shared with user space. */ void *area; /* Task for which we collect coverage, or NULL. */ struct task_struct *t; + /* Collecting coverage from remote (background) threads. */ + bool remote; + /* Size of remote area (in long's). */ + unsigned int remote_size; + /* + * Sequence is incremented each time kcov is reenabled, used by + * kcov_remote_stop(), see the comment there. + */ + int sequence; }; +struct kcov_remote_area { + struct list_head list; + unsigned int size; +}; + +struct kcov_remote { + u64 handle; + struct kcov *kcov; + struct hlist_node hnode; +}; + +static DEFINE_SPINLOCK(kcov_remote_lock); +static DEFINE_HASHTABLE(kcov_remote_map, 4); +static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); + +/* Must be called with kcov_remote_lock locked. */ +static struct kcov_remote *kcov_remote_find(u64 handle) +{ + struct kcov_remote *remote; + + hash_for_each_possible(kcov_remote_map, remote, hnode, handle) { + if (remote->handle == handle) + return remote; + } + return NULL; +} + +static struct kcov_remote *kcov_remote_add(struct kcov *kcov, u64 handle) +{ + struct kcov_remote *remote; + + if (kcov_remote_find(handle)) + return ERR_PTR(-EEXIST); + remote = kmalloc(sizeof(*remote), GFP_ATOMIC); + if (!remote) + return ERR_PTR(-ENOMEM); + remote->handle = handle; + remote->kcov = kcov; + hash_add(kcov_remote_map, &remote->hnode, handle); + return remote; +} + +/* Must be called with kcov_remote_lock locked. */ +static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) +{ + struct kcov_remote_area *area; + struct list_head *pos; + + kcov_debug("size = %u\n", size); + list_for_each(pos, &kcov_remote_areas) { + area = list_entry(pos, struct kcov_remote_area, list); + if (area->size == size) { + list_del(&area->list); + kcov_debug("rv = %px\n", area); + return area; + } + } + kcov_debug("rv = NULL\n"); + return NULL; +} + +/* Must be called with kcov_remote_lock locked. */ +static void kcov_remote_area_put(struct kcov_remote_area *area, + unsigned int size) +{ + kcov_debug("area = %px, size = %u\n", area, size); + INIT_LIST_HEAD(&area->list); + area->size = size; + list_add(&area->list, &kcov_remote_areas); +} + static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { unsigned int mode; @@ -73,7 +158,7 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru * in_interrupt() returns false (e.g. preempt_schedule_irq()). * READ_ONCE()/barrier() effectively provides load-acquire wrt * interrupts, there are paired barrier()/WRITE_ONCE() in - * kcov_ioctl_locked(). + * kcov_start(). */ barrier(); return mode == needed_mode; @@ -227,6 +312,78 @@ void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) EXPORT_SYMBOL(__sanitizer_cov_trace_switch); #endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */ +static void kcov_start(struct task_struct *t, unsigned int size, + void *area, enum kcov_mode mode, int sequence) +{ + kcov_debug("t = %px, size = %u, area = %px\n", t, size, area); + /* Cache in task struct for performance. */ + t->kcov_size = size; + t->kcov_area = area; + /* See comment in check_kcov_mode(). */ + barrier(); + WRITE_ONCE(t->kcov_mode, mode); + t->kcov_sequence = sequence; +} + +static void kcov_stop(struct task_struct *t) +{ + WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED); + barrier(); + t->kcov_size = 0; + t->kcov_area = NULL; +} + +static void kcov_task_reset(struct task_struct *t) +{ + kcov_stop(t); + t->kcov = NULL; + t->kcov_sequence = 0; + t->kcov_handle = 0; +} + +void kcov_task_init(struct task_struct *t) +{ + kcov_task_reset(t); + t->kcov_handle = current->kcov_handle; +} + +static void kcov_reset(struct kcov *kcov) +{ + kcov->t = NULL; + kcov->mode = KCOV_MODE_INIT; + kcov->remote = false; + kcov->remote_size = 0; + kcov->sequence++; +} + +static void kcov_remote_reset(struct kcov *kcov) +{ + int bkt; + struct kcov_remote *remote; + struct hlist_node *tmp; + + spin_lock(&kcov_remote_lock); + hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) { + if (remote->kcov != kcov) + continue; + kcov_debug("removing handle %llx\n", remote->handle); + hash_del(&remote->hnode); + kfree(remote); + } + /* Do reset before unlock to prevent races with kcov_remote_start(). */ + kcov_reset(kcov); + spin_unlock(&kcov_remote_lock); +} + +static void kcov_disable(struct task_struct *t, struct kcov *kcov) +{ + kcov_task_reset(t); + if (kcov->remote) + kcov_remote_reset(kcov); + else + kcov_reset(kcov); +} + static void kcov_get(struct kcov *kcov) { refcount_inc(&kcov->refcount); @@ -235,20 +392,12 @@ static void kcov_get(struct kcov *kcov) static void kcov_put(struct kcov *kcov) { if (refcount_dec_and_test(&kcov->refcount)) { + kcov_remote_reset(kcov); vfree(kcov->area); kfree(kcov); } } -void kcov_task_init(struct task_struct *t) -{ - WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED); - barrier(); - t->kcov_size = 0; - t->kcov_area = NULL; - t->kcov = NULL; -} - void kcov_task_exit(struct task_struct *t) { struct kcov *kcov; @@ -256,15 +405,36 @@ void kcov_task_exit(struct task_struct *t) kcov = t->kcov; if (kcov == NULL) return; + spin_lock(&kcov->lock); + kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t); + /* + * For KCOV_ENABLE devices we want to make sure that t->kcov->t == t, + * which comes down to: + * WARN_ON(!kcov->remote && kcov->t != t); + * + * For KCOV_REMOTE_ENABLE devices, the exiting task is either: + * 2. A remote task between kcov_remote_start() and kcov_remote_stop(). + * In this case we should print a warning right away, since a task + * shouldn't be exiting when it's in a kcov coverage collection + * section. Here t points to the task that is collecting remote + * coverage, and t->kcov->t points to the thread that created the + * kcov device. Which means that to detect this case we need to + * check that t != t->kcov->t, and this gives us the following: + * WARN_ON(kcov->remote && kcov->t != t); + * + * 2. The task that created kcov exiting without calling KCOV_DISABLE, + * and then again we can make sure that t->kcov->t == t: + * WARN_ON(kcov->remote && kcov->t != t); + * + * By combining all three checks into one we get: + */ if (WARN_ON(kcov->t != t)) { spin_unlock(&kcov->lock); return; } /* Just to not leave dangling references behind. */ - kcov_task_init(t); - kcov->t = NULL; - kcov->mode = KCOV_MODE_INIT; + kcov_disable(t, kcov); spin_unlock(&kcov->lock); kcov_put(kcov); } @@ -313,6 +483,7 @@ static int kcov_open(struct inode *inode, struct file *filep) if (!kcov) return -ENOMEM; kcov->mode = KCOV_MODE_DISABLED; + kcov->sequence = 1; refcount_set(&kcov->refcount, 1); spin_lock_init(&kcov->lock); filep->private_data = kcov; @@ -325,6 +496,20 @@ static int kcov_close(struct inode *inode, struct file *filep) return 0; } +static int kcov_get_mode(unsigned long arg) +{ + if (arg == KCOV_TRACE_PC) + return KCOV_MODE_TRACE_PC; + else if (arg == KCOV_TRACE_CMP) +#ifdef CONFIG_KCOV_ENABLE_COMPARISONS + return KCOV_MODE_TRACE_CMP; +#else + return -ENOTSUPP; +#endif + else + return -EINVAL; +} + /* * Fault in a lazily-faulted vmalloc area before it can be used by * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the @@ -340,14 +525,35 @@ static void kcov_fault_in_area(struct kcov *kcov) READ_ONCE(area[offset]); } +static inline bool kcov_check_handle(u64 handle, bool common_valid, + bool uncommon_valid, bool zero_valid) +{ + if (handle & ~(KCOV_SUBSYSTEM_MASK | KCOV_INSTANCE_MASK)) + return false; + switch (handle & KCOV_SUBSYSTEM_MASK) { + case KCOV_SUBSYSTEM_COMMON: + return (handle & KCOV_INSTANCE_MASK) ? + common_valid : zero_valid; + case KCOV_SUBSYSTEM_USB: + return uncommon_valid; + default: + return false; + } + return false; +} + static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, unsigned long arg) { struct task_struct *t; unsigned long size, unused; + int mode, i; + struct kcov_remote_arg *remote_arg; + struct kcov_remote *remote; switch (cmd) { case KCOV_INIT_TRACE: + kcov_debug("KCOV_INIT_TRACE\n"); /* * Enable kcov in trace mode and setup buffer size. * Must happen before anything else. @@ -366,6 +572,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->mode = KCOV_MODE_INIT; return 0; case KCOV_ENABLE: + kcov_debug("KCOV_ENABLE\n"); /* * Enable coverage for the current task. * At this point user must have been enabled trace mode, @@ -378,29 +585,20 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, t = current; if (kcov->t != NULL || t->kcov != NULL) return -EBUSY; - if (arg == KCOV_TRACE_PC) - kcov->mode = KCOV_MODE_TRACE_PC; - else if (arg == KCOV_TRACE_CMP) -#ifdef CONFIG_KCOV_ENABLE_COMPARISONS - kcov->mode = KCOV_MODE_TRACE_CMP; -#else - return -ENOTSUPP; -#endif - else - return -EINVAL; + mode = kcov_get_mode(arg); + if (mode < 0) + return mode; kcov_fault_in_area(kcov); - /* Cache in task struct for performance. */ - t->kcov_size = kcov->size; - t->kcov_area = kcov->area; - /* See comment in check_kcov_mode(). */ - barrier(); - WRITE_ONCE(t->kcov_mode, kcov->mode); + kcov->mode = mode; + kcov_start(t, kcov->size, kcov->area, kcov->mode, + kcov->sequence); t->kcov = kcov; kcov->t = t; - /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */ + /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; case KCOV_DISABLE: + kcov_debug("KCOV_DISABLE\n"); /* Disable coverage for the current task. */ unused = arg; if (unused != 0 || current->kcov != kcov) @@ -408,11 +606,65 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, t = current; if (WARN_ON(kcov->t != t)) return -EINVAL; - kcov_task_init(t); - kcov->t = NULL; - kcov->mode = KCOV_MODE_INIT; + kcov_disable(t, kcov); kcov_put(kcov); return 0; + case KCOV_REMOTE_ENABLE: + kcov_debug("KCOV_REMOTE_ENABLE\n"); + if (kcov->mode != KCOV_MODE_INIT || !kcov->area) + return -EINVAL; + t = current; + if (kcov->t != NULL || t->kcov != NULL) + return -EBUSY; + remote_arg = (struct kcov_remote_arg *)arg; + mode = kcov_get_mode(remote_arg->trace_mode); + if (mode < 0) + return mode; + if (remote_arg->area_size > LONG_MAX / sizeof(unsigned long)) + return -EINVAL; + kcov->mode = mode; + t->kcov = kcov; + kcov->t = t; + kcov->remote = true; + kcov->remote_size = remote_arg->area_size; + spin_lock(&kcov_remote_lock); + for (i = 0; i < remote_arg->num_handles; i++) { + kcov_debug("handle %llx\n", remote_arg->handles[i]); + if (!kcov_check_handle(remote_arg->handles[i], + false, true, false)) { + spin_unlock(&kcov_remote_lock); + kcov_disable(t, kcov); + return -EINVAL; + } + remote = kcov_remote_add(kcov, remote_arg->handles[i]); + if (IS_ERR(remote)) { + spin_unlock(&kcov_remote_lock); + kcov_disable(t, kcov); + return PTR_ERR(remote); + } + } + if (remote_arg->common_handle) { + kcov_debug("common handle %llx\n", + remote_arg->common_handle); + if (!kcov_check_handle(remote_arg->common_handle, + true, false, false)) { + spin_unlock(&kcov_remote_lock); + kcov_disable(t, kcov); + return -EINVAL; + } + remote = kcov_remote_add(kcov, + remote_arg->common_handle); + if (IS_ERR(remote)) { + spin_unlock(&kcov_remote_lock); + kcov_disable(t, kcov); + return PTR_ERR(remote); + } + t->kcov_handle = remote_arg->common_handle; + } + spin_unlock(&kcov_remote_lock); + /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ + kcov_get(kcov); + return 0; default: return -ENOTTY; } @@ -422,11 +674,35 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct kcov *kcov; int res; + struct kcov_remote_arg *remote_arg = NULL; + unsigned int remote_num_handles; + unsigned long remote_arg_size; + + if (cmd == KCOV_REMOTE_ENABLE) { + if (get_user(remote_num_handles, (unsigned __user *)(arg + + offsetof(struct kcov_remote_arg, num_handles)))) + return -EFAULT; + if (remote_num_handles > KCOV_REMOTE_MAX_HANDLES) + return -EINVAL; + remote_arg_size = struct_size(remote_arg, handles, + remote_num_handles); + remote_arg = memdup_user((void __user *)arg, remote_arg_size); + if (IS_ERR(remote_arg)) + return PTR_ERR(remote_arg); + if (remote_arg->num_handles != remote_num_handles) { + kfree(remote_arg); + return -EINVAL; + } + arg = (unsigned long)remote_arg; + } kcov = filep->private_data; spin_lock(&kcov->lock); res = kcov_ioctl_locked(kcov, cmd, arg); spin_unlock(&kcov->lock); + + kfree(remote_arg); + return res; } @@ -438,6 +714,207 @@ static const struct file_operations kcov_fops = { .release = kcov_close, }; +/* + * kcov_remote_start() and kcov_remote_stop() can be used to annotate a section + * of code in a kernel background thread to allow kcov to be used to collect + * coverage from that part of code. + * + * The handle argument of kcov_remote_start() identifies a code section that is + * used for coverage collection. A userspace process passes this handle to + * KCOV_REMOTE_ENABLE ioctl to make the used kcov device start collecting + * coverage for the code section identified by this handle. + * + * The usage of these annotations in the kernel code is different depending on + * the type of the kernel thread whose code is being annotated. + * + * For global kernel threads that are spawned in a limited number of instances + * (e.g. one USB hub_event() worker thread is spawned per USB HCD), each + * instance must be assigned a unique 4-byte instance id. The instance id is + * then combined with a 1-byte subsystem id to get a handle via + * kcov_remote_handle(subsystem_id, instance_id). + * + * For local kernel threads that are spawned from system calls handler when a + * user interacts with some kernel interface (e.g. vhost workers), a handle is + * passed from a userspace process as the common_handle field of the + * kcov_remote_arg struct (note, that the user must generate a handle by using + * kcov_remote_handle() with KCOV_SUBSYSTEM_COMMON as the subsystem id and an + * arbitrary 4-byte non-zero number as the instance id). This common handle + * then gets saved into the task_struct of the process that issued the + * KCOV_REMOTE_ENABLE ioctl. When this proccess issues system calls that spawn + * kernel threads, the common handle must be retrived via kcov_common_handle() + * and passed to the spawned threads via custom annotations. Those kernel + * threads must in turn be annotated with kcov_remote_start(common_handle) and + * kcov_remote_stop(). All of the threads that are spawned by the same process + * obtain the same handle, hence the name "common". + * + * See Documentation/dev-tools/kcov.rst for more details. + * + * Internally, this function looks up the kcov device associated with the + * provided handle, allocates an area for coverage collection, and saves the + * pointers to kcov and area into the current task_struct to allow coverage to + * be collected via __sanitizer_cov_trace_pc() + * In turns kcov_remote_stop() clears those pointers from task_struct to stop + * collecting coverage and copies all collected coverage into the kcov area. + */ +void kcov_remote_start(u64 handle) +{ + struct kcov_remote *remote; + void *area; + struct task_struct *t; + unsigned int size; + enum kcov_mode mode; + int sequence; + + if (WARN_ON(!kcov_check_handle(handle, true, true, true))) + return; + if (WARN_ON(!in_task())) + return; + t = current; + /* + * Check that kcov_remote_start is not called twice + * nor called by user tasks (with enabled kcov). + */ + if (WARN_ON(t->kcov)) + return; + + kcov_debug("handle = %llx\n", handle); + + spin_lock(&kcov_remote_lock); + remote = kcov_remote_find(handle); + if (!remote) { + kcov_debug("no remote found"); + spin_unlock(&kcov_remote_lock); + return; + } + /* Put in kcov_remote_stop(). */ + kcov_get(remote->kcov); + t->kcov = remote->kcov; + /* + * Read kcov fields before unlock to prevent races with + * KCOV_DISABLE / kcov_remote_reset(). + */ + size = remote->kcov->remote_size; + mode = remote->kcov->mode; + sequence = remote->kcov->sequence; + area = kcov_remote_area_get(size); + spin_unlock(&kcov_remote_lock); + + if (!area) { + area = vmalloc(size * sizeof(unsigned long)); + if (!area) { + t->kcov = NULL; + kcov_put(remote->kcov); + return; + } + } + /* Reset coverage size. */ + *(u64 *)area = 0; + + kcov_debug("area = %px, size = %u", area, size); + + kcov_start(t, size, area, mode, sequence); + +} +EXPORT_SYMBOL(kcov_remote_start); + +static void kcov_move_area(enum kcov_mode mode, void *dst_area, + unsigned int dst_area_size, void *src_area) +{ + u64 word_size = sizeof(unsigned long); + u64 count_size, entry_size_log; + u64 dst_len, src_len; + void *dst_entries, *src_entries; + u64 dst_occupied, dst_free, bytes_to_move, entries_moved; + + kcov_debug("%px %u <= %px %lu\n", + dst_area, dst_area_size, src_area, *(unsigned long *)src_area); + + switch (mode) { + case KCOV_MODE_TRACE_PC: + dst_len = READ_ONCE(*(unsigned long *)dst_area); + src_len = *(unsigned long *)src_area; + count_size = sizeof(unsigned long); + entry_size_log = __ilog2_u64(sizeof(unsigned long)); + break; + case KCOV_MODE_TRACE_CMP: + dst_len = READ_ONCE(*(u64 *)dst_area); + src_len = *(u64 *)src_area; + count_size = sizeof(u64); + BUILD_BUG_ON(!is_power_of_2(KCOV_WORDS_PER_CMP)); + entry_size_log = __ilog2_u64(sizeof(u64) * KCOV_WORDS_PER_CMP); + break; + default: + WARN_ON(1); + return; + } + + /* As arm can't divide u64 integers use log of entry size. */ + if (dst_len > ((dst_area_size * word_size - count_size) >> + entry_size_log)) + return; + dst_occupied = count_size + (dst_len << entry_size_log); + dst_free = dst_area_size * word_size - dst_occupied; + bytes_to_move = min(dst_free, src_len << entry_size_log); + dst_entries = dst_area + dst_occupied; + src_entries = src_area + count_size; + memcpy(dst_entries, src_entries, bytes_to_move); + entries_moved = bytes_to_move >> entry_size_log; + + switch (mode) { + case KCOV_MODE_TRACE_PC: + WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved); + break; + case KCOV_MODE_TRACE_CMP: + WRITE_ONCE(*(u64 *)dst_area, dst_len + entries_moved); + break; + default: + break; + } +} + +/* See the comment before kcov_remote_start() for usage details. */ +void kcov_remote_stop(void) +{ + struct task_struct *t = current; + struct kcov *kcov = t->kcov; + void *area = t->kcov_area; + unsigned int size = t->kcov_size; + int sequence = t->kcov_sequence; + + if (!kcov) { + kcov_debug("no kcov found\n"); + return; + } + + kcov_stop(t); + t->kcov = NULL; + + spin_lock(&kcov->lock); + /* + * KCOV_DISABLE could have been called between kcov_remote_start() + * and kcov_remote_stop(), hence the check. + */ + kcov_debug("move if: %d == %d && %d\n", + sequence, kcov->sequence, (int)kcov->remote); + if (sequence == kcov->sequence && kcov->remote) + kcov_move_area(kcov->mode, kcov->area, kcov->size, area); + spin_unlock(&kcov->lock); + + spin_lock(&kcov_remote_lock); + kcov_remote_area_put(area, size); + spin_unlock(&kcov_remote_lock); + + kcov_put(kcov); +} +EXPORT_SYMBOL(kcov_remote_stop); + +/* See the comment before kcov_remote_start() for usage details. */ +u64 kcov_common_handle(void) +{ + return current->kcov_handle; +} +EXPORT_SYMBOL(kcov_common_handle); + static int __init kcov_init(void) { /* -- cgit