From f9c82a4ea89c384d49ce03768ba88d049ed3f1f0 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:08 +0200 Subject: Increase size of ucounts to atomic_long_t RLIMIT_MSGQUEUE and RLIMIT_MEMLOCK use unsigned long to store their counters. As a preparation for moving rlimits based on ucounts, we need to increase the size of the variable to long. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/257aa5fb1a7d81cf0f4c34f39ada2320c4284771.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/ucount.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/ucount.c') diff --git a/kernel/ucount.c b/kernel/ucount.c index 11b1596e2542..04c561751af1 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -175,14 +175,14 @@ static void put_ucounts(struct ucounts *ucounts) kfree(ucounts); } -static inline bool atomic_inc_below(atomic_t *v, int u) +static inline bool atomic_long_inc_below(atomic_long_t *v, int u) { - int c, old; - c = atomic_read(v); + long c, old; + c = atomic_long_read(v); for (;;) { if (unlikely(c >= u)) return false; - old = atomic_cmpxchg(v, c, c+1); + old = atomic_long_cmpxchg(v, c, c+1); if (likely(old == c)) return true; c = old; @@ -196,17 +196,17 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, struct user_namespace *tns; ucounts = get_ucounts(ns, uid); for (iter = ucounts; iter; iter = tns->ucounts) { - int max; + long max; tns = iter->ns; max = READ_ONCE(tns->ucount_max[type]); - if (!atomic_inc_below(&iter->ucount[type], max)) + if (!atomic_long_inc_below(&iter->ucount[type], max)) goto fail; } return ucounts; fail: bad = iter; for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) - atomic_dec(&iter->ucount[type]); + atomic_long_dec(&iter->ucount[type]); put_ucounts(ucounts); return NULL; @@ -216,7 +216,7 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) { struct ucounts *iter; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - int dec = atomic_dec_if_positive(&iter->ucount[type]); + long dec = atomic_long_dec_if_positive(&iter->ucount[type]); WARN_ON_ONCE(dec < 0); } put_ucounts(ucounts); -- cgit From 905ae01c4ae2ae3df05bb141801b1db4b7d83c61 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:09 +0200 Subject: Add a reference to ucounts for each cred For RLIMIT_NPROC and some other rlimits the user_struct that holds the global limit is kept alive for the lifetime of a process by keeping it in struct cred. Adding a pointer to ucounts in the struct cred will allow to track RLIMIT_NPROC not only for user in the system, but for user in the user_namespace. Updating ucounts may require memory allocation which may fail. So, we cannot change cred.ucounts in the commit_creds() because this function cannot fail and it should always return 0. For this reason, we modify cred.ucounts before calling the commit_creds(). Changelog v6: * Fix null-ptr-deref in is_ucounts_overlimit() detected by trinity. This error was caused by the fact that cred_alloc_blank() left the ucounts pointer empty. Reported-by: kernel test robot Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/b37aaef28d8b9b0d757e07ba6dd27281bbe39259.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/ucount.c | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) (limited to 'kernel/ucount.c') diff --git a/kernel/ucount.c b/kernel/ucount.c index 04c561751af1..50cc1dfb7d28 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -8,6 +8,12 @@ #include #include +struct ucounts init_ucounts = { + .ns = &init_user_ns, + .uid = GLOBAL_ROOT_UID, + .count = 1, +}; + #define UCOUNTS_HASHTABLE_BITS 10 static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; static DEFINE_SPINLOCK(ucounts_lock); @@ -125,7 +131,15 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struc return NULL; } -static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) +static void hlist_add_ucounts(struct ucounts *ucounts) +{ + struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + spin_lock_irq(&ucounts_lock); + hlist_add_head(&ucounts->node, hashent); + spin_unlock_irq(&ucounts_lock); +} + +struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); struct ucounts *ucounts, *new; @@ -160,7 +174,26 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) return ucounts; } -static void put_ucounts(struct ucounts *ucounts) +struct ucounts *get_ucounts(struct ucounts *ucounts) +{ + unsigned long flags; + + if (!ucounts) + return NULL; + + spin_lock_irqsave(&ucounts_lock, flags); + if (ucounts->count == INT_MAX) { + WARN_ONCE(1, "ucounts: counter has reached its maximum value"); + ucounts = NULL; + } else { + ucounts->count += 1; + } + spin_unlock_irqrestore(&ucounts_lock, flags); + + return ucounts; +} + +void put_ucounts(struct ucounts *ucounts) { unsigned long flags; @@ -194,7 +227,7 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, { struct ucounts *ucounts, *iter, *bad; struct user_namespace *tns; - ucounts = get_ucounts(ns, uid); + ucounts = alloc_ucounts(ns, uid); for (iter = ucounts; iter; iter = tns->ucounts) { long max; tns = iter->ns; @@ -237,6 +270,7 @@ static __init int user_namespace_sysctl_init(void) BUG_ON(!user_header); BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif + hlist_add_ucounts(&init_ucounts); return 0; } subsys_initcall(user_namespace_sysctl_init); -- cgit From b6c336528926ef73b0f70260f2636de2c3b94c14 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:10 +0200 Subject: Use atomic_t for ucounts reference counting The current implementation of the ucounts reference counter requires the use of spin_lock. We're going to use get_ucounts() in more performance critical areas like a handling of RLIMIT_SIGPENDING. Now we need to use spin_lock only if we want to change the hashtable. v10: * Always try to put ucounts in case we cannot increase ucounts->count. This will allow to cover the case when all consumers will return ucounts at once. v9: * Use a negative value to check that the ucounts->count is close to overflow. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/94d1dbecab060a6b116b0a2d1accd8ca1bbb4f5f.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/ucount.c | 53 +++++++++++++++++++---------------------------------- 1 file changed, 19 insertions(+), 34 deletions(-) (limited to 'kernel/ucount.c') diff --git a/kernel/ucount.c b/kernel/ucount.c index 50cc1dfb7d28..365865f368ec 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -11,7 +11,7 @@ struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, - .count = 1, + .count = ATOMIC_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 @@ -139,6 +139,15 @@ static void hlist_add_ucounts(struct ucounts *ucounts) spin_unlock_irq(&ucounts_lock); } +struct ucounts *get_ucounts(struct ucounts *ucounts) +{ + if (ucounts && atomic_add_negative(1, &ucounts->count)) { + put_ucounts(ucounts); + ucounts = NULL; + } + return ucounts; +} + struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); @@ -155,7 +164,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) new->ns = ns; new->uid = uid; - new->count = 0; + atomic_set(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); @@ -163,33 +172,12 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) kfree(new); } else { hlist_add_head(&new->node, hashent); - ucounts = new; + spin_unlock_irq(&ucounts_lock); + return new; } } - if (ucounts->count == INT_MAX) - ucounts = NULL; - else - ucounts->count += 1; spin_unlock_irq(&ucounts_lock); - return ucounts; -} - -struct ucounts *get_ucounts(struct ucounts *ucounts) -{ - unsigned long flags; - - if (!ucounts) - return NULL; - - spin_lock_irqsave(&ucounts_lock, flags); - if (ucounts->count == INT_MAX) { - WARN_ONCE(1, "ucounts: counter has reached its maximum value"); - ucounts = NULL; - } else { - ucounts->count += 1; - } - spin_unlock_irqrestore(&ucounts_lock, flags); - + ucounts = get_ucounts(ucounts); return ucounts; } @@ -197,15 +185,12 @@ void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - spin_lock_irqsave(&ucounts_lock, flags); - ucounts->count -= 1; - if (!ucounts->count) + if (atomic_dec_and_test(&ucounts->count)) { + spin_lock_irqsave(&ucounts_lock, flags); hlist_del_init(&ucounts->node); - else - ucounts = NULL; - spin_unlock_irqrestore(&ucounts_lock, flags); - - kfree(ucounts); + spin_unlock_irqrestore(&ucounts_lock, flags); + kfree(ucounts); + } } static inline bool atomic_long_inc_below(atomic_long_t *v, int u) -- cgit From 21d1c5e386bc751f1953b371d72cd5b7d9c9e270 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:11 +0200 Subject: Reimplement RLIMIT_NPROC on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. To illustrate the impact of rlimits, let's say there is a program that does not fork. Some service-A wants to run this program as user X in multiple containers. Since the program never fork the service wants to set RLIMIT_NPROC=1. service-A \- program (uid=1000, container1, rlimit_nproc=1) \- program (uid=1000, container2, rlimit_nproc=1) The service-A sets RLIMIT_NPROC=1 and runs the program in container1. When the service-A tries to run a program with RLIMIT_NPROC=1 in container2 it fails since user X already has one running process. We cannot use existing inc_ucounts / dec_ucounts because they do not allow us to exceed the maximum for the counter. Some rlimits can be overlimited by root or if the user has the appropriate capability. Changelog v11: * Change inc_rlimit_ucounts() which now returns top value of ucounts. * Drop inc_rlimit_ucounts_and_test() because the return code of inc_rlimit_ucounts() can be checked. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/c5286a8aa16d2d698c222f7532f3d735c82bc6bc.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/ucount.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'kernel/ucount.c') diff --git a/kernel/ucount.c b/kernel/ucount.c index 365865f368ec..6caa56f7dec8 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -80,6 +80,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif + { }, { } }; #endif /* CONFIG_SYSCTL */ @@ -240,6 +241,48 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) put_ucounts(ucounts); } +long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +{ + struct ucounts *iter; + long ret = 0; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + long max = READ_ONCE(iter->ns->ucount_max[type]); + long new = atomic_long_add_return(v, &iter->ucount[type]); + if (new < 0 || new > max) + ret = LONG_MAX; + else if (iter == ucounts) + ret = new; + } + return ret; +} + +bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +{ + struct ucounts *iter; + long new; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + long dec = atomic_long_add_return(-v, &iter->ucount[type]); + WARN_ON_ONCE(dec < 0); + if (iter == ucounts) + new = dec; + } + return (new == 0); +} + +bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max) +{ + struct ucounts *iter; + if (get_ucounts_value(ucounts, type) > max) + return true; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + max = READ_ONCE(iter->ns->ucount_max[type]); + if (get_ucounts_value(iter, type) > max) + return true; + } + return false; +} + static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL @@ -256,6 +299,7 @@ static __init int user_namespace_sysctl_init(void) BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif hlist_add_ucounts(&init_ucounts); + inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1); return 0; } subsys_initcall(user_namespace_sysctl_init); -- cgit From 6e52a9f0532f912af37bab4caf18b57d1b9845f4 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:12 +0200 Subject: Reimplement RLIMIT_MSGQUEUE on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/2531f42f7884bbfee56a978040b3e0d25cdf6cde.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/ucount.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/ucount.c') diff --git a/kernel/ucount.c b/kernel/ucount.c index 6caa56f7dec8..6e6f936a5963 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -80,6 +80,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif + { }, { }, { } }; -- cgit From d64696905554e919321e31afc210606653b8f6a4 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:13 +0200 Subject: Reimplement RLIMIT_SIGPENDING on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. Changelog v11: * Revert most of changes to fix performance issues. v10: * Fix memory leak on get_ucounts failure. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/df9d7764dddd50f28616b7840de74ec0f81711a8.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/ucount.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/ucount.c') diff --git a/kernel/ucount.c b/kernel/ucount.c index 6e6f936a5963..8ce62da6a62c 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -80,6 +80,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif + { }, { }, { }, { } -- cgit From d7c9e99aee48e6bc0b427f3e3c658a6aba15001e Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:14 +0200 Subject: Reimplement RLIMIT_MEMLOCK on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. Changelog v11: * Fix issue found by lkp robot. v8: * Fix issues found by lkp-tests project. v7: * Keep only ucounts for RLIMIT_MEMLOCK checks instead of struct cred. v6: * Fix bug in hugetlb_file_setup() detected by trinity. Reported-by: kernel test robot Reported-by: kernel test robot Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/970d50c70c71bfd4496e0e8d2a0a32feebebb350.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/ucount.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/ucount.c') diff --git a/kernel/ucount.c b/kernel/ucount.c index 8ce62da6a62c..d316bac3e520 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -83,6 +83,7 @@ static struct ctl_table user_table[] = { { }, { }, { }, + { }, { } }; #endif /* CONFIG_SYSCTL */ -- cgit