From 4e264ffd953463cd14c0720eaa9315ac052f5973 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 16 Jun 2020 19:14:08 +0900 Subject: proc/bootconfig: Fix to use correct quotes for value Fix /proc/bootconfig to select double or single quotes corrctly according to the value. If a bootconfig value includes a double quote character, we must use single-quotes to quote that value. This modifies if() condition and blocks for avoiding double-quote in value check in 2 places. Anyway, since xbc_array_for_each_value() can handle the array which has a single node correctly. Thus, if (vnode && xbc_node_is_array(vnode)) { xbc_array_for_each_value(vnode) /* vnode->next != NULL */ ... } else { snprintf(val); /* val is an empty string if !vnode */ } is equivalent to if (vnode) { xbc_array_for_each_value(vnode) /* vnode->next can be NULL */ ... } else { snprintf(""); /* value is always empty */ } Link: http://lkml.kernel.org/r/159230244786.65555.3763894451251622488.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: c1a3c36017d4 ("proc: bootconfig: Add /proc/bootconfig to show boot config list") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- fs/proc/bootconfig.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 9955d75c0585..ad31ec4ad627 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -26,8 +26,9 @@ static int boot_config_proc_show(struct seq_file *m, void *v) static int __init copy_xbc_key_value_list(char *dst, size_t size) { struct xbc_node *leaf, *vnode; - const char *val; char *key, *end = dst + size; + const char *val; + char q; int ret = 0; key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL); @@ -41,16 +42,20 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) break; dst += ret; vnode = xbc_node_get_child(leaf); - if (vnode && xbc_node_is_array(vnode)) { + if (vnode) { xbc_array_for_each_value(vnode, val) { - ret = snprintf(dst, rest(dst, end), "\"%s\"%s", - val, vnode->next ? ", " : "\n"); + if (strchr(val, '"')) + q = '\''; + else + q = '"'; + ret = snprintf(dst, rest(dst, end), "%c%s%c%s", + q, val, q, vnode->next ? ", " : "\n"); if (ret < 0) goto out; dst += ret; } } else { - ret = snprintf(dst, rest(dst, end), "\"%s\"\n", val); + ret = snprintf(dst, rest(dst, end), "\"\"\n"); if (ret < 0) break; dst += ret; -- cgit From fe557319aa06c23cffc9346000f119547e0f289a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Jun 2020 09:37:53 +0200 Subject: maccess: rename probe_kernel_{read,write} to copy_{from,to}_kernel_nofault Better describe what these functions do. Suggested-by: Linus Torvalds Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 8ba492d44e68..e502414b3556 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -512,7 +512,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) * Using bounce buffer to bypass the * hardened user copy kernel text checks. */ - if (probe_kernel_read(buf, (void *) start, tsz)) { + if (copy_from_kernel_nofault(buf, (void *)start, + tsz)) { if (clear_user(buffer, tsz)) { ret = -EFAULT; goto out; -- cgit From 3f1266f1f82d7b8c72472a8921e80aa3e611fb62 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 Jun 2020 09:16:41 +0200 Subject: block: move block-related definitions out of fs.h Move most of the block related definition out of fs.h into more suitable headers. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/proc/devices.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/proc') diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 37d38697eaf8..837971e74109 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -3,6 +3,7 @@ #include #include #include +#include static int devinfo_show(struct seq_file *f, void *v) { -- cgit From d4d80e69927ab5da67026c1c94e23c305dbc799e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 3 Jul 2020 14:10:14 -0400 Subject: Call sysctl_head_finish on error This error path returned directly instead of calling sysctl_head_finish(). Fixes: ef9d965bc8b6 ("sysctl: reject gigantic reads/write to sysctl files") Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Al Viro --- fs/proc/proc_sysctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 42c5128c7d1c..6c1166ccdaea 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -566,8 +566,9 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf, goto out; /* don't even try if the size is too large */ - if (count > KMALLOC_MAX_SIZE) - return -ENOMEM; + error = -ENOMEM; + if (count >= KMALLOC_MAX_SIZE) + goto out; if (write) { kbuf = memdup_user_nul(ubuf, count); @@ -576,7 +577,6 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf, goto out; } } else { - error = -ENOMEM; kbuf = kzalloc(count, GFP_KERNEL); if (!kbuf) goto out; -- cgit From c818c03b661cd769e035e41673d5543ba2ebda64 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 13 May 2020 14:11:26 -0700 Subject: seccomp: Report number of loaded filters in /proc/$pid/status A common question asked when debugging seccomp filters is "how many filters are attached to your process?" Provide a way to easily answer this question through /proc/$pid/status with a "Seccomp_filters" line. Signed-off-by: Kees Cook --- fs/proc/array.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 55ecbeb3a721..65ec2029fa80 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -341,6 +341,8 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); + seq_put_decimal_ull(m, "\nSeccomp_filters:\t", + atomic_read(&p->seccomp.filter_count)); #endif seq_puts(m, "\nSpeculation_Store_Bypass:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { -- cgit From 12886f8ab10ce6a09af1d92535d49c81aaa215a8 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 19 Jul 2020 12:04:14 +0200 Subject: proc: allow access in init userns for map_files with CAP_CHECKPOINT_RESTORE Opening files in /proc/pid/map_files when the current user is CAP_CHECKPOINT_RESTORE capable in the root namespace is useful for checkpointing and restoring to recover files that are unreachable via the file system such as deleted files, or memfd files. Signed-off-by: Adrian Reber Signed-off-by: Nicolas Viennot Reviewed-by: Cyrill Gorcunov Reviewed-by: Serge Hallyn Link: https://lore.kernel.org/r/20200719100418.2112740-5-areber@redhat.com Signed-off-by: Christian Brauner --- fs/proc/base.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index d86c0afc8a85..a333caeca291 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2189,16 +2189,16 @@ struct map_files_info { }; /* - * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the - * symlinks may be used to bypass permissions on ancestor directories in the - * path to the file in question. + * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due + * to concerns about how the symlinks may be used to bypass permissions on + * ancestor directories in the path to the file in question. */ static const char * proc_map_files_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - if (!capable(CAP_SYS_ADMIN)) + if (!checkpoint_restore_ns_capable(&init_user_ns)) return ERR_PTR(-EPERM); return proc_pid_get_link(dentry, inode, done); -- cgit From f9c792729581bd8b8473af163e8ab426c2c61d89 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:10 -0700 Subject: bpf: Refactor to provide aux info to bpf_iter_init_seq_priv_t This patch refactored target bpf_iter_init_seq_priv_t callback function to accept additional information. This will be needed in later patches for map element targets since a particular map should be passed to traverse elements for that particular map. In the future, other information may be passed to target as well, e.g., pid, cgroup id, etc. to customize the iterator. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184110.590156-1-yhs@fb.com --- fs/proc/proc_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index dba63b2429f0..ed8a6306990c 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -98,7 +98,7 @@ static const struct proc_ops proc_net_seq_ops = { .proc_release = seq_release_net, }; -int bpf_iter_init_seq_net(void *priv_data) +int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux) { #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; -- cgit From d42f3245c7e299e017213fa028c319316bcdb7f4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 6 Aug 2020 23:20:39 -0700 Subject: mm: memcg: convert vmstat slab counters to bytes In order to prepare for per-object slab memory accounting, convert NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE vmstat items to bytes. To make it obvious, rename them to NR_SLAB_RECLAIMABLE_B and NR_SLAB_UNRECLAIMABLE_B (similar to NR_KERNEL_STACK_KB). Internally global and per-node counters are stored in pages, however memcg and lruvec counters are stored in bytes. This scheme may look weird, but only for now. As soon as slab pages will be shared between multiple cgroups, global and node counters will reflect the total number of slab pages. However memcg and lruvec counters will be used for per-memcg slab memory tracking, which will take separate kernel objects in the account. Keeping global and node counters in pages helps to avoid additional overhead. The size of slab memory shouldn't exceed 4Gb on 32-bit machines, so it will fit into atomic_long_t we use for vmstats. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Michal Hocko Cc: Tejun Heo Link: http://lkml.kernel.org/r/20200623174037.3951353-4-guro@fb.com Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index e9a6841fc25b..38ea95fd919a 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -52,8 +52,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); - sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE); - sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE); + sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); + sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); show_val_kb(m, "MemTotal: ", i.totalram); show_val_kb(m, "MemFree: ", i.freeram); -- cgit From 991e7673859ed41e7ba83c8c4e57afe8cfebe314 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 6 Aug 2020 23:21:37 -0700 Subject: mm: memcontrol: account kernel stack per node Currently the kernel stack is being accounted per-zone. There is no need to do that. In addition due to being per-zone, memcg has to keep a separate MEMCG_KERNEL_STACK_KB. Make the stat per-node and deprecate MEMCG_KERNEL_STACK_KB as memcg_stat_item is an extension of node_stat_item. In addition localize the kernel stack stats updates to account_kernel_stack(). Signed-off-by: Shakeel Butt Signed-off-by: Andrew Morton Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200630161539.1759185-1-shakeelb@google.com Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 38ea95fd919a..2a4c58f70fb9 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -101,10 +101,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SReclaimable: ", sreclaimable); show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", - global_zone_page_state(NR_KERNEL_STACK_KB)); + global_node_page_state(NR_KERNEL_STACK_KB)); #ifdef CONFIG_SHADOW_CALL_STACK seq_printf(m, "ShadowCallStack:%8lu kB\n", - global_zone_page_state(NR_KERNEL_SCS_KB)); + global_node_page_state(NR_KERNEL_SCS_KB)); #endif show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); -- cgit From 1455083c1d70f69028df0eb92b69fd24db277e4b Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 6 Aug 2020 23:23:03 -0700 Subject: proc/meminfo: avoid open coded reading of vm_committed_as Patch series "make vm_committed_as_batch aware of vm overcommit policy", v6. When checking a performance change for will-it-scale scalability mmap test [1], we found very high lock contention for spinlock of percpu counter 'vm_committed_as': 94.14% 0.35% [kernel.kallsyms] [k] _raw_spin_lock_irqsave 48.21% _raw_spin_lock_irqsave;percpu_counter_add_batch;__vm_enough_memory;mmap_region;do_mmap; 45.91% _raw_spin_lock_irqsave;percpu_counter_add_batch;__do_munmap; Actually this heavy lock contention is not always necessary. The 'vm_committed_as' needs to be very precise when the strict OVERCOMMIT_NEVER policy is set, which requires a rather small batch number for the percpu counter. So keep 'batch' number unchanged for strict OVERCOMMIT_NEVER policy, and enlarge it for not-so-strict OVERCOMMIT_ALWAYS and OVERCOMMIT_GUESS policies. Benchmark with the same testcase in [1] shows 53% improvement on a 8C/16T desktop, and 2097%(20X) on a 4S/72C/144T server. And for that case, whether it shows improvements depends on if the test mmap size is bigger than the batch number computed. We tested 10+ platforms in 0day (server, desktop and laptop). If we lift it to 64X, 80%+ platforms show improvements, and for 16X lift, 1/3 of the platforms will show improvements. And generally it should help the mmap/unmap usage,as Michal Hocko mentioned: : I believe that there are non-synthetic worklaods which would benefit : from a larger batch. E.g. large in memory databases which do large : mmaps during startups from multiple threads. Note: There are some style complain from checkpatch for patch 4, as sysctl handler declaration follows the similar format of sibling functions [1] https://lore.kernel.org/lkml/20200305062138.GI5972@shao2-debian/ This patch (of 4): Use the existing vm_memory_committed() instead, which is also convenient for future change. Signed-off-by: Feng Tang Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Matthew Wilcox (Oracle) Cc: Johannes Weiner Cc: Mel Gorman Cc: Qian Cai Cc: Kees Cook Cc: Andi Kleen Cc: Tim Chen Cc: Dave Hansen Cc: Huang Ying Cc: Christoph Lameter Cc: Dennis Zhou Cc: Haiyang Zhang Cc: kernel test robot Cc: "K. Y. Srinivasan" Cc: Tejun Heo Link: http://lkml.kernel.org/r/1594389708-60781-1-git-send-email-feng.tang@intel.com Link: http://lkml.kernel.org/r/1594389708-60781-2-git-send-email-feng.tang@intel.com Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 2a4c58f70fb9..887a5532e449 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -41,7 +41,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) si_meminfo(&i); si_swapinfo(&i); - committed = percpu_counter_read_positive(&vm_committed_as); + committed = vm_memory_committed(); cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - i.bufferram; -- cgit From 471e78cc7687337abd10626196f5addb30b01824 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Tue, 11 Aug 2020 18:30:57 -0700 Subject: /proc/PID/smaps: consistent whitespace output format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The keys in smaps output are padded to fixed width with spaces. All except for THPeligible that uses tabs (only since commit c06306696f83 ("mm: thp: fix false negative of shmem vma's THP eligibility")). Unify the output formatting to save time debugging some naïve parsers. (Part of the unification is also aligning FilePmdMapped with others.) Signed-off-by: Michal Koutný Signed-off-by: Andrew Morton Acked-by: Yang Shi Cc: Alexey Dobriyan Cc: Matthew Wilcox Link: http://lkml.kernel.org/r/20200728083207.17531-1-mkoutny@suse.com Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dbda4499a859..5066b0251ed8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -786,7 +786,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); - SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); + SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", mss->private_hugetlb >> 10, 7); @@ -816,7 +816,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); - seq_printf(m, "THPeligible: %d\n", + seq_printf(m, "THPeligible: %d\n", transparent_hugepage_enabled(vma)); if (arch_pkeys_enabled()) -- cgit From 9066e5cfb73cdbcdbb49e87999482ab615e9fc76 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 11 Aug 2020 18:31:22 -0700 Subject: mm, oom: make the calculation of oom badness more accurate Recently we found an issue on our production environment that when memcg oom is triggered the oom killer doesn't chose the process with largest resident memory but chose the first scanned process. Note that all processes in this memcg have the same oom_score_adj, so the oom killer should chose the process with largest resident memory. Bellow is part of the oom info, which is enough to analyze this issue. [7516987.983223] memory: usage 16777216kB, limit 16777216kB, failcnt 52843037 [7516987.983224] memory+swap: usage 16777216kB, limit 9007199254740988kB, failcnt 0 [7516987.983225] kmem: usage 301464kB, limit 9007199254740988kB, failcnt 0 [...] [7516987.983293] [ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name [7516987.983510] [ 5740] 0 5740 257 1 32768 0 -998 pause [7516987.983574] [58804] 0 58804 4594 771 81920 0 -998 entry_point.bas [7516987.983577] [58908] 0 58908 7089 689 98304 0 -998 cron [7516987.983580] [58910] 0 58910 16235 5576 163840 0 -998 supervisord [7516987.983590] [59620] 0 59620 18074 1395 188416 0 -998 sshd [7516987.983594] [59622] 0 59622 18680 6679 188416 0 -998 python [7516987.983598] [59624] 0 59624 1859266 5161 548864 0 -998 odin-agent [7516987.983600] [59625] 0 59625 707223 9248 983040 0 -998 filebeat [7516987.983604] [59627] 0 59627 416433 64239 774144 0 -998 odin-log-agent [7516987.983607] [59631] 0 59631 180671 15012 385024 0 -998 python3 [7516987.983612] [61396] 0 61396 791287 3189 352256 0 -998 client [7516987.983615] [61641] 0 61641 1844642 29089 946176 0 -998 client [7516987.983765] [ 9236] 0 9236 2642 467 53248 0 -998 php_scanner [7516987.983911] [42898] 0 42898 15543 838 167936 0 -998 su [7516987.983915] [42900] 1000 42900 3673 867 77824 0 -998 exec_script_vr2 [7516987.983918] [42925] 1000 42925 36475 19033 335872 0 -998 python [7516987.983921] [57146] 1000 57146 3673 848 73728 0 -998 exec_script_J2p [7516987.983925] [57195] 1000 57195 186359 22958 491520 0 -998 python2 [7516987.983928] [58376] 1000 58376 275764 14402 290816 0 -998 rosmaster [7516987.983931] [58395] 1000 58395 155166 4449 245760 0 -998 rosout [7516987.983935] [58406] 1000 58406 18285584 3967322 37101568 0 -998 data_sim [7516987.984221] oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),cpuset=3aa16c9482ae3a6f6b78bda68a55d32c87c99b985e0f11331cddf05af6c4d753,mems_allowed=0-1,oom_memcg=/kubepods/podf1c273d3-9b36-11ea-b3df-246e9693c184,task_memcg=/kubepods/podf1c273d3-9b36-11ea-b3df-246e9693c184/1f246a3eeea8f70bf91141eeaf1805346a666e225f823906485ea0b6c37dfc3d,task=pause,pid=5740,uid=0 [7516987.984254] Memory cgroup out of memory: Killed process 5740 (pause) total-vm:1028kB, anon-rss:4kB, file-rss:0kB, shmem-rss:0kB [7516988.092344] oom_reaper: reaped process 5740 (pause), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB We can find that the first scanned process 5740 (pause) was killed, but its rss is only one page. That is because, when we calculate the oom badness in oom_badness(), we always ignore the negtive point and convert all of these negtive points to 1. Now as oom_score_adj of all the processes in this targeted memcg have the same value -998, the points of these processes are all negtive value. As a result, the first scanned process will be killed. The oom_socre_adj (-998) in this memcg is set by kubelet, because it is a a Guaranteed pod, which has higher priority to prevent from being killed by system oom. To fix this issue, we should make the calculation of oom point more accurate. We can achieve it by convert the chosen_point from 'unsigned long' to 'long'. [cai@lca.pw: reported a issue in the previous version] [mhocko@suse.com: fixed the issue reported by Cai] [mhocko@suse.com: add the comment in proc_oom_score()] [laoar.shao@gmail.com: v3] Link: http://lkml.kernel.org/r/1594396651-9931-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Signed-off-by: Andrew Morton Tested-by: Naresh Kamboju Acked-by: Michal Hocko Cc: David Rientjes Cc: Qian Cai Link: http://lkml.kernel.org/r/1594309987-9919-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Linus Torvalds --- fs/proc/base.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index a333caeca291..617db4e0faa0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -551,8 +551,17 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, { unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; + long badness; + + badness = oom_badness(task, totalpages); + /* + * Special case OOM_SCORE_ADJ_MIN for all others scale the + * badness value into [0, 2000] range which we have been + * exporting for a long time so userspace might depend on it. + */ + if (badness != LONG_MIN) + points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3; - points = oom_badness(task, totalpages) * 1000 / totalpages; seq_printf(m, "%lu\n", points); return 0; -- cgit