From 33b2d6302abc4ccea1d9b3f095e2e27b02ca264e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:40:56 -0700 Subject: psi: introduce state_mask to represent stalled psi states Patch series "psi: pressure stall monitors", v6. This is a respin of: https://lwn.net/ml/linux-kernel/20190308184311.144521-1-surenb%40google.com/ Android is adopting psi to detect and remedy memory pressure that results in stuttering and decreased responsiveness on mobile devices. Psi gives us the stall information, but because we're dealing with latencies in the millisecond range, periodically reading the pressure files to detect stalls in a timely fashion is not feasible. Psi also doesn't aggregate its averages at a high-enough frequency right now. This patch series extends the psi interface such that users can configure sensitive latency thresholds and use poll() and friends to be notified when these are breached. As high-frequency aggregation is costly, it implements an aggregation method that is optimized for fast, short-interval averaging, and makes the aggregation frequency adaptive, such that high-frequency updates only happen while monitored stall events are actively occurring. With these patches applied, Android can monitor for, and ward off, mounting memory shortages before they cause problems for the user. For example, using memory stall monitors in userspace low memory killer daemon (lmkd) we can detect mounting pressure and kill less important processes before device becomes visibly sluggish. In our memory stress testing psi memory monitors produce roughly 10x less false positives compared to vmpressure signals. Having ability to specify multiple triggers for the same psi metric allows other parts of Android framework to monitor memory state of the device and act accordingly. The new interface is straight-forward. The user opens one of the pressure files for writing and writes a trigger description into the file descriptor that defines the stall state - some or full, and the maximum stall time over a given window of time. E.g.: /* Signal when stall time exceeds 100ms of a 1s window */ char trigger[] = "full 100000 1000000" fd = open("/proc/pressure/memory") write(fd, trigger, sizeof(trigger)) while (poll() >= 0) { ... }; close(fd); When the monitored stall state is entered, psi adapts its aggregation frequency according to what the configured time window requires in order to emit event signals in a timely fashion. Once the stalling subsides, aggregation reverts back to normal. The trigger is associated with the open file descriptor. To stop monitoring, the user only needs to close the file descriptor and the trigger is discarded. Patches 1-6 prepare the psi code for polling support. Patch 7 implements the adaptive polling logic, the pressure growth detection optimized for short intervals, and hooks up write() and poll() on the pressure files. The patches were developed in collaboration with Johannes Weiner. This patch (of 7): The psi monitoring patches will need to determine the same states as record_times(). To avoid calculating them twice, maintain a state mask that can be consulted cheaply. Do this in a separate patch to keep the churn in the main feature patch at a minimum. This adds 4-byte state_mask member into psi_group_cpu struct which results in its first cacheline-aligned part becoming 52 bytes long. Add explicit values to enumeration element counters that affect psi_group_cpu struct size. Link: http://lkml.kernel.org/r/20190124211518.244221-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/psi_types.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 2cf422db5d18..762c6bb16f3c 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -11,7 +11,7 @@ enum psi_task_count { NR_IOWAIT, NR_MEMSTALL, NR_RUNNING, - NR_PSI_TASK_COUNTS, + NR_PSI_TASK_COUNTS = 3, }; /* Task state bitmasks */ @@ -24,7 +24,7 @@ enum psi_res { PSI_IO, PSI_MEM, PSI_CPU, - NR_PSI_RESOURCES, + NR_PSI_RESOURCES = 3, }; /* @@ -41,7 +41,7 @@ enum psi_states { PSI_CPU_SOME, /* Only per-CPU, to weigh the CPU in the global average: */ PSI_NONIDLE, - NR_PSI_STATES, + NR_PSI_STATES = 6, }; struct psi_group_cpu { @@ -53,6 +53,9 @@ struct psi_group_cpu { /* States of the tasks belonging to this group */ unsigned int tasks[NR_PSI_TASK_COUNTS]; + /* Aggregate pressure state derived from the tasks */ + u32 state_mask; + /* Period time sampling buckets for each state of interest (ns) */ u32 times[NR_PSI_STATES]; -- cgit From bcc78db64168eb6dede056fed2999f75f7ace309 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:02 -0700 Subject: psi: rename psi fields in preparation for psi trigger addition Rename psi_group structure member fields used for calculating psi totals and averages for clear distinction between them and for trigger-related fields that will be added by "psi: introduce psi monitor". [surenb@google.com: v6] Link: http://lkml.kernel.org/r/20190319235619.260832-4-surenb@google.com Link: http://lkml.kernel.org/r/20190124211518.244221-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/psi_types.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 762c6bb16f3c..4d1c1f67be18 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -69,17 +69,17 @@ struct psi_group_cpu { }; struct psi_group { - /* Protects data updated during an aggregation */ - struct mutex stat_lock; + /* Protects data used by the aggregator */ + struct mutex avgs_lock; /* Per-cpu task state & time tracking */ struct psi_group_cpu __percpu *pcpu; - /* Periodic aggregation state */ - u64 total_prev[NR_PSI_STATES - 1]; - u64 last_update; - u64 next_update; - struct delayed_work clock_work; + /* Running pressure averages */ + u64 avg_total[NR_PSI_STATES - 1]; + u64 avg_last_update; + u64 avg_next_update; + struct delayed_work avgs_work; /* Total stall times and sampled pressure averages */ u64 total[NR_PSI_STATES - 1]; -- cgit From 8af0c18af1425fc70686c0fdcfc0072cd8431aa0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:12 -0700 Subject: include/: refactor headers to allow kthread.h inclusion in psi_types.h kthread.h can't be included in psi_types.h because it creates a circular inclusion with kthread.h eventually including psi_types.h and complaining on kthread structures not being defined because they are defined further in the kthread.h. Resolve this by removing psi_types.h inclusion from the headers included from kthread.h. Link: http://lkml.kernel.org/r/20190319235619.260832-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kthread.h | 3 ++- include/linux/sched.h | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 2c89e60bc752..0f9da966934e 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -4,7 +4,6 @@ /* Simple interface for creating and stopping kernel threads without mess. */ #include #include -#include __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), @@ -198,6 +197,8 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); +struct cgroup_subsys_state; + #ifdef CONFIG_BLK_CGROUP void kthread_associate_blkcg(struct cgroup_subsys_state *css); struct cgroup_subsys_state *kthread_blkcg(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index a2cd15855bad..11837410690f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include -- cgit From 0e94682b73bfa6c44c98af7a26771c9c08c055d5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:15 -0700 Subject: psi: introduce psi monitor Psi monitor aims to provide a low-latency short-term pressure detection mechanism configurable by users. It allows users to monitor psi metrics growth and trigger events whenever a metric raises above user-defined threshold within user-defined time window. Time window and threshold are both expressed in usecs. Multiple psi resources with different thresholds and window sizes can be monitored concurrently. Psi monitors activate when system enters stall state for the monitored psi metric and deactivate upon exit from the stall state. While system is in the stall state psi signal growth is monitored at a rate of 10 times per tracking window. Min window size is 500ms, therefore the min monitoring interval is 50ms. Max window size is 10s with monitoring interval of 1s. When activated psi monitor stays active for at least the duration of one tracking window to avoid repeated activations/deactivations when psi signal is bouncing. Notifications to the users are rate-limited to one per tracking window. Link: http://lkml.kernel.org/r/20190319235619.260832-8-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/psi.h | 8 +++++ include/linux/psi_types.h | 82 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 88 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/psi.h b/include/linux/psi.h index 7006008d5b72..af892c290116 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -4,6 +4,7 @@ #include #include #include +#include struct seq_file; struct css_set; @@ -26,6 +27,13 @@ int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); + +struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res); +void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t); + +__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, + poll_table *wait); #endif #else /* CONFIG_PSI */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 4d1c1f67be18..07aaf9b82241 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -1,8 +1,11 @@ #ifndef _LINUX_PSI_TYPES_H #define _LINUX_PSI_TYPES_H +#include #include #include +#include +#include #ifdef CONFIG_PSI @@ -44,6 +47,12 @@ enum psi_states { NR_PSI_STATES = 6, }; +enum psi_aggregators { + PSI_AVGS = 0, + PSI_POLL, + NR_PSI_AGGREGATORS, +}; + struct psi_group_cpu { /* 1st cacheline updated by the scheduler */ @@ -65,7 +74,55 @@ struct psi_group_cpu { /* 2nd cacheline updated by the aggregator */ /* Delta detection against the sampling buckets */ - u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp; + u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES] + ____cacheline_aligned_in_smp; +}; + +/* PSI growth tracking window */ +struct psi_window { + /* Window size in ns */ + u64 size; + + /* Start time of the current window in ns */ + u64 start_time; + + /* Value at the start of the window */ + u64 start_value; + + /* Value growth in the previous window */ + u64 prev_growth; +}; + +struct psi_trigger { + /* PSI state being monitored by the trigger */ + enum psi_states state; + + /* User-spacified threshold in ns */ + u64 threshold; + + /* List node inside triggers list */ + struct list_head node; + + /* Backpointer needed during trigger destruction */ + struct psi_group *group; + + /* Wait queue for polling */ + wait_queue_head_t event_wait; + + /* Pending event flag */ + int event; + + /* Tracking window */ + struct psi_window win; + + /* + * Time last event was generated. Used for rate-limiting + * events to one per window + */ + u64 last_event_time; + + /* Refcounting to prevent premature destruction */ + struct kref refcount; }; struct psi_group { @@ -79,11 +136,32 @@ struct psi_group { u64 avg_total[NR_PSI_STATES - 1]; u64 avg_last_update; u64 avg_next_update; + + /* Aggregator work control */ struct delayed_work avgs_work; /* Total stall times and sampled pressure averages */ - u64 total[NR_PSI_STATES - 1]; + u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1]; unsigned long avg[NR_PSI_STATES - 1][3]; + + /* Monitor work control */ + atomic_t poll_scheduled; + struct kthread_worker __rcu *poll_kworker; + struct kthread_delayed_work poll_work; + + /* Protects data used by the monitor */ + struct mutex trigger_lock; + + /* Configured polling triggers */ + struct list_head triggers; + u32 nr_triggers[NR_PSI_STATES - 1]; + u32 poll_states; + u64 poll_min_period; + + /* Total stall times at the start of monitor activation */ + u64 polling_total[NR_PSI_STATES - 1]; + u64 polling_next_update; + u64 polling_until; }; #else /* CONFIG_PSI */ -- cgit From df5ba5be7425e1df296d40c5f37a39d98ec666a2 Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Tue, 14 May 2019 15:41:18 -0700 Subject: kernel/sched/psi.c: expose pressure metrics on root cgroup Pressure metrics are already recorded and exposed in procfs for the entire system, but any tool which monitors cgroup pressure has to special case the root cgroup to read from procfs. This patch exposes the already recorded pressure metrics on the root cgroup. Link: http://lkml.kernel.org/r/20190510174938.3361741-1-dschatzberg@fb.com Signed-off-by: Dan Schatzberg Acked-by: Johannes Weiner Cc: Tejun Heo Cc: Li Zefan Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/psi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/psi.h b/include/linux/psi.h index af892c290116..7b3de7321219 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -12,6 +12,7 @@ struct css_set; #ifdef CONFIG_PSI extern struct static_key_false psi_disabled; +extern struct psi_group psi_system; void psi_init(void); -- cgit From e900a918b0984ec8f2eb150b8477a47b75d17692 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 14 May 2019 15:41:28 -0700 Subject: mm: shuffle initial free memory to improve memory-side-cache utilization Patch series "mm: Randomize free memory", v10. This patch (of 3): Randomization of the page allocator improves the average utilization of a direct-mapped memory-side-cache. Memory side caching is a platform capability that Linux has been previously exposed to in HPC (high-performance computing) environments on specialty platforms. In that instance it was a smaller pool of high-bandwidth-memory relative to higher-capacity / lower-bandwidth DRAM. Now, this capability is going to be found on general purpose server platforms where DRAM is a cache in front of higher latency persistent memory [1]. Robert offered an explanation of the state of the art of Linux interactions with memory-side-caches [2], and I copy it here: It's been a problem in the HPC space: http://www.nersc.gov/research-and-development/knl-cache-mode-performance-coe/ A kernel module called zonesort is available to try to help: https://software.intel.com/en-us/articles/xeon-phi-software and this abandoned patch series proposed that for the kernel: https://lkml.kernel.org/r/20170823100205.17311-1-lukasz.daniluk@intel.com Dan's patch series doesn't attempt to ensure buffers won't conflict, but also reduces the chance that the buffers will. This will make performance more consistent, albeit slower than "optimal" (which is near impossible to attain in a general-purpose kernel). That's better than forcing users to deploy remedies like: "To eliminate this gradual degradation, we have added a Stream measurement to the Node Health Check that follows each job; nodes are rebooted whenever their measured memory bandwidth falls below 300 GB/s." A replacement for zonesort was merged upstream in commit cc9aec03e58f ("x86/numa_emulation: Introduce uniform split capability"). With this numa_emulation capability, memory can be split into cache sized ("near-memory" sized) numa nodes. A bind operation to such a node, and disabling workloads on other nodes, enables full cache performance. However, once the workload exceeds the cache size then cache conflicts are unavoidable. While HPC environments might be able to tolerate time-scheduling of cache sized workloads, for general purpose server platforms, the oversubscribed cache case will be the common case. The worst case scenario is that a server system owner benchmarks a workload at boot with an un-contended cache only to see that performance degrade over time, even below the average cache performance due to excessive conflicts. Randomization clips the peaks and fills in the valleys of cache utilization to yield steady average performance. Here are some performance impact details of the patches: 1/ An Intel internal synthetic memory bandwidth measurement tool, saw a 3X speedup in a contrived case that tries to force cache conflicts. The contrived cased used the numa_emulation capability to force an instance of the benchmark to be run in two of the near-memory sized numa nodes. If both instances were placed on the same emulated they would fit and cause zero conflicts. While on separate emulated nodes without randomization they underutilized the cache and conflicted unnecessarily due to the in-order allocation per node. 2/ A well known Java server application benchmark was run with a heap size that exceeded cache size by 3X. The cache conflict rate was 8% for the first run and degraded to 21% after page allocator aging. With randomization enabled the rate levelled out at 11%. 3/ A MongoDB workload did not observe measurable difference in cache-conflict rates, but the overall throughput dropped by 7% with randomization in one case. 4/ Mel Gorman ran his suite of performance workloads with randomization enabled on platforms without a memory-side-cache and saw a mix of some improvements and some losses [3]. While there is potentially significant improvement for applications that depend on low latency access across a wide working-set, the performance may be negligible to negative for other workloads. For this reason the shuffle capability defaults to off unless a direct-mapped memory-side-cache is detected. Even then, the page_alloc.shuffle=0 parameter can be specified to disable the randomization on those systems. Outside of memory-side-cache utilization concerns there is potentially security benefit from randomization. Some data exfiltration and return-oriented-programming attacks rely on the ability to infer the location of sensitive data objects. The kernel page allocator, especially early in system boot, has predictable first-in-first out behavior for physical pages. Pages are freed in physical address order when first onlined. Quoting Kees: "While we already have a base-address randomization (CONFIG_RANDOMIZE_MEMORY), attacks against the same hardware and memory layouts would certainly be using the predictability of allocation ordering (i.e. for attacks where the base address isn't important: only the relative positions between allocated memory). This is common in lots of heap-style attacks. They try to gain control over ordering by spraying allocations, etc. I'd really like to see this because it gives us something similar to CONFIG_SLAB_FREELIST_RANDOM but for the page allocator." While SLAB_FREELIST_RANDOM reduces the predictability of some local slab caches it leaves vast bulk of memory to be predictably in order allocated. However, it should be noted, the concrete security benefits are hard to quantify, and no known CVE is mitigated by this randomization. Introduce shuffle_free_memory(), and its helper shuffle_zone(), to perform a Fisher-Yates shuffle of the page allocator 'free_area' lists when they are initially populated with free memory at boot and at hotplug time. Do this based on either the presence of a page_alloc.shuffle=Y command line parameter, or autodetection of a memory-side-cache (to be added in a follow-on patch). The shuffling is done in terms of CONFIG_SHUFFLE_PAGE_ORDER sized free pages where the default CONFIG_SHUFFLE_PAGE_ORDER is MAX_ORDER-1 i.e. 10, 4MB this trades off randomization granularity for time spent shuffling. MAX_ORDER-1 was chosen to be minimally invasive to the page allocator while still showing memory-side cache behavior improvements, and the expectation that the security implications of finer granularity randomization is mitigated by CONFIG_SLAB_FREELIST_RANDOM. The performance impact of the shuffling appears to be in the noise compared to other memory initialization work. This initial randomization can be undone over time so a follow-on patch is introduced to inject entropy on page free decisions. It is reasonable to ask if the page free entropy is sufficient, but it is not enough due to the in-order initial freeing of pages. At the start of that process putting page1 in front or behind page0 still keeps them close together, page2 is still near page1 and has a high chance of being adjacent. As more pages are added ordering diversity improves, but there is still high page locality for the low address pages and this leads to no significant impact to the cache conflict rate. [1]: https://itpeernetwork.intel.com/intel-optane-dc-persistent-memory-operating-modes/ [2]: https://lkml.kernel.org/r/AT5PR8401MB1169D656C8B5E121752FC0F8AB120@AT5PR8401MB1169.NAMPRD84.PROD.OUTLOOK.COM [3]: https://lkml.org/lkml/2018/10/12/309 [dan.j.williams@intel.com: fix shuffle enable] Link: http://lkml.kernel.org/r/154943713038.3858443.4125180191382062871.stgit@dwillia2-desk3.amr.corp.intel.com [cai@lca.pw: fix SHUFFLE_PAGE_ALLOCATOR help texts] Link: http://lkml.kernel.org/r/20190425201300.75650-1-cai@lca.pw Link: http://lkml.kernel.org/r/154899811738.3165233.12325692939590944259.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Signed-off-by: Qian Cai Reviewed-by: Kees Cook Acked-by: Michal Hocko Cc: Dave Hansen Cc: Keith Busch Cc: Robert Elliott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 17 +++++++++++++++++ include/linux/mmzone.h | 1 + 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/list.h b/include/linux/list.h index 9e9a6403dbe4..d3b4db895340 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -150,6 +150,23 @@ static inline void list_replace_init(struct list_head *old, INIT_LIST_HEAD(old); } +/** + * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position + * @entry1: the location to place entry2 + * @entry2: the location to place entry1 + */ +static inline void list_swap(struct list_head *entry1, + struct list_head *entry2) +{ + struct list_head *pos = entry2->prev; + + list_del(entry2); + list_replace(entry1, entry2); + if (pos == entry1) + pos = entry2; + list_add(entry1, pos); +} + /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5a4aedc160bd..1fb5a04530aa 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1271,6 +1271,7 @@ void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) +#define pfn_present pfn_valid #endif /* CONFIG_SPARSEMEM */ /* -- cgit From b03641af680959df57c275a80ff0dc116627c7ae Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 14 May 2019 15:41:32 -0700 Subject: mm: move buddy list manipulations into helpers In preparation for runtime randomization of the zone lists, take all (well, most of) the list_*() functions in the buddy allocator and put them in helper functions. Provide a common control point for injecting additional behavior when freeing pages. [dan.j.williams@intel.com: fix buddy list helpers] Link: http://lkml.kernel.org/r/155033679702.1773410.13041474192173212653.stgit@dwillia2-desk3.amr.corp.intel.com [vbabka@suse.cz: remove del_page_from_free_area() migratetype parameter] Link: http://lkml.kernel.org/r/4672701b-6775-6efd-0797-b6242591419e@suse.cz Link: http://lkml.kernel.org/r/154899812264.3165233.5219320056406926223.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Signed-off-by: Vlastimil Babka Tested-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Dave Hansen Cc: Kees Cook Cc: Keith Busch Cc: Robert Elliott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 --- include/linux/mm_types.h | 3 +++ include/linux/mmzone.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 912614fbbef3..0e8834ac32b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -536,9 +536,6 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma) struct mmu_gather; struct inode; -#define page_private(page) ((page)->private) -#define set_page_private(page, v) ((page)->private = (v)) - #if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline int pmd_devmap(pmd_t pmd) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e1f42a07d8f0..8ec38b11b361 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -220,6 +220,9 @@ struct page { #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) +#define page_private(page) ((page)->private) +#define set_page_private(page, v) ((page)->private = (v)) + struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1fb5a04530aa..acd6f9cb01bd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -98,6 +100,50 @@ struct free_area { unsigned long nr_free; }; +/* Used for pages not on another list */ +static inline void add_to_free_area(struct page *page, struct free_area *area, + int migratetype) +{ + list_add(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +/* Used for pages not on another list */ +static inline void add_to_free_area_tail(struct page *page, struct free_area *area, + int migratetype) +{ + list_add_tail(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +/* Used for pages which are on another list */ +static inline void move_to_free_area(struct page *page, struct free_area *area, + int migratetype) +{ + list_move(&page->lru, &area->free_list[migratetype]); +} + +static inline struct page *get_page_from_free_area(struct free_area *area, + int migratetype) +{ + return list_first_entry_or_null(&area->free_list[migratetype], + struct page, lru); +} + +static inline void del_page_from_free_area(struct page *page, + struct free_area *area) +{ + list_del(&page->lru); + __ClearPageBuddy(page); + set_page_private(page, 0); + area->nr_free--; +} + +static inline bool free_area_empty(struct free_area *area, int migratetype) +{ + return list_empty(&area->free_list[migratetype]); +} + struct pglist_data; /* -- cgit From 97500a4a54876d3d6d2d1b8419223eb4e69b32d8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 14 May 2019 15:41:35 -0700 Subject: mm: maintain randomization of page free lists When freeing a page with an order >= shuffle_page_order randomly select the front or back of the list for insertion. While the mm tries to defragment physical pages into huge pages this can tend to make the page allocator more predictable over time. Inject the front-back randomness to preserve the initial randomness established by shuffle_free_memory() when the kernel was booted. The overhead of this manipulation is constrained by only being applied for MAX_ORDER sized pages by default. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/154899812788.3165233.9066631950746578517.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Kees Cook Cc: Michal Hocko Cc: Dave Hansen Cc: Keith Busch Cc: Robert Elliott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index acd6f9cb01bd..70394cabaf4e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -116,6 +116,18 @@ static inline void add_to_free_area_tail(struct page *page, struct free_area *ar area->nr_free++; } +#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR +/* Used to preserve page allocation order entropy */ +void add_to_free_area_random(struct page *page, struct free_area *area, + int migratetype); +#else +static inline void add_to_free_area_random(struct page *page, + struct free_area *area, int migratetype) +{ + add_to_free_area(page, area, migratetype); +} +#endif + /* Used for pages which are on another list */ static inline void move_to_free_area(struct page *page, struct free_area *area, int migratetype) -- cgit From ad312f95d41c9de19313c51e388c4984451c010f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 May 2019 15:41:42 -0700 Subject: fs/select: avoid clang stack usage warning The select() implementation is carefully tuned to put a sensible amount of data on the stack for holding a copy of the user space fd_set, but not too large to risk overflowing the kernel stack. When building a 32-bit kernel with clang, we need a little more space than with gcc, which often triggers a warning: fs/select.c:619:5: error: stack frame size of 1048 bytes in function 'core_sys_select' [-Werror,-Wframe-larger-than=] int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, I experimentally found that for 32-bit ARM, reducing the maximum stack usage by 64 bytes keeps us reliably under the warning limit again. Link: http://lkml.kernel.org/r/20190307090146.1874906-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Reviewed-by: Andi Kleen Cc: Nick Desaulniers Cc: Alexander Viro Cc: Christoph Hellwig Cc: Eric Dumazet Cc: "Darrick J. Wong" Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poll.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/poll.h b/include/linux/poll.h index 7e0fdcf905d2..1cdc32b1f1b0 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -16,7 +16,11 @@ extern struct ctl_table epoll_table[]; /* for sysctl */ /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating additional memory. */ +#ifdef __clang__ +#define MAX_STACK_ALLOC 768 +#else #define MAX_STACK_ALLOC 832 +#endif #define FRONTEND_STACK_ALLOC 256 #define SELECT_STACK_ALLOC FRONTEND_STACK_ALLOC #define POLL_STACK_ALLOC FRONTEND_STACK_ALLOC -- cgit From 687a3e4d8e6129f064711291f1564d95472dba3e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 14 May 2019 15:41:45 -0700 Subject: treewide: remove SPDX "WITH Linux-syscall-note" from kernel-space headers The "WITH Linux-syscall-note" should be added to headers exported to the user-space. Some kernel-space headers have "WITH Linux-syscall-note", which seems a mistake. [1] arch/x86/include/asm/hyperv-tlfs.h Commit 5a4858032217 ("x86/hyper-v: move hyperv.h out of uapi") moved this file out of uapi, but missed to update the SPDX License tag. [2] include/asm-generic/shmparam.h Commit 76ce2a80a28e ("Rename include/{uapi => }/asm-generic/shmparam.h really") moved this file out of uapi, but missed to update the SPDX License tag. [3] include/linux/qcom-geni-se.h Commit eddac5af0654 ("soc: qcom: Add GENI based QUP Wrapper driver") added this file, but I do not see a good reason why its license tag must include "WITH Linux-syscall-note". Link: http://lkml.kernel.org/r/1554196104-3522-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/shmparam.h | 2 +- include/linux/qcom-geni-se.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/shmparam.h b/include/asm-generic/shmparam.h index 8b78c0ba08b1..b8f9035ffc2c 100644 --- a/include/asm-generic/shmparam.h +++ b/include/asm-generic/shmparam.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_SHMPARAM_H #define __ASM_GENERIC_SHMPARAM_H diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h index 3bcd67fd5548..dd464943f717 100644 --- a/include/linux/qcom-geni-se.h +++ b/include/linux/qcom-geni-se.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2017-2018, The Linux Foundation. All rights reserved. */ -- cgit From 9012d011660ea5cf2a623e1de207a2bc0ca6936d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 14 May 2019 15:42:25 -0700 Subject: compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING Commit 60a3cdd06394 ("x86: add optimized inlining") introduced CONFIG_OPTIMIZE_INLINING, but it has been available only for x86. The idea is obviously arch-agnostic. This commit moves the config entry from arch/x86/Kconfig.debug to lib/Kconfig.debug so that all architectures can benefit from it. This can make a huge difference in kernel image size especially when CONFIG_OPTIMIZE_FOR_SIZE is enabled. For example, I got 3.5% smaller arm64 kernel for v5.1-rc1. dec file 18983424 arch/arm64/boot/Image.before 18321920 arch/arm64/boot/Image.after This also slightly improves the "Kernel hacking" Kconfig menu as e61aca5158a8 ("Merge branch 'kconfig-diet' from Dave Hansen') suggested; this config option would be a good fit in the "compiler option" menu. Link: http://lkml.kernel.org/r/20190423034959.13525-12-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Borislav Petkov Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Boris Brezillon Cc: Brian Norris Cc: Christophe Leroy Cc: David Woodhouse Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Marek Vasut Cc: Mark Rutland Cc: Mathieu Malaterre Cc: Miquel Raynal Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Weinberger Cc: Russell King Cc: Stefan Agner Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler_types.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index ba814f18cb4c..19e58b9138a0 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -140,8 +140,7 @@ struct ftrace_likely_data { * Do not use __always_inline here, since currently it expands to inline again * (which would break users of __always_inline). */ -#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ - !defined(CONFIG_OPTIMIZE_INLINING) +#if !defined(CONFIG_OPTIMIZE_INLINING) #define inline inline __attribute__((__always_inline__)) __gnu_inline \ __maybe_unused notrace #else -- cgit From e02c9b0d65a7493180db45320f82482c6ba8ea57 Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Tue, 14 May 2019 15:42:34 -0700 Subject: kernel/latencytop.c: rename clear_all_latency_tracing to clear_tsk_latency_tracing The name clear_all_latency_tracing is misleading, in fact which only clear per task's latency_record[], and we do have another function named clear_global_latency_tracing which clear the global latency_record[] buffer. Link: http://lkml.kernel.org/r/20190226114602.16902-1-linf@wangsu.com Signed-off-by: Lin Feng Cc: Alexey Dobriyan Cc: Fabian Frederick Cc: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/latencytop.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index 7c560e0dc8f4..9022f0c2e2e4 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -36,7 +36,7 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) __account_scheduler_latency(task, usecs, inter); } -void clear_all_latency_tracing(struct task_struct *p); +void clear_tsk_latency_tracing(struct task_struct *p); extern int sysctl_latencytop(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -48,7 +48,7 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) { } -static inline void clear_all_latency_tracing(struct task_struct *p) +static inline void clear_tsk_latency_tracing(struct task_struct *p) { } -- cgit From 8e18faeac3e4d4b3ff3d705cd46d0fcb710b09d0 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 14 May 2019 15:42:46 -0700 Subject: lib/plist: rename DEBUG_PI_LIST to DEBUG_PLIST This is a lot more appropriate than PI_LIST, which in the kernel one would assume that it has to do with priority-inheritance; which is not -- furthermore futexes make use of plists so this can be even more confusing, albeit the debug nature of the config option. Link: http://lkml.kernel.org/r/20190317185434.1626-1-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/plist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/plist.h b/include/linux/plist.h index 97883604a3c5..9365df5a823f 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -231,7 +231,7 @@ static inline int plist_node_empty(const struct plist_node *node) * @type: the type of the struct this is embedded in * @member: the name of the list_head within the struct */ -#ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_DEBUG_PLIST # define plist_first_entry(head, type, member) \ ({ \ WARN_ON(plist_head_empty(head)); \ @@ -248,7 +248,7 @@ static inline int plist_node_empty(const struct plist_node *node) * @type: the type of the struct this is embedded in * @member: the name of the list_head within the struct */ -#ifdef CONFIG_DEBUG_PI_LIST +#ifdef CONFIG_DEBUG_PLIST # define plist_last_entry(head, type, member) \ ({ \ WARN_ON(plist_head_empty(head)); \ -- cgit From 043b3f7b6388fca6be86ca82979f66c5723a0d10 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Tue, 14 May 2019 15:42:58 -0700 Subject: lib/list_sort: simplify and remove MAX_LIST_LENGTH_BITS Rather than a fixed-size array of pending sorted runs, use the ->prev links to keep track of things. This reduces stack usage, eliminates some ugly overflow handling, and reduces the code size. Also: * merge() no longer needs to handle NULL inputs, so simplify. * The same applies to merge_and_restore_back_links(), which is renamed to the less ponderous merge_final(). (It's a static helper function, so we don't need a super-descriptive name; comments will do.) * Document the actual return value requirements on the (*cmp)() function; some callers are already using this feature. x86-64 code size 1086 -> 739 bytes (-347) (Yes, I see checkpatch complaining about no space after comma in "__attribute__((nonnull(2,3,4,5)))". Checkpatch is wrong.) Feedback from Rasmus Villemoes, Andy Shevchenko and Geert Uytterhoeven. [akpm@linux-foundation.org: remove __pure usage due to mysterious warning] Link: http://lkml.kernel.org/r/f63c410e0ff76009c9b58e01027e751ff7fdb749.1552704200.git.lkml@sdf.org Signed-off-by: George Spelvin Acked-by: Andrey Abramov Acked-by: Rasmus Villemoes Reviewed-by: Andy Shevchenko Cc: Geert Uytterhoeven Cc: Daniel Wagner Cc: Dave Chinner Cc: Don Mullis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_sort.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/list_sort.h b/include/linux/list_sort.h index ba79956e848d..20f178c24e9d 100644 --- a/include/linux/list_sort.h +++ b/include/linux/list_sort.h @@ -6,6 +6,7 @@ struct list_head; +__attribute__((nonnull(2,3))) void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_head *a, struct list_head *b)); -- cgit From 9f6158946987a5ce3f16da097d18f240a89db417 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 14 May 2019 15:43:08 -0700 Subject: lib/math: move int_pow() from pwm_bl.c for wider use The integer exponentiation is used in few places and might be used in the future by other call sites. Move it to wider use. Link: http://lkml.kernel.org/r/20190323172531.80025-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Daniel Thompson Cc: Lee Jones Cc: Ray Jui Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index a3b59d143afb..74b1ee9027f5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -484,6 +484,7 @@ extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); extern int func_ptr_is_kernel_text(void *ptr); +u64 int_pow(u64 base, unsigned int exp); unsigned long int_sqrt(unsigned long); #if BITS_PER_LONG < 64 -- cgit From ef4d6f6b275c498f8e5626c99dbeefdc5027f843 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 14 May 2019 15:43:27 -0700 Subject: include/linux/bitops.h: sanitize rotate primitives The ror32 implementation (word >> shift) | (word << (32 - shift) has undefined behaviour if shift is outside the [1, 31] range. Similarly for the 64 bit variants. Most callers pass a compile-time constant (naturally in that range), but there's an UBSAN report that these may actually be called with a shift count of 0. Instead of special-casing that, we can make them DTRT for all values of shift while also avoiding UB. For some reason, this was already partly done for rol32 (which was well-defined for [0, 31]). gcc 8 recognizes these patterns as rotates, so for example __u32 rol32(__u32 word, unsigned int shift) { return (word << (shift & 31)) | (word >> ((-shift) & 31)); } compiles to 0000000000000020 : 20: 89 f8 mov %edi,%eax 22: 89 f1 mov %esi,%ecx 24: d3 c0 rol %cl,%eax 26: c3 retq Older compilers unfortunately do not do as well, but this only affects the small minority of users that don't pass constants. Due to integer promotions, ro[lr]8 were already well-defined for shifts in [0, 8], and ro[lr]16 were mostly well-defined for shifts in [0, 16] (only mostly - u16 gets promoted to _signed_ int, so if bit 15 is set, word << 16 is undefined). For consistency, update those as well. Link: http://lkml.kernel.org/r/20190410211906.2190-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Reported-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: Will Deacon Cc: Vadim Pasternak Cc: Andrey Ryabinin Cc: Jacek Anaszewski Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 602af23b98c7..cf074bce3eb3 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -60,7 +60,7 @@ static __always_inline unsigned long hweight_long(unsigned long w) */ static inline __u64 rol64(__u64 word, unsigned int shift) { - return (word << shift) | (word >> (64 - shift)); + return (word << (shift & 63)) | (word >> ((-shift) & 63)); } /** @@ -70,7 +70,7 @@ static inline __u64 rol64(__u64 word, unsigned int shift) */ static inline __u64 ror64(__u64 word, unsigned int shift) { - return (word >> shift) | (word << (64 - shift)); + return (word >> (shift & 63)) | (word << ((-shift) & 63)); } /** @@ -80,7 +80,7 @@ static inline __u64 ror64(__u64 word, unsigned int shift) */ static inline __u32 rol32(__u32 word, unsigned int shift) { - return (word << shift) | (word >> ((-shift) & 31)); + return (word << (shift & 31)) | (word >> ((-shift) & 31)); } /** @@ -90,7 +90,7 @@ static inline __u32 rol32(__u32 word, unsigned int shift) */ static inline __u32 ror32(__u32 word, unsigned int shift) { - return (word >> shift) | (word << (32 - shift)); + return (word >> (shift & 31)) | (word << ((-shift) & 31)); } /** @@ -100,7 +100,7 @@ static inline __u32 ror32(__u32 word, unsigned int shift) */ static inline __u16 rol16(__u16 word, unsigned int shift) { - return (word << shift) | (word >> (16 - shift)); + return (word << (shift & 15)) | (word >> ((-shift) & 15)); } /** @@ -110,7 +110,7 @@ static inline __u16 rol16(__u16 word, unsigned int shift) */ static inline __u16 ror16(__u16 word, unsigned int shift) { - return (word >> shift) | (word << (16 - shift)); + return (word >> (shift & 15)) | (word << ((-shift) & 15)); } /** @@ -120,7 +120,7 @@ static inline __u16 ror16(__u16 word, unsigned int shift) */ static inline __u8 rol8(__u8 word, unsigned int shift) { - return (word << shift) | (word >> (8 - shift)); + return (word << (shift & 7)) | (word >> ((-shift) & 7)); } /** @@ -130,7 +130,7 @@ static inline __u8 rol8(__u8 word, unsigned int shift) */ static inline __u8 ror8(__u8 word, unsigned int shift) { - return (word >> shift) | (word << (8 - shift)); + return (word >> (shift & 7)) | (word << ((-shift) & 7)); } /** -- cgit From a6231d1993367cf48a9c5f1adc295a5a8b1c5731 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 14 May 2019 15:44:40 -0700 Subject: exec: move struct linux_binprm::buf struct linux_binprm::buf is the first field and it is exactly 128 bytes in size. It means that on x86_64 all accesses to other fields will go though [r64 + disp32] addressing mode which is 3 bytes bloatier than [r64 + disp8] addressing mode. Given that accesses to other fields outnumber accesses to ->buf, move it down. Space savings (x86_64 defconfig): more on distro configs because LSMs actively dereference "bprm" but do not care about first 128 bytes of the executable itself. add/remove: 0/0 grow/shrink: 0/24 up/down: 0/-492 (-492) Function old new delta selinux_bprm_committing_creds 552 549 -3 finalize_exec 94 91 -3 __audit_log_bprm_fcaps 283 280 -3 __audit_bprm 39 36 -3 perf_trace_sched_process_exec 347 341 -6 install_exec_creds 105 99 -6 cap_bprm_set_creds.cold 60 54 -6 would_dump 137 128 -9 load_script 637 628 -9 bprm_change_interp 61 52 -9 trace_event_raw_event_sched_process_exec 260 250 -10 search_binary_handler 255 240 -15 remove_arg_zero 295 277 -18 free_bprm 119 101 -18 prepare_binprm 379 360 -19 setup_new_exec 336 315 -21 flush_old_exec 1638 1617 -21 copy_strings.isra 746 724 -22 setup_arg_pages 559 530 -29 load_misc_binary 1151 1118 -33 selinux_bprm_set_creds 792 753 -39 load_elf_binary 11111 11072 -39 cap_bprm_set_creds 1496 1454 -42 __do_execve_file.isra 2395 2286 -109 Link: http://lkml.kernel.org/r/20190421165025.GA26843@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/binfmts.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 688ab0de7810..b40fc633f3be 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -15,7 +15,6 @@ struct filename; * This structure is used to hold the arguments that are used when loading binaries. */ struct linux_binprm { - char buf[BINPRM_BUF_SIZE]; #ifdef CONFIG_MMU struct vm_area_struct *vma; unsigned long vma_pages; @@ -64,6 +63,8 @@ struct linux_binprm { unsigned long loader, exec; struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */ + + char buf[BINPRM_BUF_SIZE]; } __randomize_layout; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 -- cgit From 3713a4e1fdb8da86f96a3e770b08e278d97529b4 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 14 May 2019 15:44:46 -0700 Subject: include/linux/cpumask.h: fix double string traverse in cpumask_parse cpumask_parse() finds first occurrence of either or strchr() and strlen(). We can do it better with a single call of strchrnul(). [akpm@linux-foundation.org: remove unneeded cast] Link: http://lkml.kernel.org/r/20190409204208.12190-1-ynorov@marvell.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 147bdec42215..21755471b1c3 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -633,8 +633,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, */ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) { - char *nl = strchr(buf, '\n'); - unsigned int len = nl ? (unsigned int)(nl - buf) : strlen(buf); + unsigned int len = strchrnul(buf, '\n') - buf; return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } -- cgit From c39ea0b9dd24bf1bf2baa5cdbfa1905f3065347b Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 14 May 2019 15:45:34 -0700 Subject: panic: avoid the extra noise dmesg When kernel panic happens, it will first print the panic call stack, then the ending msg like: [ 35.743249] ---[ end Kernel panic - not syncing: Fatal exception [ 35.749975] ------------[ cut here ]------------ The above message are very useful for debugging. But if system is configured to not reboot on panic, say the "panic_timeout" parameter equals 0, it will likely print out many noisy message like WARN() call stack for each and every CPU except the panic one, messages like below: WARNING: CPU: 1 PID: 280 at kernel/sched/core.c:1198 set_task_cpu+0x183/0x190 Call Trace: try_to_wake_up default_wake_function autoremove_wake_function __wake_up_common __wake_up_common_lock __wake_up wake_up_klogd_work_func irq_work_run_list irq_work_tick update_process_times tick_sched_timer __hrtimer_run_queues hrtimer_interrupt smp_apic_timer_interrupt apic_timer_interrupt For people working in console mode, the screen will first show the panic call stack, but immediately overridden by these noisy extra messages, which makes debugging much more difficult, as the original context gets lost on screen. Also these noisy messages will confuse some users, as I have seen many bug reporters posted the noisy message into bugzilla, instead of the real panic call stack and context. Adding a flag "suppress_printk" which gets set in panic() to avoid those noisy messages, without changing current kernel behavior that both panic blinking and sysrq magic key can work as is, suggested by Petr Mladek. To verify this, make sure kernel is not configured to reboot on panic and in console # echo c > /proc/sysrq-trigger to see if console only prints out the panic call stack. Link: http://lkml.kernel.org/r/1551430186-24169-1-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Acked-by: Steven Rostedt (VMware) Acked-by: Sergey Senozhatsky Cc: Thomas Gleixner Cc: Kees Cook Cc: Borislav Petkov Cc: Andi Kleen Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/printk.h b/include/linux/printk.h index 84ea4d094af3..cefd374c47b1 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -82,6 +82,8 @@ static inline void console_verbose(void) extern char devkmsg_log_str[]; struct ctl_table; +extern int suppress_printk; + struct va_format { const char *fmt; va_list *va; -- cgit From b287a25a7148a89d977c819c1f7d6584f875b682 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Tue, 14 May 2019 15:45:37 -0700 Subject: panic/reboot: allow specifying reboot_mode for panic only Allow specifying reboot_mode for panic only. This is needed on systems where ramoops is used to store panic logs, and user wants to use warm reset to preserve those, while still having cold reset on normal reboots. Link: http://lkml.kernel.org/r/20190322004735.27702-1-aaro.koskinen@iki.fi Signed-off-by: Aaro Koskinen Reviewed-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/reboot.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index e63799a6e895..3734cd8f38a8 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -14,6 +14,7 @@ struct device; #define SYS_POWER_OFF 0x0003 /* Notify of system power off */ enum reboot_mode { + REBOOT_UNDEFINED = -1, REBOOT_COLD = 0, REBOOT_WARM, REBOOT_HARD, @@ -21,6 +22,7 @@ enum reboot_mode { REBOOT_GPIO, }; extern enum reboot_mode reboot_mode; +extern enum reboot_mode panic_reboot_mode; enum reboot_type { BOOT_TRIPLE = 't', -- cgit From 4461d65176b408d6d7adefa41a95472a18a90a53 Mon Sep 17 00:00:00 2001 From: Tom Burkart Date: Tue, 14 May 2019 15:45:40 -0700 Subject: pps: descriptor-based gpio This patch changes the GPIO access for the pps-gpio driver from the integer based API to the descriptor based API. The integer based API is considered deprecated and the descriptor based API is the preferred way to access GPIOs as per Documentation/driver-api/gpio/intro.rst No changes are made to userspace interfaces. Link: http://lkml.kernel.org/r/20190324043305.6627-2-tom@aussec.com Signed-off-by: Tom Burkart Acked-by: Rodolfo Giometti Reviewed-by: Philipp Zabel Cc: Lukas Senger Cc: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pps-gpio.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h index 56f35dd3d01d..f028d2cda6f5 100644 --- a/include/linux/pps-gpio.h +++ b/include/linux/pps-gpio.h @@ -23,10 +23,9 @@ #define _PPS_GPIO_H struct pps_gpio_platform_data { + struct gpio_desc *gpio_pin; bool assert_falling_edge; bool capture_clear; - unsigned int gpio_pin; - const char *gpio_label; }; #endif /* _PPS_GPIO_H */ -- cgit From 4c69add45fec995ce23b21dc90be20f1efd8cdad Mon Sep 17 00:00:00 2001 From: Tom Burkart Date: Tue, 14 May 2019 15:45:46 -0700 Subject: pps: pps-gpio PPS ECHO implementation This patch implements the PPS ECHO functionality for pps-gpio, that sysfs claims is available already. Configuration is done via device tree bindings. No changes are made to userspace interfaces. This patch was originally written by Lukas Senger as part of a masters thesis project and modified for inclusion into the linux kernel by Tom Burkart. Link: http://lkml.kernel.org/r/20190324043305.6627-4-tom@aussec.com Signed-off-by: Tom Burkart Acked-by: Rodolfo Giometti Signed-off-by: Lukas Senger Cc: Philipp Zabel Cc: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pps-gpio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h index f028d2cda6f5..44171e6b7197 100644 --- a/include/linux/pps-gpio.h +++ b/include/linux/pps-gpio.h @@ -24,8 +24,10 @@ struct pps_gpio_platform_data { struct gpio_desc *gpio_pin; + struct gpio_desc *echo_pin; bool assert_falling_edge; bool capture_clear; + unsigned int echo_active_ms; }; #endif /* _PPS_GPIO_H */ -- cgit From 3278a2c20cb302d27e6f6ee45a3f57361176e426 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Tue, 14 May 2019 15:46:33 -0700 Subject: ipc: conserve sequence numbers in ipcmni_extend mode Rewrite, based on the patch from Waiman Long: The mixing in of a sequence number into the IPC IDs is probably to avoid ID reuse in userspace as much as possible. With ipcmni_extend mode, the number of usable sequence numbers is greatly reduced leading to higher chance of ID reuse. To address this issue, we need to conserve the sequence number space as much as possible. Right now, the sequence number is incremented for every new ID created. In reality, we only need to increment the sequence number when new allocated ID is not greater than the last one allocated. It is in such case that the new ID may collide with an existing one. This is being done irrespective of the ipcmni mode. In order to avoid any races, the index is first allocated and then the pointer is replaced. Changes compared to the initial patch: - Handle failures from idr_alloc(). - Avoid that concurrent operations can see the wrong sequence number. (This is achieved by using idr_replace()). - IPCMNI_SEQ_SHIFT is not a constant, thus renamed to ipcmni_seq_shift(). - IPCMNI_SEQ_MAX is not a constant, thus renamed to ipcmni_seq_max(). Link: http://lkml.kernel.org/r/20190329204930.21620-2-longman@redhat.com Signed-off-by: Manfred Spraul Signed-off-by: Waiman Long Suggested-by: Matthew Wilcox Acked-by: Waiman Long Cc: Al Viro Cc: Davidlohr Bueso Cc: "Eric W . Biederman" Cc: Jonathan Corbet Cc: Kees Cook Cc: "Luis R. Rodriguez" Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 6ab8c1bada3f..c309f43bde45 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -19,6 +19,7 @@ struct ipc_ids { struct rw_semaphore rwsem; struct idr ipcs_idr; int max_idx; + int last_idx; /* For wrap around detection */ #ifdef CONFIG_CHECKPOINT_RESTORE int next_id; #endif -- cgit From 9e9291c71eb92b457eb798501e210dec3d12e795 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 14 May 2019 15:46:42 -0700 Subject: include/linux/sched/signal.h: replace `tsk' with `task' This file uses "task" 85 times and "tsk" 25 times. It is better to be consistent. Link: http://lkml.kernel.org/r/20181129180547.15976-1-avagin@gmail.com Signed-off-by: Andrei Vagin Reviewed-by: Andrew Morton Cc: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/signal.h | 51 ++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index e412c092c1e8..38a0f0785323 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -271,17 +271,18 @@ static inline int signal_group_exit(const struct signal_struct *sig) extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); -extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *info); +extern int dequeue_signal(struct task_struct *task, + sigset_t *mask, kernel_siginfo_t *info); static inline int kernel_dequeue_signal(void) { - struct task_struct *tsk = current; + struct task_struct *task = current; kernel_siginfo_t __info; int ret; - spin_lock_irq(&tsk->sighand->siglock); - ret = dequeue_signal(tsk, &tsk->blocked, &__info); - spin_unlock_irq(&tsk->sighand->siglock); + spin_lock_irq(&task->sighand->siglock); + ret = dequeue_signal(task, &task->blocked, &__info); + spin_unlock_irq(&task->sighand->siglock); return ret; } @@ -419,18 +420,18 @@ static inline void set_restore_sigmask(void) WARN_ON(!test_thread_flag(TIF_SIGPENDING)); } -static inline void clear_tsk_restore_sigmask(struct task_struct *tsk) +static inline void clear_tsk_restore_sigmask(struct task_struct *task) { - clear_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK); + clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline void clear_restore_sigmask(void) { clear_thread_flag(TIF_RESTORE_SIGMASK); } -static inline bool test_tsk_restore_sigmask(struct task_struct *tsk) +static inline bool test_tsk_restore_sigmask(struct task_struct *task) { - return test_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK); + return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline bool test_restore_sigmask(void) { @@ -449,9 +450,9 @@ static inline void set_restore_sigmask(void) current->restore_sigmask = true; WARN_ON(!test_thread_flag(TIF_SIGPENDING)); } -static inline void clear_tsk_restore_sigmask(struct task_struct *tsk) +static inline void clear_tsk_restore_sigmask(struct task_struct *task) { - tsk->restore_sigmask = false; + task->restore_sigmask = false; } static inline void clear_restore_sigmask(void) { @@ -461,9 +462,9 @@ static inline bool test_restore_sigmask(void) { return current->restore_sigmask; } -static inline bool test_tsk_restore_sigmask(struct task_struct *tsk) +static inline bool test_tsk_restore_sigmask(struct task_struct *task) { - return tsk->restore_sigmask; + return task->restore_sigmask; } static inline bool test_and_clear_restore_sigmask(void) { @@ -617,9 +618,9 @@ static inline struct pid *task_session(struct task_struct *task) return task->signal->pids[PIDTYPE_SID]; } -static inline int get_nr_threads(struct task_struct *tsk) +static inline int get_nr_threads(struct task_struct *task) { - return tsk->signal->nr_threads; + return task->signal->nr_threads; } static inline bool thread_group_leader(struct task_struct *p) @@ -658,35 +659,35 @@ static inline int thread_group_empty(struct task_struct *p) #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) -extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, +extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, unsigned long *flags); -static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk, +static inline struct sighand_struct *lock_task_sighand(struct task_struct *task, unsigned long *flags) { struct sighand_struct *ret; - ret = __lock_task_sighand(tsk, flags); - (void)__cond_lock(&tsk->sighand->siglock, ret); + ret = __lock_task_sighand(task, flags); + (void)__cond_lock(&task->sighand->siglock, ret); return ret; } -static inline void unlock_task_sighand(struct task_struct *tsk, +static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) { - spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); + spin_unlock_irqrestore(&task->sighand->siglock, *flags); } -static inline unsigned long task_rlimit(const struct task_struct *tsk, +static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { - return READ_ONCE(tsk->signal->rlim[limit].rlim_cur); + return READ_ONCE(task->signal->rlim[limit].rlim_cur); } -static inline unsigned long task_rlimit_max(const struct task_struct *tsk, +static inline unsigned long task_rlimit_max(const struct task_struct *task, unsigned int limit) { - return READ_ONCE(tsk->signal->rlim[limit].rlim_max); + return READ_ONCE(task->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) -- cgit From b09e89366e1730ac13088706cec0f1335299eefb Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 14 May 2019 15:46:54 -0700 Subject: arch: remove and Now that all instances of #include have been replaced with #include , we can remove these. Link: http://lkml.kernel.org/r/1553267665-27228-2-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/sizes.h | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 include/asm-generic/sizes.h (limited to 'include') diff --git a/include/asm-generic/sizes.h b/include/asm-generic/sizes.h deleted file mode 100644 index 1dcfad9629ef..000000000000 --- a/include/asm-generic/sizes.h +++ /dev/null @@ -1,2 +0,0 @@ -/* This is a placeholder, to be removed over time */ -#include -- cgit From 871789d4af807d1e91a6299f12a67e06177ed420 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Tue, 14 May 2019 15:46:57 -0700 Subject: mm, memcg: rename ambiguously named memory.stat counters and functions I spent literally an hour trying to work out why an earlier version of my memory.events aggregation code doesn't work properly, only to find out I was calling memcg->events instead of memcg->memory_events, which is fairly confusing. This naming seems in need of reworking, so make it harder to do the wrong thing by using vmevents instead of events, which makes it more clear that these are vm counters rather than memcg-specific counters. There are also a few other inconsistent names in both the percpu and aggregated structs, so these are all cleaned up to be more coherent and easy to understand. This commit contains code cleanup only: there are no logic changes. [akpm@linux-foundation.org: fix it for preceding changes] Link: http://lkml.kernel.org/r/20190208224319.GA23801@chrisdown.name Signed-off-by: Chris Down Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Roman Gushchin Cc: Dennis Zhou Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 30561a954ee0..fa098d168b75 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -94,8 +94,8 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, }; -struct mem_cgroup_stat_cpu { - long count[MEMCG_NR_STAT]; +struct memcg_vmstats_percpu { + long stat[MEMCG_NR_STAT]; unsigned long events[NR_VM_EVENT_ITEMS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; @@ -274,12 +274,12 @@ struct mem_cgroup { struct task_struct *move_lock_task; /* memory.stat */ - struct mem_cgroup_stat_cpu __percpu *stat_cpu; + struct memcg_vmstats_percpu __percpu *vmstats_percpu; MEMCG_PADDING(_pad2_); - atomic_long_t stat[MEMCG_NR_STAT]; - atomic_long_t events[NR_VM_EVENT_ITEMS]; + atomic_long_t vmstats[MEMCG_NR_STAT]; + atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; unsigned long socket_pressure; @@ -557,7 +557,7 @@ void unlock_page_memcg(struct page *page); static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long x = atomic_long_read(&memcg->stat[idx]); + long x = atomic_long_read(&memcg->vmstats[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -574,12 +574,12 @@ static inline void __mod_memcg_state(struct mem_cgroup *memcg, if (mem_cgroup_disabled()) return; - x = val + __this_cpu_read(memcg->stat_cpu->count[idx]); + x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &memcg->stat[idx]); + atomic_long_add(x, &memcg->vmstats[idx]); x = 0; } - __this_cpu_write(memcg->stat_cpu->count[idx], x); + __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); } /* idx can be of type enum memcg_stat_item or node_stat_item */ @@ -717,12 +717,12 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg, if (mem_cgroup_disabled()) return; - x = count + __this_cpu_read(memcg->stat_cpu->events[idx]); + x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); if (unlikely(x > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &memcg->events[idx]); + atomic_long_add(x, &memcg->vmevents[idx]); x = 0; } - __this_cpu_write(memcg->stat_cpu->events[idx], x); + __this_cpu_write(memcg->vmstats_percpu->events[idx], x); } static inline void count_memcg_events(struct mem_cgroup *memcg, -- cgit From 205b20cc5a99cdf197c32f4dbee2b09c699477f0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 14 May 2019 15:47:06 -0700 Subject: mm: memcontrol: make cgroup stats and events query API explicitly local Patch series "mm: memcontrol: memory.stat cost & correctness". The cgroup memory.stat file holds recursive statistics for the entire subtree. The current implementation does this tree walk on-demand whenever the file is read. This is giving us problems in production. 1. The cost of aggregating the statistics on-demand is high. A lot of system service cgroups are mostly idle and their stats don't change between reads, yet we always have to check them. There are also always some lazily-dying cgroups sitting around that are pinned by a handful of remaining page cache; the same applies to them. In an application that periodically monitors memory.stat in our fleet, we have seen the aggregation consume up to 5% CPU time. 2. When cgroups die and disappear from the cgroup tree, so do their accumulated vm events. The result is that the event counters at higher-level cgroups can go backwards and confuse some of our automation, let alone people looking at the graphs over time. To address both issues, this patch series changes the stat implementation to spill counts upwards when the counters change. The upward spilling is batched using the existing per-cpu cache. In a sparse file stress test with 5 level cgroup nesting, the additional cost of the flushing was negligible (a little under 1% of CPU at 100% CPU utilization, compared to the 5% of reading memory.stat during regular operation). This patch (of 4): memcg_page_state(), lruvec_page_state(), memcg_sum_events() are currently returning the state of the local memcg or lruvec, not the recursive state. In practice there is a demand for both versions, although the callers that want the recursive counts currently sum them up by hand. Per default, cgroups are considered recursive entities and generally we expect more users of the recursive counters, with the local counts being special cases. To reflect that in the name, add a _local suffix to the current implementations. The following patch will re-incarnate these functions with recursive semantics, but with an O(1) implementation. [hannes@cmpxchg.org: fix bisection hole] Link: http://lkml.kernel.org/r/20190417160347.GC23013@cmpxchg.org Link: http://lkml.kernel.org/r/20190412151507.2769-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fa098d168b75..0aa0889218bf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -554,8 +554,8 @@ void unlock_page_memcg(struct page *page); * idx can be of type enum memcg_stat_item or node_stat_item. * Keep in sync with memcg_exact_page_state(). */ -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, - int idx) +static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, + int idx) { long x = atomic_long_read(&memcg->vmstats[idx]); #ifdef CONFIG_SMP @@ -624,8 +624,8 @@ static inline void mod_memcg_page_state(struct page *page, mod_memcg_state(page->mem_cgroup, idx, val); } -static inline unsigned long lruvec_page_state(struct lruvec *lruvec, - enum node_stat_item idx) +static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, + enum node_stat_item idx) { struct mem_cgroup_per_node *pn; long x; @@ -1011,8 +1011,8 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, - int idx) +static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, + int idx) { return 0; } @@ -1041,8 +1041,8 @@ static inline void mod_memcg_page_state(struct page *page, { } -static inline unsigned long lruvec_page_state(struct lruvec *lruvec, - enum node_stat_item idx) +static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, + enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } -- cgit From db9adbcbe740e0986b575dd56aad834ce9e9b5d3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 14 May 2019 15:47:09 -0700 Subject: mm: memcontrol: move stat/event counting functions out-of-line These are getting too big to be inlined in every callsite. They were stolen from vmstat.c, which already out-of-lines them, and they have only been growing since. The callsites aren't that hot, either. Move __mod_memcg_state() __mod_lruvec_state() and __count_memcg_events() out of line and add kerneldoc comments. Link: http://lkml.kernel.org/r/20190412151507.2769-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 62 ++++------------------------------------------ 1 file changed, 5 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0aa0889218bf..e35e6a651187 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -565,22 +565,7 @@ static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, return x; } -/* idx can be of type enum memcg_stat_item or node_stat_item */ -static inline void __mod_memcg_state(struct mem_cgroup *memcg, - int idx, int val) -{ - long x; - - if (mem_cgroup_disabled()) - return; - - x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &memcg->vmstats[idx]); - x = 0; - } - __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); -} +void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, @@ -642,31 +627,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; } -static inline void __mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - struct mem_cgroup_per_node *pn; - long x; - - /* Update node */ - __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); - - if (mem_cgroup_disabled()) - return; - - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - - /* Update memcg */ - __mod_memcg_state(pn->memcg, idx, val); - - /* Update lruvec */ - x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &pn->lruvec_stat[idx]); - x = 0; - } - __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); -} +void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + int val); static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) @@ -708,22 +670,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); -static inline void __count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) -{ - unsigned long x; - - if (mem_cgroup_disabled()) - return; - - x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); - if (unlikely(x > MEMCG_CHARGE_BATCH)) { - atomic_long_add(x, &memcg->vmevents[idx]); - x = 0; - } - __this_cpu_write(memcg->vmstats_percpu->events[idx], x); -} +void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, + unsigned long count); static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, -- cgit From 42a300353577ccc17ecc627b8570a89fa1678bec Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 14 May 2019 15:47:12 -0700 Subject: mm: memcontrol: fix recursive statistics correctness & scalabilty Right now, when somebody needs to know the recursive memory statistics and events of a cgroup subtree, they need to walk the entire subtree and sum up the counters manually. There are two issues with this: 1. When a cgroup gets deleted, its stats are lost. The state counters should all be 0 at that point, of course, but the events are not. When this happens, the event counters, which are supposed to be monotonic, can go backwards in the parent cgroups. 2. During regular operation, we always have a certain number of lazily freed cgroups sitting around that have been deleted, have no tasks, but have a few cache pages remaining. These groups' statistics do not change until we eventually hit memory pressure, but somebody watching, say, memory.stat on an ancestor has to iterate those every time. This patch addresses both issues by introducing recursive counters at each level that are propagated from the write side when stats change. Upward propagation happens when the per-cpu caches spill over into the local atomic counter. This is the same thing we do during charge and uncharge, except that the latter uses atomic RMWs, which are more expensive; stat changes happen at around the same rate. In a sparse file test (page faults and reclaim at maximum CPU speed) with 5 cgroup nesting levels, perf shows __mod_memcg_page state at ~1%. Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 54 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e35e6a651187..bc74d6a4407c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -128,6 +128,7 @@ struct mem_cgroup_per_node { struct lruvec_stat __percpu *lruvec_stat_cpu; atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; + atomic_long_t lruvec_stat_local[NR_VM_NODE_STAT_ITEMS]; unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; @@ -279,8 +280,12 @@ struct mem_cgroup { MEMCG_PADDING(_pad2_); atomic_long_t vmstats[MEMCG_NR_STAT]; + atomic_long_t vmstats_local[MEMCG_NR_STAT]; + atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; - atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; + atomic_long_t vmevents_local[NR_VM_EVENT_ITEMS]; + + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; unsigned long socket_pressure; @@ -550,6 +555,20 @@ struct mem_cgroup *lock_page_memcg(struct page *page); void __unlock_page_memcg(struct mem_cgroup *memcg); void unlock_page_memcg(struct page *page); +/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page_state(). + */ +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = atomic_long_read(&memcg->vmstats[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + /* * idx can be of type enum memcg_stat_item or node_stat_item. * Keep in sync with memcg_exact_page_state(). @@ -557,7 +576,7 @@ void unlock_page_memcg(struct page *page); static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { - long x = atomic_long_read(&memcg->vmstats[idx]); + long x = atomic_long_read(&memcg->vmstats_local[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -609,6 +628,24 @@ static inline void mod_memcg_page_state(struct page *page, mod_memcg_state(page->mem_cgroup, idx, val); } +static inline unsigned long lruvec_page_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + struct mem_cgroup_per_node *pn; + long x; + + if (mem_cgroup_disabled()) + return node_page_state(lruvec_pgdat(lruvec), idx); + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + x = atomic_long_read(&pn->lruvec_stat[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { @@ -619,7 +656,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - x = atomic_long_read(&pn->lruvec_stat[idx]); + x = atomic_long_read(&pn->lruvec_stat_local[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -959,6 +996,11 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + return 0; +} + static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { @@ -989,6 +1031,12 @@ static inline void mod_memcg_page_state(struct page *page, { } +static inline unsigned long lruvec_page_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + return node_page_state(lruvec_pgdat(lruvec), idx); +} + static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { -- cgit