From 5acda9d12dcf1ad0d9a5a2a7c646de3472fa7555 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:23 -0700 Subject: bdi: avoid oops on device removal After commit 839a8e8660b6 ("writeback: replace custom worker pool implementation with unbound workqueue") when device is removed while we are writing to it we crash in bdi_writeback_workfn() -> set_worker_desc() because bdi->dev is NULL. This can happen because even though bdi_unregister() cancels all pending flushing work, nothing really prevents new ones from being queued from balance_dirty_pages() or other places. Fix the problem by clearing BDI_registered bit in bdi_unregister() and checking it before scheduling of any flushing work. Fixes: 839a8e8660b6777e7fe4e80af1a048aebe2b5977 Reviewed-by: Tejun Heo Signed-off-by: Jan Kara Cc: Derek Basehore Cc: Jens Axboe Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 24819001f5c8..e488e9459a93 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -95,7 +95,7 @@ struct backing_dev_info { unsigned int max_ratio, max_prop_frac; struct bdi_writeback wb; /* default writeback info for this bdi */ - spinlock_t wb_lock; /* protects work_list */ + spinlock_t wb_lock; /* protects work_list & wb.dwork scheduling */ struct list_head work_list; -- cgit From 5f3bf19aeb8ed5cef0926bc10c80b6a50ac6bdeb Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Apr 2014 14:46:28 -0700 Subject: kmemleak: remove redundant code Remove kmemleak_padding() and kmemleak_release(). Signed-off-by: Li Zefan Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmemleak.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 2a5e5548a1d2..5bb424659c04 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -30,8 +30,6 @@ extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref; extern void kmemleak_free(const void *ptr) __ref; extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; -extern void kmemleak_padding(const void *ptr, unsigned long offset, - size_t size) __ref; extern void kmemleak_not_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; -- cgit From 9573f79355ff3711c98227d14a9b7f4cb3222b97 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:34 -0700 Subject: fanotify: convert access_mutex to spinlock access_mutex is used only to guard operations on access_list. There's no need for sleeping within this lock so just make a spinlock out of it. Signed-off-by: Jan Kara Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fsnotify_backend.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 64cf3ef50696..fc7718c6bd3e 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -178,7 +178,7 @@ struct fsnotify_group { struct fanotify_group_private_data { #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* allows a group to block waiting for a userspace response */ - struct mutex access_mutex; + spinlock_t access_lock; struct list_head access_list; wait_queue_head_t access_waitq; atomic_t bypass_perm; -- cgit From 9f985cb6c45bc3f8b7e161c9658d409d051d576f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Apr 2014 14:46:55 -0700 Subject: quota: provide function to grab quota structure reference Provide dqgrab() function to get quota structure reference when we are sure it already has at least one active reference. Make use of this function inside quota code. Signed-off-by: Jan Kara Reviewed-by: Mark Fasheh Reviewed-by: Srinivas Eeda Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/quotaops.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 6965fe394c3b..1d3eee594cd6 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -46,6 +46,14 @@ void inode_reclaim_rsv_space(struct inode *inode, qsize_t number); void dquot_initialize(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, struct kqid qid); +static inline struct dquot *dqgrab(struct dquot *dquot) +{ + /* Make sure someone else has active reference to dquot */ + WARN_ON_ONCE(!atomic_read(&dquot->dq_count)); + WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); + atomic_inc(&dquot->dq_count); + return dquot; +} void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), -- cgit From d26914d11751b23ca2e8747725f2cae10c2f2c1b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 3 Apr 2014 14:47:24 -0700 Subject: mm: optimize put_mems_allowed() usage Since put_mems_allowed() is strictly optional, its a seqcount retry, we don't need to evaluate the function if the allocation was in fact successful, saving a smp_rmb some loads and comparisons on some relative fast-paths. Since the naming, get/put_mems_allowed() does suggest a mandatory pairing, rename the interface, as suggested by Mel, to resemble the seqcount interface. This gives us: read_mems_allowed_begin() and read_mems_allowed_retry(), where it is important to note that the return value of the latter call is inverted from its previous incarnation. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 3fe661fe96d1..b19d3dc2e651 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -87,25 +87,26 @@ extern void rebuild_sched_domains(void); extern void cpuset_print_task_mems_allowed(struct task_struct *p); /* - * get_mems_allowed is required when making decisions involving mems_allowed - * such as during page allocation. mems_allowed can be updated in parallel - * and depending on the new value an operation can fail potentially causing - * process failure. A retry loop with get_mems_allowed and put_mems_allowed - * prevents these artificial failures. + * read_mems_allowed_begin is required when making decisions involving + * mems_allowed such as during page allocation. mems_allowed can be updated in + * parallel and depending on the new value an operation can fail potentially + * causing process failure. A retry loop with read_mems_allowed_begin and + * read_mems_allowed_retry prevents these artificial failures. */ -static inline unsigned int get_mems_allowed(void) +static inline unsigned int read_mems_allowed_begin(void) { return read_seqcount_begin(¤t->mems_allowed_seq); } /* - * If this returns false, the operation that took place after get_mems_allowed - * may have failed. It is up to the caller to retry the operation if + * If this returns true, the operation that took place after + * read_mems_allowed_begin may have failed artificially due to a concurrent + * update of mems_allowed. It is up to the caller to retry the operation if * appropriate. */ -static inline bool put_mems_allowed(unsigned int seq) +static inline bool read_mems_allowed_retry(unsigned int seq) { - return !read_seqcount_retry(¤t->mems_allowed_seq, seq); + return read_seqcount_retry(¤t->mems_allowed_seq, seq); } static inline void set_mems_allowed(nodemask_t nodemask) @@ -225,14 +226,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) { } -static inline unsigned int get_mems_allowed(void) +static inline unsigned int read_mems_allowed_begin(void) { return 0; } -static inline bool put_mems_allowed(unsigned int seq) +static inline bool read_mems_allowed_retry(unsigned int seq) { - return true; + return false; } #endif /* !CONFIG_CPUSETS */ -- cgit From 9119a41e9091fb3a8204039d595bcdae24193c57 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 3 Apr 2014 14:47:25 -0700 Subject: mm, hugetlb: unify region structure handling Currently, to track reserved and allocated regions, we use two different ways, depending on the mapping. For MAP_SHARED, we use address_mapping's private_list and, while for MAP_PRIVATE, we use a resv_map. Now, we are preparing to change a coarse grained lock which protect a region structure to fine grained lock, and this difference hinder it. So, before changing it, unify region structure handling, consistently using a resv_map regardless of the kind of mapping. Signed-off-by: Joonsoo Kim Signed-off-by: Davidlohr Bueso Reviewed-by: Aneesh Kumar K.V Reviewed-by: Naoya Horiguchi Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8c43cc469d78..f62c2f6c6059 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -6,6 +6,8 @@ #include #include #include +#include +#include struct ctl_table; struct user_struct; @@ -23,6 +25,13 @@ struct hugepage_subpool { long max_hpages, used_hpages; }; +struct resv_map { + struct kref refs; + struct list_head regions; +}; +extern struct resv_map *resv_map_alloc(void); +void resv_map_release(struct kref *ref); + extern spinlock_t hugetlb_lock; extern int hugetlb_max_hstate __read_mostly; #define for_each_hstate(h) \ -- cgit From 7b24d8616be33616efd41ff67d3c76362c60ca84 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Apr 2014 14:47:27 -0700 Subject: mm, hugetlb: fix race in region tracking There is a race condition if we map a same file on different processes. Region tracking is protected by mmap_sem and hugetlb_instantiation_mutex. When we do mmap, we don't grab a hugetlb_instantiation_mutex, but only mmap_sem (exclusively). This doesn't prevent other tasks from modifying the region structure, so it can be modified by two processes concurrently. To solve this, introduce a spinlock to resv_map and make region manipulation function grab it before they do actual work. [davidlohr@hp.com: updated changelog] Signed-off-by: Davidlohr Bueso Signed-off-by: Joonsoo Kim Suggested-by: Joonsoo Kim Acked-by: David Gibson Cc: David Gibson Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f62c2f6c6059..5b337cf8fb86 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -27,6 +27,7 @@ struct hugepage_subpool { struct resv_map { struct kref refs; + spinlock_t lock; struct list_head regions; }; extern struct resv_map *resv_map_alloc(void); -- cgit From 6a3ed2123a78de22a9e2b2855068a8d89f8e14f4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:34 -0700 Subject: mm: vmstat: fix UP zone state accounting Summary: The VM maintains cached filesystem pages on two types of lists. One list holds the pages recently faulted into the cache, the other list holds pages that have been referenced repeatedly on that first list. The idea is to prefer reclaiming young pages over those that have shown to benefit from caching in the past. We call the recently used list "inactive list" and the frequently used list "active list". Currently, the VM aims for a 1:1 ratio between the lists, which is the "perfect" trade-off between the ability to *protect* frequently used pages and the ability to *detect* frequently used pages. This means that working set changes bigger than half of cache memory go undetected and thrash indefinitely, whereas working sets bigger than half of cache memory are unprotected against used-once streams that don't even need caching. This happens on file servers and media streaming servers, where the popular files and file sections change over time. Even though the individual files might be smaller than half of memory, concurrent access to many of them may still result in their inter-reference distance being greater than half of memory. It's also been reported as a problem on database workloads that switch back and forth between tables that are bigger than half of memory. In these cases the VM never recognizes the new working set and will for the remainder of the workload thrash disk data which could easily live in memory. Historically, every reclaim scan of the inactive list also took a smaller number of pages from the tail of the active list and moved them to the head of the inactive list. This model gave established working sets more gracetime in the face of temporary use-once streams, but ultimately was not significantly better than a FIFO policy and still thrashed cache based on eviction speed, rather than actual demand for cache. This series solves the problem by maintaining a history of pages evicted from the inactive list, enabling the VM to detect frequently used pages regardless of inactive list size and facilitate working set transitions. Tests: The reported database workload is easily demonstrated on a 8G machine with two filesets a 6G. This fio workload operates on one set first, then switches to the other. The VM should obviously always cache the set that the workload is currently using. This test is based on a problem encountered by Citus Data customers: http://citusdata.com/blog/72-linux-memory-manager-and-your-big-data unpatched: db1: READ: io=98304MB, aggrb=885559KB/s, minb=885559KB/s, maxb=885559KB/s, mint= 113672msec, maxt= 113672msec db2: READ: io=98304MB, aggrb= 66169KB/s, minb= 66169KB/s, maxb= 66169KB/s, mint=1521302msec, maxt=1521302msec sdb: ios=835750/4, merge=2/1, ticks=4659739/60016, in_queue=4719203, util=98.92% real 27m15.541s user 0m19.059s sys 0m51.459s patched: db1: READ: io=98304MB, aggrb=877783KB/s, minb=877783KB/s, maxb=877783KB/s, mint=114679msec, maxt=114679msec db2: READ: io=98304MB, aggrb=397449KB/s, minb=397449KB/s, maxb=397449KB/s, mint=253273msec, maxt=253273msec sdb: ios=170587/4, merge=2/1, ticks=954910/61123, in_queue=1015923, util=90.40% real 6m8.630s user 0m14.714s sys 0m31.233s As can be seen, the unpatched kernel simply never adapts to the workingset change and db2 is stuck indefinitely with secondary storage speed. The patched kernel needs 2-3 iterations over db2 before it replaces db1 and reaches full memory speed. Given the unbounded negative affect of the existing VM behavior, these patches should be considered correctness fixes rather than performance optimizations. Another test resembles a fileserver or streaming server workload, where data in excess of memory size is accessed at different frequencies. There is very hot data accessed at a high frequency. Machines should be fitted so that the hot set of such a workload can be fully cached or all bets are off. Then there is a very big (compared to available memory) set of data that is used-once or at a very low frequency; this is what drives the inactive list and does not really benefit from caching. Lastly, there is a big set of warm data in between that is accessed at medium frequencies and benefits from caching the pages between the first and last streamer of each burst. unpatched: hot: READ: io=128000MB, aggrb=160693KB/s, minb=160693KB/s, maxb=160693KB/s, mint=815665msec, maxt=815665msec warm: READ: io= 81920MB, aggrb=109853KB/s, minb= 27463KB/s, maxb= 29244KB/s, mint=717110msec, maxt=763617msec cold: READ: io= 30720MB, aggrb= 35245KB/s, minb= 35245KB/s, maxb= 35245KB/s, mint=892530msec, maxt=892530msec sdb: ios=797960/4, merge=11763/1, ticks=4307910/796, in_queue=4308380, util=100.00% patched: hot: READ: io=128000MB, aggrb=160678KB/s, minb=160678KB/s, maxb=160678KB/s, mint=815740msec, maxt=815740msec warm: READ: io= 81920MB, aggrb=147747KB/s, minb= 36936KB/s, maxb= 40960KB/s, mint=512000msec, maxt=567767msec cold: READ: io= 30720MB, aggrb= 40960KB/s, minb= 40960KB/s, maxb= 40960KB/s, mint=768000msec, maxt=768000msec sdb: ios=596514/4, merge=9341/1, ticks=2395362/997, in_queue=2396484, util=79.18% In both kernels, the hot set is propagated to the active list and then served from cache. In both kernels, the beginning of the warm set is propagated to the active list as well, but in the unpatched case the active list eventually takes up half of memory and no new pages from the warm set get activated, despite repeated access, and despite most of the active list soon being stale. The patched kernel on the other hand detects the thrashing and manages to keep this cache window rolling through the data set. This frees up enough IO bandwidth that the cold set is served at full speed as well and disk utilization even drops by 20%. For reference, this same test was performed with the traditional demotion mechanism, where deactivation is coupled to inactive list reclaim. However, this had the same outcome as the unpatched kernel: while the warm set does indeed get activated continuously, it is forced out of the active list by inactive list pressure, which is dictated primarily by the unrelated cold set. The warm set is evicted before subsequent streamers can benefit from it, even though there would be enough space available to cache the pages of interest. Costs: Page reclaim used to shrink the radix trees but now the tree nodes are reused for shadow entries, where the cost depends heavily on the page cache access patterns. However, with workloads that maintain spatial or temporal locality, the shadow entries are either refaulted quickly or reclaimed along with the inode object itself. Workloads that will experience a memory cost increase are those that don't really benefit from caching in the first place. A more predictable alternative would be a fixed-cost separate pool of shadow entries, but this would incur relatively higher memory cost for well-behaved workloads at the benefit of cornercases. It would also make the shadow entry lookup more costly compared to storing them directly in the cache structure. Future: To simplify the merging process, this patch set is implementing thrash detection on a global per-zone level only for now, but the design is such that it can be extended to memory cgroups as well. All we need to do is store the unique cgroup ID along the node and zone identifier inside the eviction cookie to identify the lruvec. Right now we have a fixed ratio (50:50) between inactive and active list but we already have complaints about working sets exceeding half of memory being pushed out of the cache by simple streaming in the background. Ultimately, we want to adjust this ratio and allow for a much smaller inactive list. These patches are an essential step in this direction because they decouple the VMs ability to detect working set changes from the inactive list size. This would allow us to base the inactive list size on the combined readahead window size for example and potentially protect a much bigger working set. It's also a big step towards activating pages with a reuse distance larger than memory, as long as they are the most frequently used pages in the workload. This will require knowing more about the access frequency of active pages than what we measure right now, so it's also deferred in this series. Another possibility of having thrashing information would be to revisit the idea of local reclaim in the form of zero-config memory control groups. Instead of having allocating tasks go straight to global reclaim, they could try to reclaim the pages in the memcg they are part of first as long as the group is not thrashing. This would allow a user to drop e.g. a back-up job in an otherwise unconfigured memcg and it would only inflate (and possibly do global reclaim) until it has enough memory to do proper readahead. But once it reaches that point and stops thrashing it would just recycle its own used-once pages without kicking out the cache of any other tasks in the system more than necessary. This patch (of 10): Fengguang Wu's build testing spotted problems with inc_zone_state() and dec_zone_state() on UP configurations in out-of-tree patches. inc_zone_state() is declared but not defined, dec_zone_state() is missing entirely. Just like with *_zone_page_state(), they can be defined like their preemption-unsafe counterparts on UP. [akpm@linux-foundation.org: make it build] Signed-off-by: Johannes Weiner Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Minchan Kim Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Rik van Riel Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 67ce70c8279b..ea4476157e00 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -187,8 +187,6 @@ extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d) #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) -extern void inc_zone_state(struct zone *, enum zone_stat_item); - #ifdef CONFIG_SMP void __mod_zone_page_state(struct zone *, enum zone_stat_item item, int); void __inc_zone_page_state(struct page *, enum zone_stat_item); @@ -230,18 +228,18 @@ static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item) atomic_long_inc(&vm_stat[item]); } -static inline void __inc_zone_page_state(struct page *page, - enum zone_stat_item item) -{ - __inc_zone_state(page_zone(page), item); -} - static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_dec(&zone->vm_stat[item]); atomic_long_dec(&vm_stat[item]); } +static inline void __inc_zone_page_state(struct page *page, + enum zone_stat_item item) +{ + __inc_zone_state(page_zone(page), item); +} + static inline void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { @@ -256,6 +254,9 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state +#define inc_zone_state __inc_zone_state +#define dec_zone_state __dec_zone_state + #define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_cpu_vm_stats(int cpu) { } -- cgit From 53c59f262d747ea82e7414774c59a489501186a0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:39 -0700 Subject: lib: radix-tree: add radix_tree_delete_item() Provide a function that does not just delete an entry at a given index, but also allows passing in an expected item. Delete only if that item is still located at the specified index. This is handy when lockless tree traversals want to delete entries as well because they don't have to do an second, locked lookup to verify the slot has not changed under them before deleting the entry. Signed-off-by: Johannes Weiner Reviewed-by: Minchan Kim Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 403940787be1..1bf0a9c388d9 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -219,6 +219,7 @@ static inline void radix_tree_replace_slot(void **pslot, void *item) int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, -- cgit From e7b563bb2a6f4d974208da46200784b9c5b5a47e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:44 -0700 Subject: mm: filemap: move radix tree hole searching here The radix tree hole searching code is only used for page cache, for example the readahead code trying to get a a picture of the area surrounding a fault. It sufficed to rely on the radix tree definition of holes, which is "empty tree slot". But this is about to change, though, as shadow page descriptors will be stored in the page cache after the actual pages get evicted from memory. Move the functions over to mm/filemap.c and make them native page cache operations, where they can later be adapted to handle the new definition of "page cache hole". Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Acked-by: Mel Gorman Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 5 +++++ include/linux/radix-tree.h | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 1710d1b060ba..52d56872fe26 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -243,6 +243,11 @@ static inline struct page *page_cache_alloc_readahead(struct address_space *x) typedef int filler_t(void *, struct page *); +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan); +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan); + extern struct page * find_get_page(struct address_space *mapping, pgoff_t index); extern struct page * find_lock_page(struct address_space *mapping, diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 1bf0a9c388d9..e8be53ecfc45 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -227,10 +227,6 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, unsigned long *indices, unsigned long first_index, unsigned int max_items); -unsigned long radix_tree_next_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan); -unsigned long radix_tree_prev_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan); int radix_tree_preload(gfp_t gfp_mask); int radix_tree_maybe_preload(gfp_t gfp_mask); void radix_tree_init(void); -- cgit From 0cd6144aadd2afd19d1aca880153530c52957604 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:46 -0700 Subject: mm + fs: prepare for non-page entries in page cache radix trees shmem mappings already contain exceptional entries where swap slot information is remembered. To be able to store eviction information for regular page cache, prepare every site dealing with the radix trees directly to handle entries other than pages. The common lookup functions will filter out non-page entries and return NULL for page cache holes, just as before. But provide a raw version of the API which returns non-page entries as well, and switch shmem over to use it. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++++++++ include/linux/pagemap.h | 15 +++++++++------ include/linux/pagevec.h | 5 +++++ include/linux/shmem_fs.h | 1 + 4 files changed, 23 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2eec61fe75c9..b1331aff769c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1041,6 +1041,14 @@ extern void show_free_areas(unsigned int flags); extern bool skip_free_areas_node(unsigned int flags, int nid); int shmem_zero_setup(struct vm_area_struct *); +#ifdef CONFIG_SHMEM +bool shmem_mapping(struct address_space *mapping); +#else +static inline bool shmem_mapping(struct address_space *mapping) +{ + return false; +} +#endif extern int can_do_mlock(void); extern int user_shm_lock(size_t, struct user_struct *); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 52d56872fe26..493bfd85214e 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -248,12 +248,15 @@ pgoff_t page_cache_next_hole(struct address_space *mapping, pgoff_t page_cache_prev_hole(struct address_space *mapping, pgoff_t index, unsigned long max_scan); -extern struct page * find_get_page(struct address_space *mapping, - pgoff_t index); -extern struct page * find_lock_page(struct address_space *mapping, - pgoff_t index); -extern struct page * find_or_create_page(struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask); +struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); +struct page *find_get_page(struct address_space *mapping, pgoff_t offset); +struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset); +struct page *find_or_create_page(struct address_space *mapping, pgoff_t index, + gfp_t gfp_mask); +unsigned find_get_entries(struct address_space *mapping, pgoff_t start, + unsigned int nr_entries, struct page **entries, + pgoff_t *indices); unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index e4dbfab37729..b45d391b4540 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -22,6 +22,11 @@ struct pagevec { void __pagevec_release(struct pagevec *pvec); void __pagevec_lru_add(struct pagevec *pvec); +unsigned pagevec_lookup_entries(struct pagevec *pvec, + struct address_space *mapping, + pgoff_t start, unsigned nr_entries, + pgoff_t *indices); +void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_tag(struct pagevec *pvec, diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 9d55438bc4ad..4d1771c2d29f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -51,6 +51,7 @@ extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern int shmem_zero_setup(struct vm_area_struct *); extern int shmem_lock(struct file *file, int lock, struct user_struct *user); +extern bool shmem_mapping(struct address_space *mapping); extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); -- cgit From 91b0abe36a7b2b3b02d7500925a5f8455334f0e5 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:49 -0700 Subject: mm + fs: store shadow entries in page cache Reclaim will be leaving shadow entries in the page cache radix tree upon evicting the real page. As those pages are found from the LRU, an iput() can lead to the inode being freed concurrently. At this point, reclaim must no longer install shadow pages because the inode freeing code needs to ensure the page tree is really empty. Add an address_space flag, AS_EXITING, that the inode freeing code sets under the tree lock before doing the final truncate. Reclaim will check for this flag before installing shadow pages. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + include/linux/mm.h | 1 + include/linux/pagemap.h | 13 ++++++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6e765d28841b..3ca9420f627e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -419,6 +419,7 @@ struct address_space { struct mutex i_mmap_mutex; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ + unsigned long nrshadows; /* number of shadow entries */ pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ unsigned long flags; /* error bits/gfp mask */ diff --git a/include/linux/mm.h b/include/linux/mm.h index b1331aff769c..58d6ddba5db3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1834,6 +1834,7 @@ vm_unmapped_area(struct vm_unmapped_area_info *info) extern void truncate_inode_pages(struct address_space *, loff_t); extern void truncate_inode_pages_range(struct address_space *, loff_t lstart, loff_t lend); +extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 493bfd85214e..ff43253f568a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -25,6 +25,7 @@ enum mapping_flags { AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ AS_BALLOON_MAP = __GFP_BITS_SHIFT + 4, /* balloon page special map */ + AS_EXITING = __GFP_BITS_SHIFT + 5, /* final truncate in progress */ }; static inline void mapping_set_error(struct address_space *mapping, int error) @@ -69,6 +70,16 @@ static inline int mapping_balloon(struct address_space *mapping) return mapping && test_bit(AS_BALLOON_MAP, &mapping->flags); } +static inline void mapping_set_exiting(struct address_space *mapping) +{ + set_bit(AS_EXITING, &mapping->flags); +} + +static inline int mapping_exiting(struct address_space *mapping) +{ + return test_bit(AS_EXITING, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; @@ -547,7 +558,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void delete_from_page_cache(struct page *page); -extern void __delete_from_page_cache(struct page *page); +extern void __delete_from_page_cache(struct page *page, void *shadow); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); /* -- cgit From a528910e12ec7ee203095eb1711468a66b9b60b0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:51 -0700 Subject: mm: thrash detection-based file cache sizing The VM maintains cached filesystem pages on two types of lists. One list holds the pages recently faulted into the cache, the other list holds pages that have been referenced repeatedly on that first list. The idea is to prefer reclaiming young pages over those that have shown to benefit from caching in the past. We call the recently usedbut ultimately was not significantly better than a FIFO policy and still thrashed cache based on eviction speed, rather than actual demand for cache. This patch solves one half of the problem by decoupling the ability to detect working set changes from the inactive list size. By maintaining a history of recently evicted file pages it can detect frequently used pages with an arbitrarily small inactive list size, and subsequently apply pressure on the active list based on actual demand for cache, not just overall eviction speed. Every zone maintains a counter that tracks inactive list aging speed. When a page is evicted, a snapshot of this counter is stored in the now-empty page cache radix tree slot. On refault, the minimum access distance of the page can be assessed, to evaluate whether the page should be part of the active list or not. This fixes the VM's blindness towards working set changes in excess of the inactive list. And it's the foundation to further improve the protection ability and reduce the minimum inactive list size of 50%. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Reviewed-by: Bob Liu Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 5 +++++ include/linux/swap.h | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9b61b9bf81ac..f25db1d74a21 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -142,6 +142,8 @@ enum zone_stat_item { NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ #endif + WORKINGSET_REFAULT, + WORKINGSET_ACTIVATE, NR_ANON_TRANSPARENT_HUGEPAGES, NR_FREE_CMA_PAGES, NR_VM_ZONE_STAT_ITEMS }; @@ -392,6 +394,9 @@ struct zone { spinlock_t lru_lock; struct lruvec lruvec; + /* Evictions & activations on the inactive file list */ + atomic_long_t inactive_age; + unsigned long pages_scanned; /* since last reclaim */ unsigned long flags; /* zone flags, see below */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 46ba0c6c219f..b83cf61403ed 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -260,6 +260,11 @@ struct swap_list_t { int next; /* swapfile to be used next */ }; +/* linux/mm/workingset.c */ +void *workingset_eviction(struct address_space *mapping, struct page *page); +bool workingset_refault(void *shadow); +void workingset_activation(struct page *page); + /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; -- cgit From 139e561660fe11e0fc35e142a800df3dd7d03e9d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:54 -0700 Subject: lib: radix_tree: tree node interface Make struct radix_tree_node part of the public interface and provide API functions to create, look up, and delete whole nodes. Refactor the existing insert, look up, delete functions on top of these new node primitives. This will allow the VM to track and garbage collect page cache radix tree nodes. [sasha.levin@oracle.com: return correct error code on insertion failure] Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index e8be53ecfc45..13636c40bc42 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -60,6 +60,33 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) #define RADIX_TREE_MAX_TAGS 3 +#ifdef __KERNEL__ +#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) +#else +#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ +#endif + +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) + +#define RADIX_TREE_TAG_LONGS \ + ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) + +struct radix_tree_node { + unsigned int height; /* Height from the bottom */ + unsigned int count; + union { + struct radix_tree_node *parent; /* Used when ascending tree */ + struct rcu_head rcu_head; /* Used when freeing node */ + }; + void __rcu *slots[RADIX_TREE_MAP_SIZE]; + unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; +}; + +#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ + RADIX_TREE_MAP_SHIFT)) + /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ struct radix_tree_root { unsigned int height; @@ -101,6 +128,7 @@ do { \ * concurrently with other readers. * * The notable exceptions to this rule are the following functions: + * __radix_tree_lookup * radix_tree_lookup * radix_tree_lookup_slot * radix_tree_tag_get @@ -216,9 +244,15 @@ static inline void radix_tree_replace_slot(void **pslot, void *item) rcu_assign_pointer(*pslot, item); } +int __radix_tree_create(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp); int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); +void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); unsigned int -- cgit From 449dd6984d0e47643c04c807f609dd56d48d5bcc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:56 -0700 Subject: mm: keep page cache radix tree nodes in check Previously, page cache radix tree nodes were freed after reclaim emptied out their page pointers. But now reclaim stores shadow entries in their place, which are only reclaimed when the inodes themselves are reclaimed. This is problematic for bigger files that are still in use after they have a significant amount of their cache reclaimed, without any of those pages actually refaulting. The shadow entries will just sit there and waste memory. In the worst case, the shadow entries will accumulate until the machine runs out of memory. To get this under control, the VM will track radix tree nodes exclusively containing shadow entries on a per-NUMA node list. Per-NUMA rather than global because we expect the radix tree nodes themselves to be allocated node-locally and we want to reduce cross-node references of otherwise independent cache workloads. A simple shrinker will then reclaim these nodes on memory pressure. A few things need to be stored in the radix tree node to implement the shadow node LRU and allow tree deletions coming from the list: 1. There is no index available that would describe the reverse path from the node up to the tree root, which is needed to perform a deletion. To solve this, encode in each node its offset inside the parent. This can be stored in the unused upper bits of the same member that stores the node's height at no extra space cost. 2. The number of shadow entries needs to be counted in addition to the regular entries, to quickly detect when the node is ready to go to the shadow node LRU list. The current entry count is an unsigned int but the maximum number of entries is 64, so a shadow counter can easily be stored in the unused upper bits. 3. Tree modification needs tree lock and tree root, which are located in the address space, so store an address_space backpointer in the node. The parent pointer of the node is in a union with the 2-word rcu_head, so the backpointer comes at no extra cost as well. 4. The node needs to be linked to an LRU list, which requires a list head inside the node. This does increase the size of the node, but it does not change the number of objects that fit into a slab page. [akpm@linux-foundation.org: export the right function] Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_lru.h | 8 +++++++- include/linux/mmzone.h | 1 + include/linux/radix-tree.h | 32 ++++++++++++++++++++++++-------- include/linux/swap.h | 31 +++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 3ce541753c88..f3434533fbf8 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -13,6 +13,8 @@ /* list_lru_walk_cb has to always return one of those */ enum lru_status { LRU_REMOVED, /* item removed from list */ + LRU_REMOVED_RETRY, /* item removed, but lock has been + dropped and reacquired */ LRU_ROTATE, /* item referenced, give another pass */ LRU_SKIP, /* item cannot be locked, skip */ LRU_RETRY, /* item not freeable. May drop the lock @@ -32,7 +34,11 @@ struct list_lru { }; void list_lru_destroy(struct list_lru *lru); -int list_lru_init(struct list_lru *lru); +int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key); +static inline int list_lru_init(struct list_lru *lru) +{ + return list_lru_init_key(lru, NULL); +} /** * list_lru_add: add an element to the lru list's tail diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f25db1d74a21..fac5509c18f0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -144,6 +144,7 @@ enum zone_stat_item { #endif WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, + WORKINGSET_NODERECLAIM, NR_ANON_TRANSPARENT_HUGEPAGES, NR_FREE_CMA_PAGES, NR_VM_ZONE_STAT_ITEMS }; diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 13636c40bc42..33170dbd9db4 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -72,21 +72,37 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) #define RADIX_TREE_TAG_LONGS \ ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) +#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ + RADIX_TREE_MAP_SHIFT)) + +/* Height component in node->path */ +#define RADIX_TREE_HEIGHT_SHIFT (RADIX_TREE_MAX_PATH + 1) +#define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1) + +/* Internally used bits of node->count */ +#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1) +#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) + struct radix_tree_node { - unsigned int height; /* Height from the bottom */ + unsigned int path; /* Offset in parent & height from the bottom */ unsigned int count; union { - struct radix_tree_node *parent; /* Used when ascending tree */ - struct rcu_head rcu_head; /* Used when freeing node */ + struct { + /* Used when ascending tree */ + struct radix_tree_node *parent; + /* For tree user */ + void *private_data; + }; + /* Used when freeing node */ + struct rcu_head rcu_head; }; + /* For tree user */ + struct list_head private_list; void __rcu *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; -#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ - RADIX_TREE_MAP_SHIFT)) - /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ struct radix_tree_root { unsigned int height; @@ -251,7 +267,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); -bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, +bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); diff --git a/include/linux/swap.h b/include/linux/swap.h index b83cf61403ed..350711560753 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -264,6 +264,37 @@ struct swap_list_t { void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); void workingset_activation(struct page *page); +extern struct list_lru workingset_shadow_nodes; + +static inline unsigned int workingset_node_pages(struct radix_tree_node *node) +{ + return node->count & RADIX_TREE_COUNT_MASK; +} + +static inline void workingset_node_pages_inc(struct radix_tree_node *node) +{ + node->count++; +} + +static inline void workingset_node_pages_dec(struct radix_tree_node *node) +{ + node->count--; +} + +static inline unsigned int workingset_node_shadows(struct radix_tree_node *node) +{ + return node->count >> RADIX_TREE_COUNT_SHIFT; +} + +static inline void workingset_node_shadows_inc(struct radix_tree_node *node) +{ + node->count += 1U << RADIX_TREE_COUNT_SHIFT; +} + +static inline void workingset_node_shadows_dec(struct radix_tree_node *node) +{ + node->count -= 1U << RADIX_TREE_COUNT_SHIFT; +} /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; -- cgit From c558784fa9e65363ed32cf6ff137c89bc8b54f81 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:48:07 -0700 Subject: include/linux/mm.h: remove ifdef condition The ifdef conditions in include/linux/mm.h presents three cases: - !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) There is no actual definition of function but include/linux/mm.h has a static inline stub defined. - defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) linux/mm.h does not define a prototype, but mm/page_alloc.c defines the function. Hence, compiler reports the following warning: mm/page_alloc.c:4300:15: warning: no previous prototype for `__early_pfn_to_nid' [-Wmissing-prototypes] - defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) The architecture defines the function, and linux/mm.h has a prototype. Thus, join the conditions of Case 2 and 3 ie eliminate the ifdef condition of CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID to eliminate the missing prototype warning from file mm/page_alloc.c. Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Reviewed-by: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 58d6ddba5db3..35300f390eb6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1666,10 +1666,8 @@ static inline int __early_pfn_to_nid(unsigned long pfn) #else /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); -#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* there is a per-arch backend function. */ extern int __meminit __early_pfn_to_nid(unsigned long pfn); -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ #endif extern void set_dma_reserve(unsigned long new_dma_reserve); -- cgit From 67f9fd91f93c582b7de2ab9325b6e179db77e4d5 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 3 Apr 2014 14:48:18 -0700 Subject: mm: remove read_cache_page_async() This patch removes read_cache_page_async() which wasn't really needed anywhere and simplifies the code around it a bit. read_cache_page_async() is useful when we want to read a page into the cache without waiting for it to complete. This happens when the appropriate callback 'filler' doesn't complete its read operation and releases the page lock immediately, and instead queues a different completion routine to do that. This never actually happened anywhere in the code. read_cache_page_async() had 3 different callers: - read_cache_page() which is the sync version, it would just wait for the requested read to complete using wait_on_page_read(). - JFFS2 would call it from jffs2_gc_fetch_page(), but the filler function it supplied doesn't do any async reads, and would complete before the filler function returns - making it actually a sync read. - CRAMFS would call it using the read_mapping_page_async() wrapper, with a similar story to JFFS2 - the filler function doesn't do anything that reminds async reads and would always complete before the filler function returns. To sum it up, the code in mm/filemap.c never took advantage of having read_cache_page_async(). While there are filler callbacks that do async reads (such as the block one), we always called it with the read_cache_page(). This patch adds a mandatory wait for read to complete when adding a new page to the cache, and removes read_cache_page_async() and its wrappers. Signed-off-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ff43253f568a..45598f1e9aa3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -289,8 +289,6 @@ static inline struct page *grab_cache_page(struct address_space *mapping, extern struct page * grab_cache_page_nowait(struct address_space *mapping, pgoff_t index); -extern struct page * read_cache_page_async(struct address_space *mapping, - pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page_gfp(struct address_space *mapping, @@ -298,14 +296,6 @@ extern struct page * read_cache_page_gfp(struct address_space *mapping, extern int read_cache_pages(struct address_space *mapping, struct list_head *pages, filler_t *filler, void *data); -static inline struct page *read_mapping_page_async( - struct address_space *mapping, - pgoff_t index, void *data) -{ - filler_t *filler = (filler_t *)mapping->a_ops->readpage; - return read_cache_page_async(mapping, index, filler, data); -} - static inline struct page *read_mapping_page(struct address_space *mapping, pgoff_t index, void *data) { -- cgit From 5509a5d27b971a90b940e148ca9ca53312e4fa7a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 3 Apr 2014 14:48:19 -0700 Subject: drop_caches: add some documentation and info message There is plenty of anecdotal evidence and a load of blog posts suggesting that using "drop_caches" periodically keeps your system running in "tip top shape". Perhaps adding some kernel documentation will increase the amount of accurate data on its use. If we are not shrinking caches effectively, then we have real bugs. Using drop_caches will simply mask the bugs and make them harder to find, but certainly does not fix them, nor is it an appropriate "workaround" to limit the size of the caches. On the contrary, there have been bug reports on issues that turned out to be misguided use of cache dropping. Dropping caches is a very drastic and disruptive operation that is good for debugging and running tests, but if it creates bug reports from production use, kernel developers should be aware of its use. Add a bit more documentation about it, a syslog message to track down abusers, and vmstat drop counters to help analyze problem reports. [akpm@linux-foundation.org: checkpatch fixes] [hannes@cmpxchg.org: add runtime suppression control] Signed-off-by: Dave Hansen Signed-off-by: Michal Hocko Acked-by: KOSAKI Motohiro Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 3a712e2e7d76..486c3972c0be 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -37,6 +37,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, PAGEOUTRUN, ALLOCSTALL, PGROTATED, + DROP_PAGECACHE, DROP_SLAB, #ifdef CONFIG_NUMA_BALANCING NUMA_PTE_UPDATES, NUMA_HUGE_PTE_UPDATES, -- cgit From bcccff93af359533683603255124fc19eb12613d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 3 Apr 2014 14:48:21 -0700 Subject: kobject: don't block for each kobject_uevent Currently kobject_uevent has somewhat unpredictable semantics. The point is, since it may call a usermode helper and wait for it to execute (UMH_WAIT_EXEC), it is impossible to say for sure what lock dependencies it will introduce for the caller - strictly speaking it depends on what fs the binary is located on and the set of locks fork may take. There are quite a few kobject_uevent's users that do not take this into account and call it with various mutexes taken, e.g. rtnl_mutex, net_mutex, which might potentially lead to a deadlock. Since there is actually no reason to wait for the usermode helper to execute there, let's make kobject_uevent start the helper asynchronously with the aid of the UMH_NO_WAIT flag. Personally, I'm interested in this, because I really want kobject_uevent to be called under the slab_mutex in the slub implementation as it used to be some time ago, because it greatly simplifies synchronization and automatically fixes a kmemcg-related race. However, there was a deadlock detected on an attempt to call kobject_uevent under the slab_mutex (see https://lkml.org/lkml/2012/1/14/45), which was reported to be fixed by releasing the slab_mutex for kobject_uevent. Unfortunately, there was no information about who exactly blocked on the slab_mutex causing the usermode helper to stall, neither have I managed to find this out or reproduce the issue. BTW, this is not the first attempt to make kobject_uevent use UMH_NO_WAIT. Previous one was made by commit f520360d93cd ("kobject: don't block for each kobject_uevent"), but it was wrong (it passed arguments allocated on stack to async thread) so it was reverted in 05f54c13cd0c ("Revert "kobject: don't block for each kobject_uevent"."). It targeted on speeding up the boot process though. Signed-off-by: Vladimir Davydov Cc: Greg KH Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kobject.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 926afb6f6b5f..f896a33e8341 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -119,6 +119,7 @@ struct kobj_type { }; struct kobj_uevent_env { + char *argv[3]; char *envp[UEVENT_NUM_ENVP]; int envp_idx; char buf[UEVENT_BUFFER_SIZE]; -- cgit From e3a0cfdc8c8904236f91a506c1e760b0d60dd918 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:48:24 -0700 Subject: include/linux/syscalls.h: add sys32_quotactl() prototype This eliminates the following warning in quota/compat.c: fs/quota/compat.c:43:17: warning: no previous prototype for `sys32_quotactl' [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 1e67b7a5968c..2aa8b749f13d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -200,6 +200,8 @@ extern struct trace_event_functions exit_syscall_print_funcs; } \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) +asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, + qid_t id, void __user *addr); asmlinkage long sys_time(time_t __user *tloc); asmlinkage long sys_stime(time_t __user *tptr); asmlinkage long sys_gettimeofday(struct timeval __user *tv, -- cgit From 8f6c5ffc8987f4f5b5a3e9d557d94bbf3a9bf216 Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Thu, 3 Apr 2014 14:48:26 -0700 Subject: kernel/groups.c: remove return value of set_groups After commit 6307f8fee295 ("security: remove dead hook task_setgroups"), set_groups will always return zero, so we could just remove return value of set_groups. This patch reduces code size, and simplfies code to use set_groups, because we don't need to check its return value any more. [akpm@linux-foundation.org: remove obsolete claims from set_groups() comment] Signed-off-by: Wang YanQing Cc: "Eric W. Biederman" Cc: Serge Hallyn Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cred.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index 04421e825365..f61d6c8f5ef3 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -66,7 +66,7 @@ extern struct group_info *groups_alloc(int); extern struct group_info init_groups; extern void groups_free(struct group_info *); extern int set_current_groups(struct group_info *); -extern int set_groups(struct cred *, struct group_info *); +extern void set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); /* access the groups "array" with this macro */ -- cgit From a5ed3cee944abf2c80ae146b06f9055d7da03f94 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 3 Apr 2014 14:48:31 -0700 Subject: err.h: use bool for IS_ERR and IS_ERR_OR_NULL Use the more natural return of bool for these tests. No difference observed in .o files produced by gcc for x86. Remove the dentry description of kernel pointers left over from the 90's and 2002's cleanup move of parts of fs.h to err.h. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/err.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/err.h b/include/linux/err.h index 15f92e072450..a729120644d5 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -2,12 +2,13 @@ #define _LINUX_ERR_H #include +#include #include /* * Kernel pointers have redundant information, so we can use a - * scheme where we can return either an error code or a dentry + * scheme where we can return either an error code or a normal * pointer with the same return value. * * This should be a per-architecture thing, to allow different @@ -29,12 +30,12 @@ static inline long __must_check PTR_ERR(__force const void *ptr) return (long) ptr; } -static inline long __must_check IS_ERR(__force const void *ptr) +static inline bool __must_check IS_ERR(__force const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } -static inline long __must_check IS_ERR_OR_NULL(__force const void *ptr) +static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) { return !ptr || IS_ERR_VALUE((unsigned long)ptr); } -- cgit From ea1a8217b06b41b31a2b60b0b83f75c77ef9c873 Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Thu, 3 Apr 2014 14:48:33 -0700 Subject: xattr: guard against simultaneous glibc header inclusion If the glibc xattr.h header is included after the uapi header, compilation fails due to an enum re-using a #define from the uapi header. Protect against this by guarding the define and enum inclusions against each other. (See https://lists.debian.org/debian-glibc/2014/03/msg00029.html and https://sourceware.org/glibc/wiki/Synchronizing_Headers for more information.) Signed-off-by: Serge Hallyn Cc: Andrew Morton Cc: Allan McRae Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/libc-compat.h | 9 +++++++++ include/uapi/linux/xattr.h | 7 +++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h index 335e8a7cad39..c140620dad92 100644 --- a/include/uapi/linux/libc-compat.h +++ b/include/uapi/linux/libc-compat.h @@ -85,6 +85,12 @@ #endif /* _NETINET_IN_H */ +/* Definitions for xattr.h */ +#if defined(_SYS_XATTR_H) +#define __UAPI_DEF_XATTR 0 +#else +#define __UAPI_DEF_XATTR 1 +#endif /* If we did not see any headers from any supported C libraries, * or we are being included in the kernel, then define everything @@ -98,6 +104,9 @@ #define __UAPI_DEF_IPV6_MREQ 1 #define __UAPI_DEF_IPPROTO_V6 1 +/* Definitions for xattr.h */ +#define __UAPI_DEF_XATTR 1 + #endif /* __GLIBC__ */ #endif /* _UAPI_LIBC_COMPAT_H */ diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h index 40bbc04b6f81..c38355c1f3c9 100644 --- a/include/uapi/linux/xattr.h +++ b/include/uapi/linux/xattr.h @@ -7,11 +7,18 @@ Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. Copyright (c) 2004 Red Hat, Inc., James Morris */ + +#include + #ifndef _UAPI_LINUX_XATTR_H #define _UAPI_LINUX_XATTR_H +#ifdef __UAPI_DEF_XATTR +#define __USE_KERNEL_XATTR_DEFS + #define XATTR_CREATE 0x1 /* set value, fail if attr already exists */ #define XATTR_REPLACE 0x2 /* set value, fail if attr does not exist */ +#endif /* Namespaces */ #define XATTR_OS2_PREFIX "os2." -- cgit From 0185698c0f543d4e8677bb97227ebe46024e3eb8 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 3 Apr 2014 14:48:38 -0700 Subject: printk: remove duplicated check for log level The check for the exact log level is already done in printk_get_level. We do not need to duplicate it in printk_skip_level. Signed-off-by: Petr Mladek Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jan Kara Cc: Michal Hocko Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/printk.h b/include/linux/printk.h index fa47e2708c01..8860575899f2 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -24,13 +24,9 @@ static inline int printk_get_level(const char *buffer) static inline const char *printk_skip_level(const char *buffer) { - if (printk_get_level(buffer)) { - switch (buffer[1]) { - case '0' ... '7': - case 'd': /* KERN_DEFAULT */ - return buffer + 2; - } - } + if (printk_get_level(buffer)) + return buffer + 2; + return buffer; } -- cgit From d487d57581057abd271651334ea2996aa1b31e28 Mon Sep 17 00:00:00 2001 From: Simon Kågström Date: Thu, 3 Apr 2014 14:48:44 -0700 Subject: include/linux/printk.h: remove double asmlinkage in printk_emit The double asmlinkage was introduced in commit 7ff9554bb578 ("printk: convert byte-buffer to variable-length record buffer"). Signed-off-by: Simon Kagstrom Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/printk.h b/include/linux/printk.h index 8860575899f2..8752f7595b27 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -120,9 +120,9 @@ asmlinkage __printf(1, 0) int vprintk(const char *fmt, va_list args); asmlinkage __printf(5, 6) __cold -asmlinkage int printk_emit(int facility, int level, - const char *dict, size_t dictlen, - const char *fmt, ...); +int printk_emit(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, ...); asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); -- cgit From a55944ca82d287ca099ca90413af857af9086773 Mon Sep 17 00:00:00 2001 From: Liu Ying Date: Thu, 3 Apr 2014 14:48:54 -0700 Subject: backlight: update bd state & fb_blank properties when necessary We don't have to update the state and fb_blank properties of a backlight device every time a blanking or unblanking event comes because they may have already been what we want. Another thought is that one backlight device may be shared by multiple framebuffers. The backlight driver should take the backlight device as a resource shared by all the associated framebuffers. This patch adds some logic to record each framebuffer's backlight usage to determine the backlight device use count and whether the two properties should be updated or not. To be more specific, only one unblank operation on a certain blanked framebuffer may increase the backlight device's use count by one, while one blank operation on a certain unblanked framebuffer may decrease the use count by one, because the userspace is likely to unblank an unblanked framebuffer or blank a blanked framebuffer. Signed-off-by: Liu Ying Cc: Jingoo Han Cc: Jean-Christophe PLAGNIOL-VILLARD Cc: Tomi Valkeinen Cc: Jani Nikula Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backlight.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/backlight.h b/include/linux/backlight.h index 5f9cd963213d..72647429adf6 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -9,6 +9,7 @@ #define _LINUX_BACKLIGHT_H #include +#include #include #include @@ -104,6 +105,11 @@ struct backlight_device { struct list_head entry; struct device dev; + + /* Multiple framebuffers may share one backlight device */ + bool fb_bl_on[FB_MAX]; + + int use_count; }; static inline void backlight_update_status(struct backlight_device *bd) -- cgit From af661e836039cbc2aadb3bccfd7d3ddfc1bcc456 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 3 Apr 2014 14:49:10 -0700 Subject: lib/decompress_inflate.c: include appropriate header file Include appropriate header file include/linux/decompress/inflate.h in lib/decompress_inflate.c because it has prototype declaration of function defined in lib/decompress_inflate.c. Also, fix the guard around the header file include/linux/decompress/inflate.h to use a more unique guard symbol. This avoids conflict with the INFLATE_H defined by zlib_inflate/inflate.h. This eliminates the following warning in lib/decompress_inflate.c: lib/decompress_inflate.c:35:17: warning: no previous prototype for `gunzip' [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/decompress/inflate.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/decompress/inflate.h b/include/linux/decompress/inflate.h index 8c0aef1ba5f5..1d0aedef9822 100644 --- a/include/linux/decompress/inflate.h +++ b/include/linux/decompress/inflate.h @@ -1,5 +1,5 @@ -#ifndef INFLATE_H -#define INFLATE_H +#ifndef LINUX_DECOMPRESS_INFLATE_H +#define LINUX_DECOMPRESS_INFLATE_H int gunzip(unsigned char *inbuf, int len, int(*fill)(void*, unsigned int), -- cgit From 5a418558cdae4fd0699b7b6a960f03a8bb38a338 Mon Sep 17 00:00:00 2001 From: Josh Cartwright Date: Thu, 3 Apr 2014 14:50:13 -0700 Subject: rtc: pm8xxx: add support for devicetree Add support for describing the PM8921/PM8058 RTC in device tree. Additionally: - drop support for describing the RTC using platform data, as there are no current in tree users who do so. - make allow_set_time a device-specific flag, instead of mucking with the rtc_ops Signed-off-by: Josh Cartwright Reviewed-by: Stephen Boyd Acked-by: Lee Jones Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mfd/pm8xxx/rtc.h | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 include/linux/mfd/pm8xxx/rtc.h (limited to 'include') diff --git a/include/linux/mfd/pm8xxx/rtc.h b/include/linux/mfd/pm8xxx/rtc.h deleted file mode 100644 index 14f1983eaecc..000000000000 --- a/include/linux/mfd/pm8xxx/rtc.h +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 and - * only version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#ifndef __RTC_PM8XXX_H__ -#define __RTC_PM8XXX_H__ - -#define PM8XXX_RTC_DEV_NAME "rtc-pm8xxx" -/** - * struct pm8xxx_rtc_pdata - RTC driver platform data - * @rtc_write_enable: variable stating RTC write capability - */ -struct pm8xxx_rtc_platform_data { - bool rtc_write_enable; -}; - -#endif /* __RTC_PM8XXX_H__ */ -- cgit From 90ccf7dde9527672553c3a809aa93654c95672d0 Mon Sep 17 00:00:00 2001 From: Andreas Rohner Date: Thu, 3 Apr 2014 14:50:26 -0700 Subject: nilfs2: add struct nilfs_suinfo_update and flags Add the nilfs_suinfo_update structure, which contains the information needed to update one segment usage entry. The flags specify, which fields need to be updated. Signed-off-by: Andreas Rohner Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nilfs2_fs.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include') diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index 98755767c7b0..252657874a19 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -710,6 +710,48 @@ static inline int nilfs_suinfo_clean(const struct nilfs_suinfo *si) } /* ioctl */ +/** + * nilfs_suinfo_update - segment usage information update + * @sup_segnum: segment number + * @sup_flags: flags for which fields are active in sup_sui + * @sup_reserved: reserved necessary for alignment + * @sup_sui: segment usage information + */ +struct nilfs_suinfo_update { + __u64 sup_segnum; + __u32 sup_flags; + __u32 sup_reserved; + struct nilfs_suinfo sup_sui; +}; + +enum { + NILFS_SUINFO_UPDATE_LASTMOD, + NILFS_SUINFO_UPDATE_NBLOCKS, + NILFS_SUINFO_UPDATE_FLAGS, + __NR_NILFS_SUINFO_UPDATE_FIELDS, +}; + +#define NILFS_SUINFO_UPDATE_FNS(flag, name) \ +static inline void \ +nilfs_suinfo_update_set_##name(struct nilfs_suinfo_update *sup) \ +{ \ + sup->sup_flags |= 1UL << NILFS_SUINFO_UPDATE_##flag; \ +} \ +static inline void \ +nilfs_suinfo_update_clear_##name(struct nilfs_suinfo_update *sup) \ +{ \ + sup->sup_flags &= ~(1UL << NILFS_SUINFO_UPDATE_##flag); \ +} \ +static inline int \ +nilfs_suinfo_update_##name(const struct nilfs_suinfo_update *sup) \ +{ \ + return !!(sup->sup_flags & (1UL << NILFS_SUINFO_UPDATE_##flag));\ +} + +NILFS_SUINFO_UPDATE_FNS(LASTMOD, lastmod) +NILFS_SUINFO_UPDATE_FNS(NBLOCKS, nblocks) +NILFS_SUINFO_UPDATE_FNS(FLAGS, flags) + enum { NILFS_CHECKPOINT, NILFS_SNAPSHOT, -- cgit From 2cc88f3a5f16ae9a3c8f54de1b2fd4a397b36075 Mon Sep 17 00:00:00 2001 From: Andreas Rohner Date: Thu, 3 Apr 2014 14:50:28 -0700 Subject: nilfs2: implementation of NILFS_IOCTL_SET_SUINFO ioctl With this ioctl the segment usage entries in the SUFILE can be updated from userspace. This is useful, because it allows the userspace GC to modify and update segment usage entries for specific segments, which enables it to avoid unnecessary write operations. If a segment needs to be cleaned, but there is no or very little reclaimable space in it, the cleaning operation basically degrades to a useless moving operation. In the end the only thing that changes is the location of the data and a timestamp in the segment usage information. With this ioctl the GC can skip the cleaning and update the segment usage entries directly instead. This is basically a shortcut to cleaning the segment. It is still necessary to read the segment summary information, but the writing of the live blocks can be skipped if it's not worth it. [konishi.ryusuke@lab.ntt.co.jp: add description of NILFS_IOCTL_SET_SUINFO ioctl] Signed-off-by: Andreas Rohner Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nilfs2_fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index 252657874a19..1fb465f9baf2 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -905,5 +905,7 @@ struct nilfs_bdesc { _IOW(NILFS_IOCTL_IDENT, 0x8B, __u64) #define NILFS_IOCTL_SET_ALLOC_RANGE \ _IOW(NILFS_IOCTL_IDENT, 0x8C, __u64[2]) +#define NILFS_IOCTL_SET_SUINFO \ + _IOW(NILFS_IOCTL_IDENT, 0x8D, struct nilfs_argv) #endif /* _LINUX_NILFS_FS_H */ -- cgit From 0ec060d1881a24c270fdf0d6616e33e23a209ef2 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 3 Apr 2014 14:50:31 -0700 Subject: nilfs2: verify metadata sizes read from disk Add code to check sizes of on-disk data of metadata files such as inode size, segment usage size, DAT entry size, and checkpoint size. Although these sizes are read from disk, the current implementation doesn't check them. If these sizes are not sane on disk, it can cause out-of-range access to metadata or memory access overrun on metadata block buffers due to overflow in sundry calculations. Both lower limit and upper limit of metadata sizes are verified to prevent these issues. Signed-off-by: Ryusuke Konishi Cc: Andreas Rohner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nilfs2_fs.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index 1fb465f9baf2..ff3fea3194c6 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -82,6 +82,8 @@ struct nilfs_inode { __le32 i_pad; }; +#define NILFS_MIN_INODE_SIZE 128 + /** * struct nilfs_super_root - structure of super root * @sr_sum: check sum @@ -482,6 +484,8 @@ struct nilfs_dat_entry { __le64 de_rsv; }; +#define NILFS_MIN_DAT_ENTRY_SIZE 32 + /** * struct nilfs_snapshot_list - snapshot list * @ssl_next: next checkpoint number on snapshot list @@ -520,6 +524,8 @@ struct nilfs_checkpoint { struct nilfs_inode cp_ifile_inode; }; +#define NILFS_MIN_CHECKPOINT_SIZE (64 + NILFS_MIN_INODE_SIZE) + /* checkpoint flags */ enum { NILFS_CHECKPOINT_SNAPSHOT, @@ -615,6 +621,8 @@ struct nilfs_segment_usage { __le32 su_flags; }; +#define NILFS_MIN_SEGMENT_USAGE_SIZE 16 + /* segment usage flag */ enum { NILFS_SEGMENT_USAGE_ACTIVE, -- cgit