From d3213e8fd4b0f18dfd438268ff480406ba743abb Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 22 Feb 2017 15:39:47 -0800 Subject: tracing: add __print_flags_u64() Patch series "DAX tracepoints, mm argument simplification", v4. This contains both my DAX tracepoint code and Dave Jiang's MM argument simplifications. Dave's code was written with my tracepoint code as a baseline, so it seemed simplest to keep them together in a single series. This patch (of 7): Add __print_flags_u64() and the helper trace_print_flags_seq_u64() in the same spirit as __print_symbolic_u64() and trace_print_symbols_seq_u64(). These functions allow us to print symbols associated with flags that are 64 bits wide even on 32 bit machines. These will be used by the DAX code so that we can print the flags set in a pfn_t such as PFN_SG_CHAIN, PFN_SG_LAST, PFN_DEV and PFN_MAP. Without this new function I was getting errors like the following when compiling for i386: include/linux/pfn_t.h:13:22: warning: large integer implicitly truncated to unsigned type [-Woverflow] #define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1)) ^ Link: http://lkml.kernel.org/r/1484085142-2297-2-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Steven Rostedt Cc: Dave Chinner Cc: Dave Jiang Cc: Jan Kara Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/trace_events.h | 4 ++++ include/trace/trace_events.h | 11 +++++++++++ kernel/trace/trace_output.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 0f165507495c..0af63c4381b9 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -23,6 +23,10 @@ const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array); #if BITS_PER_LONG == 32 +const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, + unsigned long long flags, + const struct trace_print_flags_u64 *flag_array); + const char *trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 5c06f4af8323..00f643164ca2 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -283,8 +283,16 @@ TRACE_MAKE_SYSTEM_STR(); trace_print_symbols_seq(p, value, symbols); \ }) +#undef __print_flags_u64 #undef __print_symbolic_u64 #if BITS_PER_LONG == 32 +#define __print_flags_u64(flag, delim, flag_array...) \ + ({ \ + static const struct trace_print_flags_u64 __flags[] = \ + { flag_array, { -1, NULL } }; \ + trace_print_flags_seq_u64(p, delim, flag, __flags); \ + }) + #define __print_symbolic_u64(value, symbol_array...) \ ({ \ static const struct trace_print_flags_u64 symbols[] = \ @@ -292,6 +300,9 @@ TRACE_MAKE_SYSTEM_STR(); trace_print_symbols_seq_u64(p, value, symbols); \ }) #else +#define __print_flags_u64(flag, delim, flag_array...) \ + __print_flags(flag, delim, flag_array) + #define __print_symbolic_u64(value, symbol_array...) \ __print_symbolic(value, symbol_array) #endif diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index aea6a1218c7d..070866c32eb9 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -123,6 +123,44 @@ trace_print_symbols_seq(struct trace_seq *p, unsigned long val, EXPORT_SYMBOL(trace_print_symbols_seq); #if BITS_PER_LONG == 32 +const char * +trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, + unsigned long long flags, + const struct trace_print_flags_u64 *flag_array) +{ + unsigned long long mask; + const char *str; + const char *ret = trace_seq_buffer_ptr(p); + int i, first = 1; + + for (i = 0; flag_array[i].name && flags; i++) { + + mask = flag_array[i].mask; + if ((flags & mask) != mask) + continue; + + str = flag_array[i].name; + flags &= ~mask; + if (!first && delim) + trace_seq_puts(p, delim); + else + first = 0; + trace_seq_puts(p, str); + } + + /* check for left over flags */ + if (flags) { + if (!first && delim) + trace_seq_puts(p, delim); + trace_seq_printf(p, "0x%llx", flags); + } + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(trace_print_flags_seq_u64); + const char * trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array) -- cgit From 282a8e0391c377b64c7d065b18fc2f6267a24dad Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 22 Feb 2017 15:39:50 -0800 Subject: dax: add tracepoint infrastructure, PMD tracing Tracepoints are the standard way to capture debugging and tracing information in many parts of the kernel, including the XFS and ext4 filesystems. Create a tracepoint header for FS DAX and add the first DAX tracepoints to the PMD fault handler. This allows the tracing for DAX to be done in the same way as the filesystem tracing so that developers can look at them together and get a coherent idea of what the system is doing. I added both an entry and exit tracepoint because future patches will add tracepoints to child functions of dax_iomap_pmd_fault() like dax_pmd_load_hole() and dax_pmd_insert_mapping(). We want those messages to be wrapped by the parent function tracepoints so the code flow is more easily understood. Having entry and exit tracepoints for faults also allows us to easily see what filesystems functions were called during the fault. These filesystem functions get executed via iomap_begin() and iomap_end() calls, for example, and will have their own tracepoints. For PMD faults we primarily want to understand the type of mapping, the fault flags, the faulting address and whether it fell back to 4k faults. If it fell back to 4k faults the tracepoints should let us understand why. I named the new tracepoint header file "fs_dax.h" to allow for device DAX to have its own separate tracing header in the same directory at some point. Here is an example output for these events from a successful PMD fault: big-1441 [005] .... 32.582758: xfs_filemap_pmd_fault: dev 259:0 ino 0x1003 big-1441 [005] .... 32.582776: dax_pmd_fault: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400 big-1441 [005] .... 32.583292: dax_pmd_fault_done: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400 NOPAGE Link: http://lkml.kernel.org/r/1484085142-2297-3-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Suggested-by: Dave Chinner Reviewed-by: Jan Kara Acked-by: Steven Rostedt Cc: Dave Jiang Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 30 ++++++++++++------- include/linux/mm.h | 25 ++++++++++++++++ include/trace/events/fs_dax.h | 68 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 10 deletions(-) create mode 100644 include/trace/events/fs_dax.h diff --git a/fs/dax.c b/fs/dax.c index e9cf8b4cd234..dc11a2b90731 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -35,6 +35,9 @@ #include #include "internal.h" +#define CREATE_TRACE_POINTS +#include + /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) @@ -1341,6 +1344,16 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, loff_t pos; int error; + /* + * Check whether offset isn't beyond end of file now. Caller is + * supposed to hold locks serializing us with truncate / punch hole so + * this is a reliable test. + */ + pgoff = linear_page_index(vma, pmd_addr); + max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; + + trace_dax_pmd_fault(inode, vma, address, flags, pgoff, max_pgoff, 0); + /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) goto fallback; @@ -1351,16 +1364,10 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; - /* - * Check whether offset isn't beyond end of file now. Caller is - * supposed to hold locks serializing us with truncate / punch hole so - * this is a reliable test. - */ - pgoff = linear_page_index(vma, pmd_addr); - max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; - - if (pgoff > max_pgoff) - return VM_FAULT_SIGBUS; + if (pgoff > max_pgoff) { + result = VM_FAULT_SIGBUS; + goto out; + } /* If the PMD would extend beyond the file size */ if ((pgoff | PG_PMD_COLOUR) > max_pgoff) @@ -1432,6 +1439,9 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, split_huge_pmd(vma, pmd, address); count_vm_event(THP_FAULT_FALLBACK); } +out: + trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff, + result); return result; } EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); diff --git a/include/linux/mm.h b/include/linux/mm.h index 6ff66d6fe8e2..d22f7837ad90 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -285,6 +285,17 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ +#define FAULT_FLAG_TRACE \ + { FAULT_FLAG_WRITE, "WRITE" }, \ + { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ + { FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \ + { FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \ + { FAULT_FLAG_KILLABLE, "KILLABLE" }, \ + { FAULT_FLAG_TRIED, "TRIED" }, \ + { FAULT_FLAG_USER, "USER" }, \ + { FAULT_FLAG_REMOTE, "REMOTE" }, \ + { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" } + /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask @@ -1111,6 +1122,20 @@ static inline void clear_page_pfmemalloc(struct page *page) VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ VM_FAULT_FALLBACK) +#define VM_FAULT_RESULT_TRACE \ + { VM_FAULT_OOM, "OOM" }, \ + { VM_FAULT_SIGBUS, "SIGBUS" }, \ + { VM_FAULT_MAJOR, "MAJOR" }, \ + { VM_FAULT_WRITE, "WRITE" }, \ + { VM_FAULT_HWPOISON, "HWPOISON" }, \ + { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ + { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ + { VM_FAULT_NOPAGE, "NOPAGE" }, \ + { VM_FAULT_LOCKED, "LOCKED" }, \ + { VM_FAULT_RETRY, "RETRY" }, \ + { VM_FAULT_FALLBACK, "FALLBACK" }, \ + { VM_FAULT_DONE_COW, "DONE_COW" } + /* Encode hstate index for a hwpoisoned large page */ #define VM_FAULT_SET_HINDEX(x) ((x) << 12) #define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf) diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h new file mode 100644 index 000000000000..58b0b5612683 --- /dev/null +++ b/include/trace/events/fs_dax.h @@ -0,0 +1,68 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fs_dax + +#if !defined(_TRACE_FS_DAX_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FS_DAX_H + +#include + +DECLARE_EVENT_CLASS(dax_pmd_fault_class, + TP_PROTO(struct inode *inode, struct vm_area_struct *vma, + unsigned long address, unsigned int flags, pgoff_t pgoff, + pgoff_t max_pgoff, int result), + TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_start) + __field(unsigned long, vm_end) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(pgoff_t, pgoff) + __field(pgoff_t, max_pgoff) + __field(dev_t, dev) + __field(unsigned int, flags) + __field(int, result) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_start = vma->vm_start; + __entry->vm_end = vma->vm_end; + __entry->vm_flags = vma->vm_flags; + __entry->address = address; + __entry->flags = flags; + __entry->pgoff = pgoff; + __entry->max_pgoff = max_pgoff; + __entry->result = result; + ), + TP_printk("dev %d:%d ino %#lx %s %s address %#lx vm_start " + "%#lx vm_end %#lx pgoff %#lx max_pgoff %#lx %s", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __print_flags(__entry->flags, "|", FAULT_FLAG_TRACE), + __entry->address, + __entry->vm_start, + __entry->vm_end, + __entry->pgoff, + __entry->max_pgoff, + __print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE) + ) +) + +#define DEFINE_PMD_FAULT_EVENT(name) \ +DEFINE_EVENT(dax_pmd_fault_class, name, \ + TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \ + unsigned long address, unsigned int flags, pgoff_t pgoff, \ + pgoff_t max_pgoff, int result), \ + TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result)) + +DEFINE_PMD_FAULT_EVENT(dax_pmd_fault); +DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done); + + +#endif /* _TRACE_FS_DAX_H */ + +/* This part must be outside protection */ +#include -- cgit From e057541acc173f44f191c8df68c75edc658f4688 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 22 Feb 2017 15:39:54 -0800 Subject: dax: update MAINTAINERS entries for FS DAX Add the new include/trace/events/fs_dax.h tracepoint header, the existing include/linux/dax.h header, update Matthew's email address and add myself as a maintainer for filesystem DAX. Link: http://lkml.kernel.org/r/1484085142-2297-4-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Suggested-by: Matthew Wilcox Cc: Dave Chinner Cc: Dave Jiang Cc: Jan Kara Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 545633d6663d..5ca9a7e163d8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3926,10 +3926,13 @@ S: Maintained F: drivers/i2c/busses/i2c-diolan-u2c.c DIRECT ACCESS (DAX) -M: Matthew Wilcox +M: Matthew Wilcox +M: Ross Zwisler L: linux-fsdevel@vger.kernel.org S: Supported F: fs/dax.c +F: include/linux/dax.h +F: include/trace/events/fs_dax.h DIRECTORY NOTIFICATION (DNOTIFY) M: Eric Paris -- cgit From 653b2ea3396fda0b5e23a37b3356dd2ca82fe5c1 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 22 Feb 2017 15:39:57 -0800 Subject: dax: add tracepoints to dax_pmd_load_hole() Add tracepoints to dax_pmd_load_hole(), following the same logging conventions as the tracepoints in dax_iomap_pmd_fault(). Here is an example PMD fault showing the new tracepoints: read_big-1478 [004] .... 238.242188: xfs_filemap_pmd_fault: dev 259:0 ino 0x1003 read_big-1478 [004] .... 238.242191: dax_pmd_fault: dev 259:0 ino 0x1003 shared ALLOW_RETRY|KILLABLE|USER address 0x10400000 vm_start 0x10200000 vm_end 0x10600000 pgoff 0x200 max_pgoff 0x1400 read_big-1478 [004] .... 238.242390: dax_pmd_load_hole: dev 259:0 ino 0x1003 shared address 0x10400000 zero_page ffffea0002c20000 radix_entry 0x1e read_big-1478 [004] .... 238.242392: dax_pmd_fault_done: dev 259:0 ino 0x1003 shared ALLOW_RETRY|KILLABLE|USER address 0x10400000 vm_start 0x10200000 vm_end 0x10600000 pgoff 0x200 max_pgoff 0x1400 NOPAGE Link: http://lkml.kernel.org/r/1484085142-2297-5-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Acked-by: Steven Rostedt Cc: Dave Chinner Cc: Dave Jiang Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 14 ++++++++++---- include/trace/events/fs_dax.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index dc11a2b90731..a622961a717c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1299,33 +1299,39 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd, { struct address_space *mapping = vma->vm_file->f_mapping; unsigned long pmd_addr = address & PMD_MASK; + struct inode *inode = mapping->host; struct page *zero_page; + void *ret = NULL; spinlock_t *ptl; pmd_t pmd_entry; - void *ret; zero_page = mm_get_huge_zero_page(vma->vm_mm); if (unlikely(!zero_page)) - return VM_FAULT_FALLBACK; + goto fallback; ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, RADIX_DAX_PMD | RADIX_DAX_HZP); if (IS_ERR(ret)) - return VM_FAULT_FALLBACK; + goto fallback; *entryp = ret; ptl = pmd_lock(vma->vm_mm, pmd); if (!pmd_none(*pmd)) { spin_unlock(ptl); - return VM_FAULT_FALLBACK; + goto fallback; } pmd_entry = mk_pmd(zero_page, vma->vm_page_prot); pmd_entry = pmd_mkhuge(pmd_entry); set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry); spin_unlock(ptl); + trace_dax_pmd_load_hole(inode, vma, address, zero_page, ret); return VM_FAULT_NOPAGE; + +fallback: + trace_dax_pmd_load_hole_fallback(inode, vma, address, zero_page, ret); + return VM_FAULT_FALLBACK; } int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index 58b0b5612683..43f1263131a4 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -61,6 +61,48 @@ DEFINE_EVENT(dax_pmd_fault_class, name, \ DEFINE_PMD_FAULT_EVENT(dax_pmd_fault); DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done); +DECLARE_EVENT_CLASS(dax_pmd_load_hole_class, + TP_PROTO(struct inode *inode, struct vm_area_struct *vma, + unsigned long address, struct page *zero_page, + void *radix_entry), + TP_ARGS(inode, vma, address, zero_page, radix_entry), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(struct page *, zero_page) + __field(void *, radix_entry) + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_flags = vma->vm_flags; + __entry->address = address; + __entry->zero_page = zero_page; + __entry->radix_entry = radix_entry; + ), + TP_printk("dev %d:%d ino %#lx %s address %#lx zero_page %p " + "radix_entry %#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __entry->address, + __entry->zero_page, + (unsigned long)__entry->radix_entry + ) +) + +#define DEFINE_PMD_LOAD_HOLE_EVENT(name) \ +DEFINE_EVENT(dax_pmd_load_hole_class, name, \ + TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \ + unsigned long address, struct page *zero_page, \ + void *radix_entry), \ + TP_ARGS(inode, vma, address, zero_page, radix_entry)) + +DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole); +DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback); #endif /* _TRACE_FS_DAX_H */ -- cgit From 27a7ffaccd915675c88045ba83493804a0267ab7 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 22 Feb 2017 15:40:00 -0800 Subject: dax: add tracepoints to dax_pmd_insert_mapping() Add tracepoints to dax_pmd_insert_mapping(), following the same logging conventions as the tracepoints in dax_iomap_pmd_fault(). Here is an example PMD fault showing the new tracepoints: big-1504 [001] .... 326.960743: xfs_filemap_pmd_fault: dev 259:0 ino 0x1003 big-1504 [001] .... 326.960753: dax_pmd_fault: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400 big-1504 [001] .... 326.960981: dax_pmd_insert_mapping: dev 259:0 ino 0x1003 shared write address 0x10505000 length 0x200000 pfn 0x100600 DEV|MAP radix_entry 0xc000e big-1504 [001] .... 326.960986: dax_pmd_fault_done: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start 0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400 NOPAGE Link: http://lkml.kernel.org/r/1484085142-2297-6-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Acked-by: Steven Rostedt Cc: Dave Chinner Cc: Dave Jiang Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 12 +++++++--- include/linux/pfn_t.h | 6 +++++ include/trace/events/fs_dax.h | 51 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 3 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index a622961a717c..06c5dffaa7bc 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1262,15 +1262,16 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd, { struct address_space *mapping = vma->vm_file->f_mapping; struct block_device *bdev = iomap->bdev; + struct inode *inode = mapping->host; struct blk_dax_ctl dax = { .sector = dax_iomap_sector(iomap, pos), .size = PMD_SIZE, }; long length = dax_map_atomic(bdev, &dax); - void *ret; + void *ret = NULL; if (length < 0) /* dax_map_atomic() failed */ - return VM_FAULT_FALLBACK; + goto fallback; if (length < PMD_SIZE) goto unmap_fallback; if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) @@ -1283,13 +1284,18 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd, ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, RADIX_DAX_PMD); if (IS_ERR(ret)) - return VM_FAULT_FALLBACK; + goto fallback; *entryp = ret; + trace_dax_pmd_insert_mapping(inode, vma, address, write, length, + dax.pfn, ret); return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); unmap_fallback: dax_unmap_atomic(bdev, &dax); +fallback: + trace_dax_pmd_insert_mapping_fallback(inode, vma, address, write, + length, dax.pfn, ret); return VM_FAULT_FALLBACK; } diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index a3d90b9da18d..033fc7bbcefa 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -15,6 +15,12 @@ #define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3)) #define PFN_MAP (1ULL << (BITS_PER_LONG_LONG - 4)) +#define PFN_FLAGS_TRACE \ + { PFN_SG_CHAIN, "SG_CHAIN" }, \ + { PFN_SG_LAST, "SG_LAST" }, \ + { PFN_DEV, "DEV" }, \ + { PFN_MAP, "MAP" } + static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags) { pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), }; diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index 43f1263131a4..c3b0aae216dc 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -104,6 +104,57 @@ DEFINE_EVENT(dax_pmd_load_hole_class, name, \ DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole); DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback); +DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class, + TP_PROTO(struct inode *inode, struct vm_area_struct *vma, + unsigned long address, int write, long length, pfn_t pfn, + void *radix_entry), + TP_ARGS(inode, vma, address, write, length, pfn, radix_entry), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(long, length) + __field(u64, pfn_val) + __field(void *, radix_entry) + __field(dev_t, dev) + __field(int, write) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_flags = vma->vm_flags; + __entry->address = address; + __entry->write = write; + __entry->length = length; + __entry->pfn_val = pfn.val; + __entry->radix_entry = radix_entry; + ), + TP_printk("dev %d:%d ino %#lx %s %s address %#lx length %#lx " + "pfn %#llx %s radix_entry %#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __entry->write ? "write" : "read", + __entry->address, + __entry->length, + __entry->pfn_val & ~PFN_FLAGS_MASK, + __print_flags_u64(__entry->pfn_val & PFN_FLAGS_MASK, "|", + PFN_FLAGS_TRACE), + (unsigned long)__entry->radix_entry + ) +) + +#define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \ +DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \ + TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \ + unsigned long address, int write, long length, pfn_t pfn, \ + void *radix_entry), \ + TP_ARGS(inode, vma, address, write, length, pfn, radix_entry)) + +DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping); +DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback); + #endif /* _TRACE_FS_DAX_H */ /* This part must be outside protection */ -- cgit From d8a849e1bc123790bbbf1facba94452a3aef5736 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 22 Feb 2017 15:40:03 -0800 Subject: mm, dax: make pmd_fault() and friends be the same as fault() Instead of passing in multiple parameters in the pmd_fault() handler, a vmf can be passed in just like a fault() handler. This will simplify code and remove the need for the actual pmd fault handlers to allocate a vmf. Related functions are also modified to do the same. [dave.jiang@intel.com: fix issue with xfs_tests stall when DAX option is off] Link: http://lkml.kernel.org/r/148469861071.195597.3619476895250028518.stgit@djiang5-desk3.ch.intel.com Link: http://lkml.kernel.org/r/1484085142-2297-7-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Dave Jiang Reviewed-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Dave Chinner Cc: Matthew Wilcox Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/dax.c | 16 +++++++--------- fs/dax.c | 28 +++++++++++----------------- fs/ext4/file.c | 9 ++++----- fs/xfs/xfs_file.c | 10 ++++------ include/linux/dax.h | 7 +++---- include/linux/mm.h | 3 +-- include/trace/events/fs_dax.h | 15 +++++++-------- mm/memory.c | 6 ++---- 8 files changed, 39 insertions(+), 55 deletions(-) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index ed758b74ddf0..a8833cc35697 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -473,10 +473,9 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, - struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, - unsigned int flags) + struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long pmd_addr = addr & PMD_MASK; + unsigned long pmd_addr = vmf->address & PMD_MASK; struct device *dev = &dax_dev->dev; struct dax_region *dax_region; phys_addr_t phys; @@ -508,23 +507,22 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); - return vmf_insert_pfn_pmd(vma, addr, pmd, pfn, - flags & FAULT_FLAG_WRITE); + return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, + vmf->flags & FAULT_FLAG_WRITE); } -static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags) +static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int rc; struct file *filp = vma->vm_file; struct dax_dev *dax_dev = filp->private_data; dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, - current->comm, (flags & FAULT_FLAG_WRITE) + current->comm, (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", vma->vm_start, vma->vm_end); rcu_read_lock(); - rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags); + rc = __dax_dev_pmd_fault(dax_dev, vma, vmf); rcu_read_unlock(); return rc; diff --git a/fs/dax.c b/fs/dax.c index 06c5dffaa7bc..01fdbc86ee8c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1340,18 +1340,17 @@ fallback: return VM_FAULT_FALLBACK; } -int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, struct iomap_ops *ops) +int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops) { struct address_space *mapping = vma->vm_file->f_mapping; - unsigned long pmd_addr = address & PMD_MASK; - bool write = flags & FAULT_FLAG_WRITE; + unsigned long pmd_addr = vmf->address & PMD_MASK; + bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; int result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; pgoff_t max_pgoff, pgoff; - struct vm_fault vmf; void *entry; loff_t pos; int error; @@ -1364,7 +1363,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, pgoff = linear_page_index(vma, pmd_addr); max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; - trace_dax_pmd_fault(inode, vma, address, flags, pgoff, max_pgoff, 0); + trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0); /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) @@ -1408,21 +1407,17 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (IS_ERR(entry)) goto finish_iomap; - vmf.pgoff = pgoff; - vmf.flags = flags; - vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; - switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vma, pmd, &vmf, address, - &iomap, pos, write, &entry); + result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf, + vmf->address, &iomap, pos, write, &entry); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) goto unlock_entry; - result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, - &entry); + result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address, + &iomap, &entry); break; default: WARN_ON_ONCE(1); @@ -1448,12 +1443,11 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, } fallback: if (result == VM_FAULT_FALLBACK) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, vmf->pmd, vmf->address); count_vm_event(THP_FAULT_FALLBACK); } out: - trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff, - result); + trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result); return result; } EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 87e11dfe3cde..d3f589b3602c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -273,21 +273,20 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return result; } -static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags) +static int +ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int result; struct inode *inode = file_inode(vma->vm_file); struct super_block *sb = inode->i_sb; - bool write = flags & FAULT_FLAG_WRITE; + bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); } down_read(&EXT4_I(inode)->i_mmap_sem); - result = dax_iomap_pmd_fault(vma, addr, pmd, flags, - &ext4_iomap_ops); + result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops); up_read(&EXT4_I(inode)->i_mmap_sem); if (write) sb_end_pagefault(sb); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index bbb9eb6811b2..44a8d2356e31 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1432,9 +1432,7 @@ xfs_filemap_fault( STATIC int xfs_filemap_pmd_fault( struct vm_area_struct *vma, - unsigned long addr, - pmd_t *pmd, - unsigned int flags) + struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); struct xfs_inode *ip = XFS_I(inode); @@ -1445,16 +1443,16 @@ xfs_filemap_pmd_fault( trace_xfs_filemap_pmd_fault(ip); - if (flags & FAULT_FLAG_WRITE) { + if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops); + ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (flags & FAULT_FLAG_WRITE) + if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(inode->i_sb); return ret; diff --git a/include/linux/dax.h b/include/linux/dax.h index 24ad71173995..a829fee2b42b 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -71,16 +71,15 @@ static inline unsigned int dax_radix_order(void *entry) return PMD_SHIFT - PAGE_SHIFT; return 0; } -int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, struct iomap_ops *ops); +int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops); #else static inline unsigned int dax_radix_order(void *entry) { return 0; } static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, unsigned int flags, - struct iomap_ops *ops) + struct vm_fault *vmf, struct iomap_ops *ops) { return VM_FAULT_FALLBACK; } diff --git a/include/linux/mm.h b/include/linux/mm.h index d22f7837ad90..0961e95e904e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -351,8 +351,7 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); - int (*pmd_fault)(struct vm_area_struct *, unsigned long address, - pmd_t *, unsigned int flags); + int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index c3b0aae216dc..a98665bfb38f 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -8,9 +8,8 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, TP_PROTO(struct inode *inode, struct vm_area_struct *vma, - unsigned long address, unsigned int flags, pgoff_t pgoff, - pgoff_t max_pgoff, int result), - TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result), + struct vm_fault *vmf, pgoff_t max_pgoff, int result), + TP_ARGS(inode, vma, vmf, max_pgoff, result), TP_STRUCT__entry( __field(unsigned long, ino) __field(unsigned long, vm_start) @@ -29,9 +28,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, __entry->vm_start = vma->vm_start; __entry->vm_end = vma->vm_end; __entry->vm_flags = vma->vm_flags; - __entry->address = address; - __entry->flags = flags; - __entry->pgoff = pgoff; + __entry->address = vmf->address; + __entry->flags = vmf->flags; + __entry->pgoff = vmf->pgoff; __entry->max_pgoff = max_pgoff; __entry->result = result; ), @@ -54,9 +53,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, #define DEFINE_PMD_FAULT_EVENT(name) \ DEFINE_EVENT(dax_pmd_fault_class, name, \ TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \ - unsigned long address, unsigned int flags, pgoff_t pgoff, \ + struct vm_fault *vmf, \ pgoff_t max_pgoff, int result), \ - TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result)) + TP_ARGS(inode, vma, vmf, max_pgoff, result)) DEFINE_PMD_FAULT_EVENT(dax_pmd_fault); DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done); diff --git a/mm/memory.c b/mm/memory.c index 6bf2b471e30c..2376f8528800 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3475,8 +3475,7 @@ static int create_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, - vmf->flags); + return vma->vm_ops->pmd_fault(vma, vmf); return VM_FAULT_FALLBACK; } @@ -3485,8 +3484,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); if (vmf->vma->vm_ops->pmd_fault) - return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, - vmf->pmd, vmf->flags); + return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf); /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); -- cgit From f42003917b4569a2f4f0c79c35e1e3df2859f81a Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 22 Feb 2017 15:40:06 -0800 Subject: mm, dax: change pmd_fault() to take only vmf parameter pmd_fault() and related functions really only need the vmf parameter since the additional parameters are all included in the vmf struct. Remove the additional parameter and simplify pmd_fault() and friends. Link: http://lkml.kernel.org/r/1484085142-2297-8-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Dave Jiang Reviewed-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Dave Chinner Cc: Dave Jiang Cc: Matthew Wilcox Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/dax.c | 18 +++++++-------- fs/dax.c | 54 ++++++++++++++++++++----------------------- fs/ext4/file.c | 8 +++---- fs/xfs/xfs_file.c | 7 +++--- include/linux/dax.h | 7 +++--- include/linux/mm.h | 2 +- include/trace/events/fs_dax.h | 54 ++++++++++++++++++++----------------------- mm/memory.c | 9 ++++---- 8 files changed, 74 insertions(+), 85 deletions(-) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index a8833cc35697..18e9875f6277 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -472,8 +472,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return rc; } -static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, - struct vm_area_struct *vma, struct vm_fault *vmf) +static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) { unsigned long pmd_addr = vmf->address & PMD_MASK; struct device *dev = &dax_dev->dev; @@ -482,7 +481,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, pgoff_t pgoff; pfn_t pfn; - if (check_vma(dax_dev, vma, __func__)) + if (check_vma(dax_dev, vmf->vma, __func__)) return VM_FAULT_SIGBUS; dax_region = dax_dev->region; @@ -497,7 +496,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, return VM_FAULT_SIGBUS; } - pgoff = linear_page_index(vma, pmd_addr); + pgoff = linear_page_index(vmf->vma, pmd_addr); phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); if (phys == -1) { dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, @@ -507,22 +506,23 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); - return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, + return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, vmf->flags & FAULT_FLAG_WRITE); } -static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int dax_dev_pmd_fault(struct vm_fault *vmf) { int rc; - struct file *filp = vma->vm_file; + struct file *filp = vmf->vma->vm_file; struct dax_dev *dax_dev = filp->private_data; dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, current->comm, (vmf->flags & FAULT_FLAG_WRITE) - ? "write" : "read", vma->vm_start, vma->vm_end); + ? "write" : "read", + vmf->vma->vm_start, vmf->vma->vm_end); rcu_read_lock(); - rc = __dax_dev_pmd_fault(dax_dev, vma, vmf); + rc = __dax_dev_pmd_fault(dax_dev, vmf); rcu_read_unlock(); return rc; diff --git a/fs/dax.c b/fs/dax.c index 01fdbc86ee8c..d800197aba34 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1256,11 +1256,10 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault); */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) -static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd, - struct vm_fault *vmf, unsigned long address, - struct iomap *iomap, loff_t pos, bool write, void **entryp) +static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, + loff_t pos, void **entryp) { - struct address_space *mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct block_device *bdev = iomap->bdev; struct inode *inode = mapping->host; struct blk_dax_ctl dax = { @@ -1287,31 +1286,30 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd, goto fallback; *entryp = ret; - trace_dax_pmd_insert_mapping(inode, vma, address, write, length, - dax.pfn, ret); - return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); + trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret); + return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, + dax.pfn, vmf->flags & FAULT_FLAG_WRITE); unmap_fallback: dax_unmap_atomic(bdev, &dax); fallback: - trace_dax_pmd_insert_mapping_fallback(inode, vma, address, write, - length, dax.pfn, ret); + trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, + dax.pfn, ret); return VM_FAULT_FALLBACK; } -static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd, - struct vm_fault *vmf, unsigned long address, - struct iomap *iomap, void **entryp) +static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, + void **entryp) { - struct address_space *mapping = vma->vm_file->f_mapping; - unsigned long pmd_addr = address & PMD_MASK; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + unsigned long pmd_addr = vmf->address & PMD_MASK; struct inode *inode = mapping->host; struct page *zero_page; void *ret = NULL; spinlock_t *ptl; pmd_t pmd_entry; - zero_page = mm_get_huge_zero_page(vma->vm_mm); + zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); if (unlikely(!zero_page)) goto fallback; @@ -1322,27 +1320,27 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd, goto fallback; *entryp = ret; - ptl = pmd_lock(vma->vm_mm, pmd); - if (!pmd_none(*pmd)) { + ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (!pmd_none(*(vmf->pmd))) { spin_unlock(ptl); goto fallback; } - pmd_entry = mk_pmd(zero_page, vma->vm_page_prot); + pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); pmd_entry = pmd_mkhuge(pmd_entry); - set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry); + set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); - trace_dax_pmd_load_hole(inode, vma, address, zero_page, ret); + trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); return VM_FAULT_NOPAGE; fallback: - trace_dax_pmd_load_hole_fallback(inode, vma, address, zero_page, ret); + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); return VM_FAULT_FALLBACK; } -int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - struct iomap_ops *ops) +int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops) { + struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -1363,7 +1361,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf, pgoff = linear_page_index(vma, pmd_addr); max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; - trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0); + trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) @@ -1409,15 +1407,13 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf, switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf, - vmf->address, &iomap, pos, write, &entry); + result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) goto unlock_entry; - result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address, - &iomap, &entry); + result = dax_pmd_load_hole(vmf, &iomap, &entry); break; default: WARN_ON_ONCE(1); @@ -1447,7 +1443,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf, count_vm_event(THP_FAULT_FALLBACK); } out: - trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result); + trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); return result; } EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d3f589b3602c..13021a054fc0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -274,19 +274,19 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } static int -ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +ext4_dax_pmd_fault(struct vm_fault *vmf) { int result; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { sb_start_pagefault(sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); } down_read(&EXT4_I(inode)->i_mmap_sem); - result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops); + result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops); up_read(&EXT4_I(inode)->i_mmap_sem); if (write) sb_end_pagefault(sb); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 44a8d2356e31..9d8440b07b53 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1431,10 +1431,9 @@ xfs_filemap_fault( */ STATIC int xfs_filemap_pmd_fault( - struct vm_area_struct *vma, struct vm_fault *vmf) { - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); int ret; @@ -1445,11 +1444,11 @@ xfs_filemap_pmd_fault( if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops); + ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (vmf->flags & FAULT_FLAG_WRITE) diff --git a/include/linux/dax.h b/include/linux/dax.h index a829fee2b42b..c1bd6ab5e974 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -71,15 +71,14 @@ static inline unsigned int dax_radix_order(void *entry) return PMD_SHIFT - PAGE_SHIFT; return 0; } -int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - struct iomap_ops *ops); +int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops); #else static inline unsigned int dax_radix_order(void *entry) { return 0; } -static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma, - struct vm_fault *vmf, struct iomap_ops *ops) +static inline int dax_iomap_pmd_fault(struct vm_fault *vmf, + struct iomap_ops *ops) { return VM_FAULT_FALLBACK; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 0961e95e904e..3787f047a098 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -351,7 +351,7 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); - int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf); + int (*pmd_fault)(struct vm_fault *vmf); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index a98665bfb38f..c566ddc87f73 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -7,9 +7,9 @@ #include DECLARE_EVENT_CLASS(dax_pmd_fault_class, - TP_PROTO(struct inode *inode, struct vm_area_struct *vma, - struct vm_fault *vmf, pgoff_t max_pgoff, int result), - TP_ARGS(inode, vma, vmf, max_pgoff, result), + TP_PROTO(struct inode *inode, struct vm_fault *vmf, + pgoff_t max_pgoff, int result), + TP_ARGS(inode, vmf, max_pgoff, result), TP_STRUCT__entry( __field(unsigned long, ino) __field(unsigned long, vm_start) @@ -25,9 +25,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->vm_start = vma->vm_start; - __entry->vm_end = vma->vm_end; - __entry->vm_flags = vma->vm_flags; + __entry->vm_start = vmf->vma->vm_start; + __entry->vm_end = vmf->vma->vm_end; + __entry->vm_flags = vmf->vma->vm_flags; __entry->address = vmf->address; __entry->flags = vmf->flags; __entry->pgoff = vmf->pgoff; @@ -52,19 +52,18 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, #define DEFINE_PMD_FAULT_EVENT(name) \ DEFINE_EVENT(dax_pmd_fault_class, name, \ - TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \ - struct vm_fault *vmf, \ + TP_PROTO(struct inode *inode, struct vm_fault *vmf, \ pgoff_t max_pgoff, int result), \ - TP_ARGS(inode, vma, vmf, max_pgoff, result)) + TP_ARGS(inode, vmf, max_pgoff, result)) DEFINE_PMD_FAULT_EVENT(dax_pmd_fault); DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done); DECLARE_EVENT_CLASS(dax_pmd_load_hole_class, - TP_PROTO(struct inode *inode, struct vm_area_struct *vma, - unsigned long address, struct page *zero_page, + TP_PROTO(struct inode *inode, struct vm_fault *vmf, + struct page *zero_page, void *radix_entry), - TP_ARGS(inode, vma, address, zero_page, radix_entry), + TP_ARGS(inode, vmf, zero_page, radix_entry), TP_STRUCT__entry( __field(unsigned long, ino) __field(unsigned long, vm_flags) @@ -76,8 +75,8 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->vm_flags = vma->vm_flags; - __entry->address = address; + __entry->vm_flags = vmf->vma->vm_flags; + __entry->address = vmf->address; __entry->zero_page = zero_page; __entry->radix_entry = radix_entry; ), @@ -95,19 +94,17 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class, #define DEFINE_PMD_LOAD_HOLE_EVENT(name) \ DEFINE_EVENT(dax_pmd_load_hole_class, name, \ - TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \ - unsigned long address, struct page *zero_page, \ - void *radix_entry), \ - TP_ARGS(inode, vma, address, zero_page, radix_entry)) + TP_PROTO(struct inode *inode, struct vm_fault *vmf, \ + struct page *zero_page, void *radix_entry), \ + TP_ARGS(inode, vmf, zero_page, radix_entry)) DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole); DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback); DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class, - TP_PROTO(struct inode *inode, struct vm_area_struct *vma, - unsigned long address, int write, long length, pfn_t pfn, - void *radix_entry), - TP_ARGS(inode, vma, address, write, length, pfn, radix_entry), + TP_PROTO(struct inode *inode, struct vm_fault *vmf, + long length, pfn_t pfn, void *radix_entry), + TP_ARGS(inode, vmf, length, pfn, radix_entry), TP_STRUCT__entry( __field(unsigned long, ino) __field(unsigned long, vm_flags) @@ -121,9 +118,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->vm_flags = vma->vm_flags; - __entry->address = address; - __entry->write = write; + __entry->vm_flags = vmf->vma->vm_flags; + __entry->address = vmf->address; + __entry->write = vmf->flags & FAULT_FLAG_WRITE; __entry->length = length; __entry->pfn_val = pfn.val; __entry->radix_entry = radix_entry; @@ -146,10 +143,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class, #define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \ - TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \ - unsigned long address, int write, long length, pfn_t pfn, \ - void *radix_entry), \ - TP_ARGS(inode, vma, address, write, length, pfn, radix_entry)) + TP_PROTO(struct inode *inode, struct vm_fault *vmf, \ + long length, pfn_t pfn, void *radix_entry), \ + TP_ARGS(inode, vmf, length, pfn, radix_entry)) DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping); DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback); diff --git a/mm/memory.c b/mm/memory.c index 2376f8528800..ececdc4a2892 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3471,11 +3471,10 @@ out: static int create_huge_pmd(struct vm_fault *vmf) { - struct vm_area_struct *vma = vmf->vma; - if (vma_is_anonymous(vma)) + if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_anonymous_page(vmf); - if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, vmf); + if (vmf->vma->vm_ops->pmd_fault) + return vmf->vma->vm_ops->pmd_fault(vmf); return VM_FAULT_FALLBACK; } @@ -3484,7 +3483,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); if (vmf->vma->vm_ops->pmd_fault) - return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf); + return vmf->vma->vm_ops->pmd_fault(vmf); /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); -- cgit From a5759b2bffa807a727cbf6790ecdc4df2ae8e4d4 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Wed, 22 Feb 2017 15:40:09 -0800 Subject: dma-debug: add comment for failed to check map error Add comment for failure to check a map error to help driver developers. Link: http://lkml.kernel.org/r/1484622289-22085-1-git-send-email-miles.chen@mediatek.com Signed-off-by: Miles Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/dma-debug.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 8971370bfb16..60c57ec936db 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -1155,6 +1155,11 @@ static void check_unmap(struct dma_debug_entry *ref) dir2name[ref->direction]); } + /* + * Drivers should use dma_mapping_error() to check the returned + * addresses of dma_map_single() and dma_map_page(). + * If not, print this warning message. See Documentation/DMA-API.txt. + */ if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { err_printk(ref->dev, entry, "DMA-API: device driver failed to check map error" -- cgit From 0937577d5f5720046252196f811def2eae470593 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Wed, 22 Feb 2017 15:40:12 -0800 Subject: tools/vm: add missing Makefile rules Currently the tools/vm Makefile has a rather arbitrary implicit build rule; page-types is the first value in TARGETS so lets just build that one! Additionally there is no install rule and this is needed for make -C tools vm_install to work properly. Provide a more sensible implicit build rule and a new install rule. Note that the variables names used by the install rule (DESTDIR and sbindir) are copied from prior-art in tools/power/cpupower. Link: http://lkml.kernel.org/r/20170113165630.27541-1-daniel.thompson@linaro.org Signed-off-by: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/vm/Makefile b/tools/vm/Makefile index 93aadaf7ff63..006029456988 100644 --- a/tools/vm/Makefile +++ b/tools/vm/Makefile @@ -9,6 +9,8 @@ CC = $(CROSS_COMPILE)gcc CFLAGS = -Wall -Wextra -I../lib/ LDFLAGS = $(LIBS) +all: $(TARGETS) + $(TARGETS): $(LIBS) $(LIBS): @@ -20,3 +22,9 @@ $(LIBS): clean: $(RM) page-types slabinfo page_owner_sort make -C $(LIB_DIR) clean + +sbindir ?= /usr/sbin + +install: all + install -d $(DESTDIR)$(sbindir) + install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir) -- cgit From 8c3200265787f787e4133579cc5e5c28d805f76e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 22 Feb 2017 15:40:15 -0800 Subject: scripts/spelling.txt: add several more common spelling mistakes Lately I've been cleaning up spelling mistakes in kernel error messages and here are some of the more common spelling mistakes that I've found which probably should be added to this list so we don't keep on seeing them appearing again. Link: http://lkml.kernel.org/r/20161209173326.17662-1-colin.king@canonical.com Signed-off-by: Colin Ian King Acked-by: Kees Cook Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 163c720d3f2b..ac5bf8708ff0 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -16,6 +16,7 @@ absense||absence absolut||absolute absoulte||absolute acccess||access +acceess||access acceleratoin||acceleration accelleration||acceleration accesing||accessing @@ -46,6 +47,7 @@ ackowledged||acknowledged acording||according activete||activate acumulating||accumulating +acumulator||accumulator adapater||adapter addional||additional additionaly||additionally @@ -184,6 +186,7 @@ cacluated||calculated caculation||calculation calender||calendar calle||called +callibration||calibration calucate||calculate calulate||calculate cancelation||cancellation @@ -244,6 +247,7 @@ compatiblity||compatibility competion||completion compilant||compliant compleatly||completely +completition||completion completly||completely complient||compliant componnents||components @@ -257,6 +261,7 @@ conected||connected configuratoin||configuration configuraton||configuration configuretion||configuration +configutation||configuration conider||consider conjuction||conjunction connectinos||connections @@ -317,6 +322,7 @@ dependant||dependent depreacted||deprecated depreacte||deprecate desactivate||deactivate +desciptor||descriptor desciptors||descriptors descripton||description descrition||description @@ -417,9 +423,12 @@ extention||extension extracter||extractor faild||failed faill||fail +failied||failed +faillure||failure failue||failure failuer||failure faireness||fairness +falied||failed faliure||failure familar||familiar fatser||faster @@ -441,6 +450,7 @@ forseeable||foreseeable forse||force fortan||fortran forwardig||forwarding +framming||framing framwork||framework frequncy||frequency frome||from @@ -482,6 +492,7 @@ howver||however hsould||should hypter||hyper identidier||identifier +illigal||illegal imblance||imbalance immeadiately||immediately immedaite||immediate @@ -520,6 +531,7 @@ informtion||information infromation||information ingore||ignore inital||initial +initalized||initialized initalised||initialized initalise||initialize initalize||initialize @@ -532,6 +544,7 @@ initilize||initialize inofficial||unofficial insititute||institute instal||install +instanciated||instantiated inteface||interface integreated||integrated integrety||integrity @@ -557,6 +570,7 @@ intialized||initialized intialize||initialize intregral||integral intrrupt||interrupt +intterrupt||interrupt intuative||intuitive invaid||invalid invalde||invald @@ -567,6 +581,8 @@ invokations||invocations irrelevent||irrelevant isnt||isn't isssue||issue +iternations||iterations +itertation||iteration itslef||itself jave||java jeffies||jiffies @@ -621,6 +637,7 @@ messsage||message messsages||messages microprocesspr||microprocessor milliseonds||milliseconds +minium||minimum minumum||minimum miscelleneous||miscellaneous misformed||malformed @@ -668,6 +685,7 @@ occurances||occurrences occured||occurred occurence||occurrence occure||occurred +occured||occurred occuring||occurring offet||offset omitt||omit @@ -681,8 +699,10 @@ optionnal||optional optmizations||optimizations orientatied||orientated orientied||oriented +orignal||original otherise||otherwise ouput||output +oustanding||outstanding overaall||overall overhread||overhead overlaping||overlapping @@ -705,6 +725,7 @@ paramter||parameter paramters||parameters particuarly||particularly particularily||particularly +partiton||partition pased||passed passin||passing pathes||paths @@ -724,6 +745,7 @@ pleaes||please ploting||plotting plugable||pluggable poinnter||pointer +pointeur||pointer poiter||pointer posible||possible positon||position @@ -752,6 +774,7 @@ procceed||proceed proccesors||processors procesed||processed proces||process +procesing||processing processessing||processing processess||processes processpr||processor @@ -780,6 +803,7 @@ protable||portable protcol||protocol protecion||protection protocoll||protocol +promixity||proximity psudo||pseudo psuedo||pseudo psychadelic||psychedelic @@ -801,6 +825,7 @@ recommanded||recommended recyle||recycle redircet||redirect redirectrion||redirection +reename||rename refcounf||refcount refence||reference refered||referred @@ -944,7 +969,9 @@ suble||subtle substract||subtract succesfully||successfully succesful||successful +successed||succeeded successfull||successful +successfuly||successfully sucessfully||successfully sucess||success superflous||superfluous @@ -960,6 +987,7 @@ suppport||support supress||suppress surpresses||suppresses susbsystem||subsystem +suspeneded||suspended suspicously||suspiciously swaping||swapping switchs||switches @@ -967,6 +995,7 @@ symetric||symmetric synax||syntax synchonized||synchronized syncronize||synchronize +syncronized||synchronized syncronizing||synchronizing syncronus||synchronous syste||system @@ -1005,22 +1034,30 @@ ture||true tyep||type udpate||update uesd||used +uncommited||uncommitted unconditionaly||unconditionally underun||underrun unecessary||unnecessary unexecpted||unexpected +unexpcted||unexpected unexpectd||unexpected unexpeted||unexpected +unexpexted||unexpected unfortunatelly||unfortunately unifiy||unify unintialized||uninitialized +unkmown||unknown unknonw||unknown unknow||unknown unkown||unknown unneedingly||unnecessarily +unnsupported||unsupported +unmached||unmatched unresgister||unregister +unrgesiter||unregister unsinged||unsigned unstabel||unstable +unsolicitied||unsolicited unsuccessfull||unsuccessful unsuported||unsupported untill||until -- cgit From 10f24f75b304f6874eacf765d09827a28caea28d Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 22 Feb 2017 15:40:17 -0800 Subject: scripts/spelling.txt: fix incorrect typo-words Fix up some incorrect typo-words. [akpm@linux-foundation.org: "licencing" is valid British spelling and should be kept, per Joe] Link: http://lkml.kernel.org/r/1486409689-23335-1-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index ac5bf8708ff0..b3a1994b5df7 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -40,7 +40,7 @@ achitecture||architecture acient||ancient acitions||actions acitve||active -acknowldegement||acknowldegement +acknowldegement||acknowledgment acknowledgement||acknowledgment ackowledge||acknowledge ackowledged||acknowledged @@ -573,7 +573,7 @@ intrrupt||interrupt intterrupt||interrupt intuative||intuitive invaid||invalid -invalde||invald +invalde||invalid invalide||invalid invididual||individual invokation||invocation @@ -1020,7 +1020,7 @@ tramsmitted||transmitted tramsmit||transmit tranfer||transfer transciever||transceiver -transferd||transferrd +transferd||transferred transfered||transferred transfering||transferring transision||transition -- cgit From 78c5250296c94c0d6bd9e92814e6601ff6c0b18b Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Wed, 22 Feb 2017 15:40:20 -0800 Subject: scripts/Lindent: clean up and optimize * Add a few blank lines to improve readability. * Don't call cut 3 times when once is enough. * Drop a useless semicolon. Link: http://lkml.kernel.org/r/20170104140356.162abab2@endymion Signed-off-by: Jean Delvare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/Lindent | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/Lindent b/scripts/Lindent index 6d889de4e70b..57b564c24d61 100755 --- a/scripts/Lindent +++ b/scripts/Lindent @@ -1,21 +1,25 @@ #!/bin/sh + PARAM="-npro -kr -i8 -ts8 -sob -l80 -ss -ncs -cp1" -RES=`indent --version` + +RES=`indent --version | cut -d' ' -f3` if [ "$RES" = "" ]; then exit 1 fi -V1=`echo $RES | cut -d' ' -f3 | cut -d'.' -f1` -V2=`echo $RES | cut -d' ' -f3 | cut -d'.' -f2` -V3=`echo $RES | cut -d' ' -f3 | cut -d'.' -f3` +V1=`echo $RES | cut -d'.' -f1` +V2=`echo $RES | cut -d'.' -f2` +V3=`echo $RES | cut -d'.' -f3` + if [ $V1 -gt 2 ]; then PARAM="$PARAM -il0" elif [ $V1 -eq 2 ]; then if [ $V2 -gt 2 ]; then - PARAM="$PARAM -il0"; + PARAM="$PARAM -il0" elif [ $V2 -eq 2 ]; then if [ $V3 -ge 10 ]; then PARAM="$PARAM -il0" fi fi fi + indent $PARAM "$@" -- cgit From 0e5a47a8d346c798bbb1801c7c5852b652155b72 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 22 Feb 2017 15:40:23 -0800 Subject: scripts/checkstack.pl: add support for nios2 Adjust checkstack.pl for the nios2 architecture. Link: http://lkml.kernel.org/r/20170116113052.15034-1-tklauser@distanz.ch Signed-off-by: Tobias Klauser Cc: Ley Foon Tan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkstack.pl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl index dd8397894d5c..694a075381b0 100755 --- a/scripts/checkstack.pl +++ b/scripts/checkstack.pl @@ -78,6 +78,9 @@ my (@stack, $re, $dre, $x, $xs, $funcre); } elsif ($arch eq 'mips') { #88003254: 27bdffe0 addiu sp,sp,-32 $re = qr/.*addiu.*sp,sp,-(([0-9]{2}|[3-9])[0-9]{2})/o; + } elsif ($arch eq 'nios2') { + #25a8: defffb04 addi sp,sp,-20 + $re = qr/.*addi.*sp,sp,-(([0-9]{2}|[3-9])[0-9]{2})/o; } elsif ($arch eq 'parisc' || $arch eq 'parisc64') { $re = qr/.*ldo ($x{1,8})\(sp\),sp/o; } elsif ($arch eq 'ppc') { -- cgit From 8087a5609dbf73553a226f1ce0b3a14954adab34 Mon Sep 17 00:00:00 2001 From: Cheah Kok Cheong Date: Wed, 22 Feb 2017 15:40:26 -0800 Subject: scripts/checkincludes.pl: add exit message for no duplicates found If no duplicates found, inform user. Link: http://lkml.kernel.org/r/1486391275-2843-1-git-send-email-thrust73@gmail.com Signed-off-by: Cheah Kok Cheong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkincludes.pl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/checkincludes.pl b/scripts/checkincludes.pl index 97b2c6143fe4..381c018a4612 100755 --- a/scripts/checkincludes.pl +++ b/scripts/checkincludes.pl @@ -37,6 +37,8 @@ if ($#ARGV >= 1) { } } +my $dup_counter = 0; + foreach my $file (@ARGV) { open(my $f, '<', $file) or die "Cannot open $file: $!.\n"; @@ -57,6 +59,7 @@ foreach my $file (@ARGV) { foreach my $filename (keys %includedfiles) { if ($includedfiles{$filename} > 1) { print "$file: $filename is included more than once.\n"; + ++$dup_counter; } } next; @@ -73,6 +76,7 @@ foreach my $file (@ARGV) { if ($includedfiles{$filename} > 1) { $includedfiles{$filename}--; $dups++; + ++$dup_counter; } else { print {$f} $_; } @@ -87,3 +91,7 @@ foreach my $file (@ARGV) { } close($f); } + +if ($dup_counter == 0) { + print "No duplicate includes found.\n"; +} -- cgit From 7659c655bededecd88f6f25102ba3ab926af92cc Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 22 Feb 2017 15:40:29 -0800 Subject: scripts/tags.sh: include arch/Kconfig* for tags generation Kconfig files under arch/ directory are ignored by all_kconfigs(), so include them for tags generation. Link: http://lkml.kernel.org/r/1486206053-38223-1-git-send-email-houtao1@huawei.com Signed-off-by: Hou Tao Cc: Michal Marek Cc: Joe Perches Cc: Mathieu Maret Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/tags.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/tags.sh b/scripts/tags.sh index df5fa777d300..d661f2f3ef61 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -128,6 +128,8 @@ all_target_sources() all_kconfigs() { + find ${tree}arch/ -maxdepth 1 $ignore \ + -name "Kconfig*" -not -type l -print; for arch in $ALLSOURCE_ARCHS; do find_sources $arch 'Kconfig*' done -- cgit From 6408b6fb99a066f92a4aa6215169d2264c05af48 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 22 Feb 2017 15:40:32 -0800 Subject: m32r: use generic current.h Given that the arch does not add its own implementations, simply use the asm-generic/current.h (generic-y) header instead of duplicating code. Link: http://lkml.kernel.org/r/1482896994-25863-1-git-send-email-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/include/asm/Kbuild | 1 + arch/m32r/include/asm/current.h | 15 --------------- 2 files changed, 1 insertion(+), 15 deletions(-) delete mode 100644 arch/m32r/include/asm/current.h diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild index 652100b64a71..8c24c5e1db66 100644 --- a/arch/m32r/include/asm/Kbuild +++ b/arch/m32r/include/asm/Kbuild @@ -1,5 +1,6 @@ generic-y += clkdev.h +generic-y += current.h generic-y += exec.h generic-y += irq_work.h generic-y += kvm_para.h diff --git a/arch/m32r/include/asm/current.h b/arch/m32r/include/asm/current.h deleted file mode 100644 index 7859d864f2c2..000000000000 --- a/arch/m32r/include/asm/current.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _ASM_M32R_CURRENT_H -#define _ASM_M32R_CURRENT_H - -#include - -struct task_struct; - -static __inline__ struct task_struct *get_current(void) -{ - return current_thread_info()->task; -} - -#define current (get_current()) - -#endif /* _ASM_M32R_CURRENT_H */ -- cgit From af0de7815e2cdb7b3cf1d5faef0ca45c1dfb3ba2 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Wed, 22 Feb 2017 15:40:35 -0800 Subject: m32r: fix build warning Some m32r builds were having a warning: arch/m32r/include/asm/cmpxchg.h:191:3: warning: value computed is not used arch/m32r/include/asm/cmpxchg.h:68:3: warning: value computed is not used Taking the idea from commit e001bbae7147 ("ARM: cmpxchg: avoid warnings from macro-ized cmpxchg() implementations") the m32r implementation is changed to use a similar construct with a compound expression instead of a typecast, which causes the compiler to not complain about an unused result. Link: http://lkml.kernel.org/r/1484432664-7015-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/include/asm/cmpxchg.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/m32r/include/asm/cmpxchg.h b/arch/m32r/include/asm/cmpxchg.h index 14bf9b739dd2..23c9d0537201 100644 --- a/arch/m32r/include/asm/cmpxchg.h +++ b/arch/m32r/include/asm/cmpxchg.h @@ -64,8 +64,10 @@ __xchg(unsigned long x, volatile void *ptr, int size) return (tmp); } -#define xchg(ptr, x) \ - ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr)))) +#define xchg(ptr, x) ({ \ + ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), \ + sizeof(*(ptr)))); \ +}) static __always_inline unsigned long __xchg_local(unsigned long x, volatile void *ptr, int size) @@ -187,9 +189,12 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) return old; } -#define cmpxchg(ptr, o, n) \ - ((__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)(o), \ - (unsigned long)(n), sizeof(*(ptr)))) +#define cmpxchg(ptr, o, n) ({ \ + ((__typeof__(*(ptr))) \ + __cmpxchg((ptr), (unsigned long)(o), \ + (unsigned long)(n), \ + sizeof(*(ptr)))); \ +}) #include -- cgit From ca376b37867875b6f661bb24a3238636b74f766e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 22 Feb 2017 15:40:38 -0800 Subject: score: remove asm/current.h ... it's already using the generic version anyways, so just drop the file as do the other archs that do not implement their own version of the current macro. Link: http://lkml.kernel.org/r/1485992878-4780-5-git-send-email-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Chen Liqin Cc: Lennox Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/score/include/asm/Kbuild | 2 +- arch/score/include/asm/current.h | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) delete mode 100644 arch/score/include/asm/current.h diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild index 51970bb6c4fe..db3e28ca3ae2 100644 --- a/arch/score/include/asm/Kbuild +++ b/arch/score/include/asm/Kbuild @@ -1,9 +1,9 @@ header-y += - generic-y += barrier.h generic-y += clkdev.h +generic-y += current.h generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/score/include/asm/current.h b/arch/score/include/asm/current.h deleted file mode 100644 index 16eae9cbaf1a..000000000000 --- a/arch/score/include/asm/current.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_SCORE_CURRENT_H -#define _ASM_SCORE_CURRENT_H - -#include - -#endif /* _ASM_SCORE_CURRENT_H */ -- cgit From 439a36b8ef38657f765b80b775e2885338d72451 Mon Sep 17 00:00:00 2001 From: Eric Ren Date: Wed, 22 Feb 2017 15:40:41 -0800 Subject: ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock We are in the situation that we have to avoid recursive cluster locking, but there is no way to check if a cluster lock has been taken by a precess already. Mostly, we can avoid recursive locking by writing code carefully. However, we found that it's very hard to handle the routines that are invoked directly by vfs code. For instance: const struct inode_operations ocfs2_file_iops = { .permission = ocfs2_permission, .get_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, }; Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR): do_sys_open may_open inode_permission ocfs2_permission ocfs2_inode_lock() <=== first time generic_permission get_acl ocfs2_iop_get_acl ocfs2_inode_lock() <=== recursive one A deadlock will occur if a remote EX request comes in between two of ocfs2_inode_lock(). Briefly describe how the deadlock is formed: On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in BAST(ocfs2_generic_handle_bast) when downconvert is started on behalf of the remote EX lock request. Another hand, the recursive cluster lock (the second one) will be blocked in in __ocfs2_cluster_lock() because of OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why? because there is no chance for the first cluster lock on this node to be unlocked - we block ourselves in the code path. The idea to fix this issue is mostly taken from gfs2 code. 1. introduce a new field: struct ocfs2_lock_res.l_holders, to keep track of the processes' pid who has taken the cluster lock of this lock resource; 2. introduce a new flag for ocfs2_inode_lock_full: OCFS2_META_LOCK_GETBH; it means just getting back disk inode bh for us if we've got cluster lock. 3. export a helper: ocfs2_is_locked_by_me() is used to check if we have got the cluster lock in the upper code path. The tracking logic should be used by some of the ocfs2 vfs's callbacks, to solve the recursive locking issue cuased by the fact that vfs routines can call into each other. The performance penalty of processing the holder list should only be seen at a few cases where the tracking logic is used, such as get/set acl. You may ask what if the first time we got a PR lock, and the second time we want a EX lock? fortunately, this case never happens in the real world, as far as I can see, including permission check, (get|set)_(acl|attr), and the gfs2 code also do so. [sfr@canb.auug.org.au remove some inlines] Link: http://lkml.kernel.org/r/20170117100948.11657-2-zren@suse.com Signed-off-by: Eric Ren Reviewed-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Stephen Rothwell Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/ocfs2/dlmglue.h | 18 +++++++++ fs/ocfs2/ocfs2.h | 1 + 3 files changed, 121 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 77d1632e905d..8dce4099a6ca 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) init_waitqueue_head(&res->l_event); INIT_LIST_HEAD(&res->l_blocked_list); INIT_LIST_HEAD(&res->l_mask_waiters); + INIT_LIST_HEAD(&res->l_holders); } void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, @@ -749,6 +750,50 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res) res->l_flags = 0UL; } +/* + * Keep a list of processes who have interest in a lockres. + * Note: this is now only uesed for check recursive cluster locking. + */ +static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, + struct ocfs2_lock_holder *oh) +{ + INIT_LIST_HEAD(&oh->oh_list); + oh->oh_owner_pid = get_pid(task_pid(current)); + + spin_lock(&lockres->l_lock); + list_add_tail(&oh->oh_list, &lockres->l_holders); + spin_unlock(&lockres->l_lock); +} + +static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, + struct ocfs2_lock_holder *oh) +{ + spin_lock(&lockres->l_lock); + list_del(&oh->oh_list); + spin_unlock(&lockres->l_lock); + + put_pid(oh->oh_owner_pid); +} + +static inline int ocfs2_is_locked_by_me(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_lock_holder *oh; + struct pid *pid; + + /* look in the list of holders for one with the current task as owner */ + spin_lock(&lockres->l_lock); + pid = task_pid(current); + list_for_each_entry(oh, &lockres->l_holders, oh_list) { + if (oh->oh_owner_pid == pid) { + spin_unlock(&lockres->l_lock); + return 1; + } + } + spin_unlock(&lockres->l_lock); + + return 0; +} + static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, int level) { @@ -2333,8 +2378,9 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, goto getbh; } - if (ocfs2_mount_local(osb)) - goto local; + if ((arg_flags & OCFS2_META_LOCK_GETBH) || + ocfs2_mount_local(osb)) + goto update; if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) ocfs2_wait_for_recovery(osb); @@ -2363,7 +2409,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) ocfs2_wait_for_recovery(osb); -local: +update: /* * We only see this flag if we're being called from * ocfs2_read_locked_inode(). It means we're locking an inode @@ -2497,6 +2543,59 @@ void ocfs2_inode_unlock(struct inode *inode, ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); } +/* + * This _tracker variantes are introduced to deal with the recursive cluster + * locking issue. The idea is to keep track of a lock holder on the stack of + * the current process. If there's a lock holder on the stack, we know the + * task context is already protected by cluster locking. Currently, they're + * used in some VFS entry routines. + * + * return < 0 on error, return == 0 if there's no lock holder on the stack + * before this call, return == 1 if this call would be a recursive locking. + */ +int ocfs2_inode_lock_tracker(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct ocfs2_lock_holder *oh) +{ + int status; + int arg_flags = 0, has_locked; + struct ocfs2_lock_res *lockres; + + lockres = &OCFS2_I(inode)->ip_inode_lockres; + has_locked = ocfs2_is_locked_by_me(lockres); + /* Just get buffer head if the cluster lock has been taken */ + if (has_locked) + arg_flags = OCFS2_META_LOCK_GETBH; + + if (likely(!has_locked || ret_bh)) { + status = ocfs2_inode_lock_full(inode, ret_bh, ex, arg_flags); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + } + if (!has_locked) + ocfs2_add_holder(lockres, oh); + + return has_locked; +} + +void ocfs2_inode_unlock_tracker(struct inode *inode, + int ex, + struct ocfs2_lock_holder *oh, + int had_lock) +{ + struct ocfs2_lock_res *lockres; + + lockres = &OCFS2_I(inode)->ip_inode_lockres; + if (!had_lock) { + ocfs2_remove_holder(lockres, oh); + ocfs2_inode_unlock(inode, ex); + } +} + int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno) { struct ocfs2_lock_res *lockres; diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index d293a22c32c5..a7fc18ba0dc1 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -70,6 +70,11 @@ struct ocfs2_orphan_scan_lvb { __be32 lvb_os_seqno; }; +struct ocfs2_lock_holder { + struct list_head oh_list; + struct pid *oh_owner_pid; +}; + /* ocfs2_inode_lock_full() 'arg_flags' flags */ /* don't wait on recovery. */ #define OCFS2_META_LOCK_RECOVERY (0x01) @@ -77,6 +82,8 @@ struct ocfs2_orphan_scan_lvb { #define OCFS2_META_LOCK_NOQUEUE (0x02) /* don't block waiting for the downconvert thread, instead return -EAGAIN */ #define OCFS2_LOCK_NONBLOCK (0x04) +/* just get back disk inode bh if we've got cluster lock. */ +#define OCFS2_META_LOCK_GETBH (0x08) /* Locking subclasses of inode cluster lock */ enum { @@ -170,4 +177,15 @@ void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); /* To set the locking protocol on module initialization */ void ocfs2_set_locking_protocol(void); + +/* The _tracker pair is used to avoid cluster recursive locking */ +int ocfs2_inode_lock_tracker(struct inode *inode, + struct buffer_head **ret_bh, + int ex, + struct ocfs2_lock_holder *oh); +void ocfs2_inode_unlock_tracker(struct inode *inode, + int ex, + struct ocfs2_lock_holder *oh, + int had_lock); + #endif /* DLMGLUE_H */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7e5958b0be6b..0c39d71c67a1 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -172,6 +172,7 @@ struct ocfs2_lock_res { struct list_head l_blocked_list; struct list_head l_mask_waiters; + struct list_head l_holders; unsigned long l_flags; char l_name[OCFS2_LOCK_ID_MAX_LEN]; -- cgit From b891fa5024a95c77e0d6fd6655cb74af6fb77f46 Mon Sep 17 00:00:00 2001 From: Eric Ren Date: Wed, 22 Feb 2017 15:40:44 -0800 Subject: ocfs2: fix deadlock issue when taking inode lock at vfs entry points Commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()") results in a deadlock, as the author "Tariq Saeed" realized shortly after the patch was merged. The discussion happened here https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html The reason why taking cluster inode lock at vfs entry points opens up a self deadlock window, is explained in the previous patch of this series. So far, we have seen two different code paths that have this issue. 1. do_sys_open may_open inode_permission ocfs2_permission ocfs2_inode_lock() <=== take PR generic_permission get_acl ocfs2_iop_get_acl ocfs2_inode_lock() <=== take PR 2. fchmod|fchmodat chmod_common notify_change ocfs2_setattr <=== take EX posix_acl_chmod get_acl ocfs2_iop_get_acl <=== take PR ocfs2_iop_set_acl <=== take EX Fixes them by adding the tracking logic (in the previous patch) for these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(), ocfs2_setattr(). Link: http://lkml.kernel.org/r/20170117100948.11657-3-zren@suse.com Signed-off-by: Eric Ren Reviewed-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/acl.c | 29 +++++++++++++---------------- fs/ocfs2/file.c | 58 ++++++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 58 insertions(+), 29 deletions(-) diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index bed1fcb63088..dc22ba8c710f 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -283,16 +283,14 @@ int ocfs2_set_acl(handle_t *handle, int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) { struct buffer_head *bh = NULL; - int status = 0; + int status, had_lock; + struct ocfs2_lock_holder oh; - status = ocfs2_inode_lock(inode, &bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - return status; - } + had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); + if (had_lock < 0) + return had_lock; status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); - ocfs2_inode_unlock(inode, 1); + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); brelse(bh); return status; } @@ -302,21 +300,20 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; struct posix_acl *acl; - int ret; + int had_lock; + struct ocfs2_lock_holder oh; osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return NULL; - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - if (ret != -ENOENT) - mlog_errno(ret); - return ERR_PTR(ret); - } + + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh); + if (had_lock < 0) + return ERR_PTR(had_lock); acl = ocfs2_get_acl_nolock(inode, type, di_bh); - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); return acl; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index c4889655d32b..7b6a146327d7 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1138,6 +1138,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) handle_t *handle = NULL; struct dquot *transfer_to[MAXQUOTAS] = { }; int qtype; + int had_lock; + struct ocfs2_lock_holder oh; trace_ocfs2_setattr(inode, dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -1173,11 +1175,30 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - status = ocfs2_inode_lock(inode, &bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); + had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); + if (had_lock < 0) { + status = had_lock; goto bail_unlock_rw; + } else if (had_lock) { + /* + * As far as we know, ocfs2_setattr() could only be the first + * VFS entry point in the call chain of recursive cluster + * locking issue. + * + * For instance: + * chmod_common() + * notify_change() + * ocfs2_setattr() + * posix_acl_chmod() + * ocfs2_iop_get_acl() + * + * But, we're not 100% sure if it's always true, because the + * ordering of the VFS entry points in the call chain is out + * of our control. So, we'd better dump the stack here to + * catch the other cases of recursive locking. + */ + mlog(ML_ERROR, "Another case of recursive locking:\n"); + dump_stack(); } inode_locked = 1; @@ -1260,8 +1281,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: - if (status) { - ocfs2_inode_unlock(inode, 1); + if (status && inode_locked) { + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); inode_locked = 0; } bail_unlock_rw: @@ -1279,7 +1300,7 @@ bail: mlog_errno(status); } if (inode_locked) - ocfs2_inode_unlock(inode, 1); + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); brelse(bh); return status; @@ -1320,21 +1341,32 @@ bail: int ocfs2_permission(struct inode *inode, int mask) { - int ret; + int ret, had_lock; + struct ocfs2_lock_holder oh; if (mask & MAY_NOT_BLOCK) return -ECHILD; - ret = ocfs2_inode_lock(inode, NULL, 0); - if (ret) { - if (ret != -ENOENT) - mlog_errno(ret); + had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh); + if (had_lock < 0) { + ret = had_lock; goto out; + } else if (had_lock) { + /* See comments in ocfs2_setattr() for details. + * The call chain of this case could be: + * do_sys_open() + * may_open() + * inode_permission() + * ocfs2_permission() + * ocfs2_iop_get_acl() + */ + mlog(ML_ERROR, "Another case of recursive locking:\n"); + dump_stack(); } ret = generic_permission(inode, mask); - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); out: return ret; } -- cgit From 6302666d2ef135be46ca34226791afea275f62d1 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 22 Feb 2017 15:40:47 -0800 Subject: parisc: use generic current.h Given that the arch does not add its own implementations, simply use the asm-generic/current.h (generic-y) header instead of duplicating code. Link: http://lkml.kernel.org/r/1485992878-4780-4-git-send-email-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: "James E.J. Bottomley" Cc: Helge Deller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/parisc/include/asm/Kbuild | 1 + arch/parisc/include/asm/current.h | 15 --------------- 2 files changed, 1 insertion(+), 15 deletions(-) delete mode 100644 arch/parisc/include/asm/current.h diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 4e179d770d69..cc70b4116718 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -2,6 +2,7 @@ generic-y += auxvec.h generic-y += barrier.h generic-y += clkdev.h +generic-y += current.h generic-y += device.h generic-y += div64.h generic-y += emergency-restart.h diff --git a/arch/parisc/include/asm/current.h b/arch/parisc/include/asm/current.h deleted file mode 100644 index 0fb9338e3bf2..000000000000 --- a/arch/parisc/include/asm/current.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _PARISC_CURRENT_H -#define _PARISC_CURRENT_H - -#include - -struct task_struct; - -static inline struct task_struct * get_current(void) -{ - return current_thread_info()->task; -} - -#define current get_current() - -#endif /* !(_PARISC_CURRENT_H) */ -- cgit From 612dafabb6a0fa62e9b86f11da5d37ee11dea28f Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 22 Feb 2017 15:40:50 -0800 Subject: block: use for_each_thread() in sys_ioprio_set()/sys_ioprio_get() IOPRIO_WHO_USER case in sys_ioprio_set()/sys_ioprio_get() are using while_each_thread(), which is unsafe under RCU lock according to commit 0c740d0afc3bff0a ("introduce for_each_thread() to replace the buggy while_each_thread()"). Use for_each_thread() (via for_each_process_thread()) which is safe under RCU lock. Link: http://lkml.kernel.org/r/201702011947.DBD56740.OMVHOLOtSJFFFQ@I-love.SAKURA.ne.jp Link: http://lkml.kernel.org/r/1486041779-4401-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Cc: Oleg Nesterov Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/ioprio.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/ioprio.c b/block/ioprio.c index 01b8116298a1..3790669232ff 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -122,14 +122,14 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) if (!user) break; - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (!uid_eq(task_uid(p), uid) || !task_pid_vnr(p)) continue; ret = set_task_ioprio(p, ioprio); if (ret) goto free_uid; - } while_each_thread(g, p); + } free_uid: if (who) free_uid(user); @@ -222,7 +222,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) if (!user) break; - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (!uid_eq(task_uid(p), user->uid) || !task_pid_vnr(p)) continue; @@ -233,7 +233,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) ret = tmpio; else ret = ioprio_best(ret, tmpio); - } while_each_thread(g, p); + } if (who) free_uid(user); -- cgit From b5c66bab72a6a65edb15beb60b90d3cb84c5763b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 22 Feb 2017 15:40:53 -0800 Subject: 9p: fix a potential acl leak posix_acl_update_mode() could possibly clear 'acl', if so we leak the memory pointed by 'acl'. Save this pointer before calling posix_acl_update_mode() and release the memory if 'acl' really gets cleared. Link: http://lkml.kernel.org/r/1486678332-2430-1-git-send-email-xiyou.wangcong@gmail.com Signed-off-by: Cong Wang Reported-by: Mark Salyzyn Reviewed-by: Jan Kara Reviewed-by: Greg Kurz Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/acl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/9p/acl.c b/fs/9p/acl.c index b3c2cc79c20d..082d227fa56b 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -277,6 +277,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, case ACL_TYPE_ACCESS: if (acl) { struct iattr iattr; + struct posix_acl *old_acl = acl; retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); if (retval) @@ -287,6 +288,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, * by the mode bits. So don't * update ACL. */ + posix_acl_release(old_acl); value = NULL; size = 0; } -- cgit From 8dcde9def5a15df54fa4ca41f7750d80fa741d61 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 22 Feb 2017 15:40:56 -0800 Subject: kernel/watchdog.c: do not hardcode CPU 0 as the initial thread When CONFIG_BOOTPARAM_HOTPLUG_CPU0 is enabled, the socket containing the boot cpu can be replaced. During the hot add event, the message NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. is output implying that the NMI watchdog was disabled at some point. This is not the case and the message has caused confusion for users of systems that support the removal of the boot cpu socket. The watchdog code is coded to assume that cpu 0 is always the first cpu to initialize the watchdog, and the last to stop its watchdog thread. That is not the case for initializing if cpu 0 has been removed and added. The removal case has never been correct because the smpboot code will remove the watchdog threads starting with the lowest cpu number. This patch adds watchdog_cpus to track the number of cpus with active NMI watchdog threads so that the first and last thread can be used to set and clear the value of firstcpu_err. firstcpu_err is set when the first watchdog thread is enabled, and cleared when the last watchdog thread is disabled. Link: http://lkml.kernel.org/r/1480425321-32296-1-git-send-email-prarit@redhat.com Signed-off-by: Prarit Bhargava Acked-by: Don Zickus Cc: Borislav Petkov Cc: Tejun Heo Cc: Hidehiro Kawai Cc: Thomas Gleixner Cc: Andi Kleen Cc: Joshua Hunt Cc: Ingo Molnar Cc: Babu Moger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog_hld.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 12b8dd640786..b5de262a9eb9 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -137,12 +137,14 @@ static void watchdog_overflow_callback(struct perf_event *event, * Reduce the watchdog noise by only printing messages * that are different from what cpu0 displayed. */ -static unsigned long cpu0_err; +static unsigned long firstcpu_err; +static atomic_t watchdog_cpus; int watchdog_nmi_enable(unsigned int cpu) { struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); + int firstcpu = 0; /* nothing to do if the hard lockup detector is disabled */ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) @@ -156,19 +158,22 @@ int watchdog_nmi_enable(unsigned int cpu) if (event != NULL) goto out_enable; + if (atomic_inc_return(&watchdog_cpus) == 1) + firstcpu = 1; + wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); - /* save cpu0 error for future comparision */ - if (cpu == 0 && IS_ERR(event)) - cpu0_err = PTR_ERR(event); + /* save the first cpu's error for future comparision */ + if (firstcpu && IS_ERR(event)) + firstcpu_err = PTR_ERR(event); if (!IS_ERR(event)) { - /* only print for cpu0 or different than cpu0 */ - if (cpu == 0 || cpu0_err) + /* only print for the first cpu initialized */ + if (firstcpu || firstcpu_err) pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); goto out_save; } @@ -186,7 +191,7 @@ int watchdog_nmi_enable(unsigned int cpu) smp_mb__after_atomic(); /* skip displaying the same error again */ - if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) return PTR_ERR(event); /* vary the KERN level based on the returned errno */ @@ -222,9 +227,9 @@ void watchdog_nmi_disable(unsigned int cpu) /* should be in cleanup, but blocks oprofile */ perf_event_release_kernel(event); - } - if (cpu == 0) { + /* watchdog_nmi_enable() expects this to be zero initially. */ - cpu0_err = 0; + if (atomic_dec_and_test(&watchdog_cpus)) + firstcpu_err = 0; } } -- cgit From c6e28895a4372992961888ffaadc9efc643b5bfe Mon Sep 17 00:00:00 2001 From: Grygorii Maistrenko Date: Wed, 22 Feb 2017 15:40:59 -0800 Subject: slub: do not merge cache if slub_debug contains a never-merge flag In case CONFIG_SLUB_DEBUG_ON=n, find_mergeable() gets debug features from commandline but never checks if there are features from the SLAB_NEVER_MERGE set. As a result selected by slub_debug caches are always mergeable if they have been created without a custom constructor set or without one of the SLAB_* debug features on. This moves the SLAB_NEVER_MERGE check below the flags update from commandline to make sure it won't merge the slab cache if one of the debug features is on. Link: http://lkml.kernel.org/r/20170101124451.GA4740@lp-laptop-d Signed-off-by: Grygorii Maistrenko Reviewed-by: Pekka Enberg Acked-by: David Rientjes Acked-by: Christoph Lameter Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index ae323841adb1..36a263e9c81a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -255,7 +255,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, { struct kmem_cache *s; - if (slab_nomerge || (flags & SLAB_NEVER_MERGE)) + if (slab_nomerge) return NULL; if (ctor) @@ -266,6 +266,9 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, size = ALIGN(size, align); flags = kmem_cache_flags(size, flags, name, NULL); + if (flags & SLAB_NEVER_MERGE) + return NULL; + list_for_each_entry_reverse(s, &slab_caches, list) { if (slab_unmergeable(s)) continue; -- cgit From 65b9de75253e909e35d8f5c42357e632fdca5d7f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 22 Feb 2017 15:41:02 -0800 Subject: mm/slub: add a dump_stack() to the unexpected GFP check We wish to know who is doing such a thing. slab.c does this. Link: http://lkml.kernel.org/r/20170116091643.15260-1-bp@alien8.de Signed-off-by: Borislav Petkov Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/slub.c b/mm/slub.c index 7ec0a965c6a3..1e5ef312f146 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1630,6 +1630,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) flags &= ~GFP_SLAB_BUG_MASK; pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", invalid_mask, &invalid_mask, flags, &flags); + dump_stack(); } return allocate_slab(s, -- cgit From af3b5f8764a270165195d8b9520d913a268c0062 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 22 Feb 2017 15:41:05 -0800 Subject: mm, slab: rename kmalloc-node cache to kmalloc- SLAB as part of its bootstrap pre-creates one kmalloc cache that can fit the kmem_cache_node management structure, and puts it into the generic kmalloc cache array (e.g. for 128b objects). The name of this cache is "kmalloc-node", which is confusing for readers of /proc/slabinfo as the cache is used for generic allocations (and not just the kmem_cache_node struct) and it appears as the kmalloc-128 cache is missing. An easy solution is to use the kmalloc- name when pre-creating the cache, which we can get from the kmalloc_info array. Example /proc/slabinfo before the patch: ... kmalloc-256 1647 1984 256 16 1 : tunables 120 60 8 : slabdata 124 124 828 kmalloc-192 1974 1974 192 21 1 : tunables 120 60 8 : slabdata 94 94 133 kmalloc-96 1332 1344 128 32 1 : tunables 120 60 8 : slabdata 42 42 219 kmalloc-64 2505 5952 64 64 1 : tunables 120 60 8 : slabdata 93 93 715 kmalloc-32 4278 4464 32 124 1 : tunables 120 60 8 : slabdata 36 36 346 kmalloc-node 1352 1376 128 32 1 : tunables 120 60 8 : slabdata 43 43 53 kmem_cache 132 147 192 21 1 : tunables 120 60 8 : slabdata 7 7 0 After the patch: ... kmalloc-256 1672 2160 256 16 1 : tunables 120 60 8 : slabdata 135 135 807 kmalloc-192 1992 2016 192 21 1 : tunables 120 60 8 : slabdata 96 96 203 kmalloc-96 1159 1184 128 32 1 : tunables 120 60 8 : slabdata 37 37 116 kmalloc-64 2561 4864 64 64 1 : tunables 120 60 8 : slabdata 76 76 785 kmalloc-32 4253 4340 32 124 1 : tunables 120 60 8 : slabdata 35 35 270 kmalloc-128 1256 1280 128 32 1 : tunables 120 60 8 : slabdata 40 40 39 kmem_cache 125 147 192 21 1 : tunables 120 60 8 : slabdata 7 7 0 [vbabka@suse.cz: export the whole kmalloc_info structure instead of just a name accessor, per Christoph Lameter] Link: http://lkml.kernel.org/r/54e80303-b814-4232-66d4-95b34d3eb9d0@suse.cz Link: http://lkml.kernel.org/r/20170203181008.24898-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Matthew Wilcox Cc: Joonsoo Kim Cc: David Rientjes Cc: Pekka Enberg Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 3 ++- mm/slab.h | 6 ++++++ mm/slab_common.c | 5 +---- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 4f2ec6bb46eb..be977ef6e718 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1288,7 +1288,8 @@ void __init kmem_cache_init(void) * Initialize the caches that provide memory for the kmem_cache_node * structures first. Without this, further allocations will bug. */ - kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", + kmalloc_caches[INDEX_NODE] = create_kmalloc_cache( + kmalloc_info[INDEX_NODE].name, kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); slab_state = PARTIAL_NODE; setup_kmalloc_cache_index_table(); diff --git a/mm/slab.h b/mm/slab.h index de6579dc362c..2fa824335a50 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -71,6 +71,12 @@ extern struct list_head slab_caches; /* The slab cache that manages slab cache information */ extern struct kmem_cache *kmem_cache; +/* A table of kmalloc cache names and sizes */ +extern const struct kmalloc_info_struct { + const char *name; + unsigned long size; +} kmalloc_info[]; + unsigned long calculate_alignment(unsigned long flags, unsigned long align, unsigned long size); diff --git a/mm/slab_common.c b/mm/slab_common.c index 36a263e9c81a..f266b0de1e92 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -915,10 +915,7 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is * kmalloc-67108864. */ -static struct { - const char *name; - unsigned long size; -} const kmalloc_info[] __initconst = { +const struct kmalloc_info_struct kmalloc_info[] __initconst = { {NULL, 0}, {"kmalloc-96", 96}, {"kmalloc-192", 192}, {"kmalloc-8", 8}, {"kmalloc-16", 16}, {"kmalloc-32", 32}, -- cgit From 290b6a58b78be709e734d7fbeb1aa0416d9d41bc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 22 Feb 2017 15:41:08 -0800 Subject: Revert "slub: move synchronize_sched out of slab_mutex on shrink" Patch series "slab: make memcg slab destruction scalable", v3. With kmem cgroup support enabled, kmem_caches can be created and destroyed frequently and a great number of near empty kmem_caches can accumulate if there are a lot of transient cgroups and the system is not under memory pressure. When memory reclaim starts under such conditions, it can lead to consecutive deactivation and destruction of many kmem_caches, easily hundreds of thousands on moderately large systems, exposing scalability issues in the current slab management code. I've seen machines which end up with hundred thousands of caches and many millions of kernfs_nodes. The current code is O(N^2) on the total number of caches and has synchronous rcu_barrier() and synchronize_sched() in cgroup offline / release path which is executed while holding cgroup_mutex. Combined, this leads to very expensive and slow cache destruction operations which can easily keep running for half a day. This also messes up /proc/slabinfo along with other cache iterating operations. seq_file operates on 4k chunks and on each 4k boundary tries to seek to the last position in the list. With a huge number of caches on the list, this becomes very slow and very prone to the list content changing underneath it leading to a lot of missing and/or duplicate entries. This patchset addresses the scalability problem. * Add root and per-memcg lists. Update each user to use the appropriate list. * Make rcu_barrier() for SLAB_DESTROY_BY_RCU caches globally batched and asynchronous. * For dying empty slub caches, remove the sysfs files after deactivation so that we don't end up with millions of sysfs files without any useful information on them. This patchset contains the following nine patches. 0001-Revert-slub-move-synchronize_sched-out-of-slab_mutex.patch 0002-slub-separate-out-sysfs_slab_release-from-sysfs_slab.patch 0003-slab-remove-synchronous-rcu_barrier-call-in-memcg-ca.patch 0004-slab-reorganize-memcg_cache_params.patch 0005-slab-link-memcg-kmem_caches-on-their-associated-memo.patch 0006-slab-implement-slab_root_caches-list.patch 0007-slab-introduce-__kmemcg_cache_deactivate.patch 0008-slab-remove-synchronous-synchronize_sched-from-memcg.patch 0009-slab-remove-slub-sysfs-interface-files-early-for-emp.patch 0010-slab-use-memcg_kmem_cache_wq-for-slab-destruction-op.patch 0001 reverts an existing optimization to prepare for the following changes. 0002 is a prep patch. 0003 makes rcu_barrier() in release path batched and asynchronous. 0004-0006 separate out the lists. 0007-0008 replace synchronize_sched() in slub destruction path with call_rcu_sched(). 0009 removes sysfs files early for empty dying caches. 0010 makes destruction work items use a workqueue with limited concurrency. This patch (of 10): Revert 89e364db71fb5e ("slub: move synchronize_sched out of slab_mutex on shrink"). With kmem cgroup support enabled, kmem_caches can be created and destroyed frequently and a great number of near empty kmem_caches can accumulate if there are a lot of transient cgroups and the system is not under memory pressure. When memory reclaim starts under such conditions, it can lead to consecutive deactivation and destruction of many kmem_caches, easily hundreds of thousands on moderately large systems, exposing scalability issues in the current slab management code. This is one of the patches to address the issue. Moving synchronize_sched() out of slab_mutex isn't enough as it's still inside cgroup_mutex. The whole deactivation / release path will be updated to avoid all synchronous RCU operations. Revert this insufficient optimization in preparation to ease future changes. Link: http://lkml.kernel.org/r/20170117235411.9408-2-tj@kernel.org Signed-off-by: Tejun Heo Reported-by: Jay Vana Cc: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 ++-- mm/slab.h | 2 +- mm/slab_common.c | 27 ++------------------------- mm/slob.c | 2 +- mm/slub.c | 19 +++++++++++++++++-- 5 files changed, 23 insertions(+), 31 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index be977ef6e718..8a0e3392f181 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2315,7 +2315,7 @@ out: return nr_freed; } -int __kmem_cache_shrink(struct kmem_cache *cachep) +int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) { int ret = 0; int node; @@ -2335,7 +2335,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep) int __kmem_cache_shutdown(struct kmem_cache *cachep) { - return __kmem_cache_shrink(cachep); + return __kmem_cache_shrink(cachep, false); } void __kmem_cache_release(struct kmem_cache *cachep) diff --git a/mm/slab.h b/mm/slab.h index 2fa824335a50..d07563f37f33 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -167,7 +167,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); -int __kmem_cache_shrink(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *, bool); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; diff --git a/mm/slab_common.c b/mm/slab_common.c index f266b0de1e92..4a999d749d2b 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -582,29 +582,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) get_online_cpus(); get_online_mems(); -#ifdef CONFIG_SLUB - /* - * In case of SLUB, we need to disable empty slab caching to - * avoid pinning the offline memory cgroup by freeable kmem - * pages charged to it. SLAB doesn't need this, as it - * periodically purges unused slabs. - */ - mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { - c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL; - if (c) { - c->cpu_partial = 0; - c->min_partial = 0; - } - } - mutex_unlock(&slab_mutex); - /* - * kmem_cache->cpu_partial is checked locklessly (see - * put_cpu_partial()). Make sure the change is visible. - */ - synchronize_sched(); -#endif - mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { if (!is_root_cache(s)) @@ -616,7 +593,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) if (!c) continue; - __kmem_cache_shrink(c); + __kmem_cache_shrink(c, true); arr->entries[idx] = NULL; } mutex_unlock(&slab_mutex); @@ -787,7 +764,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) get_online_cpus(); get_online_mems(); kasan_cache_shrink(cachep); - ret = __kmem_cache_shrink(cachep); + ret = __kmem_cache_shrink(cachep, false); put_online_mems(); put_online_cpus(); return ret; diff --git a/mm/slob.c b/mm/slob.c index eac04d4357ec..5ec158054ffe 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c) { } -int __kmem_cache_shrink(struct kmem_cache *d) +int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) { return 0; } diff --git a/mm/slub.c b/mm/slub.c index 1e5ef312f146..6de08005d9cd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3891,7 +3891,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) { int node; int i; @@ -3903,6 +3903,21 @@ int __kmem_cache_shrink(struct kmem_cache *s) unsigned long flags; int ret = 0; + if (deactivate) { + /* + * Disable empty slabs caching. Used to avoid pinning offline + * memory cgroups by kmem pages that can be freed. + */ + s->cpu_partial = 0; + s->min_partial = 0; + + /* + * s->cpu_partial is checked locklessly (see put_cpu_partial), + * so we have to make sure the change is visible. + */ + synchronize_sched(); + } + flush_all(s); for_each_kmem_cache_node(s, node, n) { INIT_LIST_HEAD(&discard); @@ -3959,7 +3974,7 @@ static int slab_mem_going_offline_callback(void *arg) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s); + __kmem_cache_shrink(s, false); mutex_unlock(&slab_mutex); return 0; -- cgit From bf5eb3de3847ebcfd1fea7bc14072ef9f21d4e8d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 22 Feb 2017 15:41:11 -0800 Subject: slub: separate out sysfs_slab_release() from sysfs_slab_remove() Separate out slub sysfs removal and release, and call the former earlier from __kmem_cache_shutdown(). There's no reason to defer sysfs removal through RCU and this will later allow us to remove sysfs files way earlier during memory cgroup offline instead of release. Link: http://lkml.kernel.org/r/20170117235411.9408-3-tj@kernel.org Signed-off-by: Tejun Heo Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 4 ++-- mm/slab_common.c | 2 +- mm/slub.c | 12 ++++++++++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 75f56c2ef2d4..07ef550c6627 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -113,9 +113,9 @@ struct kmem_cache { #ifdef CONFIG_SYSFS #define SLAB_SUPPORTS_SYSFS -void sysfs_slab_remove(struct kmem_cache *); +void sysfs_slab_release(struct kmem_cache *); #else -static inline void sysfs_slab_remove(struct kmem_cache *s) +static inline void sysfs_slab_release(struct kmem_cache *s) { } #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index 4a999d749d2b..e6311b2dbba6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -483,7 +483,7 @@ static void release_caches(struct list_head *release, bool need_rcu_barrier) list_for_each_entry_safe(s, s2, release, list) { #ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_remove(s); + sysfs_slab_release(s); #else slab_kmem_cache_release(s); #endif diff --git a/mm/slub.c b/mm/slub.c index 6de08005d9cd..caac5456f0ec 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -214,11 +214,13 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void memcg_propagate_slab_attrs(struct kmem_cache *s); +static void sysfs_slab_remove(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } +static inline void sysfs_slab_remove(struct kmem_cache *s) { } #endif static inline void stat(const struct kmem_cache *s, enum stat_item si) @@ -3687,6 +3689,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) if (n->nr_partial || slabs_node(s, node)) return 1; } + sysfs_slab_remove(s); return 0; } @@ -5637,7 +5640,7 @@ out_del_kobj: goto out; } -void sysfs_slab_remove(struct kmem_cache *s) +static void sysfs_slab_remove(struct kmem_cache *s) { if (slab_state < FULL) /* @@ -5651,7 +5654,12 @@ void sysfs_slab_remove(struct kmem_cache *s) #endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); - kobject_put(&s->kobj); +} + +void sysfs_slab_release(struct kmem_cache *s) +{ + if (slab_state >= FULL) + kobject_put(&s->kobj); } /* -- cgit From 657dc2f9722092e951de95a8109428994541440b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 22 Feb 2017 15:41:14 -0800 Subject: slab: remove synchronous rcu_barrier() call in memcg cache release path With kmem cgroup support enabled, kmem_caches can be created and destroyed frequently and a great number of near empty kmem_caches can accumulate if there are a lot of transient cgroups and the system is not under memory pressure. When memory reclaim starts under such conditions, it can lead to consecutive deactivation and destruction of many kmem_caches, easily hundreds of thousands on moderately large systems, exposing scalability issues in the current slab management code. This is one of the patches to address the issue. SLAB_DESTORY_BY_RCU caches need to flush all RCU operations before destruction because slab pages are freed through RCU and they need to be able to dereference the associated kmem_cache. Currently, it's done synchronously with rcu_barrier(). As rcu_barrier() is expensive time-wise, slab implements a batching mechanism so that rcu_barrier() can be done for multiple caches at the same time. Unfortunately, the rcu_barrier() is in synchronous path which is called while holding cgroup_mutex and the batching is too limited to be actually helpful. This patch updates the cache release path so that the batching is asynchronous and global. All SLAB_DESTORY_BY_RCU caches are queued globally and a work item consumes the list. The work item calls rcu_barrier() only once for all caches that are currently queued. * release_caches() is removed and shutdown_cache() now either directly release the cache or schedules a RCU callback to do that. This makes the cache inaccessible once shutdown_cache() is called and makes it impossible for shutdown_memcg_caches() to do memcg-specific cleanups afterwards. Move memcg-specific part into a helper, unlink_memcg_cache(), and make shutdown_cache() call it directly. Link: http://lkml.kernel.org/r/20170117235411.9408-4-tj@kernel.org Signed-off-by: Tejun Heo Reported-by: Jay Vana Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 102 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 42 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index e6311b2dbba6..ac469c815587 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -30,6 +30,11 @@ LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; +static LIST_HEAD(slab_caches_to_rcu_destroy); +static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work); +static DECLARE_WORK(slab_caches_to_rcu_destroy_work, + slab_caches_to_rcu_destroy_workfn); + /* * Set of flags that will prevent slab merging */ @@ -215,6 +220,11 @@ int memcg_update_all_caches(int num_memcgs) mutex_unlock(&slab_mutex); return ret; } + +static void unlink_memcg_cache(struct kmem_cache *s) +{ + list_del(&s->memcg_params.list); +} #else static inline int init_memcg_params(struct kmem_cache *s, struct mem_cgroup *memcg, struct kmem_cache *root_cache) @@ -225,6 +235,10 @@ static inline int init_memcg_params(struct kmem_cache *s, static inline void destroy_memcg_params(struct kmem_cache *s) { } + +static inline void unlink_memcg_cache(struct kmem_cache *s) +{ +} #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ /* @@ -461,33 +475,59 @@ out_unlock: } EXPORT_SYMBOL(kmem_cache_create); -static int shutdown_cache(struct kmem_cache *s, - struct list_head *release, bool *need_rcu_barrier) +static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) { - if (__kmem_cache_shutdown(s) != 0) - return -EBUSY; + LIST_HEAD(to_destroy); + struct kmem_cache *s, *s2; - if (s->flags & SLAB_DESTROY_BY_RCU) - *need_rcu_barrier = true; + /* + * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the + * @slab_caches_to_rcu_destroy list. The slab pages are freed + * through RCU and and the associated kmem_cache are dereferenced + * while freeing the pages, so the kmem_caches should be freed only + * after the pending RCU operations are finished. As rcu_barrier() + * is a pretty slow operation, we batch all pending destructions + * asynchronously. + */ + mutex_lock(&slab_mutex); + list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy); + mutex_unlock(&slab_mutex); - list_move(&s->list, release); - return 0; + if (list_empty(&to_destroy)) + return; + + rcu_barrier(); + + list_for_each_entry_safe(s, s2, &to_destroy, list) { +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_release(s); +#else + slab_kmem_cache_release(s); +#endif + } } -static void release_caches(struct list_head *release, bool need_rcu_barrier) +static int shutdown_cache(struct kmem_cache *s) { - struct kmem_cache *s, *s2; + if (__kmem_cache_shutdown(s) != 0) + return -EBUSY; - if (need_rcu_barrier) - rcu_barrier(); + list_del(&s->list); + if (!is_root_cache(s)) + unlink_memcg_cache(s); - list_for_each_entry_safe(s, s2, release, list) { + if (s->flags & SLAB_DESTROY_BY_RCU) { + list_add_tail(&s->list, &slab_caches_to_rcu_destroy); + schedule_work(&slab_caches_to_rcu_destroy_work); + } else { #ifdef SLAB_SUPPORTS_SYSFS sysfs_slab_release(s); #else slab_kmem_cache_release(s); #endif } + + return 0; } #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) @@ -602,22 +642,8 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) put_online_cpus(); } -static int __shutdown_memcg_cache(struct kmem_cache *s, - struct list_head *release, bool *need_rcu_barrier) -{ - BUG_ON(is_root_cache(s)); - - if (shutdown_cache(s, release, need_rcu_barrier)) - return -EBUSY; - - list_del(&s->memcg_params.list); - return 0; -} - void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) { - LIST_HEAD(release); - bool need_rcu_barrier = false; struct kmem_cache *s, *s2; get_online_cpus(); @@ -631,18 +657,15 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) * The cgroup is about to be freed and therefore has no charges * left. Hence, all its caches must be empty by now. */ - BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier)); + BUG_ON(shutdown_cache(s)); } mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); - - release_caches(&release, need_rcu_barrier); } -static int shutdown_memcg_caches(struct kmem_cache *s, - struct list_head *release, bool *need_rcu_barrier) +static int shutdown_memcg_caches(struct kmem_cache *s) { struct memcg_cache_array *arr; struct kmem_cache *c, *c2; @@ -661,7 +684,7 @@ static int shutdown_memcg_caches(struct kmem_cache *s, c = arr->entries[i]; if (!c) continue; - if (__shutdown_memcg_cache(c, release, need_rcu_barrier)) + if (shutdown_cache(c)) /* * The cache still has objects. Move it to a temporary * list so as not to try to destroy it for a second @@ -684,7 +707,7 @@ static int shutdown_memcg_caches(struct kmem_cache *s, */ list_for_each_entry_safe(c, c2, &s->memcg_params.list, memcg_params.list) - __shutdown_memcg_cache(c, release, need_rcu_barrier); + shutdown_cache(c); list_splice(&busy, &s->memcg_params.list); @@ -697,8 +720,7 @@ static int shutdown_memcg_caches(struct kmem_cache *s, return 0; } #else -static inline int shutdown_memcg_caches(struct kmem_cache *s, - struct list_head *release, bool *need_rcu_barrier) +static inline int shutdown_memcg_caches(struct kmem_cache *s) { return 0; } @@ -714,8 +736,6 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - LIST_HEAD(release); - bool need_rcu_barrier = false; int err; if (unlikely(!s)) @@ -731,9 +751,9 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - err = shutdown_memcg_caches(s, &release, &need_rcu_barrier); + err = shutdown_memcg_caches(s); if (!err) - err = shutdown_cache(s, &release, &need_rcu_barrier); + err = shutdown_cache(s); if (err) { pr_err("kmem_cache_destroy %s: Slab cache still has objects\n", @@ -745,8 +765,6 @@ out_unlock: put_online_mems(); put_online_cpus(); - - release_caches(&release, need_rcu_barrier); } EXPORT_SYMBOL(kmem_cache_destroy); -- cgit From 9eeadc8b6e0e31f9aea1f8886ef472f62c2b7f55 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 22 Feb 2017 15:41:17 -0800 Subject: slab: reorganize memcg_cache_params We're going to change how memcg caches are iterated. In preparation, clean up and reorganize memcg_cache_params. * The shared ->list is replaced by ->children in root and ->children_node in children. * ->is_root_cache is removed. Instead ->root_cache is moved out of the child union and now used by both root and children. NULL indicates root cache. Non-NULL a memcg one. This patch doesn't cause any observable behavior changes. Link: http://lkml.kernel.org/r/20170117235411.9408-5-tj@kernel.org Signed-off-by: Tejun Heo Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 33 ++++++++++++++++++++++++--------- mm/slab.h | 6 +++--- mm/slab_common.c | 25 +++++++++++++------------ 3 files changed, 40 insertions(+), 24 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 4c5363566815..1f611ba00f1d 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -545,22 +545,37 @@ struct memcg_cache_array { * array to be accessed without taking any locks, on relocation we free the old * version only after a grace period. * - * Child caches will hold extra metadata needed for its operation. Fields are: + * Root and child caches hold different metadata. * - * @memcg: pointer to the memcg this cache belongs to - * @root_cache: pointer to the global, root cache, this cache was derived from + * @root_cache: Common to root and child caches. NULL for root, pointer to + * the root cache for children. * - * Both root and child caches of the same kind are linked into a list chained - * through @list. + * The following fields are specific to root caches. + * + * @memcg_caches: kmemcg ID indexed table of child caches. This table is + * used to index child cachces during allocation and cleared + * early during shutdown. + * + * @children: List of all child caches. While the child caches are also + * reachable through @memcg_caches, a child cache remains on + * this list until it is actually destroyed. + * + * The following fields are specific to child caches. + * + * @memcg: Pointer to the memcg this cache belongs to. + * + * @children_node: List node for @root_cache->children list. */ struct memcg_cache_params { - bool is_root_cache; - struct list_head list; + struct kmem_cache *root_cache; union { - struct memcg_cache_array __rcu *memcg_caches; + struct { + struct memcg_cache_array __rcu *memcg_caches; + struct list_head children; + }; struct { struct mem_cgroup *memcg; - struct kmem_cache *root_cache; + struct list_head children_node; }; }; }; diff --git a/mm/slab.h b/mm/slab.h index d07563f37f33..3ed3336883ed 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -206,12 +206,12 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); * slab_mutex. */ #define for_each_memcg_cache(iter, root) \ - list_for_each_entry(iter, &(root)->memcg_params.list, \ - memcg_params.list) + list_for_each_entry(iter, &(root)->memcg_params.children, \ + memcg_params.children_node) static inline bool is_root_cache(struct kmem_cache *s) { - return s->memcg_params.is_root_cache; + return !s->memcg_params.root_cache; } static inline bool slab_equal_or_root(struct kmem_cache *s, diff --git a/mm/slab_common.c b/mm/slab_common.c index ac469c815587..c3885032dbce 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -140,9 +140,9 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) void slab_init_memcg_params(struct kmem_cache *s) { - s->memcg_params.is_root_cache = true; - INIT_LIST_HEAD(&s->memcg_params.list); + s->memcg_params.root_cache = NULL; RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL); + INIT_LIST_HEAD(&s->memcg_params.children); } static int init_memcg_params(struct kmem_cache *s, @@ -150,10 +150,10 @@ static int init_memcg_params(struct kmem_cache *s, { struct memcg_cache_array *arr; - if (memcg) { - s->memcg_params.is_root_cache = false; - s->memcg_params.memcg = memcg; + if (root_cache) { s->memcg_params.root_cache = root_cache; + s->memcg_params.memcg = memcg; + INIT_LIST_HEAD(&s->memcg_params.children_node); return 0; } @@ -223,7 +223,7 @@ int memcg_update_all_caches(int num_memcgs) static void unlink_memcg_cache(struct kmem_cache *s) { - list_del(&s->memcg_params.list); + list_del(&s->memcg_params.children_node); } #else static inline int init_memcg_params(struct kmem_cache *s, @@ -594,7 +594,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; } - list_add(&s->memcg_params.list, &root_cache->memcg_params.list); + list_add(&s->memcg_params.children_node, + &root_cache->memcg_params.children); /* * Since readers won't lock (see cache_from_memcg_idx()), we need a @@ -690,7 +691,7 @@ static int shutdown_memcg_caches(struct kmem_cache *s) * list so as not to try to destroy it for a second * time while iterating over inactive caches below. */ - list_move(&c->memcg_params.list, &busy); + list_move(&c->memcg_params.children_node, &busy); else /* * The cache is empty and will be destroyed soon. Clear @@ -705,17 +706,17 @@ static int shutdown_memcg_caches(struct kmem_cache *s) * Second, shutdown all caches left from memory cgroups that are now * offline. */ - list_for_each_entry_safe(c, c2, &s->memcg_params.list, - memcg_params.list) + list_for_each_entry_safe(c, c2, &s->memcg_params.children, + memcg_params.children_node) shutdown_cache(c); - list_splice(&busy, &s->memcg_params.list); + list_splice(&busy, &s->memcg_params.children); /* * A cache being destroyed must be empty. In particular, this means * that all per memcg caches attached to it must be empty too. */ - if (!list_empty(&s->memcg_params.list)) + if (!list_empty(&s->memcg_params.children)) return -EBUSY; return 0; } -- cgit From bc2791f857e1984b7548d2a2de2ffb1a913dee62 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 22 Feb 2017 15:41:21 -0800 Subject: slab: link memcg kmem_caches on their associated memory cgroup With kmem cgroup support enabled, kmem_caches can be created and destroyed frequently and a great number of near empty kmem_caches can accumulate if there are a lot of transient cgroups and the system is not under memory pressure. When memory reclaim starts under such conditions, it can lead to consecutive deactivation and destruction of many kmem_caches, easily hundreds of thousands on moderately large systems, exposing scalability issues in the current slab management code. This is one of the patches to address the issue. While a memcg kmem_cache is listed on its root cache's ->children list, there is no direct way to iterate all kmem_caches which are assocaited with a memory cgroup. The only way to iterate them is walking all caches while filtering out caches which don't match, which would be most of them. This makes memcg destruction operations O(N^2) where N is the total number of slab caches which can be huge. This combined with the synchronous RCU operations can tie up a CPU and affect the whole machine for many hours when memory reclaim triggers offlining and destruction of the stale memcgs. This patch adds mem_cgroup->kmem_caches list which goes through memcg_cache_params->kmem_caches_node of all kmem_caches which are associated with the memcg. All memcg specific iterations, including stat file access, are updated to use the new list instead. Link: http://lkml.kernel.org/r/20170117235411.9408-6-tj@kernel.org Signed-off-by: Tejun Heo Reported-by: Jay Vana Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 + include/linux/slab.h | 3 +++ mm/memcontrol.c | 7 ++++--- mm/slab.h | 3 +++ mm/slab_common.c | 36 +++++++++++++++++++++++++++++------- 5 files changed, 40 insertions(+), 10 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 254698856b8f..9fcece9be85d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -253,6 +253,7 @@ struct mem_cgroup { /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; enum memcg_kmem_state kmem_state; + struct list_head kmem_caches; #endif int last_scanned_node; diff --git a/include/linux/slab.h b/include/linux/slab.h index 1f611ba00f1d..a0cc7a77cda2 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -565,6 +565,8 @@ struct memcg_cache_array { * @memcg: Pointer to the memcg this cache belongs to. * * @children_node: List node for @root_cache->children list. + * + * @kmem_caches_node: List node for @memcg->kmem_caches list. */ struct memcg_cache_params { struct kmem_cache *root_cache; @@ -576,6 +578,7 @@ struct memcg_cache_params { struct { struct mem_cgroup *memcg; struct list_head children_node; + struct list_head kmem_caches_node; }; }; }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b822e158b319..834d641dfa8c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2837,6 +2837,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) */ memcg->kmemcg_id = memcg_id; memcg->kmem_state = KMEM_ONLINE; + INIT_LIST_HEAD(&memcg->kmem_caches); return 0; } @@ -4002,9 +4003,9 @@ static struct cftype mem_cgroup_legacy_files[] = { #ifdef CONFIG_SLABINFO { .name = "kmem.slabinfo", - .seq_start = slab_start, - .seq_next = slab_next, - .seq_stop = slab_stop, + .seq_start = memcg_slab_start, + .seq_next = memcg_slab_next, + .seq_stop = memcg_slab_stop, .seq_show = memcg_slab_show, }, #endif diff --git a/mm/slab.h b/mm/slab.h index 3ed3336883ed..a08f01016a3f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -494,6 +494,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) void *slab_start(struct seq_file *m, loff_t *pos); void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); +void *memcg_slab_start(struct seq_file *m, loff_t *pos); +void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos); +void memcg_slab_stop(struct seq_file *m, void *p); int memcg_slab_show(struct seq_file *m, void *p); void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); diff --git a/mm/slab_common.c b/mm/slab_common.c index c3885032dbce..c3bbeddaeaaf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -154,6 +154,7 @@ static int init_memcg_params(struct kmem_cache *s, s->memcg_params.root_cache = root_cache; s->memcg_params.memcg = memcg; INIT_LIST_HEAD(&s->memcg_params.children_node); + INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node); return 0; } @@ -224,6 +225,7 @@ int memcg_update_all_caches(int num_memcgs) static void unlink_memcg_cache(struct kmem_cache *s) { list_del(&s->memcg_params.children_node); + list_del(&s->memcg_params.kmem_caches_node); } #else static inline int init_memcg_params(struct kmem_cache *s, @@ -596,6 +598,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, list_add(&s->memcg_params.children_node, &root_cache->memcg_params.children); + list_add(&s->memcg_params.kmem_caches_node, &memcg->kmem_caches); /* * Since readers won't lock (see cache_from_memcg_idx()), we need a @@ -651,9 +654,8 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) get_online_mems(); mutex_lock(&slab_mutex); - list_for_each_entry_safe(s, s2, &slab_caches, list) { - if (is_root_cache(s) || s->memcg_params.memcg != memcg) - continue; + list_for_each_entry_safe(s, s2, &memcg->kmem_caches, + memcg_params.kmem_caches_node) { /* * The cgroup is about to be freed and therefore has no charges * left. Hence, all its caches must be empty by now. @@ -1201,15 +1203,35 @@ static int slab_show(struct seq_file *m, void *p) } #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +void *memcg_slab_start(struct seq_file *m, loff_t *pos) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + mutex_lock(&slab_mutex); + return seq_list_start(&memcg->kmem_caches, *pos); +} + +void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + return seq_list_next(p, &memcg->kmem_caches, pos); +} + +void memcg_slab_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&slab_mutex); +} + int memcg_slab_show(struct seq_file *m, void *p) { - struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + struct kmem_cache *s = list_entry(p, struct kmem_cache, + memcg_params.kmem_caches_node); struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - if (p == slab_caches.next) + if (p == memcg->kmem_caches.next) print_slabinfo_header(m); - if (!is_root_cache(s) && s->memcg_params.memcg == memcg) - cache_show(s, m); + cache_show(s, m); return 0; } #endif -- cgit From 510ded33e075c2bd662b1efab0110f4240325fc9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 22 Feb 2017 15:41:24 -0800 Subject: slab: implement slab_root_caches list With kmem cgroup support enabled, kmem_caches can be created and destroyed frequently and a great number of near empty kmem_caches can accumulate if there are a lot of transient cgroups and the system is not under memory pressure. When memory reclaim starts under such conditions, it can lead to consecutive deactivation and destruction of many kmem_caches, easily hundreds of thousands on moderately large systems, exposing scalability issues in the current slab management code. This is one of the patches to address the issue. slab_caches currently lists all caches including root and memcg ones. This is the only data structure which lists the root caches and iterating root caches can only be done by walking the list while skipping over memcg caches. As there can be a huge number of memcg caches, this can become very expensive. This also can make /proc/slabinfo behave very badly. seq_file processes reads in 4k chunks and seeks to the previous Nth position on slab_caches list to resume after each chunk. With a lot of memcg cache churns on the list, reading /proc/slabinfo can become very slow and its content often ends up with duplicate and/or missing entries. This patch adds a new list slab_root_caches which lists only the root caches. When memcg is not enabled, it becomes just an alias of slab_caches. memcg specific list operations are collected into memcg_[un]link_cache(). Link: http://lkml.kernel.org/r/20170117235411.9408-7-tj@kernel.org Signed-off-by: Tejun Heo Reported-by: Jay Vana Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 3 +++ mm/slab.h | 15 +++++++++++++ mm/slab_common.c | 59 ++++++++++++++++++++++++++++++---------------------- mm/slub.c | 1 + 4 files changed, 53 insertions(+), 25 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index a0cc7a77cda2..af1a5bef80f4 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -556,6 +556,8 @@ struct memcg_cache_array { * used to index child cachces during allocation and cleared * early during shutdown. * + * @root_caches_node: List node for slab_root_caches list. + * * @children: List of all child caches. While the child caches are also * reachable through @memcg_caches, a child cache remains on * this list until it is actually destroyed. @@ -573,6 +575,7 @@ struct memcg_cache_params { union { struct { struct memcg_cache_array __rcu *memcg_caches; + struct list_head __root_caches_node; struct list_head children; }; struct { diff --git a/mm/slab.h b/mm/slab.h index a08f01016a3f..9631bb27c772 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -201,6 +201,11 @@ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) + +/* List of all root caches. */ +extern struct list_head slab_root_caches; +#define root_caches_node memcg_params.__root_caches_node + /* * Iterate over all memcg caches of the given root cache. The caller must hold * slab_mutex. @@ -300,9 +305,14 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, } extern void slab_init_memcg_params(struct kmem_cache *); +extern void memcg_link_cache(struct kmem_cache *s); #else /* CONFIG_MEMCG && !CONFIG_SLOB */ +/* If !memcg, all caches are root. */ +#define slab_root_caches slab_caches +#define root_caches_node list + #define for_each_memcg_cache(iter, root) \ for ((void)(iter), (void)(root); 0; ) @@ -347,6 +357,11 @@ static inline void memcg_uncharge_slab(struct page *page, int order, static inline void slab_init_memcg_params(struct kmem_cache *s) { } + +static inline void memcg_link_cache(struct kmem_cache *s) +{ +} + #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) diff --git a/mm/slab_common.c b/mm/slab_common.c index c3bbeddaeaaf..274697e1a42a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -138,6 +138,9 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, } #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) + +LIST_HEAD(slab_root_caches); + void slab_init_memcg_params(struct kmem_cache *s) { s->memcg_params.root_cache = NULL; @@ -183,9 +186,6 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size) { struct memcg_cache_array *old, *new; - if (!is_root_cache(s)) - return 0; - new = kzalloc(sizeof(struct memcg_cache_array) + new_array_size * sizeof(void *), GFP_KERNEL); if (!new) @@ -209,7 +209,7 @@ int memcg_update_all_caches(int num_memcgs) int ret = 0; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { + list_for_each_entry(s, &slab_root_caches, root_caches_node) { ret = update_memcg_params(s, num_memcgs); /* * Instead of freeing the memory, we'll just leave the caches @@ -222,10 +222,26 @@ int memcg_update_all_caches(int num_memcgs) return ret; } -static void unlink_memcg_cache(struct kmem_cache *s) +void memcg_link_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) { + list_add(&s->root_caches_node, &slab_root_caches); + } else { + list_add(&s->memcg_params.children_node, + &s->memcg_params.root_cache->memcg_params.children); + list_add(&s->memcg_params.kmem_caches_node, + &s->memcg_params.memcg->kmem_caches); + } +} + +static void memcg_unlink_cache(struct kmem_cache *s) { - list_del(&s->memcg_params.children_node); - list_del(&s->memcg_params.kmem_caches_node); + if (is_root_cache(s)) { + list_del(&s->root_caches_node); + } else { + list_del(&s->memcg_params.children_node); + list_del(&s->memcg_params.kmem_caches_node); + } } #else static inline int init_memcg_params(struct kmem_cache *s, @@ -238,7 +254,7 @@ static inline void destroy_memcg_params(struct kmem_cache *s) { } -static inline void unlink_memcg_cache(struct kmem_cache *s) +static inline void memcg_unlink_cache(struct kmem_cache *s) { } #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ @@ -285,7 +301,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, if (flags & SLAB_NEVER_MERGE) return NULL; - list_for_each_entry_reverse(s, &slab_caches, list) { + list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) { if (slab_unmergeable(s)) continue; @@ -369,6 +385,7 @@ static struct kmem_cache *create_cache(const char *name, s->refcount = 1; list_add(&s->list, &slab_caches); + memcg_link_cache(s); out: if (err) return ERR_PTR(err); @@ -514,9 +531,8 @@ static int shutdown_cache(struct kmem_cache *s) if (__kmem_cache_shutdown(s) != 0) return -EBUSY; + memcg_unlink_cache(s); list_del(&s->list); - if (!is_root_cache(s)) - unlink_memcg_cache(s); if (s->flags & SLAB_DESTROY_BY_RCU) { list_add_tail(&s->list, &slab_caches_to_rcu_destroy); @@ -596,10 +612,6 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; } - list_add(&s->memcg_params.children_node, - &root_cache->memcg_params.children); - list_add(&s->memcg_params.kmem_caches_node, &memcg->kmem_caches); - /* * Since readers won't lock (see cache_from_memcg_idx()), we need a * barrier here to ensure nobody will see the kmem_cache partially @@ -627,10 +639,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) get_online_mems(); mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { - if (!is_root_cache(s)) - continue; - + list_for_each_entry(s, &slab_root_caches, root_caches_node) { arr = rcu_dereference_protected(s->memcg_params.memcg_caches, lockdep_is_held(&slab_mutex)); c = arr->entries[idx]; @@ -829,6 +838,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, create_boot_cache(s, name, size, flags); list_add(&s->list, &slab_caches); + memcg_link_cache(s); s->refcount = 1; return s; } @@ -1136,12 +1146,12 @@ static void print_slabinfo_header(struct seq_file *m) void *slab_start(struct seq_file *m, loff_t *pos) { mutex_lock(&slab_mutex); - return seq_list_start(&slab_caches, *pos); + return seq_list_start(&slab_root_caches, *pos); } void *slab_next(struct seq_file *m, void *p, loff_t *pos) { - return seq_list_next(p, &slab_caches, pos); + return seq_list_next(p, &slab_root_caches, pos); } void slab_stop(struct seq_file *m, void *p) @@ -1193,12 +1203,11 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m) static int slab_show(struct seq_file *m, void *p) { - struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + struct kmem_cache *s = list_entry(p, struct kmem_cache, root_caches_node); - if (p == slab_caches.next) + if (p == slab_root_caches.next) print_slabinfo_header(m); - if (is_root_cache(s)) - cache_show(s, m); + cache_show(s, m); return 0; } diff --git a/mm/slub.c b/mm/slub.c index caac5456f0ec..03b012bcb5fa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4127,6 +4127,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) } slab_init_memcg_params(s); list_add(&s->list, &slab_caches); + memcg_link_cache(s); return s; } -- cgit From c9fc586403e7c85eee06b2d5dea14ce71c00fcd8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 22 Feb 2017 15:41:27 -0800 Subject: slab: introduce __kmemcg_cache_deactivate() __kmem_cache_shrink() is called with %true @deactivate only for memcg caches. Remove @deactivate from __kmem_cache_shrink() and introduce __kmemcg_cache_deactivate() instead. Each memcg-supporting allocator should implement it and it should deactivate and drain the cache. This is to allow memcg cache deactivation behavior to further deviate from simple shrinking without messing up __kmem_cache_shrink(). This is pure reorganization and doesn't introduce any observable behavior changes. v2: Dropped unnecessary ifdef in mm/slab.h as suggested by Vladimir. Link: http://lkml.kernel.org/r/20170117235411.9408-8-tj@kernel.org Signed-off-by: Tejun Heo Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 11 +++++++++-- mm/slab.h | 3 ++- mm/slab_common.c | 4 ++-- mm/slob.c | 2 +- mm/slub.c | 39 ++++++++++++++++++++++----------------- 5 files changed, 36 insertions(+), 23 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 8a0e3392f181..bd63450a9b16 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2315,7 +2315,7 @@ out: return nr_freed; } -int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *cachep) { int ret = 0; int node; @@ -2333,9 +2333,16 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) return (ret ? 1 : 0); } +#ifdef CONFIG_MEMCG +void __kmemcg_cache_deactivate(struct kmem_cache *cachep) +{ + __kmem_cache_shrink(cachep); +} +#endif + int __kmem_cache_shutdown(struct kmem_cache *cachep) { - return __kmem_cache_shrink(cachep, false); + return __kmem_cache_shrink(cachep); } void __kmem_cache_release(struct kmem_cache *cachep) diff --git a/mm/slab.h b/mm/slab.h index 9631bb27c772..7bff1ee513c2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -167,7 +167,8 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); -int __kmem_cache_shrink(struct kmem_cache *, bool); +int __kmem_cache_shrink(struct kmem_cache *); +void __kmemcg_cache_deactivate(struct kmem_cache *s); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; diff --git a/mm/slab_common.c b/mm/slab_common.c index 274697e1a42a..59e41bb81575 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -646,7 +646,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) if (!c) continue; - __kmem_cache_shrink(c, true); + __kmemcg_cache_deactivate(c); arr->entries[idx] = NULL; } mutex_unlock(&slab_mutex); @@ -794,7 +794,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) get_online_cpus(); get_online_mems(); kasan_cache_shrink(cachep); - ret = __kmem_cache_shrink(cachep, false); + ret = __kmem_cache_shrink(cachep); put_online_mems(); put_online_cpus(); return ret; diff --git a/mm/slob.c b/mm/slob.c index 5ec158054ffe..eac04d4357ec 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c) { } -int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *d) { return 0; } diff --git a/mm/slub.c b/mm/slub.c index 03b012bcb5fa..8a4591526f37 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3894,7 +3894,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *s) { int node; int i; @@ -3906,21 +3906,6 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) unsigned long flags; int ret = 0; - if (deactivate) { - /* - * Disable empty slabs caching. Used to avoid pinning offline - * memory cgroups by kmem pages that can be freed. - */ - s->cpu_partial = 0; - s->min_partial = 0; - - /* - * s->cpu_partial is checked locklessly (see put_cpu_partial), - * so we have to make sure the change is visible. - */ - synchronize_sched(); - } - flush_all(s); for_each_kmem_cache_node(s, node, n) { INIT_LIST_HEAD(&discard); @@ -3971,13 +3956,33 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) return ret; } +#ifdef CONFIG_MEMCG +void __kmemcg_cache_deactivate(struct kmem_cache *s) +{ + /* + * Disable empty slabs caching. Used to avoid pinning offline + * memory cgroups by kmem pages that can be freed. + */ + s->cpu_partial = 0; + s->min_partial = 0; + + /* + * s->cpu_partial is checked locklessly (see put_cpu_partial), so + * we have to make sure the change is visible. + */ + synchronize_sched(); + + __kmem_cache_shrink(s); +} +#endif + static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s, false); + __kmem_cache_shrink(s); mutex_unlock(&slab_mutex); return 0; -- cgit From 01fb58bcba63f8fba37581c24c99e9a515dd0335 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 22 Feb 2017 15:41:30 -0800 Subject: slab: remove synchronous synchronize_sched() from memcg cache deactivation path With kmem cgroup support enabled, kmem_caches can be created and destroyed frequently and a great number of near empty kmem_caches can accumulate if there are a lot of transient cgroups and the system is not under memory pressure. When memory reclaim starts under such conditions, it can lead to consecutive deactivation and destruction of many kmem_caches, easily hundreds of thousands on moderately large systems, exposing scalability issues in the current slab management code. This is one of the patches to address the issue. slub uses synchronize_sched() to deactivate a memcg cache. synchronize_sched() is an expensive and slow operation and doesn't scale when a huge number of caches are destroyed back-to-back. While there used to be a simple batching mechanism, the batching was too restricted to be helpful. This patch implements slab_deactivate_memcg_cache_rcu_sched() which slub can use to schedule sched RCU callback instead of performing synchronize_sched() synchronously while holding cgroup_mutex. While this adds online cpus, mems and slab_mutex operations, operating on these locks back-to-back from the same kworker, which is what's gonna happen when there are many to deactivate, isn't expensive at all and this gets rid of the scalability problem completely. Link: http://lkml.kernel.org/r/20170117235411.9408-9-tj@kernel.org Signed-off-by: Tejun Heo Reported-by: Jay Vana Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 6 ++++++ mm/slab.h | 2 ++ mm/slab_common.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/slub.c | 12 +++++++---- 4 files changed, 76 insertions(+), 4 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index af1a5bef80f4..3c37a8c51921 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -582,6 +582,12 @@ struct memcg_cache_params { struct mem_cgroup *memcg; struct list_head children_node; struct list_head kmem_caches_node; + + void (*deact_fn)(struct kmem_cache *); + union { + struct rcu_head deact_rcu_head; + struct work_struct deact_work; + }; }; }; }; diff --git a/mm/slab.h b/mm/slab.h index 7bff1ee513c2..65e7c3fcac72 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -307,6 +307,8 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, extern void slab_init_memcg_params(struct kmem_cache *); extern void memcg_link_cache(struct kmem_cache *s); +extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, + void (*deact_fn)(struct kmem_cache *)); #else /* CONFIG_MEMCG && !CONFIG_SLOB */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 59e41bb81575..c549296c7981 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -627,6 +627,66 @@ out_unlock: put_online_cpus(); } +static void kmemcg_deactivate_workfn(struct work_struct *work) +{ + struct kmem_cache *s = container_of(work, struct kmem_cache, + memcg_params.deact_work); + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); + + s->memcg_params.deact_fn(s); + + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); + + /* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */ + css_put(&s->memcg_params.memcg->css); +} + +static void kmemcg_deactivate_rcufn(struct rcu_head *head) +{ + struct kmem_cache *s = container_of(head, struct kmem_cache, + memcg_params.deact_rcu_head); + + /* + * We need to grab blocking locks. Bounce to ->deact_work. The + * work item shares the space with the RCU head and can't be + * initialized eariler. + */ + INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn); + schedule_work(&s->memcg_params.deact_work); +} + +/** + * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a + * sched RCU grace period + * @s: target kmem_cache + * @deact_fn: deactivation function to call + * + * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex + * held after a sched RCU grace period. The slab is guaranteed to stay + * alive until @deact_fn is finished. This is to be used from + * __kmemcg_cache_deactivate(). + */ +void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, + void (*deact_fn)(struct kmem_cache *)) +{ + if (WARN_ON_ONCE(is_root_cache(s)) || + WARN_ON_ONCE(s->memcg_params.deact_fn)) + return; + + /* pin memcg so that @s doesn't get destroyed in the middle */ + css_get(&s->memcg_params.memcg->css); + + s->memcg_params.deact_fn = deact_fn; + call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn); +} + void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) { int idx; diff --git a/mm/slub.c b/mm/slub.c index 8a4591526f37..62d0b557a596 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3957,6 +3957,12 @@ int __kmem_cache_shrink(struct kmem_cache *s) } #ifdef CONFIG_MEMCG +static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s) +{ + /* called with all the locks held after a sched RCU grace period */ + __kmem_cache_shrink(s); +} + void __kmemcg_cache_deactivate(struct kmem_cache *s) { /* @@ -3968,11 +3974,9 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s) /* * s->cpu_partial is checked locklessly (see put_cpu_partial), so - * we have to make sure the change is visible. + * we have to make sure the change is visible before shrinking. */ - synchronize_sched(); - - __kmem_cache_shrink(s); + slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu); } #endif -- cgit From 50862ce711b3e9cf8511df7a356892e128b037d3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 22 Feb 2017 15:41:33 -0800 Subject: slab: remove slub sysfs interface files early for empty memcg caches With kmem cgroup support enabled, kmem_caches can be created and destroyed frequently and a great number of near empty kmem_caches can accumulate if there are a lot of transient cgroups and the system is not under memory pressure. When memory reclaim starts under such conditions, it can lead to consecutive deactivation and destruction of many kmem_caches, easily hundreds of thousands on moderately large systems, exposing scalability issues in the current slab management code. This is one of the patches to address the issue. Each cache has a number of sysfs interface files under /sys/kernel/slab. On a system with a lot of memory and transient memcgs, the number of interface files which have to be removed once memory reclaim kicks in can reach millions. Link: http://lkml.kernel.org/r/20170117235411.9408-10-tj@kernel.org Signed-off-by: Tejun Heo Reported-by: Jay Vana Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 62d0b557a596..af38aaad34b0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3959,8 +3959,20 @@ int __kmem_cache_shrink(struct kmem_cache *s) #ifdef CONFIG_MEMCG static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s) { - /* called with all the locks held after a sched RCU grace period */ - __kmem_cache_shrink(s); + /* + * Called with all the locks held after a sched RCU grace period. + * Even if @s becomes empty after shrinking, we can't know that @s + * doesn't have allocations already in-flight and thus can't + * destroy @s until the associated memcg is released. + * + * However, let's remove the sysfs files for empty caches here. + * Each cache has a lot of interface files which aren't + * particularly useful for empty draining caches; otherwise, we can + * easily end up with millions of unnecessary sysfs files on + * systems which have a lot of memory and transient cgroups. + */ + if (!__kmem_cache_shrink(s)) + sysfs_slab_remove(s); } void __kmemcg_cache_deactivate(struct kmem_cache *s) @@ -5659,6 +5671,15 @@ static void sysfs_slab_remove(struct kmem_cache *s) */ return; + if (!s->kobj.state_in_sysfs) + /* + * For a memcg cache, this may be called during + * deactivation and again on shutdown. Remove only once. + * A cache is never shut down before deactivation is + * complete, so no need to worry about synchronization. + */ + return; + #ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif -- cgit From 17cc4dfeda97636d67e83de8cd41940b65a93bc7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 22 Feb 2017 15:41:36 -0800 Subject: slab: use memcg_kmem_cache_wq for slab destruction operations If there's contention on slab_mutex, queueing the per-cache destruction work item on the system_wq can unnecessarily create and tie up a lot of kworkers. Rename memcg_kmem_cache_create_wq to memcg_kmem_cache_wq and make it global and use that workqueue for the destruction work items too. While at it, convert the workqueue from an unbound workqueue to a per-cpu one with concurrency limited to 1. It's generally preferable to use per-cpu workqueues and concurrency limit of 1 is safe enough. This is suggested by Joonsoo Kim. Link: http://lkml.kernel.org/r/20170117235411.9408-11-tj@kernel.org Signed-off-by: Tejun Heo Reported-by: Jay Vana Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 + mm/memcontrol.c | 16 ++++++++-------- mm/slab_common.c | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9fcece9be85d..5af377303880 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -830,6 +830,7 @@ void memcg_kmem_uncharge(struct page *page, int order); #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) extern struct static_key_false memcg_kmem_enabled_key; +extern struct workqueue_struct *memcg_kmem_cache_wq; extern int memcg_nr_cache_ids; void memcg_get_cache_ids(void); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 834d641dfa8c..1fd6affcdde7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -317,6 +317,8 @@ void memcg_put_cache_ids(void) DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); +struct workqueue_struct *memcg_kmem_cache_wq; + #endif /* !CONFIG_SLOB */ /** @@ -2143,8 +2145,6 @@ struct memcg_kmem_cache_create_work { struct work_struct work; }; -static struct workqueue_struct *memcg_kmem_cache_create_wq; - static void memcg_kmem_cache_create_func(struct work_struct *w) { struct memcg_kmem_cache_create_work *cw = @@ -2176,7 +2176,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, cw->cachep = cachep; INIT_WORK(&cw->work, memcg_kmem_cache_create_func); - queue_work(memcg_kmem_cache_create_wq, &cw->work); + queue_work(memcg_kmem_cache_wq, &cw->work); } static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, @@ -5778,12 +5778,12 @@ static int __init mem_cgroup_init(void) #ifndef CONFIG_SLOB /* * Kmem cache creation is mostly done with the slab_mutex held, - * so use a special workqueue to avoid stalling all worker - * threads in case lots of cgroups are created simultaneously. + * so use a workqueue with limited concurrency to avoid stalling + * all worker threads in case lots of cgroups are created and + * destroyed simultaneously. */ - memcg_kmem_cache_create_wq = - alloc_ordered_workqueue("memcg_kmem_cache_create", 0); - BUG_ON(!memcg_kmem_cache_create_wq); + memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1); + BUG_ON(!memcg_kmem_cache_wq); #endif cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, diff --git a/mm/slab_common.c b/mm/slab_common.c index c549296c7981..23ff74e61838 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -659,7 +659,7 @@ static void kmemcg_deactivate_rcufn(struct rcu_head *head) * initialized eariler. */ INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn); - schedule_work(&s->memcg_params.deact_work); + queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work); } /** -- cgit From 1663f26df3df7df3720306ca67f5ea8296d68fa1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 22 Feb 2017 15:41:39 -0800 Subject: slub: make sysfs directories for memcg sub-caches optional SLUB creates a per-cache directory under /sys/kernel/slab which hosts a bunch of debug files. Usually, there aren't that many caches on a system and this doesn't really matter; however, if memcg is in use, each cache can have per-cgroup sub-caches. SLUB creates the same directories for these sub-caches under /sys/kernel/slab/$CACHE/cgroup. Unfortunately, because there can be a lot of cgroups, active or draining, the product of the numbers of caches, cgroups and files in each directory can reach a very high number - hundreds of thousands is commonplace. Millions and beyond aren't difficult to reach either. What's under /sys/kernel/slab is primarily for debugging and the information and control on the a root cache already cover its sub-caches. While having a separate directory for each sub-cache can be helpful for development, it doesn't make much sense to pay this amount of overhead by default. This patch introduces a boot parameter slub_memcg_sysfs which determines whether to create sysfs directories for per-memcg sub-caches. It also adds CONFIG_SLUB_MEMCG_SYSFS_ON which determines the boot parameter's default value and defaults to 0. [akpm@linux-foundation.org: kset_unregister(NULL) is legal] Link: http://lkml.kernel.org/r/20170204145203.GB26958@mtj.duckdns.org Signed-off-by: Tejun Heo Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 8 ++++++++ init/Kconfig | 14 +++++++++++++ mm/slub.c | 26 +++++++++++++++++++++++-- 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d05533e6120b..986e44387dad 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3694,6 +3694,14 @@ last alloc / free. For more information see Documentation/vm/slub.txt. + slub_memcg_sysfs= [MM, SLUB] + Determines whether to enable sysfs directories for + memory cgroup sub-caches. 1 to enable, 0 to disable. + The default is determined by CONFIG_SLUB_MEMCG_SYSFS_ON. + Enabling this can lead to a very high number of debug + directories and files being created under + /sys/kernel/slub. + slub_max_order= [MM, SLUB] Determines the maximum allowed order for slabs. A high setting may cause OOMs due to memory diff --git a/init/Kconfig b/init/Kconfig index 55bb6fbc294e..22f437ff65e5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1779,6 +1779,20 @@ config SLUB_DEBUG SLUB sysfs support. /sys/slab will not exist and there will be no support for cache validation etc. +config SLUB_MEMCG_SYSFS_ON + default n + bool "Enable memcg SLUB sysfs support by default" if EXPERT + depends on SLUB && SYSFS && MEMCG + help + SLUB creates a directory under /sys/kernel/slab for each + allocation cache to host info and debug files. If memory + cgroup is enabled, each cache can have per memory cgroup + caches. SLUB can create the same sysfs directories for these + caches under /sys/kernel/slab/CACHE/cgroup but it can lead + to a very high number of debug files being created. This is + controlled by slub_memcg_sysfs boot parameter and this + config option determines the parameter's default value. + config COMPAT_BRK bool "Disable heap randomization" default y diff --git a/mm/slub.c b/mm/slub.c index af38aaad34b0..7f4bc7027ed5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4708,6 +4708,22 @@ enum slab_stat_type { #define SO_OBJECTS (1 << SL_OBJECTS) #define SO_TOTAL (1 << SL_TOTAL) +#ifdef CONFIG_MEMCG +static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); + +static int __init setup_slub_memcg_sysfs(char *str) +{ + int v; + + if (get_option(&str, &v) > 0) + memcg_sysfs_enabled = v; + + return 1; +} + +__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs); +#endif + static ssize_t show_slab_objects(struct kmem_cache *s, char *buf, unsigned long flags) { @@ -5611,8 +5627,14 @@ static int sysfs_slab_add(struct kmem_cache *s) { int err; const char *name; + struct kset *kset = cache_kset(s); int unmergeable = slab_unmergeable(s); + if (!kset) { + kobject_init(&s->kobj, &slab_ktype); + return 0; + } + if (unmergeable) { /* * Slabcache can never be merged so we can use the name proper. @@ -5629,7 +5651,7 @@ static int sysfs_slab_add(struct kmem_cache *s) name = create_unique_id(s); } - s->kobj.kset = cache_kset(s); + s->kobj.kset = kset; err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); if (err) goto out; @@ -5639,7 +5661,7 @@ static int sysfs_slab_add(struct kmem_cache *s) goto out_del_kobj; #ifdef CONFIG_MEMCG - if (is_root_cache(s)) { + if (is_root_cache(s) && memcg_sysfs_enabled) { s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); if (!s->memcg_kset) { err = -ENOMEM; -- cgit From f8005451d75f4879a93b12c14b162dd60f81ec56 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 22 Feb 2017 15:41:42 -0800 Subject: tmpfs: change shmem_mapping() to test shmem_aops Callers of shmem_mapping() are interested in whether the mapping is swap backed - except for uprobes, which is interested in whether it should use shmem_read_mapping_page(). All these callers are better served by a shmem_mapping() which checks for shmem_aops, than the current version which goes through several indirections to find where the inode lives - and has the surprising effect that a private mmap of /dev/zero satisfies both vma_is_anonymous() and shmem_mapping(), when that device node is on devtmpfs. I don't think anything in the tree suffers from that surprise, but it caught me out, and is better fixed. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1612052148530.13021@eggly.anvils Signed-off-by: Hugh Dickins Acked-by: Johannes Weiner Cc: Oleg Nesterov Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 3a7587a0314d..7d52cd4b504d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2175,10 +2175,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode bool shmem_mapping(struct address_space *mapping) { - if (!mapping->host) - return false; - - return mapping->host->i_sb->s_op == &shmem_ops; + return mapping->a_ops == &shmem_aops; } #ifdef CONFIG_TMPFS -- cgit From aa187507ef8bb3178a3312d851e8485bd81913c9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:41:45 -0800 Subject: mm: throttle show_mem() from warn_alloc() Tetsuo has been stressing OOM killer path with many parallel allocation requests when he has noticed that it is not all that hard to swamp kernel logs with warn_alloc messages caused by allocation stalls. Even though the allocation stall message is triggered only once in 10s there might be many different tasks hitting it roughly around the same time. A big part of the output is show_mem() which can generate a lot of output even on a small machines. There is no reason to show the state of memory counter for each allocation stall, especially when multiple of them are reported in a short time period. Chances are that not much has changed since the last report. This patch simply rate limits show_mem called from warn_alloc to only dump something once per second. This should be enough to give us a clue why an allocation might be stalling while burst of warnings will not swamp log with too much data. While we are at it, extract all the show_mem related handling (filters) into a separate function warn_alloc_show_mem. This will make the code cleaner and as a bonus point we can distinguish which part of warn_alloc got throttled due to rate limiting as ___ratelimit dumps the caller. [akpm@linux-foundation.org: reduce scope of the ratelimit_states] Link: http://lkml.kernel.org/r/20161215101510.9030-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f3e0c69a97b7..3c790ae4cb52 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3007,18 +3007,12 @@ static inline bool should_suppress_show_mem(void) return ret; } -static DEFINE_RATELIMIT_STATE(nopage_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - -void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) +static void warn_alloc_show_mem(gfp_t gfp_mask) { unsigned int filter = SHOW_MEM_FILTER_NODES; - struct va_format vaf; - va_list args; + static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || - debug_guardpage_minorder() > 0) + if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) return; /* @@ -3033,6 +3027,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; + show_mem(filter); +} + +void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || + debug_guardpage_minorder() > 0) + return; + pr_warn("%s: ", current->comm); va_start(args, fmt); @@ -3044,8 +3052,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); dump_stack(); - if (!should_suppress_show_mem()) - show_mem(filter); + warn_alloc_show_mem(gfp_mask); } static inline struct page * -- cgit From 76741e776a37973a3e398d504069b3e55c5cc866 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 22 Feb 2017 15:41:48 -0800 Subject: mm, page_alloc: don't convert pfn to idx when merging In __free_one_page() we do the buddy merging arithmetics on "page/buddy index", which is just the lower MAX_ORDER bits of pfn. The operations we do that affect the higher bits are bitwise AND and subtraction (in that order), where the final result will be the same with the higher bits left unmasked, as long as these bits are equal for both buddies - which must be true by the definition of a buddy. We can therefore use pfn's directly instead of "index" and skip the zeroing of >MAX_ORDER bits. This can help a bit by itself, although compiler might be smart enough already. It also helps the next patch to avoid page_to_pfn() for memory hole checks. Link: http://lkml.kernel.org/r/20161216120009.20064-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 4 ++-- mm/page_alloc.c | 31 ++++++++++++++----------------- mm/page_isolation.c | 8 ++++---- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 7aa2ea0a8623..bfad3b5d2665 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -133,9 +133,9 @@ struct alloc_context { * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ static inline unsigned long -__find_buddy_index(unsigned long page_idx, unsigned int order) +__find_buddy_pfn(unsigned long page_pfn, unsigned int order) { - return page_idx ^ (1 << order); + return page_pfn ^ (1 << order); } extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3c790ae4cb52..49d40261f8c4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -787,9 +787,8 @@ static inline void __free_one_page(struct page *page, struct zone *zone, unsigned int order, int migratetype) { - unsigned long page_idx; - unsigned long combined_idx; - unsigned long uninitialized_var(buddy_idx); + unsigned long combined_pfn; + unsigned long uninitialized_var(buddy_pfn); struct page *buddy; unsigned int max_order; @@ -802,15 +801,13 @@ static inline void __free_one_page(struct page *page, if (likely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); - page_idx = pfn & ((1 << MAX_ORDER) - 1); - - VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); continue_merging: while (order < max_order - 1) { - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); if (!page_is_buddy(page, buddy, order)) goto done_merging; /* @@ -824,9 +821,9 @@ continue_merging: zone->free_area[order].nr_free--; rmv_page_order(buddy); } - combined_idx = buddy_idx & page_idx; - page = page + (combined_idx - page_idx); - page_idx = combined_idx; + combined_pfn = buddy_pfn & pfn; + page = page + (combined_pfn - pfn); + pfn = combined_pfn; order++; } if (max_order < MAX_ORDER) { @@ -841,8 +838,8 @@ continue_merging: if (unlikely(has_isolate_pageblock(zone))) { int buddy_mt; - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); buddy_mt = get_pageblock_migratetype(buddy); if (migratetype != buddy_mt @@ -867,10 +864,10 @@ done_merging: */ if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { struct page *higher_page, *higher_buddy; - combined_idx = buddy_idx & page_idx; - higher_page = page + (combined_idx - page_idx); - buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = higher_page + (buddy_idx - combined_idx); + combined_pfn = buddy_pfn & pfn; + higher_page = page + (combined_pfn - pfn); + buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); + higher_buddy = higher_page + (buddy_pfn - combined_pfn); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index a5594bfcc5ed..dadb7e74d7d6 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -83,7 +83,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) unsigned long flags, nr_pages; bool isolated_page = false; unsigned int order; - unsigned long page_idx, buddy_idx; + unsigned long pfn, buddy_pfn; struct page *buddy; zone = page_zone(page); @@ -102,9 +102,9 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) if (PageBuddy(page)) { order = page_order(page); if (order >= pageblock_order) { - page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + pfn = page_to_pfn(page); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); if (pfn_valid_within(page_to_pfn(buddy)) && !is_migrate_isolate_page(buddy)) { -- cgit From 13ad59df67f19788f6c22985b1a33e466eceb643 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 22 Feb 2017 15:41:51 -0800 Subject: mm, page_alloc: avoid page_to_pfn() when merging buddies On architectures that allow memory holes, page_is_buddy() has to perform page_to_pfn() to check for the memory hole. After the previous patch, we have the pfn already available in __free_one_page(), which is the only caller of page_is_buddy(), so move the check there and avoid page_to_pfn(). Link: http://lkml.kernel.org/r/20161216120009.20064-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++----- mm/page_isolation.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 49d40261f8c4..af65c4eedc79 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -714,7 +714,7 @@ static inline void rmv_page_order(struct page *page) /* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if - * (a) the buddy is not in a hole && + * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. @@ -729,9 +729,6 @@ static inline void rmv_page_order(struct page *page) static inline int page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { - if (!pfn_valid_within(page_to_pfn(buddy))) - return 0; - if (page_is_guard(buddy) && page_order(buddy) == order) { if (page_zone_id(page) != page_zone_id(buddy)) return 0; @@ -808,6 +805,9 @@ continue_merging: while (order < max_order - 1) { buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); + + if (!pfn_valid_within(buddy_pfn)) + goto done_merging; if (!page_is_buddy(page, buddy, order)) goto done_merging; /* @@ -862,7 +862,7 @@ done_merging: * so it's less likely to be used soon and more likely to be merged * as a higher order page */ - if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { + if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { struct page *higher_page, *higher_buddy; combined_pfn = buddy_pfn & pfn; higher_page = page + (combined_pfn - pfn); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index dadb7e74d7d6..f4e17a57926a 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -106,7 +106,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); - if (pfn_valid_within(page_to_pfn(buddy)) && + if (pfn_valid_within(buddy_pfn) && !is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); isolated_page = true; -- cgit From 4583e77310a2edf39ee91099fba3cd153a22a77b Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 22 Feb 2017 15:41:54 -0800 Subject: mm/vmalloc.c: use rb_entry_safe Use rb_entry_safe() instead of open-coding it. Link: http://lkml.kernel.org/r/81bb9820e5b9e4a1c596b3e76f88abf8c4a76cb0.1482221947.git.geliangtang@gmail.com Signed-off-by: Geliang Tang Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3ca82d44edd3..5f5b09e9dccd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2309,7 +2309,7 @@ EXPORT_SYMBOL_GPL(free_vm_area); #ifdef CONFIG_SMP static struct vmap_area *node_to_va(struct rb_node *n) { - return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; + return rb_entry_safe(n, struct vmap_area, rb_node); } /** -- cgit From aff28015fe5396b7f2bf7f55863949d391eb75fb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:41:57 -0800 Subject: mm, trace: extract COMPACTION_STATUS and ZONE_TYPE to a common header COMPACTION_STATUS resp. ZONE_TYPE are currently used to translate enum compact_result resp. struct zone index into their symbolic names for an easier post processing. The follow up patch would like to reuse this as well. The code involves some preprocessor black magic which is better not duplicated elsewhere so move it to a common mm tracing relate header. Link: http://lkml.kernel.org/r/20161220130135.15719-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/compaction.h | 60 ++---------------------------------- include/trace/events/mmflags.h | 64 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 57 deletions(-) diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index cbdb90b6b308..0a18ab6483ff 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -9,62 +9,6 @@ #include #include -#define COMPACTION_STATUS \ - EM( COMPACT_SKIPPED, "skipped") \ - EM( COMPACT_DEFERRED, "deferred") \ - EM( COMPACT_CONTINUE, "continue") \ - EM( COMPACT_SUCCESS, "success") \ - EM( COMPACT_PARTIAL_SKIPPED, "partial_skipped") \ - EM( COMPACT_COMPLETE, "complete") \ - EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ - EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \ - EMe(COMPACT_CONTENDED, "contended") - -#ifdef CONFIG_ZONE_DMA -#define IFDEF_ZONE_DMA(X) X -#else -#define IFDEF_ZONE_DMA(X) -#endif - -#ifdef CONFIG_ZONE_DMA32 -#define IFDEF_ZONE_DMA32(X) X -#else -#define IFDEF_ZONE_DMA32(X) -#endif - -#ifdef CONFIG_HIGHMEM -#define IFDEF_ZONE_HIGHMEM(X) X -#else -#define IFDEF_ZONE_HIGHMEM(X) -#endif - -#define ZONE_TYPE \ - IFDEF_ZONE_DMA( EM (ZONE_DMA, "DMA")) \ - IFDEF_ZONE_DMA32( EM (ZONE_DMA32, "DMA32")) \ - EM (ZONE_NORMAL, "Normal") \ - IFDEF_ZONE_HIGHMEM( EM (ZONE_HIGHMEM,"HighMem")) \ - EMe(ZONE_MOVABLE,"Movable") - -/* - * First define the enums in the above macros to be exported to userspace - * via TRACE_DEFINE_ENUM(). - */ -#undef EM -#undef EMe -#define EM(a, b) TRACE_DEFINE_ENUM(a); -#define EMe(a, b) TRACE_DEFINE_ENUM(a); - -COMPACTION_STATUS -ZONE_TYPE - -/* - * Now redefine the EM() and EMe() macros to map the enums to the strings - * that will be printed in the output. - */ -#undef EM -#undef EMe -#define EM(a, b) {a, b}, -#define EMe(a, b) {a, b} DECLARE_EVENT_CLASS(mm_compaction_isolate_template, @@ -187,6 +131,7 @@ TRACE_EVENT(mm_compaction_begin, __entry->sync ? "sync" : "async") ); +#ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_end, TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, unsigned long free_pfn, unsigned long zone_end, bool sync, @@ -220,6 +165,7 @@ TRACE_EVENT(mm_compaction_end, __entry->sync ? "sync" : "async", __print_symbolic(__entry->status, COMPACTION_STATUS)) ); +#endif TRACE_EVENT(mm_compaction_try_to_compact_pages, @@ -248,6 +194,7 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages, __entry->prio) ); +#ifdef CONFIG_COMPACTION DECLARE_EVENT_CLASS(mm_compaction_suitable_template, TP_PROTO(struct zone *zone, @@ -295,7 +242,6 @@ DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable, TP_ARGS(zone, order, ret) ); -#ifdef CONFIG_COMPACTION DECLARE_EVENT_CLASS(mm_compaction_defer_template, TP_PROTO(struct zone *zone, int order), diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 15bf875d0e4a..75ed3220ede2 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -1,3 +1,6 @@ +#include +#include +#include /* * The order of these masks is important. Matching masks will be seen * first and the left over flags will end up showing by themselves. @@ -171,3 +174,64 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ (flags) ? __print_flags(flags, "|", \ __def_vmaflag_names \ ) : "none" + +#ifdef CONFIG_COMPACTION +#define COMPACTION_STATUS \ + EM( COMPACT_SKIPPED, "skipped") \ + EM( COMPACT_DEFERRED, "deferred") \ + EM( COMPACT_CONTINUE, "continue") \ + EM( COMPACT_SUCCESS, "success") \ + EM( COMPACT_PARTIAL_SKIPPED, "partial_skipped") \ + EM( COMPACT_COMPLETE, "complete") \ + EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ + EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \ + EMe(COMPACT_CONTENDED, "contended") +#else +#define COMPACTION_STATUS +#endif + +#ifdef CONFIG_ZONE_DMA +#define IFDEF_ZONE_DMA(X) X +#else +#define IFDEF_ZONE_DMA(X) +#endif + +#ifdef CONFIG_ZONE_DMA32 +#define IFDEF_ZONE_DMA32(X) X +#else +#define IFDEF_ZONE_DMA32(X) +#endif + +#ifdef CONFIG_HIGHMEM +#define IFDEF_ZONE_HIGHMEM(X) X +#else +#define IFDEF_ZONE_HIGHMEM(X) +#endif + +#define ZONE_TYPE \ + IFDEF_ZONE_DMA( EM (ZONE_DMA, "DMA")) \ + IFDEF_ZONE_DMA32( EM (ZONE_DMA32, "DMA32")) \ + EM (ZONE_NORMAL, "Normal") \ + IFDEF_ZONE_HIGHMEM( EM (ZONE_HIGHMEM,"HighMem")) \ + EMe(ZONE_MOVABLE,"Movable") + +/* + * First define the enums in the above macros to be exported to userspace + * via TRACE_DEFINE_ENUM(). + */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +COMPACTION_STATUS +ZONE_TYPE + +/* + * Now redefine the EM() and EMe() macros to map the enums to the strings + * that will be printed in the output. + */ +#undef EM +#undef EMe +#define EM(a, b) {a, b}, +#define EMe(a, b) {a, b} -- cgit From d379f01de09570e06d84b4b09e5f4951821a1dc8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:42:00 -0800 Subject: oom, trace: add oom detection tracepoints should_reclaim_retry is the central decision point for declaring the OOM. It might be really useful to expose data used for this decision making when debugging an unexpected oom situations. Say we have an OOM report: [ 52.264001] mem_eater invoked oom-killer: gfp_mask=0x24280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=0, order=0, oom_score_adj=0 [ 52.267549] CPU: 3 PID: 3148 Comm: mem_eater Tainted: G W 4.8.0-oomtrace3-00006-gb21338b386d2 #1024 Now we can check the tracepoint data to see how we have ended up in this situation: mem_eater-3148 [003] .... 52.432801: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11134 min_wmark=11084 no_progress_loops=1 wmark_check=1 mem_eater-3148 [003] .... 52.433269: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11103 min_wmark=11084 no_progress_loops=1 wmark_check=1 mem_eater-3148 [003] .... 52.433712: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11100 min_wmark=11084 no_progress_loops=2 wmark_check=1 mem_eater-3148 [003] .... 52.434067: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11097 min_wmark=11084 no_progress_loops=3 wmark_check=1 mem_eater-3148 [003] .... 52.434414: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11094 min_wmark=11084 no_progress_loops=4 wmark_check=1 mem_eater-3148 [003] .... 52.434761: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11091 min_wmark=11084 no_progress_loops=5 wmark_check=1 mem_eater-3148 [003] .... 52.435108: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11087 min_wmark=11084 no_progress_loops=6 wmark_check=1 mem_eater-3148 [003] .... 52.435478: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11084 min_wmark=11084 no_progress_loops=7 wmark_check=0 mem_eater-3148 [003] .... 52.435478: reclaim_retry_zone: node=0 zone=DMA order=0 reclaimable=0 available=1126 min_wmark=179 no_progress_loops=7 wmark_check=0 The above shows that we can quickly deduce that the reclaim stopped making any progress (see no_progress_loops increased in each round) and while there were still some 51 reclaimable pages they couldn't be dropped for some reason (vmscan trace points would tell us more about that part). available will represent reclaimable + free_pages scaled down per no_progress_loops factor. This is essentially an optimistic estimate of how much memory we would have when reclaiming everything. This can be compared to min_wmark to get a rought idea but the wmark_check tells the result of the watermark check which is more precise (includes lowmem reserves, considers the order etc.). As we can see no zone is eligible in the end and that is why we have triggered the oom in this situation. Please note that higher order requests might fail on the wmark_check even when there is much more memory available than min_wmark - e.g. when the memory is fragmented. A follow up tracepoint will help to debug those situations. Link: http://lkml.kernel.org/r/20161220130135.15719-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/oom.h | 42 ++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 10 ++++++++-- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index 1e974983757e..9160da7a26a0 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h @@ -4,6 +4,7 @@ #if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_OOM_H #include +#include TRACE_EVENT(oom_score_adj_update, @@ -27,6 +28,47 @@ TRACE_EVENT(oom_score_adj_update, __entry->pid, __entry->comm, __entry->oom_score_adj) ); +TRACE_EVENT(reclaim_retry_zone, + + TP_PROTO(struct zoneref *zoneref, + int order, + unsigned long reclaimable, + unsigned long available, + unsigned long min_wmark, + int no_progress_loops, + bool wmark_check), + + TP_ARGS(zoneref, order, reclaimable, available, min_wmark, no_progress_loops, wmark_check), + + TP_STRUCT__entry( + __field( int, node) + __field( int, zone_idx) + __field( int, order) + __field( unsigned long, reclaimable) + __field( unsigned long, available) + __field( unsigned long, min_wmark) + __field( int, no_progress_loops) + __field( bool, wmark_check) + ), + + TP_fast_assign( + __entry->node = zone_to_nid(zoneref->zone); + __entry->zone_idx = zoneref->zone_idx; + __entry->order = order; + __entry->reclaimable = reclaimable; + __entry->available = available; + __entry->min_wmark = min_wmark; + __entry->no_progress_loops = no_progress_loops; + __entry->wmark_check = wmark_check; + ), + + TP_printk("node=%d zone=%-8s order=%d reclaimable=%lu available=%lu min_wmark=%lu no_progress_loops=%d wmark_check=%d", + __entry->node, __print_symbolic(__entry->zone_idx, ZONE_TYPE), + __entry->order, + __entry->reclaimable, __entry->available, __entry->min_wmark, + __entry->no_progress_loops, + __entry->wmark_check) +); #endif /* This part must be outside protection */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index af65c4eedc79..d20f8c3139bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -3468,6 +3469,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, ac->nodemask) { unsigned long available; unsigned long reclaimable; + unsigned long min_wmark = min_wmark_pages(zone); + bool wmark; available = reclaimable = zone_reclaimable_pages(zone); available -= DIV_ROUND_UP((*no_progress_loops) * available, @@ -3478,8 +3481,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * Would the allocation succeed if we reclaimed the whole * available? */ - if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags, available)) { + wmark = __zone_watermark_ok(zone, order, min_wmark, + ac_classzone_idx(ac), alloc_flags, available); + trace_reclaim_retry_zone(z, order, reclaimable, + available, min_wmark, *no_progress_loops, wmark); + if (wmark) { /* * If we didn't make any progress and have a lot of * dirty + writeback pages then we should wait for -- cgit From 65190cff3cc108b72e42cce67ed8b73dbad6b731 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:42:03 -0800 Subject: oom, trace: add compaction retry tracepoint Higher order requests oom debugging is currently quite hard. We do have some compaction points which can tell us how the compaction is operating but there is no trace point to tell us about compaction retry logic. This patch adds a one which will have the following format bash-3126 [001] .... 1498.220001: compact_retry: order=9 priority=COMPACT_PRIO_SYNC_LIGHT compaction_result=withdrawn retries=0 max_retries=16 should_retry=0 we can see that the order 9 request is not retried even though we are in the highest compaction priority mode becase the last compaction attempt was withdrawn. This means that compaction_zonelist_suitable must have returned false and there is no suitable zone to compact for this request and so no need to retry further. another example would be <...>-3137 [001] .... 81.501689: compact_retry: order=9 priority=COMPACT_PRIO_SYNC_LIGHT compaction_result=failed retries=0 max_retries=16 should_retry=0 in this case the order-9 compaction failed to find any suitable block. We do not retry anymore because this is a costly request and those do not go below COMPACT_PRIO_SYNC_LIGHT priority. Link: http://lkml.kernel.org/r/20161220130135.15719-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/mmflags.h | 26 ++++++++++++++++++++++++++ include/trace/events/oom.h | 39 +++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 22 ++++++++++++++++------ 3 files changed, 81 insertions(+), 6 deletions(-) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 75ed3220ede2..91554faed17e 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -186,8 +186,32 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \ EMe(COMPACT_CONTENDED, "contended") + +/* High-level compaction status feedback */ +#define COMPACTION_FAILED 1 +#define COMPACTION_WITHDRAWN 2 +#define COMPACTION_PROGRESS 3 + +#define compact_result_to_feedback(result) \ +({ \ + enum compact_result __result = result; \ + (compaction_failed(__result)) ? COMPACTION_FAILED : \ + (compaction_withdrawn(__result)) ? COMPACTION_WITHDRAWN : COMPACTION_PROGRESS; \ +}) + +#define COMPACTION_FEEDBACK \ + EM(COMPACTION_FAILED, "failed") \ + EM(COMPACTION_WITHDRAWN, "withdrawn") \ + EMe(COMPACTION_PROGRESS, "progress") + +#define COMPACTION_PRIORITY \ + EM(COMPACT_PRIO_SYNC_FULL, "COMPACT_PRIO_SYNC_FULL") \ + EM(COMPACT_PRIO_SYNC_LIGHT, "COMPACT_PRIO_SYNC_LIGHT") \ + EMe(COMPACT_PRIO_ASYNC, "COMPACT_PRIO_ASYNC") #else #define COMPACTION_STATUS +#define COMPACTION_PRIORITY +#define COMPACTION_FEEDBACK #endif #ifdef CONFIG_ZONE_DMA @@ -225,6 +249,8 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ #define EMe(a, b) TRACE_DEFINE_ENUM(a); COMPACTION_STATUS +COMPACTION_PRIORITY +COMPACTION_FEEDBACK ZONE_TYPE /* diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index 9160da7a26a0..38baeb27221a 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h @@ -69,6 +69,45 @@ TRACE_EVENT(reclaim_retry_zone, __entry->no_progress_loops, __entry->wmark_check) ); + +#ifdef CONFIG_COMPACTION +TRACE_EVENT(compact_retry, + + TP_PROTO(int order, + enum compact_priority priority, + enum compact_result result, + int retries, + int max_retries, + bool ret), + + TP_ARGS(order, priority, result, retries, max_retries, ret), + + TP_STRUCT__entry( + __field( int, order) + __field( int, priority) + __field( int, result) + __field( int, retries) + __field( int, max_retries) + __field( bool, ret) + ), + + TP_fast_assign( + __entry->order = order; + __entry->priority = priority; + __entry->result = compact_result_to_feedback(result); + __entry->retries = retries; + __entry->max_retries = max_retries; + __entry->ret = ret; + ), + + TP_printk("order=%d priority=%s compaction_result=%s retries=%d max_retries=%d should_retry=%d", + __entry->order, + __print_symbolic(__entry->priority, COMPACTION_PRIORITY), + __print_symbolic(__entry->result, COMPACTION_FEEDBACK), + __entry->retries, __entry->max_retries, + __entry->ret) +); +#endif /* CONFIG_COMPACTION */ #endif /* This part must be outside protection */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d20f8c3139bb..05c0a59323bd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3197,6 +3197,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, { int max_retries = MAX_COMPACT_RETRIES; int min_priority; + bool ret = false; + int retries = *compaction_retries; + enum compact_priority priority = *compact_priority; if (!order) return false; @@ -3218,8 +3221,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, * But do not retry if the given zonelist is not suitable for * compaction. */ - if (compaction_withdrawn(compact_result)) - return compaction_zonelist_suitable(ac, order, alloc_flags); + if (compaction_withdrawn(compact_result)) { + ret = compaction_zonelist_suitable(ac, order, alloc_flags); + goto out; + } /* * !costly requests are much more important than __GFP_REPEAT @@ -3231,8 +3236,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, */ if (order > PAGE_ALLOC_COSTLY_ORDER) max_retries /= 4; - if (*compaction_retries <= max_retries) - return true; + if (*compaction_retries <= max_retries) { + ret = true; + goto out; + } /* * Make sure there are attempts at the highest priority if we exhausted @@ -3241,12 +3248,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, check_priority: min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; + if (*compact_priority > min_priority) { (*compact_priority)--; *compaction_retries = 0; - return true; + ret = true; } - return false; +out: + trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); + return ret; } #else static inline struct page * -- cgit From e067eba5871c6922539dc1728699c14e6b22590f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:42:06 -0800 Subject: userfaultfd: document _IOR/_IOW Patch series "userfaultfd tmpfs/hugetlbfs/non-cooperative", v2 These userfaultfd features are finished and are ready for larger exposure in -mm and upstream merging. 1) tmpfs non present userfault 2) hugetlbfs non present userfault 3) non cooperative userfault for fork/madvise/mremap qemu development code is already exercising 2) and container postcopy live migration needs 3). 1) is not currently used but there's a self test and we know some qemu user for various reasons uses tmpfs as backing for KVM so it'll need it too to use postcopy live migration with tmpfs memory. All review feedback from the previous submit has been handled and the fixes are included. There's no outstanding issue AFIK. Upstream code just did a s/fe/vmf/ conversion in the page faults and this has been converted as well incrementally. In addition to the previous submits, this also wakes up stuck userfaults during UFFDIO_UNREGISTER. The non cooperative testcase actually reproduced this problem by getting stuck instead of quitting clean in some rare case as it could call UFFDIO_UNREGISTER while some userfault could be still in flight. The other option would have been to keep leaving it up to userland to serialize itself and to patch the testcase instead but the wakeup during unregister I think is preferable. David also asked the UFFD_FEATURE_MISSING_HUGETLBFS and UFFD_FEATURE_MISSING_SHMEM feature flags to be added so QEMU can avoid to probe if the hugetlbfs/shmem missing support is available by calling UFFDIO_REGISTER. QEMU already checks HUGETLBFS_MAGIC with fstatfs so if UFFD_FEATURE_MISSING_HUGETLBFS is also set, it knows UFFDIO_REGISTER will succeed (or if it fails, it's for some other more concerning reason). There's no reason to worry about adding too many feature flags. There are 64 available and worst case we've to bump the API if someday we're really going to run out of them. The round-trip network latency of hugetlbfs userfaults during postcopy live migration is still of the order of dozen milliseconds on 10GBit if at 2MB hugepage granularity so it's working perfectly and it should provide for higher bandwidth or lower CPU usage (which makes it interesting to add an option in the future to support THP granularity too for anonymous memory, UFFDIO_COPY would then have to create THP if alignment/len allows for it). 1GB hugetlbfs granularity will require big changes in hugetlbfs to work so it's deferred for later. This patch (of 42): This adds proper documentation (inline) to avoid the risk of further misunderstandings about the semantics of _IOW/_IOR and it also reminds whoever will bump the UFFDIO_API in the future, to change the two ioctl to _IOW. This was found while implementing strace support for those ioctl, otherwise we could have never found it by just reviewing kernel code and testing it. _IOC_READ or _IOC_WRITE alters nothing but the ioctl number itself, so it's only worth fixing if the UFFDIO_API is bumped someday. Link: http://lkml.kernel.org/r/20161216144821.5183-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: "Dmitry V. Levin" Cc: Michael Rapoport Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/asm-generic/ioctl.h | 10 +++++++++- include/uapi/linux/userfaultfd.h | 6 ++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/uapi/asm-generic/ioctl.h b/include/uapi/asm-generic/ioctl.h index 7e7c11b52143..749b32fe5623 100644 --- a/include/uapi/asm-generic/ioctl.h +++ b/include/uapi/asm-generic/ioctl.h @@ -48,6 +48,9 @@ /* * Direction bits, which any architecture can choose to override * before including this file. + * + * NOTE: _IOC_WRITE means userland is writing and kernel is + * reading. _IOC_READ means userland is reading and kernel is writing. */ #ifndef _IOC_NONE @@ -72,7 +75,12 @@ #define _IOC_TYPECHECK(t) (sizeof(t)) #endif -/* used to create numbers */ +/* + * Used to create numbers. + * + * NOTE: _IOW means userland is writing and kernel is reading. _IOR + * means userland is reading and kernel is writing. + */ #define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0) #define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),(_IOC_TYPECHECK(size))) #define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size))) diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 9057d7af3ae1..94046b8aa6ad 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -11,6 +11,12 @@ #include +/* + * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and + * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In + * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ + * means the userland is reading). + */ #define UFFD_API ((__u64)0xAA) /* * After implementing the respective features it will become: -- cgit From a4605a61d642fe9b9de050d356432ecac82cc701 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:42:09 -0800 Subject: userfaultfd: correct comment about UFFD_FEATURE_PAGEFAULT_FLAG_WP Minor comment correction. Link: http://lkml.kernel.org/r/20161216144821.5183-3-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 43953e03c356..cf1856cb25d2 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -169,7 +169,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address, msg.arg.pagefault.address = address; if (flags & FAULT_FLAG_WRITE) /* - * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the + * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE * was not set in a UFFD_EVENT_PAGEFAULT, it means it * was a read fault, otherwise if set it means it's -- cgit From 8474901a33d8a27958bfa99e78736863abd67874 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:42:12 -0800 Subject: userfaultfd: convert BUG() to WARN_ON_ONCE() Avoid BUG_ON()s and only WARN instead. This is just a cleanup, it can't make any runtime difference. This BUG_ON has never triggered and cannot trigger. Link: http://lkml.kernel.org/r/20161216144821.5183-4-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cf1856cb25d2..71e917387ec3 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -574,7 +574,8 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) ret = POLLIN; return ret; default: - BUG(); + WARN_ON_ONCE(1); + return POLLERR; } } -- cgit From a94720bf821dd63e72176da5f423ba7935dde67d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:42:15 -0800 Subject: userfaultfd: use vma_is_anonymous Cleanup the vma->vm_ops usage. Side note: it would be more robust if vma_is_anonymous() would also check that vm_flags hasn't VM_PFNMAP set. Link: http://lkml.kernel.org/r/20161216144821.5183-5-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 8 ++++---- mm/userfaultfd.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 71e917387ec3..7e8f0d60718d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -830,7 +830,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* check not compatible vmas */ ret = -EINVAL; - if (cur->vm_ops) + if (!vma_is_anonymous(cur)) goto out_unlock; /* @@ -855,7 +855,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(vma->vm_ops); + BUG_ON(!vma_is_anonymous(vma)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); @@ -981,7 +981,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * provides for more strict behavior to notice * unregistration errors. */ - if (cur->vm_ops) + if (!vma_is_anonymous(cur)) goto out_unlock; found = true; @@ -995,7 +995,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(vma->vm_ops); + BUG_ON(!vma_is_anonymous(vma)); /* * Nothing to do: this vma is already registered into this diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index af817e5060fb..9c2ed70ac78d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -197,7 +197,7 @@ retry: * FIXME: only allow copying on anonymous vmas, tmpfs should * be added. */ - if (dst_vma->vm_ops) + if (!vma_is_anonymous(dst_vma)) goto out_unlock; /* -- cgit From 6dcc27fd39437f77057322cf314cc545bf0e4863 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 22 Feb 2017 15:42:18 -0800 Subject: userfaultfd: non-cooperative: Split the find_userfault() routine I will need one to lookup for userfaultfd_wait_queue-s in different wait queue Link: http://lkml.kernel.org/r/20161216144821.5183-6-aarcange@redhat.com Signed-off-by: Pavel Emelyanov Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7e8f0d60718d..a209588043d3 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -522,25 +522,30 @@ wakeup: } /* fault_pending_wqh.lock must be hold by the caller */ -static inline struct userfaultfd_wait_queue *find_userfault( - struct userfaultfd_ctx *ctx) +static inline struct userfaultfd_wait_queue *find_userfault_in( + wait_queue_head_t *wqh) { wait_queue_t *wq; struct userfaultfd_wait_queue *uwq; - VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock)); + VM_BUG_ON(!spin_is_locked(&wqh->lock)); uwq = NULL; - if (!waitqueue_active(&ctx->fault_pending_wqh)) + if (!waitqueue_active(wqh)) goto out; /* walk in reverse to provide FIFO behavior to read userfaults */ - wq = list_last_entry(&ctx->fault_pending_wqh.task_list, - typeof(*wq), task_list); + wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list); uwq = container_of(wq, struct userfaultfd_wait_queue, wq); out: return uwq; } +static inline struct userfaultfd_wait_queue *find_userfault( + struct userfaultfd_ctx *ctx) +{ + return find_userfault_in(&ctx->fault_pending_wqh); +} + static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) { struct userfaultfd_ctx *ctx = file->private_data; -- cgit From 9cd75c3cd4c3d06aa0c4ed8ef5327d811a8b6cff Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 22 Feb 2017 15:42:21 -0800 Subject: userfaultfd: non-cooperative: add ability to report non-PF events from uffd descriptor The custom events are queued in ctx->event_wqh not to disturb the fast-path-ed PF queue-wait-wakeup functions. The events to be generated (other than PF-s) are requested in UFFD_API ioctl with the uffd_api.features bits. Those, known by the kernel, are then turned on and reported back to the user-space. Link: http://lkml.kernel.org/r/20161216144821.5183-7-aarcange@redhat.com Signed-off-by: Pavel Emelyanov Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 96 insertions(+), 2 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index a209588043d3..b5074a344635 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -12,6 +12,7 @@ * mm/ksm.c (mm hashing). */ +#include #include #include #include @@ -45,12 +46,16 @@ struct userfaultfd_ctx { wait_queue_head_t fault_wqh; /* waitqueue head for the pseudo fd to wakeup poll/read */ wait_queue_head_t fd_wqh; + /* waitqueue head for events */ + wait_queue_head_t event_wqh; /* a refile sequence protected by fault_pending_wqh lock */ struct seqcount refile_seq; /* pseudo fd refcounting */ atomic_t refcount; /* userfaultfd syscall flags */ unsigned int flags; + /* features requested from the userspace */ + unsigned int features; /* state machine */ enum userfaultfd_state state; /* released */ @@ -142,6 +147,8 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); + VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock)); + VM_BUG_ON(waitqueue_active(&ctx->event_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); mmdrop(ctx->mm); @@ -458,6 +465,59 @@ out: return ret; } +static int __maybe_unused userfaultfd_event_wait_completion( + struct userfaultfd_ctx *ctx, + struct userfaultfd_wait_queue *ewq) +{ + int ret = 0; + + ewq->ctx = ctx; + init_waitqueue_entry(&ewq->wq, current); + + spin_lock(&ctx->event_wqh.lock); + /* + * After the __add_wait_queue the uwq is visible to userland + * through poll/read(). + */ + __add_wait_queue(&ctx->event_wqh, &ewq->wq); + for (;;) { + set_current_state(TASK_KILLABLE); + if (ewq->msg.event == 0) + break; + if (ACCESS_ONCE(ctx->released) || + fatal_signal_pending(current)) { + ret = -1; + __remove_wait_queue(&ctx->event_wqh, &ewq->wq); + break; + } + + spin_unlock(&ctx->event_wqh.lock); + + wake_up_poll(&ctx->fd_wqh, POLLIN); + schedule(); + + spin_lock(&ctx->event_wqh.lock); + } + __set_current_state(TASK_RUNNING); + spin_unlock(&ctx->event_wqh.lock); + + /* + * ctx may go away after this if the userfault pseudo fd is + * already released. + */ + + userfaultfd_ctx_put(ctx); + return ret; +} + +static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, + struct userfaultfd_wait_queue *ewq) +{ + ewq->msg.event = 0; + wake_up_locked(&ctx->event_wqh); + __remove_wait_queue(&ctx->event_wqh, &ewq->wq); +} + static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; @@ -546,6 +606,12 @@ static inline struct userfaultfd_wait_queue *find_userfault( return find_userfault_in(&ctx->fault_pending_wqh); } +static inline struct userfaultfd_wait_queue *find_userfault_evt( + struct userfaultfd_ctx *ctx) +{ + return find_userfault_in(&ctx->event_wqh); +} + static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) { struct userfaultfd_ctx *ctx = file->private_data; @@ -577,6 +643,9 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) smp_mb(); if (waitqueue_active(&ctx->fault_pending_wqh)) ret = POLLIN; + else if (waitqueue_active(&ctx->event_wqh)) + ret = POLLIN; + return ret; default: WARN_ON_ONCE(1); @@ -641,6 +710,19 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, break; } spin_unlock(&ctx->fault_pending_wqh.lock); + + spin_lock(&ctx->event_wqh.lock); + uwq = find_userfault_evt(ctx); + if (uwq) { + *msg = uwq->msg; + + userfaultfd_event_complete(ctx, uwq); + spin_unlock(&ctx->event_wqh.lock); + ret = 0; + break; + } + spin_unlock(&ctx->event_wqh.lock); + if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -1184,6 +1266,14 @@ out: return ret; } +static inline unsigned int uffd_ctx_features(__u64 user_features) +{ + /* + * For the current set of features the bits just coincide + */ + return (unsigned int)user_features; +} + /* * userland asks for a certain API version and we return which bits * and ioctl commands are implemented in this kernel for such API @@ -1202,19 +1292,21 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; - if (uffdio_api.api != UFFD_API || uffdio_api.features) { + if (uffdio_api.api != UFFD_API || + (uffdio_api.features & ~UFFD_API_FEATURES)) { memset(&uffdio_api, 0, sizeof(uffdio_api)); if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; ret = -EINVAL; goto out; } - uffdio_api.features = UFFD_API_FEATURES; + uffdio_api.features &= UFFD_API_FEATURES; uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; ctx->state = UFFD_STATE_RUNNING; + ctx->features = uffd_ctx_features(uffdio_api.features); ret = 0; out: return ret; @@ -1301,6 +1393,7 @@ static void init_once_userfaultfd_ctx(void *mem) init_waitqueue_head(&ctx->fault_pending_wqh); init_waitqueue_head(&ctx->fault_wqh); + init_waitqueue_head(&ctx->event_wqh); init_waitqueue_head(&ctx->fd_wqh); seqcount_init(&ctx->refile_seq); } @@ -1341,6 +1434,7 @@ static struct file *userfaultfd_file_create(int flags) atomic_set(&ctx->refcount, 1); ctx->flags = flags; + ctx->features = 0; ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; ctx->mm = current->mm; -- cgit From 656031445d5a855e1c13b291dedae32579d0f3f2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:42:24 -0800 Subject: userfaultfd: non-cooperative: report all available features to userland This will allow userland to probe all features available in the kernel. It will however only enable the requested features in the open userfaultfd context. Link: http://lkml.kernel.org/r/20161216144821.5183-8-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index b5074a344635..87d31921b66c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1285,6 +1285,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, struct uffdio_api uffdio_api; void __user *buf = (void __user *)arg; int ret; + __u64 features; ret = -EINVAL; if (ctx->state != UFFD_STATE_WAIT_API) @@ -1292,21 +1293,23 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; - if (uffdio_api.api != UFFD_API || - (uffdio_api.features & ~UFFD_API_FEATURES)) { + features = uffdio_api.features; + if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) { memset(&uffdio_api, 0, sizeof(uffdio_api)); if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; ret = -EINVAL; goto out; } - uffdio_api.features &= UFFD_API_FEATURES; + /* report all available features and ioctls to userland */ + uffdio_api.features = UFFD_API_FEATURES; uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; ctx->state = UFFD_STATE_RUNNING; - ctx->features = uffd_ctx_features(uffdio_api.features); + /* only enable the requested features for this uffd context */ + ctx->features = uffd_ctx_features(features); ret = 0; out: return ret; -- cgit From 893e26e61d04eac974ded0c11e1647b335c8cb7b Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 22 Feb 2017 15:42:27 -0800 Subject: userfaultfd: non-cooperative: Add fork() event When the mm with uffd-ed vmas fork()-s the respective vmas notify their uffds with the event which contains a descriptor with new uffd. This new descriptor can then be used to get events from the child and populate its mm with data. Note, that there can be different uffd-s controlling different vmas within one mm, so first we should collect all those uffds (and ctx-s) in a list and then notify them all one by one but only once per fork(). The context is created at fork() time but the descriptor, file struct and anon inode object is created at event read time. So some trickery is added to the userfaultfd_ctx_read() to handle the ctx queues' locking vs file creation. Another thing worth noticing is that the task that fork()-s waits for the uffd event to get processed WITHOUT the mmap sem. [aarcange@redhat.com: build warning fix] Link: http://lkml.kernel.org/r/20161216144821.5183-10-aarcange@redhat.com Link: http://lkml.kernel.org/r/20161216144821.5183-9-aarcange@redhat.com Signed-off-by: Pavel Emelyanov Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 148 ++++++++++++++++++++++++++++++++++++++- include/linux/userfaultfd_k.h | 13 ++++ include/uapi/linux/userfaultfd.h | 15 ++-- kernel/fork.c | 10 ++- 4 files changed, 170 insertions(+), 16 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 87d31921b66c..6046e0b552b2 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -64,6 +64,12 @@ struct userfaultfd_ctx { struct mm_struct *mm; }; +struct userfaultfd_fork_ctx { + struct userfaultfd_ctx *orig; + struct userfaultfd_ctx *new; + struct list_head list; +}; + struct userfaultfd_wait_queue { struct uffd_msg msg; wait_queue_t wq; @@ -465,9 +471,8 @@ out: return ret; } -static int __maybe_unused userfaultfd_event_wait_completion( - struct userfaultfd_ctx *ctx, - struct userfaultfd_wait_queue *ewq) +static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, + struct userfaultfd_wait_queue *ewq) { int ret = 0; @@ -518,6 +523,79 @@ static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, __remove_wait_queue(&ctx->event_wqh, &ewq->wq); } +int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) +{ + struct userfaultfd_ctx *ctx = NULL, *octx; + struct userfaultfd_fork_ctx *fctx; + + octx = vma->vm_userfaultfd_ctx.ctx; + if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + return 0; + } + + list_for_each_entry(fctx, fcs, list) + if (fctx->orig == octx) { + ctx = fctx->new; + break; + } + + if (!ctx) { + fctx = kmalloc(sizeof(*fctx), GFP_KERNEL); + if (!fctx) + return -ENOMEM; + + ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); + if (!ctx) { + kfree(fctx); + return -ENOMEM; + } + + atomic_set(&ctx->refcount, 1); + ctx->flags = octx->flags; + ctx->state = UFFD_STATE_RUNNING; + ctx->features = octx->features; + ctx->released = false; + ctx->mm = vma->vm_mm; + atomic_inc(&ctx->mm->mm_users); + + userfaultfd_ctx_get(octx); + fctx->orig = octx; + fctx->new = ctx; + list_add_tail(&fctx->list, fcs); + } + + vma->vm_userfaultfd_ctx.ctx = ctx; + return 0; +} + +static int dup_fctx(struct userfaultfd_fork_ctx *fctx) +{ + struct userfaultfd_ctx *ctx = fctx->orig; + struct userfaultfd_wait_queue ewq; + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_FORK; + ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new; + + return userfaultfd_event_wait_completion(ctx, &ewq); +} + +void dup_userfaultfd_complete(struct list_head *fcs) +{ + int ret = 0; + struct userfaultfd_fork_ctx *fctx, *n; + + list_for_each_entry_safe(fctx, n, fcs, list) { + if (!ret) + ret = dup_fctx(fctx); + list_del(&fctx->list); + kfree(fctx); + } +} + static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; @@ -653,12 +731,49 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) } } +static const struct file_operations userfaultfd_fops; + +static int resolve_userfault_fork(struct userfaultfd_ctx *ctx, + struct userfaultfd_ctx *new, + struct uffd_msg *msg) +{ + int fd; + struct file *file; + unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS; + + fd = get_unused_fd_flags(flags); + if (fd < 0) + return fd; + + file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new, + O_RDWR | flags); + if (IS_ERR(file)) { + put_unused_fd(fd); + return PTR_ERR(file); + } + + fd_install(fd, file); + msg->arg.reserved.reserved1 = 0; + msg->arg.fork.ufd = fd; + + return 0; +} + static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, struct uffd_msg *msg) { ssize_t ret; DECLARE_WAITQUEUE(wait, current); struct userfaultfd_wait_queue *uwq; + /* + * Handling fork event requires sleeping operations, so + * we drop the event_wqh lock, then do these ops, then + * lock it back and wake up the waiter. While the lock is + * dropped the ewq may go away so we keep track of it + * carefully. + */ + LIST_HEAD(fork_event); + struct userfaultfd_ctx *fork_nctx = NULL; /* always take the fd_wqh lock before the fault_pending_wqh lock */ spin_lock(&ctx->fd_wqh.lock); @@ -716,6 +831,16 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, if (uwq) { *msg = uwq->msg; + if (uwq->msg.event == UFFD_EVENT_FORK) { + fork_nctx = (struct userfaultfd_ctx *) + (unsigned long) + uwq->msg.arg.reserved.reserved1; + list_move(&uwq->wq.task_list, &fork_event); + spin_unlock(&ctx->event_wqh.lock); + ret = 0; + break; + } + userfaultfd_event_complete(ctx, uwq); spin_unlock(&ctx->event_wqh.lock); ret = 0; @@ -739,6 +864,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, __set_current_state(TASK_RUNNING); spin_unlock(&ctx->fd_wqh.lock); + if (!ret && msg->event == UFFD_EVENT_FORK) { + ret = resolve_userfault_fork(ctx, fork_nctx, msg); + + if (!ret) { + spin_lock(&ctx->event_wqh.lock); + if (!list_empty(&fork_event)) { + uwq = list_first_entry(&fork_event, + typeof(*uwq), + wq.task_list); + list_del(&uwq->wq.task_list); + __add_wait_queue(&ctx->event_wqh, &uwq->wq); + userfaultfd_event_complete(ctx, uwq); + } + spin_unlock(&ctx->event_wqh.lock); + } + } + return ret; } diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 11b92b047a1e..79002bca1f43 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -52,6 +52,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP); } +extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); +extern void dup_userfaultfd_complete(struct list_head *); + #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -76,6 +79,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return false; } +static inline int dup_userfaultfd(struct vm_area_struct *vma, + struct list_head *l) +{ + return 0; +} + +static inline void dup_userfaultfd_complete(struct list_head *l) +{ +} + #endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 94046b8aa6ad..c8953c84fdcc 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -18,12 +18,7 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -/* - * After implementing the respective features it will become: - * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ - * UFFD_FEATURE_EVENT_FORK) - */ -#define UFFD_API_FEATURES (0) +#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -77,6 +72,10 @@ struct uffd_msg { __u64 address; } pagefault; + struct { + __u32 ufd; + } fork; + struct { /* unused reserved fields */ __u64 reserved1; @@ -90,9 +89,7 @@ struct uffd_msg { * Start at 0x12 and not at 0 to be more strict against bugs. */ #define UFFD_EVENT_PAGEFAULT 0x12 -#if 0 /* not available yet */ #define UFFD_EVENT_FORK 0x13 -#endif /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -111,10 +108,8 @@ struct uffdio_api { * are to be considered implicitly always enabled in all kernels as * long as the uffdio_api.api requested matches UFFD_API. */ -#if 0 /* not available yet */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) -#endif __u64 features; __u64 ioctls; diff --git a/kernel/fork.c b/kernel/fork.c index ff82e24573b6..d12fcc4db8a3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -561,6 +562,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; + LIST_HEAD(uf); uprobe_start_dup_mmap(); if (down_write_killable(&oldmm->mmap_sem)) { @@ -617,12 +619,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (retval) goto fail_nomem_policy; tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= - ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP); + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); tmp->vm_next = tmp->vm_prev = NULL; - tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); @@ -678,6 +681,7 @@ out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); + dup_userfaultfd_complete(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); return retval; -- cgit From d3aadc8ed4cb447981ecf34f9af71cddc6cf907d Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 22 Feb 2017 15:42:30 -0800 Subject: userfaultfd: non-cooperative: dup_userfaultfd: use mm_count instead of mm_users Since commit d2005e3f41d4 ("userfaultfd: don't pin the user memory in userfaultfd_file_create()") userfaultfd uses mm_count rather than mm_users to pin mm_struct. Make dup_userfaultfd consistent with this behaviour Link: http://lkml.kernel.org/r/20161216144821.5183-11-aarcange@redhat.com Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 6046e0b552b2..27978f249016 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -558,7 +558,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->features = octx->features; ctx->released = false; ctx->mm = vma->vm_mm; - atomic_inc(&ctx->mm->mm_users); + atomic_inc(&ctx->mm->mm_count); userfaultfd_ctx_get(octx); fctx->orig = octx; -- cgit From 72f87654c69690ff4721bd9b4a39983f971de9a5 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 22 Feb 2017 15:42:34 -0800 Subject: userfaultfd: non-cooperative: add mremap() event The event denotes that an area [start:end] moves to different location. Length change isn't reported as "new" addresses, if they appear on the uffd reader side they will not contain any data and the latter can just zeromap them. Waiting for the event ACK is also done outside of mmap sem, as for fork event. Link: http://lkml.kernel.org/r/20161216144821.5183-12-aarcange@redhat.com Signed-off-by: Pavel Emelyanov Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/userfaultfd_k.h | 17 +++++++++++++++++ include/uapi/linux/userfaultfd.h | 11 ++++++++++- mm/mremap.c | 17 ++++++++++++----- 4 files changed, 76 insertions(+), 6 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 27978f249016..68f978beefac 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -596,6 +596,43 @@ void dup_userfaultfd_complete(struct list_head *fcs) } } +void mremap_userfaultfd_prep(struct vm_area_struct *vma, + struct vm_userfaultfd_ctx *vm_ctx) +{ + struct userfaultfd_ctx *ctx; + + ctx = vma->vm_userfaultfd_ctx.ctx; + if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) { + vm_ctx->ctx = ctx; + userfaultfd_ctx_get(ctx); + } +} + +void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx vm_ctx, + unsigned long from, unsigned long to, + unsigned long len) +{ + struct userfaultfd_ctx *ctx = vm_ctx.ctx; + struct userfaultfd_wait_queue ewq; + + if (!ctx) + return; + + if (to & ~PAGE_MASK) { + userfaultfd_ctx_put(ctx); + return; + } + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_REMAP; + ewq.msg.arg.remap.from = from; + ewq.msg.arg.remap.to = to; + ewq.msg.arg.remap.len = len; + + userfaultfd_event_wait_completion(ctx, &ewq); +} + static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 79002bca1f43..7f318a46044b 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -55,6 +55,12 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); +extern void mremap_userfaultfd_prep(struct vm_area_struct *, + struct vm_userfaultfd_ctx *); +extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx, + unsigned long from, unsigned long to, + unsigned long len); + #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -89,6 +95,17 @@ static inline void dup_userfaultfd_complete(struct list_head *l) { } +static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma, + struct vm_userfaultfd_ctx *ctx) +{ +} + +static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx ctx, + unsigned long from, + unsigned long to, + unsigned long len) +{ +} #endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index c8953c84fdcc..79a85e5bd388 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -18,7 +18,8 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK) +#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ + UFFD_FEATURE_EVENT_REMAP) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -76,6 +77,12 @@ struct uffd_msg { __u32 ufd; } fork; + struct { + __u64 from; + __u64 to; + __u64 len; + } remap; + struct { /* unused reserved fields */ __u64 reserved1; @@ -90,6 +97,7 @@ struct uffd_msg { */ #define UFFD_EVENT_PAGEFAULT 0x12 #define UFFD_EVENT_FORK 0x13 +#define UFFD_EVENT_REMAP 0x14 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -110,6 +118,7 @@ struct uffdio_api { */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) +#define UFFD_FEATURE_EVENT_REMAP (1<<2) __u64 features; __u64 ioctls; diff --git a/mm/mremap.c b/mm/mremap.c index 30d7d2482eea..504b560c013c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -250,7 +251,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, static unsigned long move_vma(struct vm_area_struct *vma, unsigned long old_addr, unsigned long old_len, - unsigned long new_len, unsigned long new_addr, bool *locked) + unsigned long new_len, unsigned long new_addr, + bool *locked, struct vm_userfaultfd_ctx *uf) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; @@ -309,6 +311,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, old_addr = new_addr; new_addr = err; } else { + mremap_userfaultfd_prep(new_vma, uf); arch_remap(mm, old_addr, old_addr + old_len, new_addr, new_addr + new_len); } @@ -413,7 +416,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, } static unsigned long mremap_to(unsigned long addr, unsigned long old_len, - unsigned long new_addr, unsigned long new_len, bool *locked) + unsigned long new_addr, unsigned long new_len, bool *locked, + struct vm_userfaultfd_ctx *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -458,7 +462,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (offset_in_page(ret)) goto out1; - ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf); if (!(offset_in_page(ret))) goto out; out1: @@ -497,6 +501,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long ret = -EINVAL; unsigned long charged = 0; bool locked = false; + struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; @@ -523,7 +528,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (flags & MREMAP_FIXED) { ret = mremap_to(addr, old_len, new_addr, new_len, - &locked); + &locked, &uf); goto out; } @@ -592,7 +597,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } - ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); + ret = move_vma(vma, addr, old_len, new_len, new_addr, + &locked, &uf); } out: if (offset_in_page(ret)) { @@ -602,5 +608,6 @@ out: up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); + mremap_userfaultfd_complete(uf, addr, new_addr, old_len); return ret; } -- cgit From 90794bf19dc19691a16b71bcd75c04094d9e392d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:42:37 -0800 Subject: userfaultfd: non-cooperative: optimize mremap_userfaultfd_complete() Optimize the mremap_userfaultfd_complete() interface to pass only the vm_userfaultfd_ctx pointer through the stack as a microoptimization. Link: http://lkml.kernel.org/r/20161216144821.5183-13-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Hillf Danton Acked-by: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 4 ++-- include/linux/userfaultfd_k.h | 4 ++-- mm/mremap.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 68f978beefac..5d37c37854b0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -608,11 +608,11 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, } } -void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx vm_ctx, +void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, unsigned long from, unsigned long to, unsigned long len) { - struct userfaultfd_ctx *ctx = vm_ctx.ctx; + struct userfaultfd_ctx *ctx = vm_ctx->ctx; struct userfaultfd_wait_queue ewq; if (!ctx) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 7f318a46044b..78ec197e8b47 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -57,7 +57,7 @@ extern void dup_userfaultfd_complete(struct list_head *); extern void mremap_userfaultfd_prep(struct vm_area_struct *, struct vm_userfaultfd_ctx *); -extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx, +extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, unsigned long from, unsigned long to, unsigned long len); @@ -100,7 +100,7 @@ static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma, { } -static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx ctx, +static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, unsigned long from, unsigned long to, unsigned long len) diff --git a/mm/mremap.c b/mm/mremap.c index 504b560c013c..8779928d6a70 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -608,6 +608,6 @@ out: up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); - mremap_userfaultfd_complete(uf, addr, new_addr, old_len); + mremap_userfaultfd_complete(&uf, addr, new_addr, old_len); return ret; } -- cgit From 05ce77249d5068b057082d24ec22d3824f4816ac Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 22 Feb 2017 15:42:40 -0800 Subject: userfaultfd: non-cooperative: add madvise() event for MADV_DONTNEED request If the page is punched out of the address space the uffd reader should know this and zeromap the respective area in case of the #PF event. Link: http://lkml.kernel.org/r/20161216144821.5183-14-aarcange@redhat.com Signed-off-by: Pavel Emelyanov Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 28 ++++++++++++++++++++++++++++ include/linux/userfaultfd_k.h | 12 ++++++++++++ include/uapi/linux/userfaultfd.h | 10 +++++++++- mm/madvise.c | 2 ++ 4 files changed, 51 insertions(+), 1 deletion(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5d37c37854b0..ea9008254df4 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -633,6 +633,34 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, userfaultfd_event_wait_completion(ctx, &ewq); } +void madvise_userfault_dontneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + struct userfaultfd_ctx *ctx; + struct userfaultfd_wait_queue ewq; + + ctx = vma->vm_userfaultfd_ctx.ctx; + if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_MADVDONTNEED)) + return; + + userfaultfd_ctx_get(ctx); + up_read(&mm->mmap_sem); + + *prev = NULL; /* We wait for ACK w/o the mmap semaphore */ + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_MADVDONTNEED; + ewq.msg.arg.madv_dn.start = start; + ewq.msg.arg.madv_dn.end = end; + + userfaultfd_event_wait_completion(ctx, &ewq); + + down_read(&mm->mmap_sem); +} + static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 78ec197e8b47..f431861f22f1 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -61,6 +61,11 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, unsigned long from, unsigned long to, unsigned long len); +extern void madvise_userfault_dontneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, + unsigned long end); + #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -106,6 +111,13 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, unsigned long len) { } + +static inline void madvise_userfault_dontneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, + unsigned long end) +{ +} #endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 79a85e5bd388..2bbf32319cf5 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -19,7 +19,8 @@ */ #define UFFD_API ((__u64)0xAA) #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ - UFFD_FEATURE_EVENT_REMAP) + UFFD_FEATURE_EVENT_REMAP | \ + UFFD_FEATURE_EVENT_MADVDONTNEED) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -83,6 +84,11 @@ struct uffd_msg { __u64 len; } remap; + struct { + __u64 start; + __u64 end; + } madv_dn; + struct { /* unused reserved fields */ __u64 reserved1; @@ -98,6 +104,7 @@ struct uffd_msg { #define UFFD_EVENT_PAGEFAULT 0x12 #define UFFD_EVENT_FORK 0x13 #define UFFD_EVENT_REMAP 0x14 +#define UFFD_EVENT_MADVDONTNEED 0x15 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -119,6 +126,7 @@ struct uffdio_api { #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) #define UFFD_FEATURE_EVENT_REMAP (1<<2) +#define UFFD_FEATURE_EVENT_MADVDONTNEED (1<<3) __u64 features; __u64 ioctls; diff --git a/mm/madvise.c b/mm/madvise.c index 0e3828eae9f8..06ffb5a170de 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -477,6 +478,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, return -EINVAL; zap_page_range(vma, start, end - start, NULL); + madvise_userfault_dontneed(vma, prev, start, end); return 0; } -- cgit From 0594f58dbd954f7747553c041d7cbbf9b6ef1947 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:42:43 -0800 Subject: userfaultfd: non-cooperative: avoid MADV_DONTNEED race condition MADV_DONTNEED must be notified to userland before the pages are zapped. This allows userland to immediately stop adding pages to the userfaultfd ranges before the pages are actually zapped or there could be non-zeropage leftovers as result of concurrent UFFDIO_COPY run in between zap_page_range and madvise_userfault_dontneed (both MADV_DONTNEED and UFFDIO_COPY runs under the mmap_sem for reading, so they can run concurrently). Link: http://lkml.kernel.org/r/20161216144821.5183-15-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 06ffb5a170de..ca75b8a01ba0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -477,8 +477,8 @@ static long madvise_dontneed(struct vm_area_struct *vma, if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) return -EINVAL; - zap_page_range(vma, start, end - start, NULL); madvise_userfault_dontneed(vma, prev, start, end); + zap_page_range(vma, start, end - start, NULL); return 0; } -- cgit From 09fa5296a40d01e014b4851def20b975feb98fd5 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:42:46 -0800 Subject: userfaultfd: non-cooperative: wake userfaults after UFFDIO_UNREGISTER Userfaults may still happen after the userfaultfd monitor thread received a UFFD_EVENT_MADVDONTNEED until UFFDIO_UNREGISTER is run. Wake any pending userfault within UFFDIO_UNREGISTER protected by the mmap_sem for writing, so they will not be reported to userland leading to UFFDIO_COPY returning -EINVAL (as the range was already unregistered) and they will not hang permanently either. Link: http://lkml.kernel.org/r/20161216144821.5183-16-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ea9008254df4..26e1ef00b63c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1302,6 +1302,19 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, start = vma->vm_start; vma_end = min(end, vma->vm_end); + if (userfaultfd_missing(vma)) { + /* + * Wake any concurrent pending userfault while + * we unregister, so they will not hang + * permanently and it avoids userland to call + * UFFDIO_WAKE explicitly. + */ + struct userfaultfd_wake_range range; + range.start = start; + range.len = vma_end - start; + wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); + } + new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, -- cgit From fa4d75c1de13299c61b5e18a1ae46bc00888b599 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:42:49 -0800 Subject: userfaultfd: hugetlbfs: add copy_huge_page_from_user for hugetlb userfaultfd support userfaultfd UFFDIO_COPY allows user level code to copy data to a page at fault time. The data is copied from user space to a newly allocated huge page. The new routine copy_huge_page_from_user performs this copy. Link: http://lkml.kernel.org/r/20161216144821.5183-17-aarcange@redhat.com Signed-off-by: Mike Kravetz Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ mm/memory.c | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3787f047a098..6bc7f2c1a6d1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2424,6 +2424,9 @@ extern void clear_huge_page(struct page *page, extern void copy_user_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma, unsigned int pages_per_huge_page); +extern long copy_huge_page_from_user(struct page *dst_page, + const void __user *usr_src, + unsigned int pages_per_huge_page); #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ extern struct page_ext_operations debug_guardpage_ops; diff --git a/mm/memory.c b/mm/memory.c index ececdc4a2892..4ade940d105c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4152,6 +4152,31 @@ void copy_user_huge_page(struct page *dst, struct page *src, copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); } } + +long copy_huge_page_from_user(struct page *dst_page, + const void __user *usr_src, + unsigned int pages_per_huge_page) +{ + void *src = (void *)usr_src; + void *page_kaddr; + unsigned long i, rc = 0; + unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; + + for (i = 0; i < pages_per_huge_page; i++) { + page_kaddr = kmap_atomic(dst_page + i); + rc = copy_from_user(page_kaddr, + (const void __user *)(src + i * PAGE_SIZE), + PAGE_SIZE); + kunmap_atomic(page_kaddr); + + ret_val -= (PAGE_SIZE - rc); + if (rc) + break; + + cond_resched(); + } + return ret_val; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS -- cgit From 8fb5debc5fcd450470cdd789c2d80ef95ebb8cf4 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:42:52 -0800 Subject: userfaultfd: hugetlbfs: add hugetlb_mcopy_atomic_pte for userfaultfd support hugetlb_mcopy_atomic_pte is the low level routine that implements the userfaultfd UFFDIO_COPY command. It is based on the existing mcopy_atomic_pte routine with modifications for huge pages. Link: http://lkml.kernel.org/r/20161216144821.5183-18-aarcange@redhat.com Signed-off-by: Mike Kravetz Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 7 +++++ mm/hugetlb.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 48c76d612d40..aab2fff3e269 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -81,6 +81,11 @@ void hugetlb_show_meminfo(void); unsigned long hugetlb_total_pages(void); int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); +int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep); int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); @@ -149,6 +154,8 @@ static inline void hugetlb_show_meminfo(void) #define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) +#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ + src_addr, pagep) ({ BUG(); 0; }) #define huge_pte_offset(mm, address) 0 static inline int dequeue_hwpoisoned_huge_page(struct page *page) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c7025c132670..dec628b26f59 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3948,6 +3948,87 @@ out_mutex: return ret; } +/* + * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with + * modifications for huge pages. + */ +int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, + pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + struct hstate *h = hstate_vma(dst_vma); + pte_t _dst_pte; + spinlock_t *ptl; + int ret; + struct page *page; + + if (!*pagep) { + ret = -ENOMEM; + page = alloc_huge_page(dst_vma, dst_addr, 0); + if (IS_ERR(page)) + goto out; + + ret = copy_huge_page_from_user(page, + (const void __user *) src_addr, + pages_per_huge_page(h)); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + ret = -EFAULT; + *pagep = page; + /* don't free the page */ + goto out; + } + } else { + page = *pagep; + *pagep = NULL; + } + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + set_page_huge_active(page); + + ptl = huge_pte_lockptr(h, dst_mm, dst_pte); + spin_lock(ptl); + + ret = -EEXIST; + if (!huge_pte_none(huge_ptep_get(dst_pte))) + goto out_release_unlock; + + ClearPagePrivate(page); + hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + + _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); + if (dst_vma->vm_flags & VM_WRITE) + _dst_pte = huge_pte_mkdirty(_dst_pte); + _dst_pte = pte_mkyoung(_dst_pte); + + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, + dst_vma->vm_flags & VM_WRITE); + hugetlb_count_add(pages_per_huge_page(h), dst_mm); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + + spin_unlock(ptl); + ret = 0; +out: + return ret; +out_release_unlock: + spin_unlock(ptl); + put_page(page); + goto out; +} + long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, -- cgit From 60d4d2d2b40e44cd36bfb6049e8d9e2055a24f8a Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:42:55 -0800 Subject: userfaultfd: hugetlbfs: add __mcopy_atomic_hugetlb for huge page UFFDIO_COPY __mcopy_atomic_hugetlb performs the UFFDIO_COPY operation for huge pages. It is based on the existing __mcopy_atomic routine for normal pages. Unlike normal pages, there is no huge page support for the UFFDIO_ZEROPAGE operation. Link: http://lkml.kernel.org/r/20161216144821.5183-19-aarcange@redhat.com Signed-off-by: Mike Kravetz Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/userfaultfd.c | 186 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9c2ed70ac78d..ef0495bfd17a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include "internal.h" @@ -139,6 +141,183 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) return pmd; } +#ifdef CONFIG_HUGETLB_PAGE +/* + * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is + * called with mmap_sem held, it will release mmap_sem before returning. + */ +static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, + struct vm_area_struct *dst_vma, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + bool zeropage) +{ + ssize_t err; + pte_t *dst_pte; + unsigned long src_addr, dst_addr; + long copied; + struct page *page; + struct hstate *h; + unsigned long vma_hpagesize; + pgoff_t idx; + u32 hash; + struct address_space *mapping; + + /* + * There is no default zero huge page for all huge page sizes as + * supported by hugetlb. A PMD_SIZE huge pages may exist as used + * by THP. Since we can not reliably insert a zero page, this + * feature is not supported. + */ + if (zeropage) { + up_read(&dst_mm->mmap_sem); + return -EINVAL; + } + + src_addr = src_start; + dst_addr = dst_start; + copied = 0; + page = NULL; + vma_hpagesize = vma_kernel_pagesize(dst_vma); + + /* + * Validate alignment based on huge page size + */ + err = -EINVAL; + if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) + goto out_unlock; + +retry: + /* + * On routine entry dst_vma is set. If we had to drop mmap_sem and + * retry, dst_vma will be set to NULL and we must lookup again. + */ + if (!dst_vma) { + err = -EINVAL; + dst_vma = find_vma(dst_mm, dst_start); + if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) + goto out_unlock; + + if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) + goto out_unlock; + + /* + * Make sure the vma is not shared, that the remaining dst + * range is both valid and fully within a single existing vma. + */ + if (dst_vma->vm_flags & VM_SHARED) + goto out_unlock; + if (dst_start < dst_vma->vm_start || + dst_start + len > dst_vma->vm_end) + goto out_unlock; + } + + if (WARN_ON(dst_addr & (vma_hpagesize - 1) || + (len - copied) & (vma_hpagesize - 1))) + goto out_unlock; + + /* + * Only allow __mcopy_atomic_hugetlb on userfaultfd registered ranges. + */ + if (!dst_vma->vm_userfaultfd_ctx.ctx) + goto out_unlock; + + /* + * Ensure the dst_vma has a anon_vma. + */ + err = -ENOMEM; + if (unlikely(anon_vma_prepare(dst_vma))) + goto out_unlock; + + h = hstate_vma(dst_vma); + + while (src_addr < src_start + len) { + pte_t dst_pteval; + + BUG_ON(dst_addr >= dst_start + len); + VM_BUG_ON(dst_addr & ~huge_page_mask(h)); + + /* + * Serialize via hugetlb_fault_mutex + */ + idx = linear_page_index(dst_vma, dst_addr); + mapping = dst_vma->vm_file->f_mapping; + hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, + idx, dst_addr); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + err = -ENOMEM; + dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); + if (!dst_pte) { + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } + + err = -EEXIST; + dst_pteval = huge_ptep_get(dst_pte); + if (!huge_pte_none(dst_pteval)) { + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } + + err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, + dst_addr, src_addr, &page); + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + + cond_resched(); + + if (unlikely(err == -EFAULT)) { + up_read(&dst_mm->mmap_sem); + BUG_ON(!page); + + err = copy_huge_page_from_user(page, + (const void __user *)src_addr, + pages_per_huge_page(h)); + if (unlikely(err)) { + err = -EFAULT; + goto out; + } + down_read(&dst_mm->mmap_sem); + + dst_vma = NULL; + goto retry; + } else + BUG_ON(page); + + if (!err) { + dst_addr += vma_hpagesize; + src_addr += vma_hpagesize; + copied += vma_hpagesize; + + if (fatal_signal_pending(current)) + err = -EINTR; + } + if (err) + break; + } + +out_unlock: + up_read(&dst_mm->mmap_sem); +out: + if (page) + put_page(page); + BUG_ON(copied < 0); + BUG_ON(err > 0); + BUG_ON(!copied && !err); + return copied ? copied : err; +} +#else /* !CONFIG_HUGETLB_PAGE */ +/* fail at build time if gcc attempts to use this */ +extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, + struct vm_area_struct *dst_vma, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + bool zeropage); +#endif /* CONFIG_HUGETLB_PAGE */ + static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, @@ -181,6 +360,13 @@ retry: dst_start + len > dst_vma->vm_end) goto out_unlock; + /* + * If this is a HUGETLB vma, pass off to appropriate routine + */ + if (is_vm_hugetlb_page(dst_vma)) + return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, + src_start, len, zeropage); + /* * Be strict and only allow __mcopy_atomic on userfaultfd * registered ranges to prevent userland errors going -- cgit From 810a56b943e265bbabfcd5a8e54cb8d3b16cd6e4 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:42:58 -0800 Subject: userfaultfd: hugetlbfs: fix __mcopy_atomic_hugetlb retry/error processing The new routine copy_huge_page_from_user() uses kmap_atomic() to map PAGE_SIZE pages. However, this prevents page faults in the subsequent call to copy_from_user(). This is OK in the case where the routine is copied with mmap_sema held. However, in another case we want to allow page faults. So, add a new argument allow_pagefault to indicate if the routine should allow page faults. [dan.carpenter@oracle.com: unmap the correct pointer] Link: http://lkml.kernel.org/r/20170113082608.GA3548@mwanda [akpm@linux-foundation.org: kunmap() takes a page*, per Hugh] Link: http://lkml.kernel.org/r/20161216144821.5183-20-aarcange@redhat.com Signed-off-by: Mike Kravetz Signed-off-by: Andrea Arcangeli Signed-off-by: Dan Carpenter Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Cc: Hugh Dickins Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++- mm/hugetlb.c | 2 +- mm/memory.c | 13 ++++++++++--- mm/userfaultfd.c | 2 +- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6bc7f2c1a6d1..c3e2be2b3296 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2426,7 +2426,8 @@ extern void copy_user_huge_page(struct page *dst, struct page *src, unsigned int pages_per_huge_page); extern long copy_huge_page_from_user(struct page *dst_page, const void __user *usr_src, - unsigned int pages_per_huge_page); + unsigned int pages_per_huge_page, + bool allow_pagefault); #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ extern struct page_ext_operations debug_guardpage_ops; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dec628b26f59..5d20af921a30 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3973,7 +3973,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ret = copy_huge_page_from_user(page, (const void __user *) src_addr, - pages_per_huge_page(h)); + pages_per_huge_page(h), false); /* fallback to copy_from_user outside mmap_sem */ if (unlikely(ret)) { diff --git a/mm/memory.c b/mm/memory.c index 4ade940d105c..d7676a68c80a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4155,7 +4155,8 @@ void copy_user_huge_page(struct page *dst, struct page *src, long copy_huge_page_from_user(struct page *dst_page, const void __user *usr_src, - unsigned int pages_per_huge_page) + unsigned int pages_per_huge_page, + bool allow_pagefault) { void *src = (void *)usr_src; void *page_kaddr; @@ -4163,11 +4164,17 @@ long copy_huge_page_from_user(struct page *dst_page, unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; for (i = 0; i < pages_per_huge_page; i++) { - page_kaddr = kmap_atomic(dst_page + i); + if (allow_pagefault) + page_kaddr = kmap(dst_page + i); + else + page_kaddr = kmap_atomic(dst_page + i); rc = copy_from_user(page_kaddr, (const void __user *)(src + i * PAGE_SIZE), PAGE_SIZE); - kunmap_atomic(page_kaddr); + if (allow_pagefault) + kunmap(dst_page + i); + else + kunmap_atomic(page_kaddr); ret_val -= (PAGE_SIZE - rc); if (rc) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ef0495bfd17a..09976745be23 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -274,7 +274,7 @@ retry: err = copy_huge_page_from_user(page, (const void __user *)src_addr, - pages_per_huge_page(h)); + pages_per_huge_page(h), true); if (unlikely(err)) { err = -EFAULT; goto out; -- cgit From 1a1aad8a9b7bd34f60cdf98cd7915f00ae892c45 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:43:01 -0800 Subject: userfaultfd: hugetlbfs: add userfaultfd hugetlb hook When processing a hugetlb fault for no page present, check the vma to determine if faults are to be handled via userfaultfd. If so, drop the hugetlb_fault_mutex and call handle_userfault(). Link: http://lkml.kernel.org/r/20161216144821.5183-21-aarcange@redhat.com Signed-off-by: Mike Kravetz Signed-off-by: Andrea Arcangeli Acked-by: Hillf Danton Cc: "Dr. David Alan Gilbert" Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d20af921a30..a4b29054cc3f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "internal.h" int hugepages_treat_as_movable; @@ -3680,6 +3681,38 @@ retry: size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; + + /* + * Check for page in userfault range + */ + if (userfaultfd_missing(vma)) { + u32 hash; + struct vm_fault vmf = { + .vma = vma, + .address = address, + .flags = flags, + /* + * Hard to debug if it ends up being + * used by a callee that assumes + * something about the other + * uninitialized fields... same as in + * memory.c + */ + }; + + /* + * hugetlb_fault_mutex must be dropped before + * handling userfault. Reacquire after handling + * fault to make calling code simpler. + */ + hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, + idx, address); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + ret = handle_userfault(&vmf, VM_UFFD_MISSING); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + goto out; + } + page = alloc_huge_page(vma, address, 0); if (IS_ERR(page)) { ret = PTR_ERR(page); -- cgit From cab350afcbc9c8a744e0d164d1c26560568f770b Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:43:04 -0800 Subject: userfaultfd: hugetlbfs: allow registration of ranges containing huge pages Expand the userfaultfd_register/unregister routines to allow VM_HUGETLB vmas. huge page alignment checking is performed after a VM_HUGETLB vma is encountered. Also, since there is no UFFDIO_ZEROPAGE support for huge pages do not return that as a valid ioctl method for huge page ranges. Link: http://lkml.kernel.org/r/20161216144821.5183-22-aarcange@redhat.com Signed-off-by: Mike Kravetz Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 55 ++++++++++++++++++++++++++++++++++++---- include/uapi/linux/userfaultfd.h | 3 +++ 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 26e1ef00b63c..5139d05f80e6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -27,6 +27,7 @@ #include #include #include +#include static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; @@ -1058,6 +1059,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, struct uffdio_register __user *user_uffdio_register; unsigned long vm_flags, new_flags; bool found; + bool huge_pages; unsigned long start, end, vma_end; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1108,6 +1110,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (vma->vm_start >= end) goto out_unlock; + /* + * If the first vma contains huge pages, make sure start address + * is aligned to huge page size. + */ + if (is_vm_hugetlb_page(vma)) { + unsigned long vma_hpagesize = vma_kernel_pagesize(vma); + + if (start & (vma_hpagesize - 1)) + goto out_unlock; + } + /* * Search for not compatible vmas. * @@ -1116,6 +1129,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * on anonymous vmas). */ found = false; + huge_pages = false; for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { cond_resched(); @@ -1124,8 +1138,21 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* check not compatible vmas */ ret = -EINVAL; - if (!vma_is_anonymous(cur)) + if (!vma_is_anonymous(cur) && !is_vm_hugetlb_page(cur)) goto out_unlock; + /* + * If this vma contains ending address, and huge pages + * check alignment. + */ + if (is_vm_hugetlb_page(cur) && end <= cur->vm_end && + end > cur->vm_start) { + unsigned long vma_hpagesize = vma_kernel_pagesize(cur); + + ret = -EINVAL; + + if (end & (vma_hpagesize - 1)) + goto out_unlock; + } /* * Check that this vma isn't already owned by a @@ -1138,6 +1165,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, cur->vm_userfaultfd_ctx.ctx != ctx) goto out_unlock; + /* + * Note vmas containing huge pages + */ + if (is_vm_hugetlb_page(cur)) + huge_pages = true; + found = true; } BUG_ON(!found); @@ -1149,7 +1182,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!vma_is_anonymous(vma)); + BUG_ON(!vma_is_anonymous(vma) && !is_vm_hugetlb_page(vma)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); @@ -1207,7 +1240,8 @@ out_unlock: * userland which ioctls methods are guaranteed to * succeed on this range. */ - if (put_user(UFFD_API_RANGE_IOCTLS, + if (put_user(huge_pages ? UFFD_API_RANGE_IOCTLS_HPAGE : + UFFD_API_RANGE_IOCTLS, &user_uffdio_register->ioctls)) ret = -EFAULT; } @@ -1253,6 +1287,17 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, if (vma->vm_start >= end) goto out_unlock; + /* + * If the first vma contains huge pages, make sure start address + * is aligned to huge page size. + */ + if (is_vm_hugetlb_page(vma)) { + unsigned long vma_hpagesize = vma_kernel_pagesize(vma); + + if (start & (vma_hpagesize - 1)) + goto out_unlock; + } + /* * Search for not compatible vmas. * @@ -1275,7 +1320,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * provides for more strict behavior to notice * unregistration errors. */ - if (!vma_is_anonymous(cur)) + if (!vma_is_anonymous(cur) && !is_vm_hugetlb_page(cur)) goto out_unlock; found = true; @@ -1289,7 +1334,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!vma_is_anonymous(vma)); + BUG_ON(!vma_is_anonymous(vma) && !is_vm_hugetlb_page(vma)); /* * Nothing to do: this vma is already registered into this diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 2bbf32319cf5..a3828a9bc16e 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -29,6 +29,9 @@ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE) +#define UFFD_API_RANGE_IOCTLS_HPAGE \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY) /* * Valid ioctl command number range with this API is from 0x00 to -- cgit From 9903bd7b73ef0dec429e951009b36643eb0fe239 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:43:07 -0800 Subject: userfaultfd: hugetlbfs: add userfaultfd_hugetlb test Test userfaultfd hugetlb functionality by using the existing testing method (in userfaultfd.c). Instead of an anonymous memeory, a hugetlbfs file is mmap'ed private. In this way fallocate hole punch can be used to release pages. This is because madvise(MADV_DONTNEED) is not supported for huge pages. Use the same file, but create wrappers for allocating ranges and releasing pages. Compile userfaultfd.c with HUGETLB_TEST defined to produce an executable to test userfaultfd hugetlb functionality. Link: http://lkml.kernel.org/r/20161216144821.5183-23-aarcange@redhat.com Signed-off-by: Mike Kravetz Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/Makefile | 4 + tools/testing/selftests/vm/run_vmtests | 13 +++ tools/testing/selftests/vm/userfaultfd.c | 161 +++++++++++++++++++++++++++---- 3 files changed, 161 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index bbab7f4664ac..0114aac78e1f 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -10,6 +10,7 @@ BINARIES += on-fault-limit BINARIES += thuge-gen BINARIES += transhuge-stress BINARIES += userfaultfd +BINARIES += userfaultfd_hugetlb BINARIES += mlock-random-test all: $(BINARIES) @@ -18,6 +19,9 @@ all: $(BINARIES) userfaultfd: userfaultfd.c ../../../../usr/include/linux/kernel.h $(CC) $(CFLAGS) -O2 -o $@ $< -lpthread +userfaultfd_hugetlb: userfaultfd.c ../../../../usr/include/linux/kernel.h + $(CC) $(CFLAGS) -DHUGETLB_TEST -O2 -o $@ $< -lpthread + mlock-random-test: mlock-random-test.c $(CC) $(CFLAGS) -o $@ $< -lcap diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index e11968b3677e..14d697ee00c4 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -103,6 +103,19 @@ else echo "[PASS]" fi +echo "----------------------------" +echo "running userfaultfd_hugetlb" +echo "----------------------------" +# 258MB total huge pages == 128MB src and 128MB dst +./userfaultfd_hugetlb 128 32 $mnt/ufd_test_file +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi +rm -f $mnt/ufd_test_file + #cleanup umount $mnt rm -rf $mnt diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index d77ed41b2094..3011711212ca 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -76,6 +76,10 @@ static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; #define BOUNCE_POLL (1<<3) static int bounces; +#ifdef HUGETLB_TEST +static int huge_fd; +static char *huge_fd_off0; +#endif static unsigned long long *count_verify; static int uffd, finished, *pipefd; static char *area_src, *area_dst; @@ -97,6 +101,69 @@ pthread_attr_t attr; ~(unsigned long)(sizeof(unsigned long long) \ - 1))) +#ifndef HUGETLB_TEST + +#define EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ + (1 << _UFFDIO_COPY) | \ + (1 << _UFFDIO_ZEROPAGE)) + +static int release_pages(char *rel_area) +{ + int ret = 0; + + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) { + perror("madvise"); + ret = 1; + } + + return ret; +} + +static void allocate_area(void **alloc_area) +{ + if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) { + fprintf(stderr, "out of memory\n"); + *alloc_area = NULL; + } +} + +#else /* HUGETLB_TEST */ + +#define EXPECTED_IOCTLS UFFD_API_RANGE_IOCTLS_HPAGE + +static int release_pages(char *rel_area) +{ + int ret = 0; + + if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + rel_area == huge_fd_off0 ? 0 : + nr_pages * page_size, + nr_pages * page_size)) { + perror("fallocate"); + ret = 1; + } + + return ret; +} + + +static void allocate_area(void **alloc_area) +{ + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_HUGETLB, huge_fd, + *alloc_area == area_src ? 0 : + nr_pages * page_size); + if (*alloc_area == MAP_FAILED) { + fprintf(stderr, "mmap of hugetlbfs file failed\n"); + *alloc_area = NULL; + } + + if (*alloc_area == area_src) + huge_fd_off0 = *alloc_area; +} + +#endif /* HUGETLB_TEST */ + static int my_bcmp(char *str1, char *str2, size_t n) { unsigned long i; @@ -384,10 +451,8 @@ static int stress(unsigned long *userfaults) * UFFDIO_COPY without writing zero pages into area_dst * because the background threads already completed). */ - if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) { - perror("madvise"); + if (release_pages(area_src)) return 1; - } for (cpu = 0; cpu < nr_cpus; cpu++) { char c; @@ -425,16 +490,12 @@ static int userfaultfd_stress(void) int uffd_flags, err; unsigned long userfaults[nr_cpus]; - if (posix_memalign(&area, page_size, nr_pages * page_size)) { - fprintf(stderr, "out of memory\n"); + allocate_area((void **)&area_src); + if (!area_src) return 1; - } - area_src = area; - if (posix_memalign(&area, page_size, nr_pages * page_size)) { - fprintf(stderr, "out of memory\n"); + allocate_area((void **)&area_dst); + if (!area_dst) return 1; - } - area_dst = area; uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); if (uffd < 0) { @@ -528,9 +589,7 @@ static int userfaultfd_stress(void) fprintf(stderr, "register failure\n"); return 1; } - expected_ioctls = (1 << _UFFDIO_WAKE) | - (1 << _UFFDIO_COPY) | - (1 << _UFFDIO_ZEROPAGE); + expected_ioctls = EXPECTED_IOCTLS; if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { fprintf(stderr, @@ -562,10 +621,8 @@ static int userfaultfd_stress(void) * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's * required to MADV_DONTNEED here. */ - if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) { - perror("madvise 2"); + if (release_pages(area_dst)) return 1; - } /* bounce pass */ if (stress(userfaults)) @@ -606,6 +663,8 @@ static int userfaultfd_stress(void) return err; } +#ifndef HUGETLB_TEST + int main(int argc, char **argv) { if (argc < 3) @@ -632,6 +691,74 @@ int main(int argc, char **argv) return userfaultfd_stress(); } +#else /* HUGETLB_TEST */ + +/* + * Copied from mlock2-tests.c + */ +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +int main(int argc, char **argv) +{ + if (argc < 4) + fprintf(stderr, "Usage: \n"), + exit(1); + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + page_size = default_huge_page_size(); + if (!page_size) + fprintf(stderr, "Unable to determine huge page size\n"), + exit(2); + if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 + > page_size) + fprintf(stderr, "Impossible to run this test\n"), exit(2); + nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size / + nr_cpus; + if (!nr_pages_per_cpu) { + fprintf(stderr, "invalid MiB\n"); + fprintf(stderr, "Usage: \n"), exit(1); + } + bounces = atoi(argv[2]); + if (bounces <= 0) { + fprintf(stderr, "invalid bounces\n"); + fprintf(stderr, "Usage: \n"), exit(1); + } + nr_pages = nr_pages_per_cpu * nr_cpus; + huge_fd = open(argv[3], O_CREAT | O_RDWR, 0755); + if (huge_fd < 0) { + fprintf(stderr, "Open of %s failed", argv[3]); + perror("open"); + exit(1); + } + if (ftruncate(huge_fd, 0)) { + fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]); + perror("ftruncate"); + exit(1); + } + printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", + nr_pages, nr_pages_per_cpu); + return userfaultfd_stress(); +} + +#endif #else /* __NR_userfaultfd */ #warning "missing __NR_userfaultfd definition" -- cgit From 369cd2121be440543280b91056de187f625d0dbb Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:43:10 -0800 Subject: userfaultfd: hugetlbfs: userfaultfd_huge_must_wait for hugepmd ranges Add routine userfaultfd_huge_must_wait which has the same functionality as the existing userfaultfd_must_wait routine. Only difference is that new routine must handle page table structure for hugepmd vmas. Link: http://lkml.kernel.org/r/20161216144821.5183-24-aarcange@redhat.com Signed-off-by: Mike Kravetz Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5139d05f80e6..11b5e65e8fba 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -202,6 +202,49 @@ static inline struct uffd_msg userfault_msg(unsigned long address, return msg; } +#ifdef CONFIG_HUGETLB_PAGE +/* + * Same functionality as userfaultfd_must_wait below with modifications for + * hugepmd ranges. + */ +static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + unsigned long address, + unsigned long flags, + unsigned long reason) +{ + struct mm_struct *mm = ctx->mm; + pte_t *pte; + bool ret = true; + + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + + pte = huge_pte_offset(mm, address); + if (!pte) + goto out; + + ret = false; + + /* + * Lockless access: we're in a wait_event so it's ok if it + * changes under us. + */ + if (huge_pte_none(*pte)) + ret = true; + if (!huge_pte_write(*pte) && (reason & VM_UFFD_WP)) + ret = true; +out: + return ret; +} +#else +static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + unsigned long address, + unsigned long flags, + unsigned long reason) +{ + return false; /* should never get here */ +} +#endif /* CONFIG_HUGETLB_PAGE */ + /* * Verify the pagetables are still not ok after having reigstered into * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any @@ -378,8 +421,12 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) set_current_state(blocking_state); spin_unlock(&ctx->fault_pending_wqh.lock); - must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, - reason); + if (!is_vm_hugetlb_page(vmf->vma)) + must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, + reason); + else + must_wait = userfaultfd_huge_must_wait(ctx, vmf->address, + vmf->flags, reason); up_read(&mm->mmap_sem); if (likely(must_wait && !ACCESS_ONCE(ctx->released) && -- cgit From 87ffc118b54dcd4cc642723603d944673248152f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:43:13 -0800 Subject: userfaultfd: hugetlbfs: gup: support VM_FAULT_RETRY Add support for VM_FAULT_RETRY to follow_hugetlb_page() so that get_user_pages_unlocked/locked and "nonblocking/FOLL_NOWAIT" features will work on hugetlbfs. This is required for fully functional userfaultfd non-present support on hugetlbfs. Link: http://lkml.kernel.org/r/20161216144821.5183-25-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reviewed-by: Mike Kravetz Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +++-- mm/gup.c | 2 +- mm/hugetlb.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 44 insertions(+), 11 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index aab2fff3e269..503099d8aada 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -65,7 +65,8 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, - unsigned long *, unsigned long *, long, unsigned int); + unsigned long *, unsigned long *, long, unsigned int, + int *); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *); void __unmap_hugepage_range_final(struct mmu_gather *tlb, @@ -136,7 +137,7 @@ static inline unsigned long hugetlb_total_pages(void) return 0; } -#define follow_hugetlb_page(m,v,p,vs,a,b,i,w) ({ BUG(); 0; }) +#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; }) #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) static inline void hugetlb_report_meminfo(struct seq_file *m) diff --git a/mm/gup.c b/mm/gup.c index 55315555489d..40abe4c90383 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -572,7 +572,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, - gup_flags); + gup_flags, nonblocking); continue; } } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a4b29054cc3f..f6c7ff316daf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4065,7 +4065,7 @@ out_release_unlock: long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, - long i, unsigned int flags) + long i, unsigned int flags, int *nonblocking) { unsigned long pfn_offset; unsigned long vaddr = *position; @@ -4128,16 +4128,43 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, ((flags & FOLL_WRITE) && !huge_pte_write(huge_ptep_get(pte)))) { int ret; + unsigned int fault_flags = 0; if (pte) spin_unlock(ptl); - ret = hugetlb_fault(mm, vma, vaddr, - (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); - if (!(ret & VM_FAULT_ERROR)) - continue; - - remainder = 0; - break; + if (flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (flags & FOLL_NOWAIT) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | + FAULT_FLAG_RETRY_NOWAIT; + if (flags & FOLL_TRIED) { + VM_WARN_ON_ONCE(fault_flags & + FAULT_FLAG_ALLOW_RETRY); + fault_flags |= FAULT_FLAG_TRIED; + } + ret = hugetlb_fault(mm, vma, vaddr, fault_flags); + if (ret & VM_FAULT_ERROR) { + remainder = 0; + break; + } + if (ret & VM_FAULT_RETRY) { + if (nonblocking) + *nonblocking = 0; + *nr_pages = 0; + /* + * VM_FAULT_RETRY must not return an + * error, it will return zero + * instead. + * + * No need to update "position" as the + * caller will not check it after + * *nr_pages is set to 0. + */ + return i; + } + continue; } pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; @@ -4166,6 +4193,11 @@ same_page: spin_unlock(ptl); } *nr_pages = remainder; + /* + * setting position is actually required only if remainder is + * not zero but it's faster not to add a "if (remainder)" + * branch. + */ *position = vaddr; return i ? i : -EFAULT; -- cgit From 21205bf8f77b23484966cb057ceaec860cc400b3 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:43:16 -0800 Subject: userfaultfd: hugetlbfs: reserve count on error in __mcopy_atomic_hugetlb If __mcopy_atomic_hugetlb exits with an error, put_page will be called if a huge page was allocated and needs to be freed. If a reservation was associated with the huge page, the PagePrivate flag will be set. Clear PagePrivate before calling put_page/free_huge_page so that the global reservation count is not incremented. Link: http://lkml.kernel.org/r/20161216144821.5183-26-aarcange@redhat.com Signed-off-by: Mike Kravetz Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/userfaultfd.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 09976745be23..31207b47ea92 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -301,8 +301,23 @@ retry: out_unlock: up_read(&dst_mm->mmap_sem); out: - if (page) + if (page) { + /* + * We encountered an error and are about to free a newly + * allocated huge page. It is possible that there was a + * reservation associated with the page that has been + * consumed. See the routine restore_reserve_on_error + * for details. Unfortunately, we can not call + * restore_reserve_on_error now as it would require holding + * mmap_sem. Clear the PagePrivate flag so that the global + * reserve count will not be incremented in free_huge_page. + * The reservation map will still indicate the reservation + * was consumed and possibly prevent later page allocation. + * This is better than leaking a global reservation. + */ + ClearPagePrivate(page); put_page(page); + } BUG_ON(copied < 0); BUG_ON(err > 0); BUG_ON(!copied && !err); -- cgit From 163e11bc4f6ebbfcfdf751c108bd212a26e492ee Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:43:19 -0800 Subject: userfaultfd: hugetlbfs: UFFD_FEATURE_MISSING_HUGETLBFS Userland developers asked to be notified immediately by the UFFDIO_API ioctl if hugetlbfs missing mode is supported by userfaultfd in the running kernel. This avoids the need to run UFFDIO_REGISTER on a hugetlbfs virtual memory range to find out. Link: http://lkml.kernel.org/r/20161216144821.5183-27-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index a3828a9bc16e..7293321abdfb 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -18,9 +18,10 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ - UFFD_FEATURE_EVENT_REMAP | \ - UFFD_FEATURE_EVENT_MADVDONTNEED) +#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ + UFFD_FEATURE_EVENT_REMAP | \ + UFFD_FEATURE_EVENT_MADVDONTNEED | \ + UFFD_FEATURE_MISSING_HUGETLBFS) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -125,11 +126,32 @@ struct uffdio_api { * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE * are to be considered implicitly always enabled in all kernels as * long as the uffdio_api.api requested matches UFFD_API. + * + * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER + * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on + * hugetlbfs virtual memory ranges. Adding or not adding + * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has + * no real functional effect after UFFDIO_API returns, but + * it's only useful for an initial feature set probe at + * UFFDIO_API time. There are two ways to use it: + * + * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the + * uffdio_api.features before calling UFFDIO_API, an error + * will be returned by UFFDIO_API on a kernel without + * hugetlbfs missing support + * + * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in + * uffdio_api.features and instead it will be set by the + * kernel in the uffdio_api.features if the kernel supports + * it, so userland can later check if the feature flag is + * present in uffdio_api.features after UFFDIO_API + * succeeded. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) #define UFFD_FEATURE_EVENT_REMAP (1<<2) #define UFFD_FEATURE_EVENT_MADVDONTNEED (1<<3) +#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) __u64 features; __u64 ioctls; -- cgit From ba6907db6de17cf90c4fc67f3eb1d1d37dd0b7ac Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 22 Feb 2017 15:43:22 -0800 Subject: userfaultfd: introduce vma_can_userfault Check whether a VMA can be used with userfault in more compact way Link: http://lkml.kernel.org/r/20161216144821.5183-28-aarcange@redhat.com Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Acked-by: Hillf Danton Cc: "Dr. David Alan Gilbert" Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 11b5e65e8fba..593135c296bc 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1096,6 +1096,11 @@ static __always_inline int validate_range(struct mm_struct *mm, return 0; } +static inline bool vma_can_userfault(struct vm_area_struct *vma) +{ + return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma); +} + static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { @@ -1185,7 +1190,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* check not compatible vmas */ ret = -EINVAL; - if (!vma_is_anonymous(cur) && !is_vm_hugetlb_page(cur)) + if (!vma_can_userfault(cur)) goto out_unlock; /* * If this vma contains ending address, and huge pages @@ -1229,7 +1234,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!vma_is_anonymous(vma) && !is_vm_hugetlb_page(vma)); + BUG_ON(!vma_can_userfault(vma)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); @@ -1367,7 +1372,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * provides for more strict behavior to notice * unregistration errors. */ - if (!vma_is_anonymous(cur) && !is_vm_hugetlb_page(cur)) + if (!vma_can_userfault(cur)) goto out_unlock; found = true; @@ -1381,7 +1386,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!vma_is_anonymous(vma) && !is_vm_hugetlb_page(vma)); + BUG_ON(!vma_can_userfault(vma)); /* * Nothing to do: this vma is already registered into this -- cgit From 4c27fe4c4c84f3afd504ecff2420cc1ad420d38e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 22 Feb 2017 15:43:25 -0800 Subject: userfaultfd: shmem: add shmem_mcopy_atomic_pte for userfaultfd support shmem_mcopy_atomic_pte is the low level routine that implements the userfaultfd UFFDIO_COPY command. It is based on the existing mcopy_atomic_pte routine with modifications for shared memory pages. Link: http://lkml.kernel.org/r/20161216144821.5183-29-aarcange@redhat.com Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 11 +++++ mm/shmem.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index ff078e7043b6..fdaac9d4d46d 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -124,4 +124,15 @@ static inline bool shmem_huge_enabled(struct vm_area_struct *vma) } #endif +#ifdef CONFIG_SHMEM +extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep); +#else +#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ + src_addr, pagep) ({ BUG(); 0; }) +#endif + #endif diff --git a/mm/shmem.c b/mm/shmem.c index 7d52cd4b504d..14de2a9e5083 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -70,6 +70,7 @@ static struct vfsmount *shm_mnt; #include #include #include +#include #include #include @@ -2178,6 +2179,115 @@ bool shmem_mapping(struct address_space *mapping) return mapping->a_ops == &shmem_aops; } +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + struct inode *inode = file_inode(dst_vma->vm_file); + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + gfp_t gfp = mapping_gfp_mask(mapping); + pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct mem_cgroup *memcg; + spinlock_t *ptl; + void *page_kaddr; + struct page *page; + pte_t _dst_pte, *dst_pte; + int ret; + + if (!*pagep) { + ret = -ENOMEM; + if (shmem_acct_block(info->flags, 1)) + goto out; + if (sbinfo->max_blocks) { + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks) >= 0) + goto out_unacct_blocks; + percpu_counter_inc(&sbinfo->used_blocks); + } + + page = shmem_alloc_page(gfp, info, pgoff); + if (!page) + goto out_dec_used_blocks; + + page_kaddr = kmap_atomic(page); + ret = copy_from_user(page_kaddr, (const void __user *)src_addr, + PAGE_SIZE); + kunmap_atomic(page_kaddr); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + *pagep = page; + /* don't free the page */ + return -EFAULT; + } + } else { + page = *pagep; + *pagep = NULL; + } + + ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false); + if (ret) + goto out_release; + + ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); + if (!ret) { + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL); + radix_tree_preload_end(); + } + if (ret) + goto out_release_uncharge; + + mem_cgroup_commit_charge(page, memcg, false, false); + + _dst_pte = mk_pte(page, dst_vma->vm_page_prot); + if (dst_vma->vm_flags & VM_WRITE) + _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); + + ret = -EEXIST; + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!pte_none(*dst_pte)) + goto out_release_uncharge_unlock; + + __SetPageUptodate(page); + + lru_cache_add_anon(page); + + spin_lock(&info->lock); + info->alloced++; + inode->i_blocks += BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + + inc_mm_counter(dst_mm, mm_counter_file(page)); + page_add_file_rmap(page, false); + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + unlock_page(page); + pte_unmap_unlock(dst_pte, ptl); + ret = 0; +out: + return ret; +out_release_uncharge_unlock: + pte_unmap_unlock(dst_pte, ptl); +out_release_uncharge: + mem_cgroup_cancel_charge(page, memcg, false); +out_release: + put_page(page); +out_dec_used_blocks: + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -1); +out_unacct_blocks: + shmem_unacct_blocks(info->flags, 1); + goto out; +} + #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; -- cgit From b0506e488da5cf2f07f3a4f6d7acaa8f459ad714 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 22 Feb 2017 15:43:28 -0800 Subject: userfaultfd: shmem: introduce vma_is_shmem Currently userfault relies on vma_is_anonymous and vma_is_hugetlb to ensure compatibility of a VMA with userfault. Introduction of vma_is_shmem allows detection if tmpfs backed VMAs, so that they may be used with userfaultfd. Current implementation presumes usage of vma_is_shmem only by slow path routines in userfaultfd, therefore the vma_is_shmem is not made inline to leave the few remaining free bits in vm_flags. Link: http://lkml.kernel.org/r/20161216144821.5183-30-aarcange@redhat.com Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 ++++++++++ mm/shmem.c | 5 +++++ 2 files changed, 15 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index c3e2be2b3296..bb997493e15d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1383,6 +1383,16 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma) return !vma->vm_ops; } +#ifdef CONFIG_SHMEM +/* + * The vma_is_shmem is not inline because it is used only by slow + * paths in userfault. + */ +bool vma_is_shmem(struct vm_area_struct *vma); +#else +static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } +#endif + static inline int stack_guard_page_start(struct vm_area_struct *vma, unsigned long addr) { diff --git a/mm/shmem.c b/mm/shmem.c index 14de2a9e5083..8412baec17cd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -191,6 +191,11 @@ static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; static struct file_system_type shmem_fs_type; +bool vma_is_shmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &shmem_vm_ops; +} + static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); -- cgit From 95cc09d66f523895635eca8aa1f9bc7419e60e32 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:43:31 -0800 Subject: userfaultfd: shmem: add tlbflush.h header for microblaze It resolves this build error: All errors (new ones prefixed by >>): mm/shmem.c: In function 'shmem_mcopy_atomic_pte': >> mm/shmem.c:2228:2: error: implicit declaration of function 'update_mmu_cache' [-Werror=implicit-function-declaration] update_mmu_cache(dst_vma, dst_addr, dst_pte); microblaze may have to be also updated to define it in asm/pgtable.h like the other archs, then this header inclusion can be removed. Link: http://lkml.kernel.org/r/20161216144821.5183-31-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 8412baec17cd..26c332c84e14 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -34,6 +34,8 @@ #include #include +#include /* for arch/microblaze update_mmu_cache() */ + static struct vfsmount *shm_mnt; #ifdef CONFIG_SHMEM -- cgit From 26071cedc519b822f69cc42dba9be969d2cdeb19 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 22 Feb 2017 15:43:34 -0800 Subject: userfaultfd: shmem: use shmem_mcopy_atomic_pte for shared memory The shmem_mcopy_atomic_pte implements low lever part of UFFDIO_COPY operation for shared memory VMAs. It's based on mcopy_atomic_pte with adjustments necessary for shared memory pages. Link: http://lkml.kernel.org/r/20161216144821.5183-32-aarcange@redhat.com Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/userfaultfd.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 31207b47ea92..a0817cc470b0 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "internal.h" @@ -369,7 +370,9 @@ retry: */ err = -EINVAL; dst_vma = find_vma(dst_mm, dst_start); - if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) + if (!dst_vma) + goto out_unlock; + if (!vma_is_shmem(dst_vma) && dst_vma->vm_flags & VM_SHARED) goto out_unlock; if (dst_start < dst_vma->vm_start || dst_start + len > dst_vma->vm_end) @@ -394,11 +397,7 @@ retry: if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; - /* - * FIXME: only allow copying on anonymous vmas, tmpfs should - * be added. - */ - if (!vma_is_anonymous(dst_vma)) + if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; /* @@ -407,7 +406,7 @@ retry: * dst_vma. */ err = -ENOMEM; - if (unlikely(anon_vma_prepare(dst_vma))) + if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma))) goto out_unlock; while (src_addr < src_start + len) { @@ -444,12 +443,21 @@ retry: BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd)); - if (!zeropage) - err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, &page); - else - err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, - dst_addr); + if (vma_is_anonymous(dst_vma)) { + if (!zeropage) + err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, + &page); + else + err = mfill_zeropage_pte(dst_mm, dst_pmd, + dst_vma, dst_addr); + } else { + err = -EINVAL; /* if zeropage is true return -EINVAL */ + if (likely(!zeropage)) + err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, + dst_vma, dst_addr, + src_addr, &page); + } cond_resched(); -- cgit From cfda05267f7bd02b5ae5ac6a37fbbdf3b9c41b57 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 22 Feb 2017 15:43:37 -0800 Subject: userfaultfd: shmem: add userfaultfd hook for shared memory faults When processing a page fault in shared memory area for not present page, check the VMA determine if faults are to be handled by userfaultfd. If so, delegate the page fault to handle_userfault. Link: http://lkml.kernel.org/r/20161216144821.5183-33-aarcange@redhat.com Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 26c332c84e14..ab6644194fee 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -72,6 +72,7 @@ static struct vfsmount *shm_mnt; #include #include #include +#include #include #include @@ -118,13 +119,14 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, - gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); + gfp_t gfp, struct vm_area_struct *vma, + struct vm_fault *vmf, int *fault_type); int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp) { return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL); + mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); } static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) @@ -1578,7 +1580,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, - struct mm_struct *fault_mm, int *fault_type) + struct vm_area_struct *vma, struct vm_fault *vmf, int *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1632,7 +1634,7 @@ repeat: * bring it back from swap or allocate. */ sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = fault_mm ? : current->mm; + charge_mm = vma ? vma->vm_mm : current->mm; if (swap.val) { /* Look it up and read it in.. */ @@ -1642,7 +1644,8 @@ repeat: if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT); + mem_cgroup_count_vm_event(charge_mm, + PGMAJFAULT); } /* Here we actually start the io */ page = shmem_swapin(swap, gfp, info, index); @@ -1711,6 +1714,11 @@ repeat: swap_free(swap); } else { + if (vma && userfaultfd_missing(vma)) { + *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); + return 0; + } + /* shmem_symlink() */ if (mapping->a_ops != &shmem_aops) goto alloc_nohuge; @@ -1973,7 +1981,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) sgp = SGP_NOHUGE; error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, - gfp, vma->vm_mm, &ret); + gfp, vma, vmf, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); return ret; @@ -4254,7 +4262,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, BUG_ON(mapping->a_ops != &shmem_aops); error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, - gfp, NULL, NULL); + gfp, NULL, NULL, NULL); if (error) page = ERR_PTR(error); else -- cgit From cac673292b9b39493bb0ff526b96c83ace6fdcd0 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 22 Feb 2017 15:43:40 -0800 Subject: userfaultfd: shmem: allow registration of shared memory ranges Expand the userfaultfd_register/unregister routines to allow shared memory VMAs. Currently, there is no UFFDIO_ZEROPAGE and write-protection support for shared memory VMAs, which is reflected in ioctl methods supported by uffdio_register. Link: http://lkml.kernel.org/r/20161216144821.5183-34-aarcange@redhat.com Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 21 +++++++-------------- include/uapi/linux/userfaultfd.h | 2 +- tools/testing/selftests/vm/userfaultfd.c | 2 +- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 593135c296bc..18406158e13f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1098,7 +1098,8 @@ static __always_inline int validate_range(struct mm_struct *mm, static inline bool vma_can_userfault(struct vm_area_struct *vma) { - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma); + return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || + vma_is_shmem(vma); } static int userfaultfd_register(struct userfaultfd_ctx *ctx, @@ -1111,7 +1112,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, struct uffdio_register __user *user_uffdio_register; unsigned long vm_flags, new_flags; bool found; - bool huge_pages; + bool non_anon_pages; unsigned long start, end, vma_end; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1175,13 +1176,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* * Search for not compatible vmas. - * - * FIXME: this shall be relaxed later so that it doesn't fail - * on tmpfs backed vmas (in addition to the current allowance - * on anonymous vmas). */ found = false; - huge_pages = false; + non_anon_pages = false; for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { cond_resched(); @@ -1220,8 +1217,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* * Note vmas containing huge pages */ - if (is_vm_hugetlb_page(cur)) - huge_pages = true; + if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur)) + non_anon_pages = true; found = true; } @@ -1292,7 +1289,7 @@ out_unlock: * userland which ioctls methods are guaranteed to * succeed on this range. */ - if (put_user(huge_pages ? UFFD_API_RANGE_IOCTLS_HPAGE : + if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC : UFFD_API_RANGE_IOCTLS, &user_uffdio_register->ioctls)) ret = -EFAULT; @@ -1352,10 +1349,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, /* * Search for not compatible vmas. - * - * FIXME: this shall be relaxed later so that it doesn't fail - * on tmpfs backed vmas (in addition to the current allowance - * on anonymous vmas). */ found = false; ret = -EINVAL; diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 7293321abdfb..10631a4cdb24 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -30,7 +30,7 @@ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE) -#define UFFD_API_RANGE_IOCTLS_HPAGE \ +#define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 3011711212ca..d753a9161411 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -129,7 +129,7 @@ static void allocate_area(void **alloc_area) #else /* HUGETLB_TEST */ -#define EXPECTED_IOCTLS UFFD_API_RANGE_IOCTLS_HPAGE +#define EXPECTED_IOCTLS UFFD_API_RANGE_IOCTLS_BASIC static int release_pages(char *rel_area) { -- cgit From 1c9e8def43a3452e7af658b340f5f4f4ecde5c38 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:43:43 -0800 Subject: userfaultfd: hugetlbfs: add UFFDIO_COPY support for shared mappings When userfaultfd hugetlbfs support was originally added, it followed the pattern of anon mappings and did not support any vmas marked VM_SHARED. As such, support was only added for private mappings. Remove this limitation and support shared mappings. The primary functional change required is adding pages to the page cache. More subtle changes are required for huge page reservation handling in error paths. A lengthy comment in the code describes the reservation handling. [mike.kravetz@oracle.com: update] Link: http://lkml.kernel.org/r/c9c8cafe-baa7-05b4-34ea-1dfa5523a85f@oracle.com Link: http://lkml.kernel.org/r/1487195210-12839-1-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Andrea Arcangeli Cc: Andrew Morton Cc: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 26 ++++++++++++++++++-- mm/userfaultfd.c | 74 ++++++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 82 insertions(+), 18 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f6c7ff316daf..30e7709a5121 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3992,6 +3992,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, unsigned long src_addr, struct page **pagep) { + int vm_shared = dst_vma->vm_flags & VM_SHARED; struct hstate *h = hstate_vma(dst_vma); pte_t _dst_pte; spinlock_t *ptl; @@ -4028,6 +4029,18 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); set_page_huge_active(page); + /* + * If shared, add to page cache + */ + if (vm_shared) { + struct address_space *mapping = dst_vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); + + ret = huge_add_to_page_cache(page, mapping, idx); + if (ret) + goto out_release_nounlock; + } + ptl = huge_pte_lockptr(h, dst_mm, dst_pte); spin_lock(ptl); @@ -4035,8 +4048,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!huge_pte_none(huge_ptep_get(dst_pte))) goto out_release_unlock; - ClearPagePrivate(page); - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + if (vm_shared) { + page_dup_rmap(page, true); + } else { + ClearPagePrivate(page); + hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + } _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); if (dst_vma->vm_flags & VM_WRITE) @@ -4053,11 +4070,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); + if (vm_shared) + unlock_page(page); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); +out_release_nounlock: + if (vm_shared) + unlock_page(page); put_page(page); goto out; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index a0817cc470b0..1e5c2f94e8a3 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -154,6 +154,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long len, bool zeropage) { + int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; + int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; pte_t *dst_pte; unsigned long src_addr, dst_addr; @@ -204,14 +206,14 @@ retry: goto out_unlock; /* - * Make sure the vma is not shared, that the remaining dst - * range is both valid and fully within a single existing vma. + * Make sure the remaining dst range is both valid and + * fully within a single existing vma. */ - if (dst_vma->vm_flags & VM_SHARED) - goto out_unlock; if (dst_start < dst_vma->vm_start || dst_start + len > dst_vma->vm_end) goto out_unlock; + + vm_shared = dst_vma->vm_flags & VM_SHARED; } if (WARN_ON(dst_addr & (vma_hpagesize - 1) || @@ -225,11 +227,13 @@ retry: goto out_unlock; /* - * Ensure the dst_vma has a anon_vma. + * If not shared, ensure the dst_vma has a anon_vma. */ err = -ENOMEM; - if (unlikely(anon_vma_prepare(dst_vma))) - goto out_unlock; + if (!vm_shared) { + if (unlikely(anon_vma_prepare(dst_vma))) + goto out_unlock; + } h = hstate_vma(dst_vma); @@ -266,6 +270,7 @@ retry: dst_addr, src_addr, &page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); + vm_alloc_shared = vm_shared; cond_resched(); @@ -305,18 +310,49 @@ out: if (page) { /* * We encountered an error and are about to free a newly - * allocated huge page. It is possible that there was a - * reservation associated with the page that has been - * consumed. See the routine restore_reserve_on_error - * for details. Unfortunately, we can not call - * restore_reserve_on_error now as it would require holding - * mmap_sem. Clear the PagePrivate flag so that the global + * allocated huge page. + * + * Reservation handling is very subtle, and is different for + * private and shared mappings. See the routine + * restore_reserve_on_error for details. Unfortunately, we + * can not call restore_reserve_on_error now as it would + * require holding mmap_sem. + * + * If a reservation for the page existed in the reservation + * map of a private mapping, the map was modified to indicate + * the reservation was consumed when the page was allocated. + * We clear the PagePrivate flag now so that the global * reserve count will not be incremented in free_huge_page. * The reservation map will still indicate the reservation * was consumed and possibly prevent later page allocation. - * This is better than leaking a global reservation. + * This is better than leaking a global reservation. If no + * reservation existed, it is still safe to clear PagePrivate + * as no adjustments to reservation counts were made during + * allocation. + * + * The reservation map for shared mappings indicates which + * pages have reservations. When a huge page is allocated + * for an address with a reservation, no change is made to + * the reserve map. In this case PagePrivate will be set + * to indicate that the global reservation count should be + * incremented when the page is freed. This is the desired + * behavior. However, when a huge page is allocated for an + * address without a reservation a reservation entry is added + * to the reservation map, and PagePrivate will not be set. + * When the page is freed, the global reserve count will NOT + * be incremented and it will appear as though we have leaked + * reserved page. In this case, set PagePrivate so that the + * global reserve count will be incremented to match the + * reservation map entry which was created. + * + * Note that vm_alloc_shared is based on the flags of the vma + * for which the page was originally allocated. dst_vma could + * be different or NULL on error. */ - ClearPagePrivate(page); + if (vm_alloc_shared) + SetPagePrivate(page); + else + ClearPagePrivate(page); put_page(page); } BUG_ON(copied < 0); @@ -372,8 +408,14 @@ retry: dst_vma = find_vma(dst_mm, dst_start); if (!dst_vma) goto out_unlock; - if (!vma_is_shmem(dst_vma) && dst_vma->vm_flags & VM_SHARED) + /* + * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but + * it will overwrite vm_ops, so vma_is_anonymous must return false. + */ + if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && + dst_vma->vm_flags & VM_SHARED)) goto out_unlock; + if (dst_start < dst_vma->vm_start || dst_start + len > dst_vma->vm_end) goto out_unlock; -- cgit From 419624daf0e827452837177c4b983dc0f1b6429f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 22 Feb 2017 15:43:46 -0800 Subject: userfaultfd: shmem: add userfaultfd_shmem test The test verifies that anonymous shared mapping can be used with userfault using the existing testing method. The shared memory area is allocated using mmap(..., MAP_SHARED | MAP_ANONYMOUS, ...) and released using madvise(MADV_REMOVE) Link: http://lkml.kernel.org/r/20161216144821.5183-35-aarcange@redhat.com Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/Makefile | 4 ++++ tools/testing/selftests/vm/run_vmtests | 11 ++++++++++ tools/testing/selftests/vm/userfaultfd.c | 37 ++++++++++++++++++++++++++++++-- 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 0114aac78e1f..900dfaf81051 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -11,6 +11,7 @@ BINARIES += thuge-gen BINARIES += transhuge-stress BINARIES += userfaultfd BINARIES += userfaultfd_hugetlb +BINARIES += userfaultfd_shmem BINARIES += mlock-random-test all: $(BINARIES) @@ -22,6 +23,9 @@ userfaultfd: userfaultfd.c ../../../../usr/include/linux/kernel.h userfaultfd_hugetlb: userfaultfd.c ../../../../usr/include/linux/kernel.h $(CC) $(CFLAGS) -DHUGETLB_TEST -O2 -o $@ $< -lpthread +userfaultfd_shmem: userfaultfd.c ../../../../usr/include/linux/kernel.h + $(CC) $(CFLAGS) -DSHMEM_TEST -O2 -o $@ $< -lpthread + mlock-random-test: mlock-random-test.c $(CC) $(CFLAGS) -o $@ $< -lcap diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index 14d697ee00c4..c92f6cf31d0a 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -116,6 +116,17 @@ else fi rm -f $mnt/ufd_test_file +echo "----------------------------" +echo "running userfaultfd_shmem" +echo "----------------------------" +./userfaultfd_shmem 128 32 +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + #cleanup umount $mnt rm -rf $mnt diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index d753a9161411..a5e5808c86cd 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -101,8 +101,9 @@ pthread_attr_t attr; ~(unsigned long)(sizeof(unsigned long long) \ - 1))) -#ifndef HUGETLB_TEST +#if !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) +/* Anonymous memory */ #define EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ (1 << _UFFDIO_COPY) | \ (1 << _UFFDIO_ZEROPAGE)) @@ -127,10 +128,13 @@ static void allocate_area(void **alloc_area) } } -#else /* HUGETLB_TEST */ +#else /* HUGETLB_TEST or SHMEM_TEST */ #define EXPECTED_IOCTLS UFFD_API_RANGE_IOCTLS_BASIC +#ifdef HUGETLB_TEST + +/* HugeTLB memory */ static int release_pages(char *rel_area) { int ret = 0; @@ -162,8 +166,37 @@ static void allocate_area(void **alloc_area) huge_fd_off0 = *alloc_area; } +#elif defined(SHMEM_TEST) + +/* Shared memory */ +static int release_pages(char *rel_area) +{ + int ret = 0; + + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) { + perror("madvise"); + ret = 1; + } + + return ret; +} + +static void allocate_area(void **alloc_area) +{ + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (*alloc_area == MAP_FAILED) { + fprintf(stderr, "shared memory mmap failed\n"); + *alloc_area = NULL; + } +} + +#else /* SHMEM_TEST */ +#error "Undefined test type" #endif /* HUGETLB_TEST */ +#endif /* !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) */ + static int my_bcmp(char *str1, char *str2, size_t n) { unsigned long i; -- cgit From 9cc90c664a65f9b6b9f3ce1c719f1308539427bd Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:43:49 -0800 Subject: userfaultfd: shmem: lock the page before adding it to pagecache A VM_BUG_ON triggered on the shmem selftest. Link: http://lkml.kernel.org/r/20161216144821.5183-36-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index ab6644194fee..4e5e7a57e5b4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2245,6 +2245,10 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; } + VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); + __SetPageLocked(page); + __SetPageSwapBacked(page); + ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false); if (ret) goto out_release; @@ -2294,6 +2298,7 @@ out_release_uncharge_unlock: out_release_uncharge: mem_cgroup_cancel_charge(page, memcg, false); out_release: + unlock_page(page); put_page(page); out_dec_used_blocks: if (sbinfo->max_blocks) -- cgit From a425d3584e7e69587aa441e91c7ffce7f47004d7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:43:52 -0800 Subject: userfaultfd: shmem: avoid a lockup resulting from corrupted page->flags Use the non atomic version of __SetPageUptodate while the page is still private and not visible to lookup operations. Using the non atomic version after the page is already visible to lookups is unsafe as there would be concurrent lock_page operation modifying the page->flags while it runs. This solves a lockup in find_lock_entry with the userfaultfd_shmem selftest. userfaultfd_shm D14296 691 1 0x00000004 Call Trace: schedule+0x3d/0x90 schedule_timeout+0x228/0x420 io_schedule_timeout+0xa4/0x110 __lock_page+0x12d/0x170 find_lock_entry+0xa4/0x190 shmem_getpage_gfp+0xb9/0xc30 shmem_fault+0x70/0x1c0 __do_fault+0x21/0x150 handle_mm_fault+0xec9/0x1490 __do_page_fault+0x20d/0x520 trace_do_page_fault+0x61/0x270 do_async_page_fault+0x19/0x80 async_page_fault+0x25/0x30 Link: http://lkml.kernel.org/r/20170116180408.12184-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Mike Rapoport Acked-by: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 4e5e7a57e5b4..8d7d80cf8708 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2248,6 +2248,7 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); __SetPageLocked(page); __SetPageSwapBacked(page); + __SetPageUptodate(page); ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false); if (ret) @@ -2272,8 +2273,6 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!pte_none(*dst_pte)) goto out_release_uncharge_unlock; - __SetPageUptodate(page); - lru_cache_add_anon(page); spin_lock(&info->lock); -- cgit From cb658a453b9327ce96ce5222c24d162b5b65b564 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:43:55 -0800 Subject: userfaultfd: shmem: avoid leaking blocks and used blocks in UFFDIO_COPY If the atomic copy_user fails because of a real dangling userland pointer, we won't go back into the shmem method, so when the method returns it must not leave anything charged up, except the page itself. Link: http://lkml.kernel.org/r/20161216144821.5183-37-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 8d7d80cf8708..9c6d22ff44e2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2214,17 +2214,17 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t _dst_pte, *dst_pte; int ret; - if (!*pagep) { - ret = -ENOMEM; - if (shmem_acct_block(info->flags, 1)) - goto out; - if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks) >= 0) - goto out_unacct_blocks; - percpu_counter_inc(&sbinfo->used_blocks); - } + ret = -ENOMEM; + if (shmem_acct_block(info->flags, 1)) + goto out; + if (sbinfo->max_blocks) { + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks) >= 0) + goto out_unacct_blocks; + percpu_counter_inc(&sbinfo->used_blocks); + } + if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) goto out_dec_used_blocks; @@ -2237,6 +2237,9 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, /* fallback to copy_from_user outside mmap_sem */ if (unlikely(ret)) { *pagep = page; + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -1); + shmem_unacct_blocks(info->flags, 1); /* don't free the page */ return -EFAULT; } -- cgit From 47dd924508f5fb10480afc69de04539fa3d14034 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:43:58 -0800 Subject: userfaultfd: hugetlbfs: UFFD_FEATURE_MISSING_SHMEM Userland developers asked to be notified immediately by the UFFDIO_API ioctl if shmem missing mode is supported by userfaultfd in the running kernel. This avoids the need to run UFFDIO_REGISTER on a shmem virtual memory range to find out. Link: http://lkml.kernel.org/r/20161216144821.5183-38-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 10631a4cdb24..9ac4b68c54d1 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -21,7 +21,8 @@ #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ UFFD_FEATURE_EVENT_MADVDONTNEED | \ - UFFD_FEATURE_MISSING_HUGETLBFS) + UFFD_FEATURE_MISSING_HUGETLBFS | \ + UFFD_FEATURE_MISSING_SHMEM) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -146,12 +147,17 @@ struct uffdio_api { * it, so userland can later check if the feature flag is * present in uffdio_api.features after UFFDIO_API * succeeded. + * + * UFFD_FEATURE_MISSING_SHMEM works the same as + * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem + * (i.e. tmpfs and other shmem based APIs). */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) #define UFFD_FEATURE_EVENT_REMAP (1<<2) #define UFFD_FEATURE_EVENT_MADVDONTNEED (1<<3) #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) +#define UFFD_FEATURE_MISSING_SHMEM (1<<5) __u64 features; __u64 ioctls; -- cgit From 6228b8f2d15bc9a9b76d6b209a8b760a642fa996 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 22 Feb 2017 15:44:01 -0800 Subject: userfaultfd: non-cooperative: selftest: introduce userfaultfd_open userfaultfd_open will be needed by the non cooperative selftest. Link: http://lkml.kernel.org/r/20161216144821.5183-39-aarcange@redhat.com Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 41 +++++++++++++++++++------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index a5e5808c86cd..75540e770b82 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -81,7 +81,7 @@ static int huge_fd; static char *huge_fd_off0; #endif static unsigned long long *count_verify; -static int uffd, finished, *pipefd; +static int uffd, uffd_flags, finished, *pipefd; static char *area_src, *area_dst; static char *zeropage; pthread_attr_t attr; @@ -512,23 +512,9 @@ static int stress(unsigned long *userfaults) return 0; } -static int userfaultfd_stress(void) +static int userfaultfd_open(void) { - void *area; - char *tmp_area; - unsigned long nr; - struct uffdio_register uffdio_register; struct uffdio_api uffdio_api; - unsigned long cpu; - int uffd_flags, err; - unsigned long userfaults[nr_cpus]; - - allocate_area((void **)&area_src); - if (!area_src) - return 1; - allocate_area((void **)&area_dst); - if (!area_dst) - return 1; uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); if (uffd < 0) { @@ -549,6 +535,29 @@ static int userfaultfd_stress(void) return 1; } + return 0; +} + +static int userfaultfd_stress(void) +{ + void *area; + char *tmp_area; + unsigned long nr; + struct uffdio_register uffdio_register; + unsigned long cpu; + int err; + unsigned long userfaults[nr_cpus]; + + allocate_area((void **)&area_src); + if (!area_src) + return 1; + allocate_area((void **)&area_dst); + if (!area_dst) + return 1; + + if (userfaultfd_open() < 0) + return 1; + count_verify = malloc(nr_pages * sizeof(unsigned long long)); if (!count_verify) { perror("count_verify"); -- cgit From aa0d27217477acbc1cfcc4fdaa4de4f3ce545b4e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 22 Feb 2017 15:44:04 -0800 Subject: userfaultfd: non-cooperative: selftest: add ufd parameter to copy_page With future addition of event tests, copy_page will be called with different userfault file descriptors Link: http://lkml.kernel.org/r/20161216144821.5183-40-aarcange@redhat.com Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 75540e770b82..c79c372db2da 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -317,7 +317,7 @@ static void *locking_thread(void *arg) return NULL; } -static int copy_page(unsigned long offset) +static int copy_page(int ufd, unsigned long offset) { struct uffdio_copy uffdio_copy; @@ -329,7 +329,7 @@ static int copy_page(unsigned long offset) uffdio_copy.len = page_size; uffdio_copy.mode = 0; uffdio_copy.copy = 0; - if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) { + if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { /* real retval in ufdio_copy.copy */ if (uffdio_copy.copy != -EEXIST) fprintf(stderr, "UFFDIO_COPY error %Ld\n", @@ -386,7 +386,7 @@ static void *uffd_poll_thread(void *arg) offset = (char *)(unsigned long)msg.arg.pagefault.address - area_dst; offset &= ~(page_size-1); - if (copy_page(offset)) + if (copy_page(uffd, offset)) userfaults++; } return (void *)userfaults; @@ -424,7 +424,7 @@ static void *uffd_read_thread(void *arg) offset = (char *)(unsigned long)msg.arg.pagefault.address - area_dst; offset &= ~(page_size-1); - if (copy_page(offset)) + if (copy_page(uffd, offset)) (*this_cpu_userfaults)++; } return (void *)NULL; @@ -438,7 +438,7 @@ static void *background_thread(void *arg) for (page_nr = cpu * nr_pages_per_cpu; page_nr < (cpu+1) * nr_pages_per_cpu; page_nr++) - copy_page(page_nr * page_size); + copy_page(uffd, page_nr * page_size); return NULL; } -- cgit From da5502c0a39b7ba28f403a4b87d1dd690e7829bf Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 22 Feb 2017 15:44:06 -0800 Subject: userfaultfd: non-cooperative: selftest: add test for FORK, MADVDONTNEED and REMAP events Add test for userfaultfd events used in non-cooperative scenario when the process that monitors the userfaultfd and handles user faults is not the same process that causes the page faults. Link: http://lkml.kernel.org/r/20161216144821.5183-41-aarcange@redhat.com Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 175 ++++++++++++++++++++++++++++--- 1 file changed, 163 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index c79c372db2da..71b4d820b011 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -347,6 +348,7 @@ static void *uffd_poll_thread(void *arg) unsigned long cpu = (unsigned long) arg; struct pollfd pollfd[2]; struct uffd_msg msg; + struct uffdio_register uffd_reg; int ret; unsigned long offset; char tmp_chr; @@ -378,16 +380,35 @@ static void *uffd_poll_thread(void *arg) continue; perror("nonblocking read error"), exit(1); } - if (msg.event != UFFD_EVENT_PAGEFAULT) + switch (msg.event) { + default: fprintf(stderr, "unexpected msg event %u\n", msg.event), exit(1); - if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - fprintf(stderr, "unexpected write fault\n"), exit(1); - offset = (char *)(unsigned long)msg.arg.pagefault.address - - area_dst; - offset &= ~(page_size-1); - if (copy_page(uffd, offset)) - userfaults++; + break; + case UFFD_EVENT_PAGEFAULT: + if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + fprintf(stderr, "unexpected write fault\n"), exit(1); + offset = (char *)(unsigned long)msg.arg.pagefault.address - + area_dst; + offset &= ~(page_size-1); + if (copy_page(uffd, offset)) + userfaults++; + break; + case UFFD_EVENT_FORK: + uffd = msg.arg.fork.ufd; + pollfd[0].fd = uffd; + break; + case UFFD_EVENT_MADVDONTNEED: + uffd_reg.range.start = msg.arg.madv_dn.start; + uffd_reg.range.len = msg.arg.madv_dn.end - + msg.arg.madv_dn.start; + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) + fprintf(stderr, "madv_dn failure\n"), exit(1); + break; + case UFFD_EVENT_REMAP: + area_dst = (char *)(unsigned long)msg.arg.remap.to; + break; + } } return (void *)userfaults; } @@ -512,7 +533,7 @@ static int stress(unsigned long *userfaults) return 0; } -static int userfaultfd_open(void) +static int userfaultfd_open(int features) { struct uffdio_api uffdio_api; @@ -525,7 +546,7 @@ static int userfaultfd_open(void) uffd_flags = fcntl(uffd, F_GETFD, NULL); uffdio_api.api = UFFD_API; - uffdio_api.features = 0; + uffdio_api.features = features; if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { fprintf(stderr, "UFFDIO_API\n"); return 1; @@ -538,6 +559,132 @@ static int userfaultfd_open(void) return 0; } +/* + * For non-cooperative userfaultfd test we fork() a process that will + * generate pagefaults, will mremap the area monitored by the + * userfaultfd and at last this process will release the monitored + * area. + * For the anonymous and shared memory the area is divided into two + * parts, the first part is accessed before mremap, and the second + * part is accessed after mremap. Since hugetlbfs does not support + * mremap, the entire monitored area is accessed in a single pass for + * HUGETLB_TEST. + * The release of the pages currently generates event only for + * anonymous memory (UFFD_EVENT_MADVDONTNEED), hence it is not checked + * for hugetlb and shmem. + */ +static int faulting_process(void) +{ + unsigned long nr; + unsigned long long count; + +#ifndef HUGETLB_TEST + unsigned long split_nr_pages = (nr_pages + 1) / 2; +#else + unsigned long split_nr_pages = nr_pages; +#endif + + for (nr = 0; nr < split_nr_pages; nr++) { + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) { + fprintf(stderr, + "nr %lu memory corruption %Lu %Lu\n", + nr, count, + count_verify[nr]), exit(1); + } + } + +#ifndef HUGETLB_TEST + area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, area_src); + if (area_dst == MAP_FAILED) + perror("mremap"), exit(1); + + for (; nr < nr_pages; nr++) { + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) { + fprintf(stderr, + "nr %lu memory corruption %Lu %Lu\n", + nr, count, + count_verify[nr]), exit(1); + } + } + +#ifndef SHMEM_TEST + if (release_pages(area_dst)) + return 1; + + for (nr = 0; nr < nr_pages; nr++) { + if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) + fprintf(stderr, "nr %lu is not zero\n", nr), exit(1); + } +#endif /* SHMEM_TEST */ + +#endif /* HUGETLB_TEST */ + + return 0; +} + +static int userfaultfd_events_test(void) +{ + struct uffdio_register uffdio_register; + unsigned long expected_ioctls; + unsigned long userfaults; + pthread_t uffd_mon; + int err, features; + pid_t pid; + char c; + + printf("testing events (fork, remap, madv_dn): "); + fflush(stdout); + + if (release_pages(area_dst)) + return 1; + + features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | + UFFD_FEATURE_EVENT_MADVDONTNEED; + if (userfaultfd_open(features) < 0) + return 1; + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + fprintf(stderr, "register failure\n"), exit(1); + + expected_ioctls = EXPECTED_IOCTLS; + if ((uffdio_register.ioctls & expected_ioctls) != + expected_ioctls) + fprintf(stderr, + "unexpected missing ioctl for anon memory\n"), + exit(1); + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL)) + perror("uffd_poll_thread create"), exit(1); + + pid = fork(); + if (pid < 0) + perror("fork"), exit(1); + + if (!pid) + return faulting_process(); + + waitpid(pid, &err, 0); + if (err) + fprintf(stderr, "faulting process failed\n"), exit(1); + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + perror("pipe write"), exit(1); + if (pthread_join(uffd_mon, (void **)&userfaults)) + return 1; + + close(uffd); + printf("userfaults: %ld\n", userfaults); + + return userfaults != nr_pages; +} + static int userfaultfd_stress(void) { void *area; @@ -555,7 +702,7 @@ static int userfaultfd_stress(void) if (!area_dst) return 1; - if (userfaultfd_open() < 0) + if (userfaultfd_open(0) < 0) return 1; count_verify = malloc(nr_pages * sizeof(unsigned long long)); @@ -702,7 +849,11 @@ static int userfaultfd_stress(void) printf("\n"); } - return err; + if (err) + return err; + + close(uffd); + return userfaultfd_events_test(); } #ifndef HUGETLB_TEST -- cgit From 7a0c4cf85b856430af62a907dd65dfc51438d24f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:44:10 -0800 Subject: userfaultfd: selftest: test UFFDIO_ZEROPAGE on all memory types This will verify -EINVAL is returned with hugetlbfs/shmem and it'll do a functional test of UFFDIO_ZEROPAGE on anonymous memory. Link: http://lkml.kernel.org/r/20161216144821.5183-42-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 82 +++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 71b4d820b011..5a840a605a16 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -625,6 +625,86 @@ static int faulting_process(void) return 0; } +static int uffdio_zeropage(int ufd, unsigned long offset) +{ + struct uffdio_zeropage uffdio_zeropage; + int ret; + unsigned long has_zeropage = EXPECTED_IOCTLS & (1 << _UFFDIO_ZEROPAGE); + + if (offset >= nr_pages * page_size) + fprintf(stderr, "unexpected offset %lu\n", + offset), exit(1); + uffdio_zeropage.range.start = (unsigned long) area_dst + offset; + uffdio_zeropage.range.len = page_size; + uffdio_zeropage.mode = 0; + ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); + if (ret) { + /* real retval in ufdio_zeropage.zeropage */ + if (has_zeropage) { + if (uffdio_zeropage.zeropage == -EEXIST) + fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n"), + exit(1); + else + fprintf(stderr, "UFFDIO_ZEROPAGE error %Ld\n", + uffdio_zeropage.zeropage), exit(1); + } else { + if (uffdio_zeropage.zeropage != -EINVAL) + fprintf(stderr, + "UFFDIO_ZEROPAGE not -EINVAL %Ld\n", + uffdio_zeropage.zeropage), exit(1); + } + } else if (has_zeropage) { + if (uffdio_zeropage.zeropage != page_size) { + fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n", + uffdio_zeropage.zeropage), exit(1); + } else + return 1; + } else { + fprintf(stderr, + "UFFDIO_ZEROPAGE succeeded %Ld\n", + uffdio_zeropage.zeropage), exit(1); + } + + return 0; +} + +/* exercise UFFDIO_ZEROPAGE */ +static int userfaultfd_zeropage_test(void) +{ + struct uffdio_register uffdio_register; + unsigned long expected_ioctls; + + printf("testing UFFDIO_ZEROPAGE: "); + fflush(stdout); + + if (release_pages(area_dst)) + return 1; + + if (userfaultfd_open(0) < 0) + return 1; + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + fprintf(stderr, "register failure\n"), exit(1); + + expected_ioctls = EXPECTED_IOCTLS; + if ((uffdio_register.ioctls & expected_ioctls) != + expected_ioctls) + fprintf(stderr, + "unexpected missing ioctl for anon memory\n"), + exit(1); + + if (uffdio_zeropage(uffd, 0)) { + if (my_bcmp(area_dst, zeropage, page_size)) + fprintf(stderr, "zeropage is not zero\n"), exit(1); + } + + close(uffd); + printf("done.\n"); + return 0; +} + static int userfaultfd_events_test(void) { struct uffdio_register uffdio_register; @@ -853,7 +933,7 @@ static int userfaultfd_stress(void) return err; close(uffd); - return userfaultfd_events_test(); + return userfaultfd_zeropage_test() || userfaultfd_events_test(); } #ifndef HUGETLB_TEST -- cgit From 175ad4f1e7a29c8f914254e2e6316c50671e027a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:44:12 -0800 Subject: mm: mprotect: use pmd_trans_unstable instead of taking the pmd_lock pmd_trans_unstable does an atomic read on the pmd so it doesn't require the pmd_lock for the same check. This also removes the special assumption that the mmap_sem is hold for writing if prot_numa is not set. userfaultfd will hold the mmap_sem only for reading in change_pte_range like prot_numa, but it will not set prot_numa. This is always a valid micro-optimization regardless of userfaultfd. [kirill@shutemov.name: drop unneeded pmd_trans_unstable(pmd) check after __split_huge_pmd()] Link: http://lkml.kernel.org/r/20170208120421.GE5578@node.shutemov.name Link: http://lkml.kernel.org/r/20161216144821.5183-43-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 46 +++++++++++++++------------------------------- 1 file changed, 15 insertions(+), 31 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index f9c07f54dd62..a45b4dc6a7f5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -33,34 +33,6 @@ #include "internal.h" -/* - * For a prot_numa update we only hold mmap_sem for read so there is a - * potential race with faulting where a pmd was temporarily none. This - * function checks for a transhuge pmd under the appropriate lock. It - * returns a pte if it was successfully locked or NULL if it raced with - * a transhuge insertion. - */ -static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, int prot_numa, spinlock_t **ptl) -{ - pte_t *pte; - spinlock_t *pmdl; - - /* !prot_numa is protected by mmap_sem held for write */ - if (!prot_numa) - return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); - - pmdl = pmd_lock(vma->vm_mm, pmd); - if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { - spin_unlock(pmdl); - return NULL; - } - - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); - spin_unlock(pmdl); - return pte; -} - static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -71,7 +43,21 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long pages = 0; int target_node = NUMA_NO_NODE; - pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); + /* + * Can be called with only the mmap_sem for reading by + * prot_numa so we must check the pmd isn't constantly + * changing from under us from pmd_none to pmd_trans_huge + * and/or the other way around. + */ + if (pmd_trans_unstable(pmd)) + return 0; + + /* + * The pmd points to a regular pte so the pmd can't change + * from under us even if the mmap_sem is only hold for + * reading. + */ + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!pte) return 0; @@ -177,8 +163,6 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { __split_huge_pmd(vma, pmd, addr, false, NULL); - if (pmd_trans_unstable(pmd)) - continue; } else { int nr_ptes = change_huge_pmd(vma, pmd, addr, newprot, prot_numa); -- cgit From 30b9aed8cd576964bff71a6c5f022ca30ac4c3b7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:44:15 -0800 Subject: mm, vmscan: remove unused mm_vmscan_memcg_isolate Patch series "vm, vmscan: enahance vmscan tracepoints", v2. While debugging [2] I've realized that there is some room for improvements in the tracepoints set we offer currently. I had hard times to make any conclusion from the existing ones. The resulting problem turned out to be active list aging [3] and we are missing at least two tracepoints to debug such a problem. Some existing tracepoints could export more information to see _why_ the reclaim progress cannot be made not only _how much_ we could reclaim. The later could be seen quite reasonably from the vmstat counters already. It can be argued that we are showing too many implementation details in those tracepoints but I consider them way too lowlevel already to be usable by any kernel independent userspace. I would be _really_ surprised if anything but debugging tools have used them. Any feedback is highly appreciated. [1] http://lkml.kernel.org/r/20161228153032.10821-1-mhocko@kernel.org [2] http://lkml.kernel.org/r/20161215225702.GA27944@boerne.fritz.box [3] http://lkml.kernel.org/r/20161223105157.GB23109@dhcp22.suse.cz This patch (of 8): The trace point is not used since 925b7673cce3 ("mm: make per-memcg LRU lists exclusive") so it can be removed. Link: http://lkml.kernel.org/r/20170104101942.4860-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Hillf Danton Acked-by: Mel Gorman Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index c88fd0934e7e..39bad8921ca1 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -269,8 +269,7 @@ TRACE_EVENT(mm_shrink_slab_end, __entry->retval) ); -DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, - +TRACE_EVENT(mm_vmscan_lru_isolate, TP_PROTO(int classzone_idx, int order, unsigned long nr_requested, @@ -311,34 +310,6 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __entry->file) ); -DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, - - TP_PROTO(int classzone_idx, - int order, - unsigned long nr_requested, - unsigned long nr_scanned, - unsigned long nr_taken, - isolate_mode_t isolate_mode, - int file), - - TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) - -); - -DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate, - - TP_PROTO(int classzone_idx, - int order, - unsigned long nr_requested, - unsigned long nr_scanned, - unsigned long nr_taken, - isolate_mode_t isolate_mode, - int file), - - TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) - -); - TRACE_EVENT(mm_vmscan_writepage, TP_PROTO(struct page *page), -- cgit From 9d998b4f1e39abd69441d29a1ef3250514479267 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:44:18 -0800 Subject: mm, vmscan: add active list aging tracepoint Our reclaim process has several tracepoints to tell us more about how things are progressing. We are, however, missing a tracepoint to track active list aging. Introduce mm_vmscan_lru_shrink_active which reports the number of - nr_taken is number of isolated pages from the active list - nr_referenced pages which tells us that we are hitting referenced pages which are deactivated. If this is a large part of the reported nr_deactivated pages then we might be hitting into the active list too early because they might be still part of the working set. This might help to debug performance issues. - nr_active pages which tells us how many pages are kept on the active list - mostly exec file backed pages. A high number can indicate that we might be trashing on executables. [mhocko@suse.com: update] Link: http://lkml.kernel.org/r/20170104135244.GJ25453@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20170104101942.4860-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Hillf Danton Acked-by: Mel Gorman Acked-by: Minchan Kim Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 36 ++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 18 ++++++++++++++---- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 39bad8921ca1..c295d8f1b67a 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -363,6 +363,42 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, show_reclaim_flags(__entry->reclaim_flags)) ); +TRACE_EVENT(mm_vmscan_lru_shrink_active, + + TP_PROTO(int nid, unsigned long nr_taken, + unsigned long nr_active, unsigned long nr_deactivated, + unsigned long nr_referenced, int priority, int file), + + TP_ARGS(nid, nr_taken, nr_active, nr_deactivated, nr_referenced, priority, file), + + TP_STRUCT__entry( + __field(int, nid) + __field(unsigned long, nr_taken) + __field(unsigned long, nr_active) + __field(unsigned long, nr_deactivated) + __field(unsigned long, nr_referenced) + __field(int, priority) + __field(int, reclaim_flags) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->nr_taken = nr_taken; + __entry->nr_active = nr_active; + __entry->nr_deactivated = nr_deactivated; + __entry->nr_referenced = nr_referenced; + __entry->priority = priority; + __entry->reclaim_flags = trace_shrink_flags(file); + ), + + TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s", + __entry->nid, + __entry->nr_taken, + __entry->nr_active, __entry->nr_deactivated, __entry->nr_referenced, + __entry->priority, + show_reclaim_flags(__entry->reclaim_flags)) +); + #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 532a2a750952..a34bf51d68e2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1855,9 +1855,11 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * * The downside is that we have to touch page->_refcount against each page. * But we had to alter page->flags anyway. + * + * Returns the number of pages moved to the given lru. */ -static void move_active_pages_to_lru(struct lruvec *lruvec, +static unsigned move_active_pages_to_lru(struct lruvec *lruvec, struct list_head *list, struct list_head *pages_to_free, enum lru_list lru) @@ -1866,6 +1868,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, unsigned long pgmoved = 0; struct page *page; int nr_pages; + int nr_moved = 0; while (!list_empty(list)) { page = lru_to_page(list); @@ -1891,11 +1894,15 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, pages_to_free); + } else { + nr_moved += nr_pages; } } if (!is_active_lru(lru)) __count_vm_events(PGDEACTIVATE, pgmoved); + + return nr_moved; } static void shrink_active_list(unsigned long nr_to_scan, @@ -1911,7 +1918,8 @@ static void shrink_active_list(unsigned long nr_to_scan, LIST_HEAD(l_inactive); struct page *page; struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - unsigned long nr_rotated = 0; + unsigned nr_deactivate, nr_activate; + unsigned nr_rotated = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -1989,13 +1997,15 @@ static void shrink_active_list(unsigned long nr_to_scan, */ reclaim_stat->recent_rotated[file] += nr_rotated; - move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); - move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); + nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); + nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&l_hold); free_hot_cold_page_list(&l_hold, true); + trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, + nr_deactivate, nr_rotated, sc->priority, file); } /* -- cgit From 1265e3a69f1ea97357536773d48c92a409e01eaf Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:44:21 -0800 Subject: mm, vmscan: show the number of skipped pages in mm_vmscan_lru_isolate mm_vmscan_lru_isolate shows the number of requested, scanned and taken pages. This is mostly OK but on 32b systems the number of scanned pages is quite misleading because it includes both the scanned and skipped pages. Moreover the skipped part is scaled based on the number of taken pages. Let's report the exact numbers without any additional logic and add the number of skipped pages. This should make the reported data much more easier to interpret. Link: http://lkml.kernel.org/r/20170104101942.4860-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Minchan Kim Acked-by: Mel Gorman Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 8 ++++++-- mm/vmscan.c | 13 +++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index c295d8f1b67a..340b3c4ef4a8 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -274,17 +274,19 @@ TRACE_EVENT(mm_vmscan_lru_isolate, int order, unsigned long nr_requested, unsigned long nr_scanned, + unsigned long nr_skipped, unsigned long nr_taken, isolate_mode_t isolate_mode, int file), - TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file), + TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, file), TP_STRUCT__entry( __field(int, classzone_idx) __field(int, order) __field(unsigned long, nr_requested) __field(unsigned long, nr_scanned) + __field(unsigned long, nr_skipped) __field(unsigned long, nr_taken) __field(isolate_mode_t, isolate_mode) __field(int, file) @@ -295,17 +297,19 @@ TRACE_EVENT(mm_vmscan_lru_isolate, __entry->order = order; __entry->nr_requested = nr_requested; __entry->nr_scanned = nr_scanned; + __entry->nr_skipped = nr_skipped; __entry->nr_taken = nr_taken; __entry->isolate_mode = isolate_mode; __entry->file = file; ), - TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d", + TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu file=%d", __entry->isolate_mode, __entry->classzone_idx, __entry->order, __entry->nr_requested, __entry->nr_scanned, + __entry->nr_skipped, __entry->nr_taken, __entry->file) ); diff --git a/mm/vmscan.c b/mm/vmscan.c index a34bf51d68e2..d2fc97c6f72c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1437,6 +1437,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; + unsigned long skipped = 0, total_skipped = 0; unsigned long scan, nr_pages; LIST_HEAD(pages_skipped); @@ -1488,14 +1489,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, */ if (!list_empty(&pages_skipped)) { int zid; - unsigned long total_skipped = 0; for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue; __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); - total_skipped += nr_skipped[zid]; + skipped += nr_skipped[zid]; } /* @@ -1503,13 +1503,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * close to unreclaimable. If the LRU list is empty, account * skipped pages as a full scan. */ - scan += list_empty(src) ? total_skipped : total_skipped >> 2; + total_skipped = list_empty(src) ? skipped : skipped >> 2; list_splice(&pages_skipped, src); } - *nr_scanned = scan; - trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, - nr_taken, mode, is_file_lru(lru)); + *nr_scanned = scan + total_skipped; + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, + scan, skipped, nr_taken, mode, + is_file_lru(lru)); update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } -- cgit From 32b3f2974adca13f8a4a610c396e88c6f81eb10e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:44:24 -0800 Subject: mm, vmscan: show LRU name in mm_vmscan_lru_isolate tracepoint mm_vmscan_lru_isolate currently prints only whether the LRU we isolate from is file or anonymous but we do not know which LRU this is. It is useful to know whether the list is active or inactive, since we are using the same function to isolate pages from both of them and it's hard to distinguish otherwise. Link: http://lkml.kernel.org/r/20170104101942.4860-5-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Hillf Danton Acked-by: Mel Gorman Acked-by: Minchan Kim Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/mmflags.h | 8 ++++++++ include/trace/events/vmscan.h | 12 ++++++------ mm/vmscan.c | 3 +-- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 91554faed17e..304ff94363b2 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -239,6 +239,13 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ IFDEF_ZONE_HIGHMEM( EM (ZONE_HIGHMEM,"HighMem")) \ EMe(ZONE_MOVABLE,"Movable") +#define LRU_NAMES \ + EM (LRU_INACTIVE_ANON, "inactive_anon") \ + EM (LRU_ACTIVE_ANON, "active_anon") \ + EM (LRU_INACTIVE_FILE, "inactive_file") \ + EM (LRU_ACTIVE_FILE, "active_file") \ + EMe(LRU_UNEVICTABLE, "unevictable") + /* * First define the enums in the above macros to be exported to userspace * via TRACE_DEFINE_ENUM(). @@ -252,6 +259,7 @@ COMPACTION_STATUS COMPACTION_PRIORITY COMPACTION_FEEDBACK ZONE_TYPE +LRU_NAMES /* * Now redefine the EM() and EMe() macros to map the enums to the strings diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 340b3c4ef4a8..4af0bf70e07e 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -277,9 +277,9 @@ TRACE_EVENT(mm_vmscan_lru_isolate, unsigned long nr_skipped, unsigned long nr_taken, isolate_mode_t isolate_mode, - int file), + int lru), - TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, file), + TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru), TP_STRUCT__entry( __field(int, classzone_idx) @@ -289,7 +289,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate, __field(unsigned long, nr_skipped) __field(unsigned long, nr_taken) __field(isolate_mode_t, isolate_mode) - __field(int, file) + __field(int, lru) ), TP_fast_assign( @@ -300,10 +300,10 @@ TRACE_EVENT(mm_vmscan_lru_isolate, __entry->nr_skipped = nr_skipped; __entry->nr_taken = nr_taken; __entry->isolate_mode = isolate_mode; - __entry->file = file; + __entry->lru = lru; ), - TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu file=%d", + TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s", __entry->isolate_mode, __entry->classzone_idx, __entry->order, @@ -311,7 +311,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate, __entry->nr_scanned, __entry->nr_skipped, __entry->nr_taken, - __entry->file) + __print_symbolic(__entry->lru, LRU_NAMES)) ); TRACE_EVENT(mm_vmscan_writepage, diff --git a/mm/vmscan.c b/mm/vmscan.c index d2fc97c6f72c..daacfc1be867 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1509,8 +1509,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } *nr_scanned = scan + total_skipped; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, - scan, skipped, nr_taken, mode, - is_file_lru(lru)); + scan, skipped, nr_taken, mode, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } -- cgit From 3c710c1ad11b4a856a396b181911568f3851a5d8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:44:27 -0800 Subject: mm, vmscan: extract shrink_page_list reclaim counters into a struct shrink_page_list returns quite some counters back to its caller. Extract the existing 5 into struct reclaim_stat because this makes the code easier to follow and also allows further counters to be returned. While we are at it, make all of them unsigned rather than unsigned long as we do not really need full 64b for them (we never scan more than SWAP_CLUSTER_MAX pages at once). This should reduce some stack space. This patch shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170104101942.4860-6-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Hillf Danton Acked-by: Mel Gorman Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 61 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index daacfc1be867..3e5f33b78daf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -912,6 +912,14 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } +struct reclaim_stat { + unsigned nr_dirty; + unsigned nr_unqueued_dirty; + unsigned nr_congested; + unsigned nr_writeback; + unsigned nr_immediate; +}; + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -919,22 +927,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct pglist_data *pgdat, struct scan_control *sc, enum ttu_flags ttu_flags, - unsigned long *ret_nr_dirty, - unsigned long *ret_nr_unqueued_dirty, - unsigned long *ret_nr_congested, - unsigned long *ret_nr_writeback, - unsigned long *ret_nr_immediate, + struct reclaim_stat *stat, bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; - unsigned long nr_unqueued_dirty = 0; - unsigned long nr_dirty = 0; - unsigned long nr_congested = 0; - unsigned long nr_reclaimed = 0; - unsigned long nr_writeback = 0; - unsigned long nr_immediate = 0; + unsigned nr_unqueued_dirty = 0; + unsigned nr_dirty = 0; + unsigned nr_congested = 0; + unsigned nr_reclaimed = 0; + unsigned nr_writeback = 0; + unsigned nr_immediate = 0; cond_resched(); @@ -1276,11 +1280,13 @@ keep: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); - *ret_nr_dirty += nr_dirty; - *ret_nr_congested += nr_congested; - *ret_nr_unqueued_dirty += nr_unqueued_dirty; - *ret_nr_writeback += nr_writeback; - *ret_nr_immediate += nr_immediate; + if (stat) { + stat->nr_dirty = nr_dirty; + stat->nr_congested = nr_congested; + stat->nr_unqueued_dirty = nr_unqueued_dirty; + stat->nr_writeback = nr_writeback; + stat->nr_immediate = nr_immediate; + } return nr_reclaimed; } @@ -1292,7 +1298,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; + unsigned long ret; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1305,8 +1311,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, - &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); + TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; @@ -1705,11 +1710,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; - unsigned long nr_dirty = 0; - unsigned long nr_congested = 0; - unsigned long nr_unqueued_dirty = 0; - unsigned long nr_writeback = 0; - unsigned long nr_immediate = 0; + struct reclaim_stat stat = {}; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -1754,9 +1755,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return 0; nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, - &nr_dirty, &nr_unqueued_dirty, &nr_congested, - &nr_writeback, &nr_immediate, - false); + &stat, false); spin_lock_irq(&pgdat->lru_lock); @@ -1790,7 +1789,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * of pages under pages flagged for immediate reclaim and stall if any * are encountered in the nr_immediate check below. */ - if (nr_writeback && nr_writeback == nr_taken) + if (stat.nr_writeback && stat.nr_writeback == nr_taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); /* @@ -1802,7 +1801,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * Tag a zone as congested if all the dirty pages scanned were * backed by a congested BDI and wait_iff_congested will stall. */ - if (nr_dirty && nr_dirty == nr_congested) + if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) set_bit(PGDAT_CONGESTED, &pgdat->flags); /* @@ -1811,7 +1810,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * the pgdat PGDAT_DIRTY and kswapd will start writing pages from * reclaim context. */ - if (nr_unqueued_dirty == nr_taken) + if (stat.nr_unqueued_dirty == nr_taken) set_bit(PGDAT_DIRTY, &pgdat->flags); /* @@ -1820,7 +1819,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * that pages are cycling through the LRU faster than * they are written so also forcibly stall. */ - if (nr_immediate && current_may_throttle()) + if (stat.nr_immediate && current_may_throttle()) congestion_wait(BLK_RW_ASYNC, HZ/10); } -- cgit From 5bccd16657e893e52e96547e7c2b5729d78d4e45 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:44:30 -0800 Subject: mm, vmscan: enhance mm_vmscan_lru_shrink_inactive tracepoint mm_vmscan_lru_shrink_inactive will currently report the number of scanned and reclaimed pages. This doesn't give us an idea how the reclaim went except for the overall effectiveness though. Export and show other counters which will tell us why we couldn't reclaim some pages. - nr_dirty, nr_writeback, nr_congested and nr_immediate tells us how many pages are blocked due to IO - nr_activate tells us how many pages were moved to the active list - nr_ref_keep reports how many pages are kept on the LRU due to references (mostly for the file pages which are about to go for another round through the inactive list) - nr_unmap_fail - how many pages failed to unmap All these are rather low level so they might change in future but the tracepoint is already implementation specific so no tools should be depending on its stability. Link: http://lkml.kernel.org/r/20170104101942.4860-7-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Hillf Danton Acked-by: Mel Gorman Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 29 ++++++++++++++++++++++++++--- mm/vmscan.c | 14 ++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 4af0bf70e07e..08c1cd5af0d6 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -340,14 +340,27 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, unsigned long nr_scanned, unsigned long nr_reclaimed, + unsigned long nr_dirty, unsigned long nr_writeback, + unsigned long nr_congested, unsigned long nr_immediate, + unsigned long nr_activate, unsigned long nr_ref_keep, + unsigned long nr_unmap_fail, int priority, int file), - TP_ARGS(nid, nr_scanned, nr_reclaimed, priority, file), + TP_ARGS(nid, nr_scanned, nr_reclaimed, nr_dirty, nr_writeback, + nr_congested, nr_immediate, nr_activate, nr_ref_keep, + nr_unmap_fail, priority, file), TP_STRUCT__entry( __field(int, nid) __field(unsigned long, nr_scanned) __field(unsigned long, nr_reclaimed) + __field(unsigned long, nr_dirty) + __field(unsigned long, nr_writeback) + __field(unsigned long, nr_congested) + __field(unsigned long, nr_immediate) + __field(unsigned long, nr_activate) + __field(unsigned long, nr_ref_keep) + __field(unsigned long, nr_unmap_fail) __field(int, priority) __field(int, reclaim_flags) ), @@ -356,14 +369,24 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, __entry->nid = nid; __entry->nr_scanned = nr_scanned; __entry->nr_reclaimed = nr_reclaimed; + __entry->nr_dirty = nr_dirty; + __entry->nr_writeback = nr_writeback; + __entry->nr_congested = nr_congested; + __entry->nr_immediate = nr_immediate; + __entry->nr_activate = nr_activate; + __entry->nr_ref_keep = nr_ref_keep; + __entry->nr_unmap_fail = nr_unmap_fail; __entry->priority = priority; __entry->reclaim_flags = trace_shrink_flags(file); ), - TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", + TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate=%ld nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", __entry->nid, __entry->nr_scanned, __entry->nr_reclaimed, - __entry->priority, + __entry->nr_dirty, __entry->nr_writeback, + __entry->nr_congested, __entry->nr_immediate, + __entry->nr_activate, __entry->nr_ref_keep, + __entry->nr_unmap_fail, __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) ); diff --git a/mm/vmscan.c b/mm/vmscan.c index 3e5f33b78daf..8cc90bd8149d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -918,6 +918,9 @@ struct reclaim_stat { unsigned nr_congested; unsigned nr_writeback; unsigned nr_immediate; + unsigned nr_activate; + unsigned nr_ref_keep; + unsigned nr_unmap_fail; }; /* @@ -939,6 +942,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, unsigned nr_reclaimed = 0; unsigned nr_writeback = 0; unsigned nr_immediate = 0; + unsigned nr_ref_keep = 0; + unsigned nr_unmap_fail = 0; cond_resched(); @@ -1077,6 +1082,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGEREF_ACTIVATE: goto activate_locked; case PAGEREF_KEEP: + nr_ref_keep++; goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: @@ -1114,6 +1120,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : (ttu_flags | TTU_BATCH_FLUSH))) { case SWAP_FAIL: + nr_unmap_fail++; goto activate_locked; case SWAP_AGAIN: goto keep_locked; @@ -1286,6 +1293,9 @@ keep: stat->nr_unqueued_dirty = nr_unqueued_dirty; stat->nr_writeback = nr_writeback; stat->nr_immediate = nr_immediate; + stat->nr_activate = pgactivate; + stat->nr_ref_keep = nr_ref_keep; + stat->nr_unmap_fail = nr_unmap_fail; } return nr_reclaimed; } @@ -1834,6 +1844,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, nr_scanned, nr_reclaimed, + stat.nr_dirty, stat.nr_writeback, + stat.nr_congested, stat.nr_immediate, + stat.nr_activate, stat.nr_ref_keep, + stat.nr_unmap_fail, sc->priority, file); return nr_reclaimed; } -- cgit From dcec0b60a8213aeb876823a15d834009fce3b36e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:44:33 -0800 Subject: mm, vmscan: add mm_vmscan_inactive_list_is_low tracepoint Currently we have tracepoints for both active and inactive LRU lists reclaim but we do not have any which would tell us why we we decided to age the active list. Without that it is quite hard to diagnose active/inactive lists balancing. Add mm_vmscan_inactive_list_is_low tracepoint to tell us this information. Link: http://lkml.kernel.org/r/20170104101942.4860-8-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Hillf Danton Acked-by: Mel Gorman Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 40 ++++++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 23 ++++++++++++++--------- 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 08c1cd5af0d6..27e8a5c77579 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -15,6 +15,7 @@ #define RECLAIM_WB_MIXED 0x0010u #define RECLAIM_WB_SYNC 0x0004u /* Unused, all reclaim async */ #define RECLAIM_WB_ASYNC 0x0008u +#define RECLAIM_WB_LRU (RECLAIM_WB_ANON|RECLAIM_WB_FILE) #define show_reclaim_flags(flags) \ (flags) ? __print_flags(flags, "|", \ @@ -426,6 +427,45 @@ TRACE_EVENT(mm_vmscan_lru_shrink_active, show_reclaim_flags(__entry->reclaim_flags)) ); +TRACE_EVENT(mm_vmscan_inactive_list_is_low, + + TP_PROTO(int nid, int reclaim_idx, + unsigned long total_inactive, unsigned long inactive, + unsigned long total_active, unsigned long active, + unsigned long ratio, int file), + + TP_ARGS(nid, reclaim_idx, total_inactive, inactive, total_active, active, ratio, file), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, reclaim_idx) + __field(unsigned long, total_inactive) + __field(unsigned long, inactive) + __field(unsigned long, total_active) + __field(unsigned long, active) + __field(unsigned long, ratio) + __field(int, reclaim_flags) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->reclaim_idx = reclaim_idx; + __entry->total_inactive = total_inactive; + __entry->inactive = inactive; + __entry->total_active = total_active; + __entry->active = active; + __entry->ratio = ratio; + __entry->reclaim_flags = trace_shrink_flags(file) & RECLAIM_WB_LRU; + ), + + TP_printk("nid=%d reclaim_idx=%d total_inactive=%ld inactive=%ld total_active=%ld active=%ld ratio=%ld flags=%s", + __entry->nid, + __entry->reclaim_idx, + __entry->total_inactive, __entry->inactive, + __entry->total_active, __entry->active, + __entry->ratio, + show_reclaim_flags(__entry->reclaim_flags)) +); #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 8cc90bd8149d..de400d1eac0e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2048,11 +2048,11 @@ static void shrink_active_list(unsigned long nr_to_scan, * 10TB 320 32GB */ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct scan_control *sc) + struct scan_control *sc, bool trace) { unsigned long inactive_ratio; - unsigned long inactive; - unsigned long active; + unsigned long total_inactive, inactive; + unsigned long total_active, active; unsigned long gb; struct pglist_data *pgdat = lruvec_pgdat(lruvec); int zid; @@ -2064,8 +2064,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, if (!file && !total_swap_pages) return false; - inactive = lruvec_lru_size(lruvec, file * LRU_FILE); - active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); + total_inactive = inactive = lruvec_lru_size(lruvec, file * LRU_FILE); + total_active = active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); /* * For zone-constrained allocations, it is necessary to check if @@ -2092,6 +2092,11 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, else inactive_ratio = 1; + if (trace) + trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, + sc->reclaim_idx, + total_inactive, inactive, + total_active, active, inactive_ratio, file); return inactive * inactive_ratio < active; } @@ -2099,7 +2104,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru), sc)) + if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -2230,7 +2235,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_list_is_low(lruvec, true, sc) && + if (!inactive_list_is_low(lruvec, true, sc, false) && lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; @@ -2455,7 +2460,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false, sc)) + if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -3105,7 +3110,7 @@ static void age_active_anon(struct pglist_data *pgdat, do { struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - if (inactive_list_is_low(lruvec, false, sc)) + if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); -- cgit From 93607e5a554a6408a1b56cdd146edc91e5da9983 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:44:36 -0800 Subject: trace-vmscan-postprocess: sync with tracepoints updates Both mm_vmscan_lru_shrink_active and mm_vmscan_lru_isolate have changed so the script needs to be update to reflect those changes Link: http://lkml.kernel.org/r/20170105151737.GU21618@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Hillf Danton Acked-by: Mel Gorman Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../trace/postprocess/trace-vmscan-postprocess.pl | 26 +++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 8f961ef2b457..ba976805853a 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -112,8 +112,8 @@ my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)'; my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)'; -my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) file=([0-9]*)'; -my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) zid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; +my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; +my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)'; @@ -205,15 +205,15 @@ $regex_wakeup_kswapd = generate_traceevent_regex( $regex_lru_isolate = generate_traceevent_regex( "vmscan/mm_vmscan_lru_isolate", $regex_lru_isolate_default, - "isolate_mode", "order", - "nr_requested", "nr_scanned", "nr_taken", - "file"); + "isolate_mode", "classzone_idx", "order", + "nr_requested", "nr_scanned", "nr_skipped", "nr_taken", + "lru"); $regex_lru_shrink_inactive = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_inactive", $regex_lru_shrink_inactive_default, - "nid", "zid", - "nr_scanned", "nr_reclaimed", "priority", - "flags"); + "nid", "nr_scanned", "nr_reclaimed", "nr_dirty", "nr_writeback", + "nr_congested", "nr_immediate", "nr_activate", "nr_ref_keep", + "nr_unmap_fail", "priority", "flags"); $regex_lru_shrink_active = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_active", $regex_lru_shrink_active_default, @@ -381,8 +381,8 @@ EVENT_PROCESS: next; } my $isolate_mode = $1; - my $nr_scanned = $4; - my $file = $6; + my $nr_scanned = $5; + my $file = $8; # To closer match vmstat scanning statistics, only count isolate_both # and isolate_inactive as scanning. isolate_active is rotation @@ -391,7 +391,7 @@ EVENT_PROCESS: # isolate_both == 3 if ($isolate_mode != 2) { $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned; - if ($file == 1) { + if ($file =~ /_file/) { $perprocesspid{$process_pid}->{HIGH_NR_FILE_SCANNED} += $nr_scanned; } else { $perprocesspid{$process_pid}->{HIGH_NR_ANON_SCANNED} += $nr_scanned; @@ -406,8 +406,8 @@ EVENT_PROCESS: next; } - my $nr_reclaimed = $4; - my $flags = $6; + my $nr_reclaimed = $3; + my $flags = $12; my $file = 0; if ($flags =~ /RECLAIM_WB_FILE/) { $file = 1; -- cgit From d94e0c05eba9ed627ce467a7c3e92b365269f905 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 22 Feb 2017 15:44:39 -0800 Subject: nfs: no PG_private waiters remain, remove waker Since commit 4f52b6bb8c57 ("NFS: Don't call COMMIT in ->releasepage()"), no tasks wait on PagePrivate. Thus the wake introduced in commit 9590544694be ("NFS: avoid deadlocks with loop-back mounted NFS filesystems.") can be removed. Link: http://lkml.kernel.org/r/20170103182234.30141-2-npiggin@gmail.com Signed-off-by: Nicholas Piggin Cc: Trond Myklebust Cc: Anna Schumaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/write.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b00d53d13d47..006068526542 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -728,8 +728,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); - smp_mb__after_atomic(); - wake_up_page(head->wb_page, PG_private); clear_bit(PG_MAPPED, &head->wb_flags); } nfsi->nrequests--; -- cgit From 74d81bfae8e3f52e956367f6ed764db269b87091 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 22 Feb 2017 15:44:41 -0800 Subject: mm: un-export wake_up_page functions These are no longer used outside mm/filemap.c, so un-export them and make them static where possible. These were exported specifically for NFS use in commit a4796e37c12e ("MM: export page_wakeup functions"). Link: http://lkml.kernel.org/r/20170103182234.30141-3-npiggin@gmail.com Signed-off-by: Nicholas Piggin Cc: Trond Myklebust Cc: Anna Schumaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 12 ++---------- mm/filemap.c | 10 ++++++++-- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 324c8dbad1e1..b572f5530392 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -482,19 +482,11 @@ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, } /* - * This is exported only for wait_on_page_locked/wait_on_page_writeback, - * and for filesystems which need to wait on PG_private. + * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc., + * and should not be used directly. */ extern void wait_on_page_bit(struct page *page, int bit_nr); extern int wait_on_page_bit_killable(struct page *page, int bit_nr); -extern void wake_up_page_bit(struct page *page, int bit_nr); - -static inline void wake_up_page(struct page *page, int bit) -{ - if (!PageWaiters(page)) - return; - wake_up_page_bit(page, bit); -} /* * Wait for a page to be unlocked. diff --git a/mm/filemap.c b/mm/filemap.c index 3f9afded581b..a3dbe9eff213 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -788,7 +788,7 @@ static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void return autoremove_wake_function(wait, mode, sync, key); } -void wake_up_page_bit(struct page *page, int bit_nr) +static void wake_up_page_bit(struct page *page, int bit_nr) { wait_queue_head_t *q = page_waitqueue(page); struct wait_page_key key; @@ -821,7 +821,13 @@ void wake_up_page_bit(struct page *page, int bit_nr) } spin_unlock_irqrestore(&q->lock, flags); } -EXPORT_SYMBOL(wake_up_page_bit); + +static void wake_up_page(struct page *page, int bit) +{ + if (!PageWaiters(page)) + return; + wake_up_page_bit(page, bit); +} static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, bool lock) -- cgit From 870667553af5e18b2f3402c03f8c4f82381f0ec7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 22 Feb 2017 15:44:44 -0800 Subject: mm: fix filemap.c kernel-doc warnings Fix kernel-doc warnings in mm/filemap.c: mm/filemap.c:993: warning: No description found for parameter '__page' mm/filemap.c:993: warning: Excess function parameter 'page' description in '__lock_page' Link: http://lkml.kernel.org/r/a66fe492-518c-ad6c-5f03-5e8b721fb451@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index a3dbe9eff213..416d563468a3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1019,7 +1019,7 @@ EXPORT_SYMBOL_GPL(page_endio); /** * __lock_page - get a lock on the page, assuming we need to sleep to get it - * @page: the page to lock + * @__page: the page to lock */ void __lock_page(struct page *__page) { -- cgit From e57b9d8c5ad08ed8cbca7e51fe252719a2845394 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 22 Feb 2017 15:44:47 -0800 Subject: mm/mmzone.c: swap likely to unlikely as code logic is different for next_zones_zonelist() Commit 682a3385e773 ("mm, page_alloc: inline the fast path of the zonelist iterator") changed how next_zones_zonelist() is called, by adding a static inline function to do the fast path. This function adds: if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) return z; return __next_zones_zonelist(z, highest_zoneidx, nodes); Where __next_zones_zonelist() is only called when nodes is not NULL or zonelist_zone_idx(z) is less than highest_zoneidx. The original next_zone_zonelist() was converted to __next_zones_zonelist() but it still maintained: if (likely(nodes == NULL)) Which is now actually a very unlikely, as it is only called with nodes equal to NULL when zonelist_zone_idx(z) is greater than highest_zoneidx. Before this commit, this if had this statistic: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 837895 446078 34 next_zones_zonelist mmzone.c 63 After this commit, it has: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 10 173840 99 __next_zones_zonelist mmzone.c 63 Thus, the if statement is now much more unlikely than it ever was as a likely. Link: http://lkml.kernel.org/r/20170105200102.77989567@gandalf.local.home Signed-off-by: Steven Rostedt (VMware) Acked-by: Mel Gorman Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmzone.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmzone.c b/mm/mmzone.c index 5652be858e5e..a51c0a67ea3d 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -60,7 +60,7 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z, * Find the next suitable zone to use for the allocation. * Only filter based on nodemask if it's set */ - if (likely(nodes == NULL)) + if (unlikely(nodes == NULL)) while (zonelist_zone_idx(z) > highest_zoneidx) z++; else -- cgit From 7f354a548d1cb6bb01b6ee74aee9264aa152f1ec Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Feb 2017 15:44:50 -0800 Subject: mm, compaction: add vmstats for kcompactd work A "compact_daemon_wake" vmstat exists that represents the number of times kcompactd has woken up. This doesn't represent how much work it actually did, though. It's useful to understand how much compaction work is being done by kcompactd versus other methods such as direct compaction and explicitly triggered per-node (or system) compaction. This adds two new vmstats: "compact_daemon_migrate_scanned" and "compact_daemon_free_scanned" to represent the number of pages kcompactd has scanned as part of its migration scanner and freeing scanner, respectively. These values are still accounted for in the general "compact_migrate_scanned" and "compact_free_scanned" for compatibility. It could be argued that explicitly triggered compaction could also be tracked separately, and that could be added if others find it useful. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1612071749390.69852@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 1 + mm/compaction.c | 22 +++++++++++++++++++--- mm/internal.h | 2 ++ mm/vmstat.c | 2 ++ 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 4d6ec58a8d45..6aa1b6cb5828 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -56,6 +56,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, COMPACTISOLATED, COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, KCOMPACTD_WAKE, + KCOMPACTD_MIGRATE_SCANNED, KCOMPACTD_FREE_SCANNED, #endif #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, diff --git a/mm/compaction.c b/mm/compaction.c index 949198d01260..c6178bbd3e04 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -548,7 +548,7 @@ isolate_fail: if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); - count_compact_events(COMPACTFREE_SCANNED, nr_scanned); + cc->total_free_scanned += nr_scanned; if (total_isolated) count_compact_events(COMPACTISOLATED, total_isolated); return total_isolated; @@ -931,7 +931,7 @@ isolate_fail: trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, nr_scanned, nr_isolated); - count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); + cc->total_migrate_scanned += nr_scanned; if (nr_isolated) count_compact_events(COMPACTISOLATED, nr_isolated); @@ -1631,6 +1631,9 @@ out: zone->compact_cached_free_pfn = free_pfn; } + count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); + count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned); + trace_mm_compaction_end(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync, ret); @@ -1645,6 +1648,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, + .total_migrate_scanned = 0, + .total_free_scanned = 0, .order = order, .gfp_mask = gfp_mask, .zone = zone, @@ -1757,6 +1762,8 @@ static void compact_node(int nid) struct zone *zone; struct compact_control cc = { .order = -1, + .total_migrate_scanned = 0, + .total_free_scanned = 0, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .whole_zone = true, @@ -1883,6 +1890,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct zone *zone; struct compact_control cc = { .order = pgdat->kcompactd_max_order, + .total_migrate_scanned = 0, + .total_free_scanned = 0, .classzone_idx = pgdat->kcompactd_classzone_idx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = true, @@ -1891,7 +1900,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, cc.classzone_idx); - count_vm_event(KCOMPACTD_WAKE); + count_compact_event(KCOMPACTD_WAKE); for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { int status; @@ -1909,6 +1918,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) cc.nr_freepages = 0; cc.nr_migratepages = 0; + cc.total_migrate_scanned = 0; + cc.total_free_scanned = 0; cc.zone = zone; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1927,6 +1938,11 @@ static void kcompactd_do_work(pg_data_t *pgdat) defer_compaction(zone, cc.order); } + count_compact_events(KCOMPACTD_MIGRATE_SCANNED, + cc.total_migrate_scanned); + count_compact_events(KCOMPACTD_FREE_SCANNED, + cc.total_free_scanned); + VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); } diff --git a/mm/internal.h b/mm/internal.h index bfad3b5d2665..9ad04fc6eefe 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -175,6 +175,8 @@ struct compact_control { struct list_head migratepages; /* List of pages being migrated */ unsigned long nr_freepages; /* Number of isolated free pages */ unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned long total_migrate_scanned; + unsigned long total_free_scanned; unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 7c28df36f50f..69f9aff39a2e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1038,6 +1038,8 @@ const char * const vmstat_text[] = { "compact_fail", "compact_success", "compact_daemon_wake", + "compact_daemon_migrate_scanned", + "compact_daemon_free_scanned", #endif #ifdef CONFIG_HUGETLB_PAGE -- cgit From b92df1de5d289c0b5d653e72414bf0850b8511e0 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Wed, 22 Feb 2017 15:44:53 -0800 Subject: mm: page_alloc: skip over regions of invalid pfns where possible When using a sparse memory model memmap_init_zone() when invoked with the MEMMAP_EARLY context will skip over pages which aren't valid - ie. which aren't in a populated region of the sparse memory map. However if the memory map is extremely sparse then it can spend a long time linearly checking each PFN in a large non-populated region of the memory map & skipping it in turn. When CONFIG_HAVE_MEMBLOCK_NODE_MAP is enabled, we have sufficient information to quickly discover the next valid PFN given an invalid one by searching through the list of memory regions & skipping forwards to the first PFN covered by the memory region to the right of the non-populated region. Implement this in order to speed up memmap_init_zone() for systems with extremely sparse memory maps. James said "I have tested this patch on a virtual model of a Samurai CPU with a sparse memory map. The kernel boot time drops from 109 to 62 seconds. " Link: http://lkml.kernel.org/r/20161125185518.29885-1-paul.burton@imgtec.com Signed-off-by: Paul Burton Tested-by: James Hartley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 1 + mm/memblock.c | 25 +++++++++++++++++++++++++ mm/page_alloc.c | 11 ++++++++++- 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 5b759c9acf97..38bcf00cbed3 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -203,6 +203,7 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, unsigned long *out_end_pfn, int *out_nid); +unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn); /** * for_each_mem_pfn_range - early memory pfn range iterator diff --git a/mm/memblock.c b/mm/memblock.c index 7608bc305936..a476d28e0733 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1105,6 +1105,31 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, *out_nid = r->nid; } +unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn, + unsigned long max_pfn) +{ + struct memblock_type *type = &memblock.memory; + unsigned int right = type->cnt; + unsigned int mid, left = 0; + phys_addr_t addr = PFN_PHYS(pfn + 1); + + do { + mid = (right + left) / 2; + + if (addr < type->regions[mid].base) + right = mid; + else if (addr >= (type->regions[mid].base + + type->regions[mid].size)) + left = mid + 1; + else { + /* addr is within the region, so pfn + 1 is valid */ + return min(pfn + 1, max_pfn); + } + } while (left < right); + + return min(PHYS_PFN(type->regions[right].base), max_pfn); +} + /** * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05c0a59323bd..6da3169d3750 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5103,8 +5103,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (context != MEMMAP_EARLY) goto not_early; - if (!early_pfn_valid(pfn)) + if (!early_pfn_valid(pfn)) { +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + /* + * Skip to the pfn preceding the next valid one (or + * end_pfn), such that we hit a valid pfn (or end_pfn) + * on our next iteration of the loop. + */ + pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; +#endif continue; + } if (!early_pfn_in_nid(pfn, nid)) continue; if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) -- cgit From 46acef048a6568ba490f636db8682a1461ed223c Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 22 Feb 2017 15:44:55 -0800 Subject: mm,compaction: serialize waitqueue_active() checks Without a memory barrier, the following race can occur with a high-order allocation: wakeup_kcompactd(order == 1) kcompactd() [L] waitqueue_active(kcompactd_wait) [S] prepare_to_wait_event(kcompactd_wait) [L] (kcompactd_max_order == 0) [S] kcompactd_max_order = order; schedule() Where the waitqueue_active() check is speculatively re-ordered to before setting the actual condition (max_order), not seeing the threads that's going to block; making us miss a wakeup. There are a couple of options to fix this, including calling wq_has_sleepers() which adds a full barrier, or unconditionally doing the wake_up_interruptible() and serialize on the q->lock. However, to make use of the control dependency, we just need to add L->L guarantees. While this bug is theoretical, there have been other offenders of the lockless waitqueue_active() in the past -- this is also documented in the call itself. Link: http://lkml.kernel.org/r/1483975528-24342-1-git-send-email-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index c6178bbd3e04..0aa2757399ee 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1966,6 +1966,13 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kcompactd_max_order < order) pgdat->kcompactd_max_order = order; + /* + * Pairs with implicit barrier in wait_event_freezable() + * such that wakeups are not missed in the lockless + * waitqueue_active() call. + */ + smp_acquire__after_ctrl_dep(); + if (pgdat->kcompactd_classzone_idx > classzone_idx) pgdat->kcompactd_classzone_idx = classzone_idx; -- cgit From d3a9d7a3784e3a54c08dcd9eafaef7ccb383a894 Mon Sep 17 00:00:00 2001 From: Adygzhy Ondar Date: Wed, 22 Feb 2017 15:44:58 -0800 Subject: mm/bootmem.c: cosmetic improvement of code readability The obvious number of bits in a byte is replaced by BITS_PER_BYTE macro in bootmap_bytes() Link: http://lkml.kernel.org/r/1483781600-5136-1-git-send-email-ondar07@gmail.com Signed-off-by: Adygzhy Ondar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/bootmem.c b/mm/bootmem.c index e8a55a3c9feb..9fedb27c6451 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -53,7 +53,7 @@ early_param("bootmem_debug", bootmem_debug_setup); static unsigned long __init bootmap_bytes(unsigned long pages) { - unsigned long bytes = DIV_ROUND_UP(pages, 8); + unsigned long bytes = DIV_ROUND_UP(pages, BITS_PER_BYTE); return ALIGN(bytes, sizeof(long)); } -- cgit From 399d8eebe768fe3ae648700547d04f2a8796a4da Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 22 Feb 2017 15:45:01 -0800 Subject: mm: fix some typos in mm/zsmalloc.c Delete extra semicolon, and fix some typos. Link: http://lkml.kernel.org/r/586F1823.4050107@huawei.com Signed-off-by: Xishi Qiu Reviewed-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9cc3c0b2c2c1..a1f24989ac23 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -25,7 +25,7 @@ * Usage of struct page flags: * PG_private: identifies the first component page * PG_private2: identifies the last component page - * PG_owner_priv_1: indentifies the huge component page + * PG_owner_priv_1: identifies the huge component page * */ @@ -364,7 +364,7 @@ static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) { return kmem_cache_alloc(pool->zspage_cachep, flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); -}; +} static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) { @@ -2383,7 +2383,7 @@ struct zs_pool *zs_create_pool(const char *name) goto err; /* - * Iterate reversly, because, size of size_class that we want to use + * Iterate reversely, because, size of size_class that we want to use * for merging should be larger or equal to current size. */ for (i = zs_size_classes - 1; i >= 0; i--) { -- cgit From ef415ef41153b747031e8ad965cb3df4f22ff72e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 22 Feb 2017 15:45:04 -0800 Subject: mm/memblock.c: trivial code refine in memblock_is_region_memory() memblock_is_region_memory() invoke memblock_search() to see whether the base address is in the memory region. If it fails, idx would be -1. Then, it returns 0. If the memblock_search() returns a valid index, it means the base address is guaranteed to be in the range memblock.memory.regions[idx]. Because of this, it is not necessary to check the base again. This patch removes the check on "base". Link: http://lkml.kernel.org/r/1482363033-24754-2-git-send-email-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index a476d28e0733..90171d508b09 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1640,8 +1640,7 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size if (idx == -1) return 0; - return memblock.memory.regions[idx].base <= base && - (memblock.memory.regions[idx].base + + return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size) >= end; } -- cgit From 7d41c03e2dc6a650f69655a42c0ddb72a7c121bf Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 22 Feb 2017 15:45:07 -0800 Subject: mm/memblock.c: check return value of memblock_reserve() in memblock_virt_alloc_internal() memblock_reserve() would add a new range to memblock.reserved in case the new range is not totally covered by any of the current memblock.reserved range. If the memblock.reserved is full and can't resize, memblock_reserve() would fail. This doesn't happen in real world now, I observed this during code review. While theoretically, it has the chance to happen. And if it happens, others would think this range of memory is still available and may corrupt the memory. This patch checks the return value and goto "done" after it succeeds. Link: http://lkml.kernel.org/r/1482363033-24754-3-git-send-email-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 90171d508b09..d105d6c81d53 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1299,18 +1299,17 @@ static void * __init memblock_virt_alloc_internal( if (max_addr > memblock.current_limit) max_addr = memblock.current_limit; - again: alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, nid, flags); - if (alloc) + if (alloc && !memblock_reserve(alloc, size)) goto done; if (nid != NUMA_NO_NODE) { alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, NUMA_NO_NODE, flags); - if (alloc) + if (alloc && !memblock_reserve(alloc, size)) goto done; } @@ -1328,7 +1327,6 @@ again: return NULL; done: - memblock_reserve(alloc, size); ptr = phys_to_virt(alloc); memset(ptr, 0, size); -- cgit From 857e522a007bfda34606ae252e2f61a6eff151ff Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Wed, 22 Feb 2017 15:45:10 -0800 Subject: mm/sparse: use page_private() to get page->private value free_map_bootmem() uses page->private directly to set removing_section_nr argument. But to get page->private value, page_private() has been prepared. So free_map_bootmem() should use page_private() instead of page->private. Link: http://lkml.kernel.org/r/1d34eaa5-a506-8b7a-6471-490c345deef8@gmail.com Signed-off-by: Yasuaki Ishimatsu Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mel Gorman Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index 1e168bf2779a..dc30a70e1dce 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -667,7 +667,7 @@ static void free_map_bootmem(struct page *memmap) BUG_ON(magic == NODE_INFO); maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); - removing_section_nr = page->private; + removing_section_nr = page_private(page); /* * When this function is called, the removing section is -- cgit From ddffe98d166f4a93d996d5aa628fd745311fc1e7 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Wed, 22 Feb 2017 15:45:13 -0800 Subject: mm/memory_hotplug: set magic number to page->freelist instead of page->lru.next To identify that pages of page table are allocated from bootmem allocator, magic number sets to page->lru.next. But page->lru list is initialized in reserve_bootmem_region(). So when calling free_pagetable(), the function cannot find the magic number of pages. And free_pagetable() frees the pages by free_reserved_page() not put_page_bootmem(). But if the pages are allocated from bootmem allocator and used as page table, the pages have private flag. So before freeing the pages, we should clear the private flag by put_page_bootmem(). Before applying the commit 7bfec6f47bb0 ("mm, page_alloc: check multiple page fields with a single branch"), we could find the following visible issue: BUG: Bad page state in process kworker/u1024:1 page:ffffea103cfd8040 count:0 mapcount:0 mappi flags: 0x6fffff80000800(private) page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set bad because of flags: 0x800(private) Call Trace: [...] dump_stack+0x63/0x87 [...] bad_page+0x114/0x130 [...] free_pages_prepare+0x299/0x2d0 [...] free_hot_cold_page+0x31/0x150 [...] __free_pages+0x25/0x30 [...] free_pagetable+0x6f/0xb4 [...] remove_pagetable+0x379/0x7ff [...] vmemmap_free+0x10/0x20 [...] sparse_remove_one_section+0x149/0x180 [...] __remove_pages+0x2e9/0x4f0 [...] arch_remove_memory+0x63/0xc0 [...] remove_memory+0x8c/0xc0 [...] acpi_memory_device_remove+0x79/0xa5 [...] acpi_bus_trim+0x5a/0x8d [...] acpi_bus_trim+0x38/0x8d [...] acpi_device_hotplug+0x1b7/0x418 [...] acpi_hotplug_work_fn+0x1e/0x29 [...] process_one_work+0x152/0x400 [...] worker_thread+0x125/0x4b0 [...] kthread+0xd8/0xf0 [...] ret_from_fork+0x22/0x40 And the issue still silently occurs. Until freeing the pages of page table allocated from bootmem allocator, the page->freelist is never used. So the patch sets magic number to page->freelist instead of page->lru.next. [isimatu.yasuaki@jp.fujitsu.com: fix merge issue] Link: http://lkml.kernel.org/r/722b1cc4-93ac-dd8b-2be2-7a7e313b3b0b@gmail.com Link: http://lkml.kernel.org/r/2c29bd9f-5b67-02d0-18a3-8828e78bbb6f@gmail.com Signed-off-by: Yasuaki Ishimatsu Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mel Gorman Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 2 +- mm/memory_hotplug.c | 5 +++-- mm/sparse.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index af85b686a7b0..97346f987ef2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -679,7 +679,7 @@ static void __meminit free_pagetable(struct page *page, int order) if (PageReserved(page)) { __ClearPageReserved(page); - magic = (unsigned long)page->lru.next; + magic = (unsigned long)page->freelist; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b8c11e063ff0..d67787d10ff0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -179,7 +179,7 @@ static void release_memory_resource(struct resource *res) void get_page_bootmem(unsigned long info, struct page *page, unsigned long type) { - page->lru.next = (struct list_head *) type; + page->freelist = (void *)type; SetPagePrivate(page); set_page_private(page, info); page_ref_inc(page); @@ -189,11 +189,12 @@ void put_page_bootmem(struct page *page) { unsigned long type; - type = (unsigned long) page->lru.next; + type = (unsigned long) page->freelist; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (page_ref_dec_return(page) == 1) { + page->freelist = NULL; ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); diff --git a/mm/sparse.c b/mm/sparse.c index dc30a70e1dce..db6bf3c97ea2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -662,7 +662,7 @@ static void free_map_bootmem(struct page *memmap) >> PAGE_SHIFT; for (i = 0; i < nr_pages; i++, page++) { - magic = (unsigned long) page->lru.next; + magic = (unsigned long) page->freelist; BUG_ON(magic == NODE_INFO); -- cgit From 16e72e9b30986ee15f17fbb68189ca842c32af58 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 22 Feb 2017 15:45:16 -0800 Subject: powerpc: do not make the entire heap executable On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt, or with a toolchain which defaults to it) look like this: [17] .sbss NOBITS 0002aff8 01aff8 000014 00 WA 0 0 4 [18] .plt NOBITS 0002b00c 01aff8 000084 00 WAX 0 0 4 [19] .bss NOBITS 0002b090 01aff8 0000a4 00 WA 0 0 4 Which results in an ELF load header: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align LOAD 0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x10000 This is all correct, the load region containing the PLT is marked as executable. Note that the PLT starts at 0002b00c but the file mapping ends at 0002aff8, so the PLT falls in the 0 fill section described by the load header, and after a page boundary. Unfortunately the generic ELF loader ignores the X bit in the load headers when it creates the 0 filled non-file backed mappings. It assumes all of these mappings are RW BSS sections, which is not the case for PPC. gcc/ld has an option (--secure-plt) to not do this, this is said to incur a small performance penalty. Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire brk area* with executable rights for all binaries, even --secure-plt ones. Stop doing that. Teach the ELF loader to check the X bit in the relevant load header and create 0 filled anonymous mappings that are executable if the load header requests that. Test program showing the difference in /proc/$PID/maps: int main() { char buf[16*1024]; char *p = malloc(123); /* make "[heap]" mapping appear */ int fd = open("/proc/self/maps", O_RDONLY); int len = read(fd, buf, sizeof(buf)); write(1, buf, len); printf("%p\n", p); return 0; } Compiled using: gcc -mbss-plt -m32 -Os test.c -otest Unpatched ppc64 kernel: 00100000-00120000 r-xp 00000000 00:00 0 [vdso] 0fe10000-0ffd0000 r-xp 00000000 fd:00 67898094 /usr/lib/libc-2.17.so 0ffd0000-0ffe0000 r--p 001b0000 fd:00 67898094 /usr/lib/libc-2.17.so 0ffe0000-0fff0000 rw-p 001c0000 fd:00 67898094 /usr/lib/libc-2.17.so 10000000-10010000 r-xp 00000000 fd:00 100674505 /home/user/test 10010000-10020000 r--p 00000000 fd:00 100674505 /home/user/test 10020000-10030000 rw-p 00010000 fd:00 100674505 /home/user/test 10690000-106c0000 rwxp 00000000 00:00 0 [heap] f7f70000-f7fa0000 r-xp 00000000 fd:00 67898089 /usr/lib/ld-2.17.so f7fa0000-f7fb0000 r--p 00020000 fd:00 67898089 /usr/lib/ld-2.17.so f7fb0000-f7fc0000 rw-p 00030000 fd:00 67898089 /usr/lib/ld-2.17.so ffa90000-ffac0000 rw-p 00000000 00:00 0 [stack] 0x10690008 Patched ppc64 kernel: 00100000-00120000 r-xp 00000000 00:00 0 [vdso] 0fe10000-0ffd0000 r-xp 00000000 fd:00 67898094 /usr/lib/libc-2.17.so 0ffd0000-0ffe0000 r--p 001b0000 fd:00 67898094 /usr/lib/libc-2.17.so 0ffe0000-0fff0000 rw-p 001c0000 fd:00 67898094 /usr/lib/libc-2.17.so 10000000-10010000 r-xp 00000000 fd:00 100674505 /home/user/test 10010000-10020000 r--p 00000000 fd:00 100674505 /home/user/test 10020000-10030000 rw-p 00010000 fd:00 100674505 /home/user/test 10180000-101b0000 rw-p 00000000 00:00 0 [heap] ^^^^ this has changed f7c60000-f7c90000 r-xp 00000000 fd:00 67898089 /usr/lib/ld-2.17.so f7c90000-f7ca0000 r--p 00020000 fd:00 67898089 /usr/lib/ld-2.17.so f7ca0000-f7cb0000 rw-p 00030000 fd:00 67898089 /usr/lib/ld-2.17.so ff860000-ff890000 rw-p 00000000 00:00 0 [stack] 0x10180008 The patch was originally posted in 2012 by Jason Gunthorpe and apparently ignored: https://lkml.org/lkml/2012/9/30/138 Lightly run-tested. Link: http://lkml.kernel.org/r/20161215131950.23054-1-dvlasenk@redhat.com Signed-off-by: Jason Gunthorpe Signed-off-by: Denys Vlasenko Acked-by: Kees Cook Acked-by: Michael Ellerman Tested-by: Jason Gunthorpe Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Aneesh Kumar K.V" Cc: Oleg Nesterov Cc: Florian Weimer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/page.h | 4 +++- fs/binfmt_elf.c | 30 ++++++++++++++++++++++-------- include/linux/mm.h | 1 + mm/mmap.c | 24 +++++++++++++++++++----- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 47120bf2670c..2a32483c7b6c 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -230,7 +230,9 @@ extern long long virt_phys_offset; * and needs to be executable. This means the whole heap ends * up being executable. */ -#define VM_DATA_DEFAULT_FLAGS32 (VM_READ | VM_WRITE | VM_EXEC | \ +#define VM_DATA_DEFAULT_FLAGS32 \ + (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \ + VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_DEFAULT_FLAGS64 (VM_READ | VM_WRITE | \ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e7bf01373bc4..443a6f537d56 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) -static int set_brk(unsigned long start, unsigned long end) +static int set_brk(unsigned long start, unsigned long end, int prot) { start = ELF_PAGEALIGN(start); end = ELF_PAGEALIGN(end); if (end > start) { - int error = vm_brk(start, end - start); + /* + * Map the last of the bss segment. + * If the header is requesting these pages to be + * executable, honour that (ppc32 needs this). + */ + int error = vm_brk_flags(start, end - start, + prot & PROT_EXEC ? VM_EXEC : 0); if (error) return error; } @@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, unsigned long load_addr = 0; int load_addr_set = 0; unsigned long last_bss = 0, elf_bss = 0; + int bss_prot = 0; unsigned long error = ~0UL; unsigned long total_size; int i; @@ -606,8 +613,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, * elf_bss and last_bss is the bss section. */ k = load_addr + eppnt->p_vaddr + eppnt->p_memsz; - if (k > last_bss) + if (k > last_bss) { last_bss = k; + bss_prot = elf_prot; + } } } @@ -623,13 +632,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, /* * Next, align both the file and mem bss up to the page size, * since this is where elf_bss was just zeroed up to, and where - * last_bss will end after the vm_brk() below. + * last_bss will end after the vm_brk_flags() below. */ elf_bss = ELF_PAGEALIGN(elf_bss); last_bss = ELF_PAGEALIGN(last_bss); /* Finally, if there is still more bss to allocate, do it. */ if (last_bss > elf_bss) { - error = vm_brk(elf_bss, last_bss - elf_bss); + error = vm_brk_flags(elf_bss, last_bss - elf_bss, + bss_prot & PROT_EXEC ? VM_EXEC : 0); if (error) goto out; } @@ -674,6 +684,7 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; unsigned long elf_bss, elf_brk; + int bss_prot = 0; int retval, i; unsigned long elf_entry; unsigned long interp_load_addr = 0; @@ -882,7 +893,8 @@ static int load_elf_binary(struct linux_binprm *bprm) before this one. Map anonymous pages, if needed, and clear the area. */ retval = set_brk(elf_bss + load_bias, - elf_brk + load_bias); + elf_brk + load_bias, + bss_prot); if (retval) goto out_free_dentry; nbyte = ELF_PAGEOFFSET(elf_bss); @@ -976,8 +988,10 @@ static int load_elf_binary(struct linux_binprm *bprm) if (end_data < k) end_data = k; k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz; - if (k > elf_brk) + if (k > elf_brk) { + bss_prot = elf_prot; elf_brk = k; + } } loc->elf_ex.e_entry += load_bias; @@ -993,7 +1007,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * mapping in the interpreter, to make sure it doesn't wind * up getting placed where the bss needs to go. */ - retval = set_brk(elf_bss, elf_brk); + retval = set_brk(elf_bss, elf_brk, bss_prot); if (retval) goto out_free_dentry; if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { diff --git a/include/linux/mm.h b/include/linux/mm.h index bb997493e15d..dae6f58d67c8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2083,6 +2083,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {} /* These take the mm semaphore themselves */ extern int __must_check vm_brk(unsigned long, unsigned long); +extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, diff --git a/mm/mmap.c b/mm/mmap.c index dc4291dcc99b..b729084eea90 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2806,11 +2806,11 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -static int do_brk(unsigned long addr, unsigned long request) +static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - unsigned long flags, len; + unsigned long len; struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; @@ -2821,7 +2821,10 @@ static int do_brk(unsigned long addr, unsigned long request) if (!len) return 0; - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + /* Until we need other flags, refuse anything except VM_EXEC. */ + if ((flags & (~VM_EXEC)) != 0) + return -EINVAL; + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (offset_in_page(error)) @@ -2889,7 +2892,12 @@ out: return 0; } -int vm_brk(unsigned long addr, unsigned long len) +static int do_brk(unsigned long addr, unsigned long len) +{ + return do_brk_flags(addr, len, 0); +} + +int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; int ret; @@ -2898,13 +2906,19 @@ int vm_brk(unsigned long addr, unsigned long len) if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_brk(addr, len); + ret = do_brk_flags(addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); up_write(&mm->mmap_sem); if (populate && !ret) mm_populate(addr, len); return ret; } +EXPORT_SYMBOL(vm_brk_flags); + +int vm_brk(unsigned long addr, unsigned long len) +{ + return vm_brk_flags(addr, len, 0); +} EXPORT_SYMBOL(vm_brk); /* Release all mmaps. */ -- cgit From 6a991fc72d1243b8da0c644d3147d3ec41a0b281 Mon Sep 17 00:00:00 2001 From: "Huang, Ying" Date: Wed, 22 Feb 2017 15:45:19 -0800 Subject: mm/swap: fix kernel message in swap_info_get() Patch series "mm/swap: Regular page swap optimizations", v5. Times have changed. Coming generation of Solid state Block device latencies are getting down to sub 100 usec, which is within an order of magnitude of DRAM, and their performance is orders of magnitude higher than the single- spindle rotational media we've swapped to historically. This could benefit many usage scenearios. For example cloud providers who overcommit their memory (as VM don't use all the memory provisioned). Having a fast swap will allow them to be more aggressive in memory overcommit and fit more VMs to a platform. In our testing [see footnote], the median latency that the kernel adds to a page fault is 15 usec, which comes quite close to the amount that will be contributed by the underlying I/O devices. The software latency comes mostly from contentions on the locks protecting the radix tree of the swap cache and also the locks protecting the individual swap devices. The lock contentions already consumed 35% of cpu cycles in our test. In the very near future, software latency will become the bottleneck to swap performnace as block device I/O latency gets within the shouting distance of DRAM speed. This patch set, reduced the median page fault latency from 15 usec to 4 usec (375% reduction) for DRAM based pmem block device. This patch (of 9): swap_info_get() is used not only in swap free code path but also in page_swapcount(), etc. So the original kernel message in swap_info_get() is not correct now. Fix it via replacing "swap_free" to "swap_info_get" in the message. Link: http://lkml.kernel.org/r/9b5f8bd6266f9da978c373f2384c8044df5e262c.1484082593.git.tim.c.chen@linux.intel.com Signed-off-by: "Huang, Ying" Signed-off-by: Tim Chen Reviewed-by: Rik van Riel Cc: Aaron Lu Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Christian Borntraeger Cc: Dave Hansen Cc: Hillf Danton Cc: Huang Ying Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet escreveu: Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Minchan Kim Cc: Shaohua Li Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4761701d1721..2001ce427a1d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -753,16 +753,16 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) return p; bad_free: - pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val); + pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); goto out; bad_offset: - pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val); + pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val); goto out; bad_device: - pr_err("swap_free: %s%08lx\n", Unused_file, entry.val); + pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val); goto out; bad_nofile: - pr_err("swap_free: %s%08lx\n", Bad_file, entry.val); + pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val); out: return NULL; } -- cgit From 235b62176712b970c815923e36b9a9cc05d4d901 Mon Sep 17 00:00:00 2001 From: "Huang, Ying" Date: Wed, 22 Feb 2017 15:45:22 -0800 Subject: mm/swap: add cluster lock This patch is to reduce the lock contention of swap_info_struct->lock via using a more fine grained lock in swap_cluster_info for some swap operations. swap_info_struct->lock is heavily contended if multiple processes reclaim pages simultaneously. Because there is only one lock for each swap device. While in common configuration, there is only one or several swap devices in the system. The lock protects almost all swap related operations. In fact, many swap operations only access one element of swap_info_struct->swap_map array. And there is no dependency between different elements of swap_info_struct->swap_map. So a fine grained lock can be used to allow parallel access to the different elements of swap_info_struct->swap_map. In this patch, a spinlock is added to swap_cluster_info to protect the elements of swap_info_struct->swap_map in the swap cluster and the fields of swap_cluster_info. This reduced locking contention for swap_info_struct->swap_map access greatly. Because of the added spinlock, the size of swap_cluster_info increases from 4 bytes to 8 bytes on the 64 bit and 32 bit system. This will use additional 4k RAM for every 1G swap space. Because the size of swap_cluster_info is much smaller than the size of the cache line (8 vs 64 on x86_64 architecture), there may be false cache line sharing between spinlocks in swap_cluster_info. To avoid the false sharing in the first round of the swap cluster allocation, the order of the swap clusters in the free clusters list is changed. So that, the swap_cluster_info sharing the same cache line will be placed as far as possible. After the first round of allocation, the order of the clusters in free clusters list is expected to be random. So the false sharing should be not serious. Compared with a previous implementation using bit_spin_lock, the sequential swap out throughput improved about 3.2%. Test was done on a Xeon E5 v3 system. The swap device used is a RAM simulated PMEM (persistent memory) device. To test the sequential swapping out, the test case created 32 processes, which sequentially allocate and write to the anonymous pages until the RAM and part of the swap device is used. [ying.huang@intel.com: v5] Link: http://lkml.kernel.org/r/878tqeuuic.fsf_-_@yhuang-dev.intel.com [minchan@kernel.org: initialize spinlock for swap_cluster_info] Link: http://lkml.kernel.org/r/1486434945-29753-1-git-send-email-minchan@kernel.org [hughd@google.com: annotate nested locking for cluster lock] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1702161050540.21773@eggly.anvils Link: http://lkml.kernel.org/r/dbb860bbd825b1aaba18988015e8963f263c3f0d.1484082593.git.tim.c.chen@linux.intel.com Signed-off-by: "Huang, Ying" Signed-off-by: Tim Chen Signed-off-by: Minchan Kim Signed-off-by: Hugh Dickins Cc: Aaron Lu Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Christian Borntraeger Cc: Dave Hansen Cc: Hillf Danton Cc: Huang Ying Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet escreveu: Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 ++ mm/swapfile.c | 215 +++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 179 insertions(+), 42 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 7f47b7098b1b..279c7b84bd63 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -176,6 +176,12 @@ enum { * protected by swap_info_struct.lock. */ struct swap_cluster_info { + spinlock_t lock; /* + * Protect swap_cluster_info fields + * and swap_info_struct->swap_map + * elements correspond to the swap + * cluster + */ unsigned int data:24; unsigned int flags:8; }; diff --git a/mm/swapfile.c b/mm/swapfile.c index 2001ce427a1d..eb71b5d9430b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -257,6 +257,47 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } +static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, + unsigned long offset) +{ + struct swap_cluster_info *ci; + + ci = si->cluster_info; + if (ci) { + ci += offset / SWAPFILE_CLUSTER; + spin_lock(&ci->lock); + } + return ci; +} + +static inline void unlock_cluster(struct swap_cluster_info *ci) +{ + if (ci) + spin_unlock(&ci->lock); +} + +static inline struct swap_cluster_info *lock_cluster_or_swap_info( + struct swap_info_struct *si, + unsigned long offset) +{ + struct swap_cluster_info *ci; + + ci = lock_cluster(si, offset); + if (!ci) + spin_lock(&si->lock); + + return ci; +} + +static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + if (ci) + unlock_cluster(ci); + else + spin_unlock(&si->lock); +} + static inline bool cluster_list_empty(struct swap_cluster_list *list) { return cluster_is_null(&list->head); @@ -281,9 +322,17 @@ static void cluster_list_add_tail(struct swap_cluster_list *list, cluster_set_next_flag(&list->head, idx, 0); cluster_set_next_flag(&list->tail, idx, 0); } else { + struct swap_cluster_info *ci_tail; unsigned int tail = cluster_next(&list->tail); - cluster_set_next(&ci[tail], idx); + /* + * Nested cluster lock, but both cluster locks are + * only acquired when we held swap_info_struct->lock + */ + ci_tail = ci + tail; + spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); + cluster_set_next(ci_tail, idx); + unlock_cluster(ci_tail); cluster_set_next_flag(&list->tail, idx, 0); } } @@ -328,7 +377,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, */ static void swap_do_scheduled_discard(struct swap_info_struct *si) { - struct swap_cluster_info *info; + struct swap_cluster_info *info, *ci; unsigned int idx; info = si->cluster_info; @@ -341,10 +390,14 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) SWAPFILE_CLUSTER); spin_lock(&si->lock); - cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); + ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); + cluster_set_flag(ci, CLUSTER_FLAG_FREE); + unlock_cluster(ci); cluster_list_add_tail(&si->free_clusters, info, idx); + ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); + unlock_cluster(ci); } } @@ -447,8 +500,9 @@ static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, unsigned long *offset, unsigned long *scan_base) { struct percpu_cluster *cluster; + struct swap_cluster_info *ci; bool found_free; - unsigned long tmp; + unsigned long tmp, max; new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); @@ -476,14 +530,21 @@ new_cluster: * check if there is still free entry in the cluster */ tmp = cluster->next; - while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * - SWAPFILE_CLUSTER) { + max = min_t(unsigned long, si->max, + (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); + if (tmp >= max) { + cluster_set_null(&cluster->index); + goto new_cluster; + } + ci = lock_cluster(si, tmp); + while (tmp < max) { if (!si->swap_map[tmp]) { found_free = true; break; } tmp++; } + unlock_cluster(ci); if (!found_free) { cluster_set_null(&cluster->index); goto new_cluster; @@ -496,6 +557,7 @@ new_cluster: static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned char usage) { + struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; @@ -572,9 +634,11 @@ checks: if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; + ci = lock_cluster(si, offset); /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; + unlock_cluster(ci); spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset); spin_lock(&si->lock); @@ -584,8 +648,10 @@ checks: goto scan; /* check next one */ } - if (si->swap_map[offset]) + if (si->swap_map[offset]) { + unlock_cluster(ci); goto scan; + } if (offset == si->lowest_bit) si->lowest_bit++; @@ -601,6 +667,7 @@ checks: } si->swap_map[offset] = usage; inc_cluster_info_page(si, si->cluster_info, offset); + unlock_cluster(ci); si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; @@ -731,7 +798,7 @@ swp_entry_t get_swap_page_of_type(int type) return (swp_entry_t) {0}; } -static struct swap_info_struct *swap_info_get(swp_entry_t entry) +static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; unsigned long offset, type; @@ -749,7 +816,6 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) goto bad_offset; if (!p->swap_map[offset]) goto bad_free; - spin_lock(&p->lock); return p; bad_free: @@ -767,14 +833,45 @@ out: return NULL; } +static struct swap_info_struct *swap_info_get(swp_entry_t entry) +{ + struct swap_info_struct *p; + + p = _swap_info_get(entry); + if (p) + spin_lock(&p->lock); + return p; +} + static unsigned char swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage) + swp_entry_t entry, unsigned char usage, + bool swap_info_locked) { + struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; + bool lock_swap_info = false; + + if (!swap_info_locked) { + count = p->swap_map[offset]; + if (!p->cluster_info || count == usage || count == SWAP_MAP_SHMEM) { +lock_swap_info: + swap_info_locked = true; + lock_swap_info = true; + spin_lock(&p->lock); + } + } + + ci = lock_cluster(p, offset); count = p->swap_map[offset]; + + if (!swap_info_locked && (count == usage || count == SWAP_MAP_SHMEM)) { + unlock_cluster(ci); + goto lock_swap_info; + } + has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; @@ -800,10 +897,15 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, usage = count | has_cache; p->swap_map[offset] = usage; + unlock_cluster(ci); + /* free if no reference */ if (!usage) { + VM_BUG_ON(!swap_info_locked); mem_cgroup_uncharge_swap(entry); + ci = lock_cluster(p, offset); dec_cluster_info_page(p, p->cluster_info, offset); + unlock_cluster(ci); if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) { @@ -829,6 +931,9 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, } } + if (lock_swap_info) + spin_unlock(&p->lock); + return usage; } @@ -840,11 +945,9 @@ void swap_free(swp_entry_t entry) { struct swap_info_struct *p; - p = swap_info_get(entry); - if (p) { - swap_entry_free(p, entry, 1); - spin_unlock(&p->lock); - } + p = _swap_info_get(entry); + if (p) + swap_entry_free(p, entry, 1, false); } /* @@ -854,11 +957,9 @@ void swapcache_free(swp_entry_t entry) { struct swap_info_struct *p; - p = swap_info_get(entry); - if (p) { - swap_entry_free(p, entry, SWAP_HAS_CACHE); - spin_unlock(&p->lock); - } + p = _swap_info_get(entry); + if (p) + swap_entry_free(p, entry, SWAP_HAS_CACHE, false); } /* @@ -870,13 +971,17 @@ int page_swapcount(struct page *page) { int count = 0; struct swap_info_struct *p; + struct swap_cluster_info *ci; swp_entry_t entry; + unsigned long offset; entry.val = page_private(page); - p = swap_info_get(entry); + p = _swap_info_get(entry); if (p) { - count = swap_count(p->swap_map[swp_offset(entry)]); - spin_unlock(&p->lock); + offset = swp_offset(entry); + ci = lock_cluster_or_swap_info(p, offset); + count = swap_count(p->swap_map[offset]); + unlock_cluster_or_swap_info(p, ci); } return count; } @@ -889,22 +994,26 @@ int swp_swapcount(swp_entry_t entry) { int count, tmp_count, n; struct swap_info_struct *p; + struct swap_cluster_info *ci; struct page *page; pgoff_t offset; unsigned char *map; - p = swap_info_get(entry); + p = _swap_info_get(entry); if (!p) return 0; - count = swap_count(p->swap_map[swp_offset(entry)]); + offset = swp_offset(entry); + + ci = lock_cluster_or_swap_info(p, offset); + + count = swap_count(p->swap_map[offset]); if (!(count & COUNT_CONTINUED)) goto out; count &= ~COUNT_CONTINUED; n = SWAP_MAP_MAX + 1; - offset = swp_offset(entry); page = vmalloc_to_page(p->swap_map + offset); offset &= ~PAGE_MASK; VM_BUG_ON(page_private(page) != SWP_CONTINUED); @@ -919,7 +1028,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - spin_unlock(&p->lock); + unlock_cluster_or_swap_info(p, ci); return count; } @@ -1017,7 +1126,7 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { + if (swap_entry_free(p, entry, 1, true) == SWAP_HAS_CACHE) { page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (page && !trylock_page(page)) { @@ -2298,6 +2407,9 @@ static unsigned long read_swap_header(struct swap_info_struct *p, return maxpages; } +#define SWAP_CLUSTER_COLS \ + DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) + static int setup_swap_map_and_extents(struct swap_info_struct *p, union swap_header *swap_header, unsigned char *swap_map, @@ -2305,11 +2417,12 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, unsigned long maxpages, sector_t *span) { - int i; + unsigned int j, k; unsigned int nr_good_pages; int nr_extents; unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER; + unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; + unsigned long i, idx; nr_good_pages = maxpages - 1; /* omit header page */ @@ -2357,15 +2470,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, if (!cluster_info) return nr_extents; - for (i = 0; i < nr_clusters; i++) { - if (!cluster_count(&cluster_info[idx])) { + + /* Reduce false cache line sharing between cluster_info */ + for (k = 0; k < SWAP_CLUSTER_COLS; k++) { + j = (k + col) % SWAP_CLUSTER_COLS; + for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { + idx = i * SWAP_CLUSTER_COLS + j; + if (idx >= nr_clusters) + continue; + if (cluster_count(&cluster_info[idx])) + continue; cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); cluster_list_add_tail(&p->free_clusters, cluster_info, idx); } - idx++; - if (idx == nr_clusters) - idx = 0; } return nr_extents; } @@ -2468,6 +2586,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { int cpu; + unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; /* @@ -2475,13 +2594,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * SSD */ p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - cluster_info = vzalloc(DIV_ROUND_UP(maxpages, - SWAPFILE_CLUSTER) * sizeof(*cluster_info)); + cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info)); if (!cluster_info) { error = -ENOMEM; goto bad_swap; } + + for (ci = 0; ci < nr_cluster; ci++) + spin_lock_init(&((cluster_info + ci)->lock)); + p->percpu_cluster = alloc_percpu(struct percpu_cluster); if (!p->percpu_cluster) { error = -ENOMEM; @@ -2627,6 +2750,7 @@ void si_swapinfo(struct sysinfo *val) static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; + struct swap_cluster_info *ci; unsigned long offset, type; unsigned char count; unsigned char has_cache; @@ -2640,10 +2764,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) goto bad_file; p = swap_info[type]; offset = swp_offset(entry); - - spin_lock(&p->lock); if (unlikely(offset >= p->max)) - goto unlock_out; + goto out; + + ci = lock_cluster_or_swap_info(p, offset); count = p->swap_map[offset]; @@ -2686,7 +2810,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) p->swap_map[offset] = count | has_cache; unlock_out: - spin_unlock(&p->lock); + unlock_cluster_or_swap_info(p, ci); out: return err; @@ -2775,6 +2899,7 @@ EXPORT_SYMBOL_GPL(__page_file_index); int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) { struct swap_info_struct *si; + struct swap_cluster_info *ci; struct page *head; struct page *page; struct page *list_page; @@ -2798,6 +2923,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } offset = swp_offset(entry); + + ci = lock_cluster(si, offset); + count = si->swap_map[offset] & ~SWAP_HAS_CACHE; if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { @@ -2810,6 +2938,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { + unlock_cluster(ci); spin_unlock(&si->lock); return -ENOMEM; } @@ -2858,6 +2987,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ out: + unlock_cluster(ci); spin_unlock(&si->lock); outer: if (page) @@ -2871,7 +3001,8 @@ outer: * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. + * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster + * lock. */ static bool swap_count_continued(struct swap_info_struct *si, pgoff_t offset, unsigned char count) -- cgit From 4b3ef9daa4fc0bba742a79faecb17fdaaead083b Mon Sep 17 00:00:00 2001 From: "Huang, Ying" Date: Wed, 22 Feb 2017 15:45:26 -0800 Subject: mm/swap: split swap cache into 64MB trunks The patch is to improve the scalability of the swap out/in via using fine grained locks for the swap cache. In current kernel, one address space will be used for each swap device. And in the common configuration, the number of the swap device is very small (one is typical). This causes the heavy lock contention on the radix tree of the address space if multiple tasks swap out/in concurrently. But in fact, there is no dependency between pages in the swap cache. So that, we can split the one shared address space for each swap device into several address spaces to reduce the lock contention. In the patch, the shared address space is split into 64MB trunks. 64MB is chosen to balance the memory space usage and effect of lock contention reduction. The size of struct address_space on x86_64 architecture is 408B, so with the patch, 6528B more memory will be used for every 1GB swap space on x86_64 architecture. One address space is still shared for the swap entries in the same 64M trunks. To avoid lock contention for the first round of swap space allocation, the order of the swap clusters in the initial free clusters list is changed. The swap space distance between the consecutive swap clusters in the free cluster list is at least 64M. After the first round of allocation, the swap clusters are expected to be freed randomly, so the lock contention should be reduced effectively. Link: http://lkml.kernel.org/r/735bab895e64c930581ffb0a05b661e01da82bc5.1484082593.git.tim.c.chen@linux.intel.com Signed-off-by: "Huang, Ying" Signed-off-by: Tim Chen Cc: Aaron Lu Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Christian Borntraeger Cc: Dave Hansen Cc: Hillf Danton Cc: Huang Ying Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet escreveu: Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 11 +++++++-- mm/swap.c | 6 ----- mm/swap_state.c | 68 ++++++++++++++++++++++++++++++++++++++++++---------- mm/swapfile.c | 16 +++++++++++-- 4 files changed, 79 insertions(+), 22 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 279c7b84bd63..648a32b58e3e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -343,8 +343,13 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *, sector_t *); /* linux/mm/swap_state.c */ -extern struct address_space swapper_spaces[]; -#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) +/* One swap address space for each 64M swap space */ +#define SWAP_ADDRESS_SPACE_SHIFT 14 +#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) +extern struct address_space *swapper_spaces[]; +#define swap_address_space(entry) \ + (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ + >> SWAP_ADDRESS_SPACE_SHIFT]) extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *, struct list_head *list); @@ -398,6 +403,8 @@ extern struct swap_info_struct *page_swap_info(struct page *); extern bool reuse_swap_page(struct page *, int *); extern int try_to_free_swap(struct page *); struct backing_dev_info; +extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); +extern void exit_swap_address_space(unsigned int type); #else /* CONFIG_SWAP */ diff --git a/mm/swap.c b/mm/swap.c index 844baedd2429..aabf2e90fe32 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -971,12 +971,6 @@ EXPORT_SYMBOL(pagevec_lookup_tag); void __init swap_setup(void) { unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); -#ifdef CONFIG_SWAP - int i; - - for (i = 0; i < MAX_SWAPFILES; i++) - spin_lock_init(&swapper_spaces[i].tree_lock); -#endif /* Use a smaller cluster for small-memory machines */ if (megs < 16) diff --git a/mm/swap_state.c b/mm/swap_state.c index 35d7e0ee1c77..3863acd6189c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -32,15 +33,8 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space swapper_spaces[MAX_SWAPFILES] = { - [0 ... MAX_SWAPFILES - 1] = { - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .i_mmap_writable = ATOMIC_INIT(0), - .a_ops = &swap_aops, - /* swap cache doesn't use writeback related tags */ - .flags = 1 << AS_NO_WRITEBACK_TAGS, - } -}; +struct address_space *swapper_spaces[MAX_SWAPFILES]; +static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -53,11 +47,26 @@ static struct { unsigned long total_swapcache_pages(void) { - int i; + unsigned int i, j, nr; unsigned long ret = 0; + struct address_space *spaces; - for (i = 0; i < MAX_SWAPFILES; i++) - ret += swapper_spaces[i].nrpages; + rcu_read_lock(); + for (i = 0; i < MAX_SWAPFILES; i++) { + /* + * The corresponding entries in nr_swapper_spaces and + * swapper_spaces will be reused only after at least + * one grace period. So it is impossible for them + * belongs to different usage. + */ + nr = nr_swapper_spaces[i]; + spaces = rcu_dereference(swapper_spaces[i]); + if (!nr || !spaces) + continue; + for (j = 0; j < nr; j++) + ret += spaces[j].nrpages; + } + rcu_read_unlock(); return ret; } @@ -505,3 +514,38 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, skip: return read_swap_cache_async(entry, gfp_mask, vma, addr); } + +int init_swap_address_space(unsigned int type, unsigned long nr_pages) +{ + struct address_space *spaces, *space; + unsigned int i, nr; + + nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); + spaces = vzalloc(sizeof(struct address_space) * nr); + if (!spaces) + return -ENOMEM; + for (i = 0; i < nr; i++) { + space = spaces + i; + INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); + atomic_set(&space->i_mmap_writable, 0); + space->a_ops = &swap_aops; + /* swap cache doesn't use writeback related tags */ + mapping_set_no_writeback_tags(space); + spin_lock_init(&space->tree_lock); + } + nr_swapper_spaces[type] = nr; + rcu_assign_pointer(swapper_spaces[type], spaces); + + return 0; +} + +void exit_swap_address_space(unsigned int type) +{ + struct address_space *spaces; + + spaces = swapper_spaces[type]; + nr_swapper_spaces[type] = 0; + rcu_assign_pointer(swapper_spaces[type], NULL); + synchronize_rcu(); + kvfree(spaces); +} diff --git a/mm/swapfile.c b/mm/swapfile.c index eb71b5d9430b..66e95eb73040 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2084,6 +2084,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) vfree(frontswap_map); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); + exit_swap_address_space(p->type); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { @@ -2407,8 +2408,12 @@ static unsigned long read_swap_header(struct swap_info_struct *p, return maxpages; } -#define SWAP_CLUSTER_COLS \ +#define SWAP_CLUSTER_INFO_COLS \ DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) +#define SWAP_CLUSTER_SPACE_COLS \ + DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) +#define SWAP_CLUSTER_COLS \ + max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) static int setup_swap_map_and_extents(struct swap_info_struct *p, union swap_header *swap_header, @@ -2471,7 +2476,10 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, return nr_extents; - /* Reduce false cache line sharing between cluster_info */ + /* + * Reduce false cache line sharing between cluster_info and + * sharing same address space. + */ for (k = 0; k < SWAP_CLUSTER_COLS; k++) { j = (k + col) % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { @@ -2661,6 +2669,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } + error = init_swap_address_space(p->type, maxpages); + if (error) + goto bad_swap; + mutex_lock(&swapon_mutex); prio = -1; if (swap_flags & SWAP_FLAG_PREFER) -- cgit From e8c26ab60598558ec3a626e7925b06e7417d7710 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 22 Feb 2017 15:45:29 -0800 Subject: mm/swap: skip readahead for unreferenced swap slots We can avoid needlessly allocating page for swap slots that are not used by anyone. No pages have to be read in for these slots. Link: http://lkml.kernel.org/r/0784b3f20b9bd3aa5552219624cb78dc4ae710c9.1484082593.git.tim.c.chen@linux.intel.com Signed-off-by: Tim Chen Signed-off-by: "Huang, Ying" Cc: Aaron Lu Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Christian Borntraeger Cc: Dave Hansen Cc: Hillf Danton Cc: Huang Ying Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet escreveu: Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 ++++++ mm/swap_state.c | 4 ++++ mm/swapfile.c | 47 +++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 51 insertions(+), 6 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 648a32b58e3e..3116382067cd 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -398,6 +398,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); +extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern bool reuse_swap_page(struct page *, int *); @@ -492,6 +493,11 @@ static inline int page_swapcount(struct page *page) return 0; } +static inline int __swp_swapcount(swp_entry_t entry) +{ + return 0; +} + static inline int swp_swapcount(swp_entry_t entry) { return 0; diff --git a/mm/swap_state.c b/mm/swap_state.c index 3863acd6189c..3d76d80c07d6 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -323,6 +323,10 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (found_page) break; + /* Just skip read ahead for unused swap slot */ + if (!__swp_swapcount(entry)) + return NULL; + /* * Get a new page to read into from swap. */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 66e95eb73040..7e888de35c41 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -798,7 +798,7 @@ swp_entry_t get_swap_page_of_type(int type) return (swp_entry_t) {0}; } -static struct swap_info_struct *_swap_info_get(swp_entry_t entry) +static struct swap_info_struct *__swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; unsigned long offset, type; @@ -814,13 +814,8 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) offset = swp_offset(entry); if (offset >= p->max) goto bad_offset; - if (!p->swap_map[offset]) - goto bad_free; return p; -bad_free: - pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); - goto out; bad_offset: pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val); goto out; @@ -833,6 +828,24 @@ out: return NULL; } +static struct swap_info_struct *_swap_info_get(swp_entry_t entry) +{ + struct swap_info_struct *p; + + p = __swap_info_get(entry); + if (!p) + goto out; + if (!p->swap_map[swp_offset(entry)]) + goto bad_free; + return p; + +bad_free: + pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); + goto out; +out: + return NULL; +} + static struct swap_info_struct *swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; @@ -986,6 +999,28 @@ int page_swapcount(struct page *page) return count; } +/* + * How many references to @entry are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. + */ +int __swp_swapcount(swp_entry_t entry) +{ + int count = 0; + pgoff_t offset; + struct swap_info_struct *si; + struct swap_cluster_info *ci; + + si = __swap_info_get(entry); + if (si) { + offset = swp_offset(entry); + ci = lock_cluster_or_swap_info(si, offset); + count = swap_count(si->swap_map[offset]); + unlock_cluster_or_swap_info(si, ci); + } + return count; +} + /* * How many references to @entry are currently swapped out? * This considers COUNT_CONTINUED so it returns exact answer. -- cgit From 36005bae205da3eef0016a5c96a34f10a68afa1e Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 22 Feb 2017 15:45:33 -0800 Subject: mm/swap: allocate swap slots in batches Currently, the swap slots are allocated one page at a time, causing contention to the swap_info lock protecting the swap partition on every page being swapped. This patch adds new functions get_swap_pages and scan_swap_map_slots to request multiple swap slots at once. This will reduces the lock contention on the swap_info lock. Also scan_swap_map_slots can operate more efficiently as swap slots often occurs in clusters close to each other on a swap device and it is quicker to allocate them together. Link: http://lkml.kernel.org/r/9fec2845544371f62c3763d43510045e33d286a6.1484082593.git.tim.c.chen@linux.intel.com Signed-off-by: Tim Chen Signed-off-by: "Huang, Ying" Cc: Aaron Lu Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Christian Borntraeger Cc: Dave Hansen Cc: Hillf Danton Cc: Huang Ying Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet escreveu: Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 + mm/swapfile.c | 136 +++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 113 insertions(+), 25 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 3116382067cd..956eae8a8edf 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -27,6 +27,7 @@ struct bio; #define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \ SWAP_FLAG_DISCARD_PAGES) +#define SWAP_BATCH 64 static inline int current_is_kswapd(void) { @@ -386,6 +387,7 @@ static inline long get_nr_swap_pages(void) extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page_of_type(int); +extern int get_swap_pages(int n, swp_entry_t swp_entries[]); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); diff --git a/mm/swapfile.c b/mm/swapfile.c index 7e888de35c41..e73b5441055b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -496,7 +496,7 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, * Try to get a swap entry from current cpu's swap entry pool (a cluster). This * might involve allocating a new cluster for current CPU too. */ -static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, +static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, unsigned long *offset, unsigned long *scan_base) { struct percpu_cluster *cluster; @@ -520,7 +520,7 @@ new_cluster: *scan_base = *offset = si->cluster_next; goto new_cluster; } else - return; + return false; } found_free = false; @@ -552,16 +552,22 @@ new_cluster: cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; + return found_free; } -static unsigned long scan_swap_map(struct swap_info_struct *si, - unsigned char usage) +static int scan_swap_map_slots(struct swap_info_struct *si, + unsigned char usage, int nr, + swp_entry_t slots[]) { struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; + int n_ret = 0; + + if (nr > SWAP_BATCH) + nr = SWAP_BATCH; /* * We try to cluster swap pages by allocating them sequentially @@ -579,8 +585,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, /* SSD algorithm */ if (si->cluster_info) { - scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); - goto checks; + if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + goto checks; + else + goto scan; } if (unlikely(!si->cluster_nr--)) { @@ -624,8 +632,14 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, checks: if (si->cluster_info) { - while (scan_swap_map_ssd_cluster_conflict(si, offset)) - scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + while (scan_swap_map_ssd_cluster_conflict(si, offset)) { + /* take a break if we already got some slots */ + if (n_ret) + goto done; + if (!scan_swap_map_try_ssd_cluster(si, &offset, + &scan_base)) + goto scan; + } } if (!(si->flags & SWP_WRITEOK)) goto no_page; @@ -650,7 +664,10 @@ checks: if (si->swap_map[offset]) { unlock_cluster(ci); - goto scan; + if (!n_ret) + goto scan; + else + goto done; } if (offset == si->lowest_bit) @@ -669,9 +686,43 @@ checks: inc_cluster_info_page(si, si->cluster_info, offset); unlock_cluster(ci); si->cluster_next = offset + 1; - si->flags -= SWP_SCANNING; + slots[n_ret++] = swp_entry(si->type, offset); + + /* got enough slots or reach max slots? */ + if ((n_ret == nr) || (offset >= si->highest_bit)) + goto done; + + /* search for next available slot */ + + /* time to take a break? */ + if (unlikely(--latency_ration < 0)) { + if (n_ret) + goto done; + spin_unlock(&si->lock); + cond_resched(); + spin_lock(&si->lock); + latency_ration = LATENCY_LIMIT; + } - return offset; + /* try to get more slots in cluster */ + if (si->cluster_info) { + if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + goto checks; + else + goto done; + } + /* non-ssd case */ + ++offset; + + /* non-ssd case, still more slots in cluster? */ + if (si->cluster_nr && !si->swap_map[offset]) { + --si->cluster_nr; + goto checks; + } + +done: + si->flags -= SWP_SCANNING; + return n_ret; scan: spin_unlock(&si->lock); @@ -709,17 +760,41 @@ scan: no_page: si->flags -= SWP_SCANNING; - return 0; + return n_ret; } -swp_entry_t get_swap_page(void) +static unsigned long scan_swap_map(struct swap_info_struct *si, + unsigned char usage) +{ + swp_entry_t entry; + int n_ret; + + n_ret = scan_swap_map_slots(si, usage, 1, &entry); + + if (n_ret) + return swp_offset(entry); + else + return 0; + +} + +int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) { struct swap_info_struct *si, *next; - pgoff_t offset; + long avail_pgs; + int n_ret = 0; - if (atomic_long_read(&nr_swap_pages) <= 0) + avail_pgs = atomic_long_read(&nr_swap_pages); + if (avail_pgs <= 0) goto noswap; - atomic_long_dec(&nr_swap_pages); + + if (n_goal > SWAP_BATCH) + n_goal = SWAP_BATCH; + + if (n_goal > avail_pgs) + n_goal = avail_pgs; + + atomic_long_sub(n_goal, &nr_swap_pages); spin_lock(&swap_avail_lock); @@ -745,14 +820,14 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - - /* This is called for allocating swap entry for cache */ - offset = scan_swap_map(si, SWAP_HAS_CACHE); + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries); spin_unlock(&si->lock); - if (offset) - return swp_entry(si->type, offset); + if (n_ret) + goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", - si->type); + si->type); + spin_lock(&swap_avail_lock); nextsi: /* @@ -763,7 +838,8 @@ nextsi: * up between us dropping swap_avail_lock and taking si->lock. * Since we dropped the swap_avail_lock, the swap_avail_head * list may have been modified; so if next is still in the - * swap_avail_head list then try it, otherwise start over. + * swap_avail_head list then try it, otherwise start over + * if we have not gotten any slots. */ if (plist_node_empty(&next->avail_list)) goto start_over; @@ -771,9 +847,19 @@ nextsi: spin_unlock(&swap_avail_lock); - atomic_long_inc(&nr_swap_pages); +check_out: + if (n_ret < n_goal) + atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages); noswap: - return (swp_entry_t) {0}; + return n_ret; +} + +swp_entry_t get_swap_page(void) +{ + swp_entry_t entry; + + get_swap_pages(1, &entry); + return entry; } /* The only caller of this function is now suspend routine */ -- cgit From 7c00bafee87c7bac7ed9eced7c161f8e5332cb4e Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 22 Feb 2017 15:45:36 -0800 Subject: mm/swap: free swap slots in batch Add new functions that free unused swap slots in batches without the need to reacquire swap info lock. This improves scalability and reduce lock contention. Link: http://lkml.kernel.org/r/c25e0fcdfd237ec4ca7db91631d3b9f6ed23824e.1484082593.git.tim.c.chen@linux.intel.com Signed-off-by: Tim Chen Signed-off-by: "Huang, Ying" Cc: Aaron Lu Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Christian Borntraeger Cc: Dave Hansen Cc: Hillf Danton Cc: Huang Ying Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet escreveu: Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + mm/swapfile.c | 155 +++++++++++++++++++++++++++++++-------------------- 2 files changed, 95 insertions(+), 61 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 956eae8a8edf..bcc0b18f96d2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -394,6 +394,7 @@ extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); extern void swapcache_free(swp_entry_t); +extern void swapcache_free_entries(swp_entry_t *entries, int n); extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); extern unsigned int count_swap_pages(int, int); diff --git a/mm/swapfile.c b/mm/swapfile.c index e73b5441055b..8b5bd34b1a00 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -942,35 +942,34 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) return p; } -static unsigned char swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage, - bool swap_info_locked) +static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, + struct swap_info_struct *q) +{ + struct swap_info_struct *p; + + p = _swap_info_get(entry); + + if (p != q) { + if (q != NULL) + spin_unlock(&q->lock); + if (p != NULL) + spin_lock(&p->lock); + } + return p; +} + +static unsigned char __swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; - bool lock_swap_info = false; - - if (!swap_info_locked) { - count = p->swap_map[offset]; - if (!p->cluster_info || count == usage || count == SWAP_MAP_SHMEM) { -lock_swap_info: - swap_info_locked = true; - lock_swap_info = true; - spin_lock(&p->lock); - } - } - ci = lock_cluster(p, offset); + ci = lock_cluster_or_swap_info(p, offset); count = p->swap_map[offset]; - if (!swap_info_locked && (count == usage || count == SWAP_MAP_SHMEM)) { - unlock_cluster(ci); - goto lock_swap_info; - } - has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; @@ -994,46 +993,52 @@ lock_swap_info: } usage = count | has_cache; - p->swap_map[offset] = usage; + p->swap_map[offset] = usage ? : SWAP_HAS_CACHE; + + unlock_cluster_or_swap_info(p, ci); + + return usage; +} +static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) +{ + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + unsigned char count; + + ci = lock_cluster(p, offset); + count = p->swap_map[offset]; + VM_BUG_ON(count != SWAP_HAS_CACHE); + p->swap_map[offset] = 0; + dec_cluster_info_page(p, p->cluster_info, offset); unlock_cluster(ci); - /* free if no reference */ - if (!usage) { - VM_BUG_ON(!swap_info_locked); - mem_cgroup_uncharge_swap(entry); - ci = lock_cluster(p, offset); - dec_cluster_info_page(p, p->cluster_info, offset); - unlock_cluster(ci); - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) { - bool was_full = !p->highest_bit; - p->highest_bit = offset; - if (was_full && (p->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - WARN_ON(!plist_node_empty(&p->avail_list)); - if (plist_node_empty(&p->avail_list)) - plist_add(&p->avail_list, - &swap_avail_head); - spin_unlock(&swap_avail_lock); - } - } - atomic_long_inc(&nr_swap_pages); - p->inuse_pages--; - frontswap_invalidate_page(p->type, offset); - if (p->flags & SWP_BLKDEV) { - struct gendisk *disk = p->bdev->bd_disk; - if (disk->fops->swap_slot_free_notify) - disk->fops->swap_slot_free_notify(p->bdev, - offset); + mem_cgroup_uncharge_swap(entry); + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) { + bool was_full = !p->highest_bit; + + p->highest_bit = offset; + if (was_full && (p->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&p->avail_list)); + if (plist_node_empty(&p->avail_list)) + plist_add(&p->avail_list, + &swap_avail_head); + spin_unlock(&swap_avail_lock); } } + atomic_long_inc(&nr_swap_pages); + p->inuse_pages--; + frontswap_invalidate_page(p->type, offset); + if (p->flags & SWP_BLKDEV) { + struct gendisk *disk = p->bdev->bd_disk; - if (lock_swap_info) - spin_unlock(&p->lock); - - return usage; + if (disk->fops->swap_slot_free_notify) + disk->fops->swap_slot_free_notify(p->bdev, + offset); + } } /* @@ -1045,8 +1050,10 @@ void swap_free(swp_entry_t entry) struct swap_info_struct *p; p = _swap_info_get(entry); - if (p) - swap_entry_free(p, entry, 1, false); + if (p) { + if (!__swap_entry_free(p, entry, 1)) + swapcache_free_entries(&entry, 1); + } } /* @@ -1057,8 +1064,32 @@ void swapcache_free(swp_entry_t entry) struct swap_info_struct *p; p = _swap_info_get(entry); + if (p) { + if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE)) + swapcache_free_entries(&entry, 1); + } +} + +void swapcache_free_entries(swp_entry_t *entries, int n) +{ + struct swap_info_struct *p, *prev; + int i; + + if (n <= 0) + return; + + prev = NULL; + p = NULL; + for (i = 0; i < n; ++i) { + p = swap_info_get_cont(entries[i], prev); + if (p) + swap_entry_free(p, entries[i]); + else + break; + prev = p; + } if (p) - swap_entry_free(p, entry, SWAP_HAS_CACHE, false); + spin_unlock(&p->lock); } /* @@ -1241,21 +1272,23 @@ int free_swap_and_cache(swp_entry_t entry) { struct swap_info_struct *p; struct page *page = NULL; + unsigned char count; if (non_swap_entry(entry)) return 1; - p = swap_info_get(entry); + p = _swap_info_get(entry); if (p) { - if (swap_entry_free(p, entry, 1, true) == SWAP_HAS_CACHE) { + count = __swap_entry_free(p, entry, 1); + if (count == SWAP_HAS_CACHE) { page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (page && !trylock_page(page)) { put_page(page); page = NULL; } - } - spin_unlock(&p->lock); + } else if (!count) + swapcache_free_entries(&entry, 1); } if (page) { /* -- cgit From 67afa38e012e9581b9b42f2a41dfc56b1280794d Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 22 Feb 2017 15:45:39 -0800 Subject: mm/swap: add cache for swap slots allocation We add per cpu caches for swap slots that can be allocated and freed quickly without the need to touch the swap info lock. Two separate caches are maintained for swap slots allocated and swap slots returned. This is to allow the swap slots to be returned to the global pool in a batch so they will have a chance to be coaelesced with other slots in a cluster. We do not reuse the slots that are returned right away, as it may increase fragmentation of the slots. The swap allocation cache is protected by a mutex as we may sleep when searching for empty slots in cache. The swap free cache is protected by a spin lock as we cannot sleep in the free path. We refill the swap slots cache when we run out of slots, and we disable the swap slots cache and drain the slots if the global number of slots fall below a low watermark threshold. We re-enable the cache agian when the slots available are above a high watermark. [ying.huang@intel.com: use raw_cpu_ptr over this_cpu_ptr for swap slots access] [tim.c.chen@linux.intel.com: add comments on locks in swap_slots.h] Link: http://lkml.kernel.org/r/20170118180327.GA24225@linux.intel.com Link: http://lkml.kernel.org/r/35de301a4eaa8daa2977de6e987f2c154385eb66.1484082593.git.tim.c.chen@linux.intel.com Signed-off-by: Tim Chen Signed-off-by: "Huang, Ying" Reviewed-by: Michal Hocko Cc: Aaron Lu Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Christian Borntraeger Cc: Dave Hansen Cc: Hillf Danton Cc: Huang Ying Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet escreveu: Cc: Kirill A. Shutemov Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 4 + include/linux/swap_slots.h | 28 ++++ mm/Makefile | 2 +- mm/swap_slots.c | 342 +++++++++++++++++++++++++++++++++++++++++++++ mm/swap_state.c | 1 + mm/swapfile.c | 26 ++-- 6 files changed, 391 insertions(+), 12 deletions(-) create mode 100644 include/linux/swap_slots.h create mode 100644 mm/swap_slots.c diff --git a/include/linux/swap.h b/include/linux/swap.h index bcc0b18f96d2..45e91dd6716d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -372,6 +372,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; +extern bool has_usable_swap(void); /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) @@ -410,6 +411,9 @@ struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); +extern int get_swap_slots(int n, swp_entry_t *slots); +extern void swapcache_free_batch(swp_entry_t *entries, int n); + #else /* CONFIG_SWAP */ #define swap_address_space(entry) (NULL) diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h new file mode 100644 index 000000000000..ba5623b27c60 --- /dev/null +++ b/include/linux/swap_slots.h @@ -0,0 +1,28 @@ +#ifndef _LINUX_SWAP_SLOTS_H +#define _LINUX_SWAP_SLOTS_H + +#include +#include +#include + +#define SWAP_SLOTS_CACHE_SIZE SWAP_BATCH +#define THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE (5*SWAP_SLOTS_CACHE_SIZE) +#define THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE (2*SWAP_SLOTS_CACHE_SIZE) + +struct swap_slots_cache { + bool lock_initialized; + struct mutex alloc_lock; /* protects slots, nr, cur */ + swp_entry_t *slots; + int nr; + int cur; + spinlock_t free_lock; /* protects slots_ret, n_ret */ + swp_entry_t *slots_ret; + int n_ret; +}; + +void disable_swap_slots_cache_lock(void); +void reenable_swap_slots_cache_unlock(void); +int enable_swap_slots_cache(void); +int free_swap_slot(swp_entry_t entry); + +#endif /* _LINUX_SWAP_SLOTS_H */ diff --git a/mm/Makefile b/mm/Makefile index 295bd7a9f76b..433eaf9a876e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -35,7 +35,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o vmacache.o \ + compaction.o vmacache.o swap_slots.o \ interval_tree.o list_lru.o workingset.o \ debug.o $(mmu-y) diff --git a/mm/swap_slots.c b/mm/swap_slots.c new file mode 100644 index 000000000000..ebf4f1cbac04 --- /dev/null +++ b/mm/swap_slots.c @@ -0,0 +1,342 @@ +/* + * Manage cache of swap slots to be used for and returned from + * swap. + * + * Copyright(c) 2016 Intel Corporation. + * + * Author: Tim Chen + * + * We allocate the swap slots from the global pool and put + * it into local per cpu caches. This has the advantage + * of no needing to acquire the swap_info lock every time + * we need a new slot. + * + * There is also opportunity to simply return the slot + * to local caches without needing to acquire swap_info + * lock. We do not reuse the returned slots directly but + * move them back to the global pool in a batch. This + * allows the slots to coaellesce and reduce fragmentation. + * + * The swap entry allocated is marked with SWAP_HAS_CACHE + * flag in map_count that prevents it from being allocated + * again from the global pool. + * + * The swap slots cache is protected by a mutex instead of + * a spin lock as when we search for slots with scan_swap_map, + * we can possibly sleep. + */ + +#include +#include +#include +#include +#include + +#ifdef CONFIG_SWAP + +static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); +static bool swap_slot_cache_active; +static bool swap_slot_cache_enabled; +static bool swap_slot_cache_initialized; +DEFINE_MUTEX(swap_slots_cache_mutex); +/* Serialize swap slots cache enable/disable operations */ +DEFINE_MUTEX(swap_slots_cache_enable_mutex); + +static void __drain_swap_slots_cache(unsigned int type); +static void deactivate_swap_slots_cache(void); +static void reactivate_swap_slots_cache(void); + +#define use_swap_slot_cache (swap_slot_cache_active && \ + swap_slot_cache_enabled && swap_slot_cache_initialized) +#define SLOTS_CACHE 0x1 +#define SLOTS_CACHE_RET 0x2 + +static void deactivate_swap_slots_cache(void) +{ + mutex_lock(&swap_slots_cache_mutex); + swap_slot_cache_active = false; + __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + mutex_unlock(&swap_slots_cache_mutex); +} + +static void reactivate_swap_slots_cache(void) +{ + mutex_lock(&swap_slots_cache_mutex); + swap_slot_cache_active = true; + mutex_unlock(&swap_slots_cache_mutex); +} + +/* Must not be called with cpu hot plug lock */ +void disable_swap_slots_cache_lock(void) +{ + mutex_lock(&swap_slots_cache_enable_mutex); + swap_slot_cache_enabled = false; + if (swap_slot_cache_initialized) { + /* serialize with cpu hotplug operations */ + get_online_cpus(); + __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + put_online_cpus(); + } +} + +static void __reenable_swap_slots_cache(void) +{ + swap_slot_cache_enabled = has_usable_swap(); +} + +void reenable_swap_slots_cache_unlock(void) +{ + __reenable_swap_slots_cache(); + mutex_unlock(&swap_slots_cache_enable_mutex); +} + +static bool check_cache_active(void) +{ + long pages; + + if (!swap_slot_cache_enabled || !swap_slot_cache_initialized) + return false; + + pages = get_nr_swap_pages(); + if (!swap_slot_cache_active) { + if (pages > num_online_cpus() * + THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE) + reactivate_swap_slots_cache(); + goto out; + } + + /* if global pool of slot caches too low, deactivate cache */ + if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE) + deactivate_swap_slots_cache(); +out: + return swap_slot_cache_active; +} + +static int alloc_swap_slot_cache(unsigned int cpu) +{ + struct swap_slots_cache *cache; + swp_entry_t *slots, *slots_ret; + + /* + * Do allocation outside swap_slots_cache_mutex + * as vzalloc could trigger reclaim and get_swap_page, + * which can lock swap_slots_cache_mutex. + */ + slots = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE); + if (!slots) + return -ENOMEM; + + slots_ret = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE); + if (!slots_ret) { + vfree(slots); + return -ENOMEM; + } + + mutex_lock(&swap_slots_cache_mutex); + cache = &per_cpu(swp_slots, cpu); + if (cache->slots || cache->slots_ret) + /* cache already allocated */ + goto out; + if (!cache->lock_initialized) { + mutex_init(&cache->alloc_lock); + spin_lock_init(&cache->free_lock); + cache->lock_initialized = true; + } + cache->nr = 0; + cache->cur = 0; + cache->n_ret = 0; + cache->slots = slots; + slots = NULL; + cache->slots_ret = slots_ret; + slots_ret = NULL; +out: + mutex_unlock(&swap_slots_cache_mutex); + if (slots) + vfree(slots); + if (slots_ret) + vfree(slots_ret); + return 0; +} + +static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + struct swap_slots_cache *cache; + swp_entry_t *slots = NULL; + + cache = &per_cpu(swp_slots, cpu); + if ((type & SLOTS_CACHE) && cache->slots) { + mutex_lock(&cache->alloc_lock); + swapcache_free_entries(cache->slots + cache->cur, cache->nr); + cache->cur = 0; + cache->nr = 0; + if (free_slots && cache->slots) { + vfree(cache->slots); + cache->slots = NULL; + } + mutex_unlock(&cache->alloc_lock); + } + if ((type & SLOTS_CACHE_RET) && cache->slots_ret) { + spin_lock_irq(&cache->free_lock); + swapcache_free_entries(cache->slots_ret, cache->n_ret); + cache->n_ret = 0; + if (free_slots && cache->slots_ret) { + slots = cache->slots_ret; + cache->slots_ret = NULL; + } + spin_unlock_irq(&cache->free_lock); + if (slots) + vfree(slots); + } +} + +static void __drain_swap_slots_cache(unsigned int type) +{ + unsigned int cpu; + + /* + * This function is called during + * 1) swapoff, when we have to make sure no + * left over slots are in cache when we remove + * a swap device; + * 2) disabling of swap slot cache, when we run low + * on swap slots when allocating memory and need + * to return swap slots to global pool. + * + * We cannot acquire cpu hot plug lock here as + * this function can be invoked in the cpu + * hot plug path: + * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback + * -> memory allocation -> direct reclaim -> get_swap_page + * -> drain_swap_slots_cache + * + * Hence the loop over current online cpu below could miss cpu that + * is being brought online but not yet marked as online. + * That is okay as we do not schedule and run anything on a + * cpu before it has been marked online. Hence, we will not + * fill any swap slots in slots cache of such cpu. + * There are no slots on such cpu that need to be drained. + */ + for_each_online_cpu(cpu) + drain_slots_cache_cpu(cpu, type, false); +} + +static int free_slot_cache(unsigned int cpu) +{ + mutex_lock(&swap_slots_cache_mutex); + drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true); + mutex_unlock(&swap_slots_cache_mutex); + return 0; +} + +int enable_swap_slots_cache(void) +{ + int ret = 0; + + mutex_lock(&swap_slots_cache_enable_mutex); + if (swap_slot_cache_initialized) { + __reenable_swap_slots_cache(); + goto out_unlock; + } + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", + alloc_swap_slot_cache, free_slot_cache); + if (ret < 0) + goto out_unlock; + swap_slot_cache_initialized = true; + __reenable_swap_slots_cache(); +out_unlock: + mutex_unlock(&swap_slots_cache_enable_mutex); + return 0; +} + +/* called with swap slot cache's alloc lock held */ +static int refill_swap_slots_cache(struct swap_slots_cache *cache) +{ + if (!use_swap_slot_cache || cache->nr) + return 0; + + cache->cur = 0; + if (swap_slot_cache_active) + cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots); + + return cache->nr; +} + +int free_swap_slot(swp_entry_t entry) +{ + struct swap_slots_cache *cache; + + BUG_ON(!swap_slot_cache_initialized); + + cache = &get_cpu_var(swp_slots); + if (use_swap_slot_cache && cache->slots_ret) { + spin_lock_irq(&cache->free_lock); + /* Swap slots cache may be deactivated before acquiring lock */ + if (!use_swap_slot_cache) { + spin_unlock_irq(&cache->free_lock); + goto direct_free; + } + if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) { + /* + * Return slots to global pool. + * The current swap_map value is SWAP_HAS_CACHE. + * Set it to 0 to indicate it is available for + * allocation in global pool + */ + swapcache_free_entries(cache->slots_ret, cache->n_ret); + cache->n_ret = 0; + } + cache->slots_ret[cache->n_ret++] = entry; + spin_unlock_irq(&cache->free_lock); + } else { +direct_free: + swapcache_free_entries(&entry, 1); + } + put_cpu_var(swp_slots); + + return 0; +} + +swp_entry_t get_swap_page(void) +{ + swp_entry_t entry, *pentry; + struct swap_slots_cache *cache; + + /* + * Preemption is allowed here, because we may sleep + * in refill_swap_slots_cache(). But it is safe, because + * accesses to the per-CPU data structure are protected by the + * mutex cache->alloc_lock. + * + * The alloc path here does not touch cache->slots_ret + * so cache->free_lock is not taken. + */ + cache = raw_cpu_ptr(&swp_slots); + + entry.val = 0; + if (check_cache_active()) { + mutex_lock(&cache->alloc_lock); + if (cache->slots) { +repeat: + if (cache->nr) { + pentry = &cache->slots[cache->cur++]; + entry = *pentry; + pentry->val = 0; + cache->nr--; + } else { + if (refill_swap_slots_cache(cache)) + goto repeat; + } + } + mutex_unlock(&cache->alloc_lock); + if (entry.val) + return entry; + } + + get_swap_pages(1, &entry); + + return entry; +} + +#endif /* CONFIG_SWAP */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 3d76d80c07d6..e1f07cafecaa 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -18,6 +18,7 @@ #include #include #include +#include #include diff --git a/mm/swapfile.c b/mm/swapfile.c index 8b5bd34b1a00..30a90fd140b7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -854,14 +855,6 @@ noswap: return n_ret; } -swp_entry_t get_swap_page(void) -{ - swp_entry_t entry; - - get_swap_pages(1, &entry); - return entry; -} - /* The only caller of this function is now suspend routine */ swp_entry_t get_swap_page_of_type(int type) { @@ -1052,7 +1045,7 @@ void swap_free(swp_entry_t entry) p = _swap_info_get(entry); if (p) { if (!__swap_entry_free(p, entry, 1)) - swapcache_free_entries(&entry, 1); + free_swap_slot(entry); } } @@ -1066,7 +1059,7 @@ void swapcache_free(swp_entry_t entry) p = _swap_info_get(entry); if (p) { if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE)) - swapcache_free_entries(&entry, 1); + free_swap_slot(entry); } } @@ -1288,7 +1281,7 @@ int free_swap_and_cache(swp_entry_t entry) page = NULL; } } else if (!count) - swapcache_free_entries(&entry, 1); + free_swap_slot(entry); } if (page) { /* @@ -2116,6 +2109,17 @@ static void reinsert_swap_info(struct swap_info_struct *p) spin_unlock(&swap_lock); } +bool has_usable_swap(void) +{ + bool ret = true; + + spin_lock(&swap_lock); + if (plist_head_empty(&swap_active_head)) + ret = false; + spin_unlock(&swap_lock); + return ret; +} + SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; -- cgit From 039939a65059852242c823ece685579370bc574f Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 22 Feb 2017 15:45:43 -0800 Subject: mm/swap: enable swap slots cache usage Initialize swap slots cache and enable it on swap on. Drain all swap slots on swap off. Link: http://lkml.kernel.org/r/07cbc94882fa95d4ac3cfc50b8dce0b1ec231b93.1484082593.git.tim.c.chen@linux.intel.com Signed-off-by: Tim Chen Cc: "Huang, Ying" Cc: Aaron Lu Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Christian Borntraeger Cc: Dave Hansen Cc: Hillf Danton Cc: Huang Ying Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet escreveu: Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index 30a90fd140b7..2cac12cc9abe 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2190,6 +2190,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&p->lock); spin_unlock(&swap_lock); + disable_swap_slots_cache_lock(); + set_current_oom_origin(); err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ clear_current_oom_origin(); @@ -2197,9 +2199,12 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (err) { /* re-insert swap space back into swap_list */ reinsert_swap_info(p); + reenable_swap_slots_cache_unlock(); goto out_dput; } + reenable_swap_slots_cache_unlock(); + flush_work(&p->discard_work); destroy_swap_extents(p); @@ -2886,6 +2891,8 @@ out: putname(name); if (inode && S_ISREG(inode->i_mode)) inode_unlock(inode); + if (!error) + enable_swap_slots_cache(); return error; } -- cgit From ba81f83842549871cbd7226fc11530dc464500bb Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 22 Feb 2017 15:45:46 -0800 Subject: mm/swap: skip readahead only when swap slot cache is enabled Because during swap off, a swap entry may have swap_map[] == SWAP_HAS_CACHE (for example, just allocated). If we return NULL in __read_swap_cache_async(), the swap off will abort. So when swap slot cache is disabled, (for swap off), we will wait for page to be put into swap cache in such race condition. This should not be a problem for swap slot cache, because swap slot cache should be drained after clearing swap_slot_cache_enabled. [ying.huang@intel.com: fix memory leak in __read_swap_cache_async()] Link: http://lkml.kernel.org/r/874lzt6znd.fsf@yhuang-dev.intel.com Link: http://lkml.kernel.org/r/5e2c5f6abe8e6eb0797408897b1bba80938e9b9d.1484082593.git.tim.c.chen@linux.intel.com Signed-off-by: "Huang, Ying" Signed-off-by: Tim Chen Cc: Aaron Lu Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Christian Borntraeger Cc: Dave Hansen Cc: Hillf Danton Cc: Huang Ying Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet escreveu: Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap_slots.h | 2 ++ mm/swap_slots.c | 2 +- mm/swap_state.c | 13 ++++++++++--- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index ba5623b27c60..6ef92d17633d 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -25,4 +25,6 @@ void reenable_swap_slots_cache_unlock(void); int enable_swap_slots_cache(void); int free_swap_slot(swp_entry_t entry); +extern bool swap_slot_cache_enabled; + #endif /* _LINUX_SWAP_SLOTS_H */ diff --git a/mm/swap_slots.c b/mm/swap_slots.c index ebf4f1cbac04..9b5bc86f96ad 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -36,7 +36,7 @@ static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); static bool swap_slot_cache_active; -static bool swap_slot_cache_enabled; +bool swap_slot_cache_enabled; static bool swap_slot_cache_initialized; DEFINE_MUTEX(swap_slots_cache_mutex); /* Serialize swap slots cache enable/disable operations */ diff --git a/mm/swap_state.c b/mm/swap_state.c index e1f07cafecaa..473b71e052a8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -324,9 +324,16 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (found_page) break; - /* Just skip read ahead for unused swap slot */ - if (!__swp_swapcount(entry)) - return NULL; + /* + * Just skip read ahead for unused swap slot. + * During swap_off when swap_slot_cache is disabled, + * we have to handle the race between putting + * swap entry in swap cache and marking swap slot + * as SWAP_HAS_CACHE. That's done in later part of code or + * else swap_off will be aborted if we return NULL. + */ + if (!__swp_swapcount(entry) && swap_slot_cache_enabled) + break; /* * Get a new page to read into from swap. -- cgit From 21440d7eb9044001b7fdb71d0163689f60a0f2a1 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Feb 2017 15:45:49 -0800 Subject: mm, thp: add new defer+madvise defrag option There is no thp defrag option that currently allows MADV_HUGEPAGE regions to do direct compaction and reclaim while all other thp allocations simply trigger kswapd and kcompactd in the background and fail immediately. The "defer" setting simply triggers background reclaim and compaction for all regions, regardless of MADV_HUGEPAGE, which makes it unusable for our userspace where MADV_HUGEPAGE is being used to indicate the application is willing to wait for work for thp memory to be available. The "madvise" setting will do direct compaction and reclaim for these MADV_HUGEPAGE regions, but does not trigger kswapd and kcompactd in the background for anybody else. For reasonable usage, there needs to be a mesh between the two options. This patch introduces a fifth mode, "defer+madvise", that will do direct reclaim and compaction for MADV_HUGEPAGE regions and trigger background reclaim and compaction for everybody else so that hugepages may be available in the near future. A proposal to allow direct reclaim and compaction for MADV_HUGEPAGE regions as part of the "defer" mode, making it a very powerful setting and avoids breaking userspace, was offered: http://marc.info/?t=148236612700003 This additional mode is a compromise. A second proposal to allow both "defer" and "madvise" to be selected at the same time was also offered: http://marc.info/?t=148357345300001. This is possible, but there was a concern that it might break existing userspaces the parse the output of the defrag mode, so the fifth option was introduced instead. This patch also cleans up the helper function for storing to "enabled" and "defrag" since the former supports three modes while the latter supports five and triple_flag_store() was getting unnecessarily messy. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701101614330.41805@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Cc: Jonathan Corbet Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/transhuge.txt | 8 ++- include/linux/huge_mm.h | 1 + mm/huge_memory.c | 146 +++++++++++++++++++++-------------------- 3 files changed, 82 insertions(+), 73 deletions(-) diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index c4171e4519c2..8fda5b7b24e9 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -110,6 +110,7 @@ MADV_HUGEPAGE region. echo always >/sys/kernel/mm/transparent_hugepage/defrag echo defer >/sys/kernel/mm/transparent_hugepage/defrag +echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag echo madvise >/sys/kernel/mm/transparent_hugepage/defrag echo never >/sys/kernel/mm/transparent_hugepage/defrag @@ -120,10 +121,15 @@ that benefit heavily from THP use and are willing to delay the VM start to utilise them. "defer" means that an application will wake kswapd in the background -to reclaim pages and wake kcompact to compact memory so that THP is +to reclaim pages and wake kcompactd to compact memory so that THP is available in the near future. It's the responsibility of khugepaged to then install the THP pages later. +"defer+madvise" will enter direct reclaim and compaction like "always", but +only for regions that have used madvise(MADV_HUGEPAGE); all other regions +will wake kswapd in the background to reclaim pages and wake kcompactd to +compact memory so that THP is available in the near future. + "madvise" will enter direct reclaim like "always" but only for regions that are have used madvise(MADV_HUGEPAGE). This is the default behaviour. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 97e478d6b690..f0029e786205 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -33,6 +33,7 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5f3ad65c85de..f9ecc2aeadfc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -142,42 +142,6 @@ static struct shrinker huge_zero_page_shrinker = { }; #ifdef CONFIG_SYSFS - -static ssize_t triple_flag_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count, - enum transparent_hugepage_flag enabled, - enum transparent_hugepage_flag deferred, - enum transparent_hugepage_flag req_madv) -{ - if (!memcmp("defer", buf, - min(sizeof("defer")-1, count))) { - if (enabled == deferred) - return -EINVAL; - clear_bit(enabled, &transparent_hugepage_flags); - clear_bit(req_madv, &transparent_hugepage_flags); - set_bit(deferred, &transparent_hugepage_flags); - } else if (!memcmp("always", buf, - min(sizeof("always")-1, count))) { - clear_bit(deferred, &transparent_hugepage_flags); - clear_bit(req_madv, &transparent_hugepage_flags); - set_bit(enabled, &transparent_hugepage_flags); - } else if (!memcmp("madvise", buf, - min(sizeof("madvise")-1, count))) { - clear_bit(enabled, &transparent_hugepage_flags); - clear_bit(deferred, &transparent_hugepage_flags); - set_bit(req_madv, &transparent_hugepage_flags); - } else if (!memcmp("never", buf, - min(sizeof("never")-1, count))) { - clear_bit(enabled, &transparent_hugepage_flags); - clear_bit(req_madv, &transparent_hugepage_flags); - clear_bit(deferred, &transparent_hugepage_flags); - } else - return -EINVAL; - - return count; -} - static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -193,19 +157,28 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - ssize_t ret; + ssize_t ret = count; - ret = triple_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_FLAG, - TRANSPARENT_HUGEPAGE_FLAG, - TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); + if (!memcmp("always", buf, + min(sizeof("always")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("madvise", buf, + min(sizeof("madvise")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("never", buf, + min(sizeof("never")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else + ret = -EINVAL; if (ret > 0) { int err = start_stop_khugepaged(); if (err) ret = err; } - return ret; } static struct kobj_attribute enabled_attr = @@ -241,32 +214,58 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj, return count; } -/* - * Currently defrag only disables __GFP_NOWAIT for allocation. A blind - * __GFP_REPEAT is too aggressive, it's never worth swapping tons of - * memory just to allocate one more hugepage. - */ static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "[always] defer madvise never\n"); + return sprintf(buf, "[always] defer defer+madvise madvise never\n"); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always [defer] madvise never\n"); - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always defer [madvise] never\n"); - else - return sprintf(buf, "always defer madvise [never]\n"); - + return sprintf(buf, "always [defer] defer+madvise madvise never\n"); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always defer [defer+madvise] madvise never\n"); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always defer defer+madvise [madvise] never\n"); + return sprintf(buf, "always defer defer+madvise madvise [never]\n"); } + static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - return triple_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, - TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, - TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); + if (!memcmp("always", buf, + min(sizeof("always")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("defer", buf, + min(sizeof("defer")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("defer+madvise", buf, + min(sizeof("defer+madvise")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("madvise", buf, + min(sizeof("madvise")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("never", buf, + min(sizeof("never")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else + return -EINVAL; + + return count; } static struct kobj_attribute defrag_attr = __ATTR(defrag, 0644, defrag_show, defrag_store); @@ -612,25 +611,28 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, } /* - * If THP defrag is set to always then directly reclaim/compact as necessary - * If set to defer then do only background reclaim/compact and defer to khugepaged - * If set to madvise and the VMA is flagged then directly reclaim/compact - * When direct reclaim/compact is allowed, don't retry except for flagged VMA's + * always: directly stall for all thp allocations + * defer: wake kswapd and fail if not immediately available + * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise + * fail if not immediately available + * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately + * available + * never: never stall for any thp allocation */ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) { - bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, - &transparent_hugepage_flags) && vma_madvised) - return GFP_TRANSHUGE; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, - &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, - &transparent_hugepage_flags)) + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); - + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + 0); return GFP_TRANSHUGE_LIGHT; } -- cgit From bc71226b0690c21bdf50c9c9d08c5dc9ef98764e Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 22 Feb 2017 15:45:52 -0800 Subject: mm/backing-dev.c: use rb_entry() To make the code clearer, use rb_entry() instead of container_of() to deal with rbtree. Link: http://lkml.kernel.org/r/671275de093d93ddc7c6f77ddc0d357149691a39.1484306840.git.geliangtang@gmail.com Signed-off-by: Geliang Tang Cc: Jens Axboe Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 39ce616a9d71..6d861d090e9f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -411,8 +411,8 @@ retry: while (*node != NULL) { parent = *node; - congested = container_of(parent, struct bdi_writeback_congested, - rb_node); + congested = rb_entry(parent, struct bdi_writeback_congested, + rb_node); if (congested->blkcg_id < blkcg_id) node = &parent->rb_left; else if (congested->blkcg_id > blkcg_id) -- cgit From f0958906cd2bf3730cd7938b8af80a1c23e8ac06 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:45:55 -0800 Subject: mm, vmscan: do not count freed pages as PGDEACTIVATE PGDEACTIVATE represents the number of pages moved from the active list to the inactive list. At least this sounds like the original motivation of the counter. move_active_pages_to_lru, however, counts pages which got freed in the mean time as deactivated as well. This is a very rare event and counting them as deactivation in itself is not harmful but it makes the code more convoluted than necessary - we have to count both all pages and those which are freed which is a bit confusing. After this patch the PGDEACTIVATE should have a slightly more clear semantic and only count those pages which are moved from the active to the inactive list which is a plus. Link: http://lkml.kernel.org/r/20170112211221.17636-1-mhocko@kernel.org Signed-off-by: Michal Hocko Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Cc: Hugh Dickins Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index de400d1eac0e..277e105646a5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1878,7 +1878,6 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec, enum lru_list lru) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); - unsigned long pgmoved = 0; struct page *page; int nr_pages; int nr_moved = 0; @@ -1893,7 +1892,6 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec, nr_pages = hpage_nr_pages(page); update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); list_move(&page->lru, &lruvec->lists[lru]); - pgmoved += nr_pages; if (put_page_testzero(page)) { __ClearPageLRU(page); @@ -1913,7 +1911,7 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec, } if (!is_active_lru(lru)) - __count_vm_events(PGDEACTIVATE, pgmoved); + __count_vm_events(PGDEACTIVATE, nr_moved); return nr_moved; } -- cgit From fd538803731e50367b7c59ce4ad3454426a3d671 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:45:58 -0800 Subject: mm, vmscan: cleanup lru size claculations lruvec_lru_size returns the full size of the LRU list while we sometimes need a value reduced only to eligible zones (e.g. for lowmem requests). inactive_list_is_low is one such user. Later patches will add more of them. Add a new parameter to lruvec_lru_size and allow it filter out zones which are not eligible for the given context. Link: http://lkml.kernel.org/r/20170117103702.28542-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: Minchan Kim Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- mm/vmscan.c | 89 +++++++++++++++++++++++++------------------------- mm/workingset.c | 2 +- 3 files changed, 46 insertions(+), 47 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f4aac87adcc3..82fc632fd11d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -779,7 +779,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) #endif } -extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru); +extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); #ifdef CONFIG_HAVE_MEMORY_PRESENT void memory_present(int nid, unsigned long start, unsigned long end); diff --git a/mm/vmscan.c b/mm/vmscan.c index 277e105646a5..e7c75a3c6b53 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -234,22 +234,39 @@ bool pgdat_reclaimable(struct pglist_data *pgdat) pgdat_reclaimable_pages(pgdat) * 6; } -unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) +/** + * lruvec_lru_size - Returns the number of pages on the given LRU list. + * @lruvec: lru vector + * @lru: lru to use + * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list) + */ +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { + unsigned long lru_size; + int zid; + if (!mem_cgroup_disabled()) - return mem_cgroup_get_lru_size(lruvec, lru); + lru_size = mem_cgroup_get_lru_size(lruvec, lru); + else + lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); - return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); -} + for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; + unsigned long size; -unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zone_idx) -{ - if (!mem_cgroup_disabled()) - return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx); + if (!managed_zone(zone)) + continue; + + if (!mem_cgroup_disabled()) + size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid); + else + size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid], + NR_ZONE_LRU_BASE + lru); + lru_size -= min(size, lru_size); + } + + return lru_size; - return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx], - NR_ZONE_LRU_BASE + lru); } /* @@ -2049,11 +2066,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, struct scan_control *sc, bool trace) { unsigned long inactive_ratio; - unsigned long total_inactive, inactive; - unsigned long total_active, active; + unsigned long inactive, active; + enum lru_list inactive_lru = file * LRU_FILE; + enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; unsigned long gb; - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - int zid; /* * If we don't have swap space, anonymous page deactivation @@ -2062,27 +2078,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, if (!file && !total_swap_pages) return false; - total_inactive = inactive = lruvec_lru_size(lruvec, file * LRU_FILE); - total_active = active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); - - /* - * For zone-constrained allocations, it is necessary to check if - * deactivations are required for lowmem to be reclaimed. This - * calculates the inactive/active pages available in eligible zones. - */ - for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) { - struct zone *zone = &pgdat->node_zones[zid]; - unsigned long inactive_zone, active_zone; - - if (!managed_zone(zone)) - continue; - - inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid); - active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid); - - inactive -= min(inactive, inactive_zone); - active -= min(active, active_zone); - } + inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); + active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -2091,10 +2088,12 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, inactive_ratio = 1; if (trace) - trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, + trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id, sc->reclaim_idx, - total_inactive, inactive, - total_active, active, inactive_ratio, file); + lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, + lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, + inactive_ratio, file); + return inactive * inactive_ratio < active; } @@ -2234,7 +2233,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * system is under heavy pressure. */ if (!inactive_list_is_low(lruvec, true, sc, false) && - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) { scan_balance = SCAN_FILE; goto out; } @@ -2260,10 +2259,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anon in [0], file in [1] */ - anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) + - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON); - file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); + anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); spin_lock_irq(&pgdat->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { @@ -2301,7 +2300,7 @@ out: unsigned long size; unsigned long scan; - size = lruvec_lru_size(lruvec, lru); + size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES); scan = size >> sc->priority; if (!scan && pass && force_scan) diff --git a/mm/workingset.c b/mm/workingset.c index abb58ffa3c64..a67f5796b995 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -267,7 +267,7 @@ bool workingset_refault(void *shadow) } lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); - active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); + active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); rcu_read_unlock(); /* -- cgit From 71ab6cfe88dcf9f6e6a65eb85cf2bda20a257682 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:01 -0800 Subject: mm, vmscan: consider eligible zones in get_scan_count get_scan_count() considers the whole node LRU size when - doing SCAN_FILE due to many page cache inactive pages - calculating the number of pages to scan In both cases this might lead to unexpected behavior especially on 32b systems where we can expect lowmem memory pressure very often. A large highmem zone can easily distort SCAN_FILE heuristic because there might be only few file pages from the eligible zones on the node lru and we would still enforce file lru scanning which can lead to trashing while we could still scan anonymous pages. The later use of lruvec_lru_size can be problematic as well. Especially when there are not many pages from the eligible zones. We would have to skip over many pages to find anything to reclaim but shrink_node_memcg would only reduce the remaining number to scan by SWAP_CLUSTER_MAX at maximum. Therefore we can end up going over a large LRU many times without actually having chance to reclaim much if anything at all. The closer we are out of memory on lowmem zone the worse the problem will be. Fix this by filtering out all the ineligible zones when calculating the lru size for both paths and consider only sc->reclaim_idx zones. The patch would need to be tweaked a bit to apply to 4.10 and older but I will do that as soon as it hits the Linus tree in the next merge window. Link: http://lkml.kernel.org/r/20170117103702.28542-3-mhocko@kernel.org Fixes: b2e18757f2c9 ("mm, vmscan: begin reclaiming pages on a per-node basis") Signed-off-by: Michal Hocko Tested-by: Trevor Cordes Acked-by: Minchan Kim Acked-by: Hillf Danton Acked-by: Mel Gorman Acked-by: Johannes Weiner Cc: [4.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index e7c75a3c6b53..861ec1431f25 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2233,7 +2233,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * system is under heavy pressure. */ if (!inactive_list_is_low(lruvec, true, sc, false) && - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) { + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { scan_balance = SCAN_FILE; goto out; } @@ -2300,7 +2300,7 @@ out: unsigned long size; unsigned long scan; - size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES); + size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); scan = size >> sc->priority; if (!scan && pass && force_scan) -- cgit From abd6e8a7ac49807102652861f69583944752e297 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:04 -0800 Subject: Revert "mm: bail out in shrink_inactive_list()" This reverts commit 91dcade47a3d0e7. inactive_reclaimable_pages shouldn't be needed anymore since that get_scan_count is aware of the eligble zones ("mm, vmscan: consider eligible zones in get_scan_count"). Link: http://lkml.kernel.org/r/20170117103702.28542-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Hillf Danton Acked-by: Minchan Kim Acked-by: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 861ec1431f25..7bb23ff229b6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1701,30 +1701,6 @@ static int current_may_throttle(void) bdi_write_congested(current->backing_dev_info); } -static bool inactive_reclaimable_pages(struct lruvec *lruvec, - struct scan_control *sc, enum lru_list lru) -{ - int zid; - struct zone *zone; - int file = is_file_lru(lru); - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - - if (!global_reclaim(sc)) - return true; - - for (zid = sc->reclaim_idx; zid >= 0; zid--) { - zone = &pgdat->node_zones[zid]; - if (!managed_zone(zone)) - continue; - - if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE + - LRU_FILE * file) >= SWAP_CLUSTER_MAX) - return true; - } - - return false; -} - /* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages @@ -1743,9 +1719,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - if (!inactive_reclaimable_pages(lruvec, sc, lru)) - return 0; - while (unlikely(too_many_isolated(pgdat, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); -- cgit From c02e50bb8a55a7adeeca5e411479ed70c6a2dfa1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:07 -0800 Subject: mm, page_alloc: do not report all nodes in show_mem Patch series "show_mem updates", v2. This is a mixture of one bug fix (patch 1), an enhancement (patch 2) and cleanups (the rest of the series). First two patches should be really straightforward. Patch 3 removes some arch specific show_mem implementations because I think they are quite outdated and do not really serve any useful purpose anymore. I think we should really strive to have a consistent show_mem output regardless of the architecture. If some architecture is really special and wants to dump something additional we should do that via an arch specific hook. The last patch adds nodemask parameter so that we do not rely on the hardcoded mems_allowed of the current task when doing the node filtering. I consider this more a cleanup than a fix because basically all users use a nodemask which is a subset of mems_allowed. There is only one call path in the memory hotplug which doesn't comply with this but that is hardly something to worry about. This patch (of 4): Commit 599d0c954f91 ("mm, vmscan: move LRU lists to node") has added per numa node statistics to show_mem but it forgot to add skip_free_areas_node to filter out nodes which are outside of the allocating task numa policy. Add this check to not pollute the output with the pointless information. Link: http://lkml.kernel.org/r/20170117091543.25850-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: David Rientjes Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6da3169d3750..cf6b53dc08f9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4368,6 +4368,9 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { + if (skip_free_areas_node(filter, pgdat->node_id)) + continue; + printk("Node %d" " active_anon:%lukB" " inactive_anon:%lukB" -- cgit From a8e99259e7e32b67af2b447f0a570813c0c283ec Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:10 -0800 Subject: mm, page_alloc: warn_alloc print nodemask warn_alloc is currently used for to report an allocation failure or an allocation stall. We print some details of the allocation request like the gfp mask and the request order. We do not print the allocation nodemask which is important when debugging the reason for the allocation failure as well. We alreaddy print the nodemask in the OOM report. Add nodemask to warn_alloc and print it in warn_alloc as well. Link: http://lkml.kernel.org/r/20170117091543.25850-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Hillf Danton Cc: Johannes Weiner Cc: Vlastimil Babka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- mm/page_alloc.c | 10 ++++++---- mm/vmalloc.c | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index dae6f58d67c8..28b6c3f8a7f3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1942,8 +1942,8 @@ extern void si_meminfo_node(struct sysinfo *val, int nid); extern unsigned long arch_reserved_kernel_pages(void); #endif -extern __printf(2, 3) -void warn_alloc(gfp_t gfp_mask, const char *fmt, ...); +extern __printf(3, 4) +void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cf6b53dc08f9..96c8fe602dfb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3028,12 +3028,13 @@ static void warn_alloc_show_mem(gfp_t gfp_mask) show_mem(filter); } -void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) +void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) { struct va_format vaf; va_list args; static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + nodemask_t *nm = (nodemask) ? nodemask : &cpuset_current_mems_allowed; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || debug_guardpage_minorder() > 0) @@ -3047,7 +3048,8 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) pr_cont("%pV", &vaf); va_end(args); - pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); + pr_cont(", mode:%#x(%pGg), nodemask=%*pbl\n", gfp_mask, &gfp_mask, nodemask_pr_args(nm)); + cpuset_print_current_mems_allowed(); dump_stack(); warn_alloc_show_mem(gfp_mask); @@ -3724,7 +3726,7 @@ retry: /* Make sure we know about allocations which stall for too long */ if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, + warn_alloc(gfp_mask, ac->nodemask, "page allocation stalls for %ums, order:%u", jiffies_to_msecs(jiffies-alloc_start), order); stall_timeout += 10 * HZ; @@ -3775,7 +3777,7 @@ nopage: if (read_mems_allowed_retry(cpuset_mems_cookie)) goto retry_cpuset; - warn_alloc(gfp_mask, + warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); got_pg: return page; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5f5b09e9dccd..d89034a393f2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1662,7 +1662,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - warn_alloc(gfp_mask, + warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); vfree(area->addr); @@ -1724,7 +1724,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr; fail: - warn_alloc(gfp_mask, + warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure: %lu bytes", real_size); return NULL; } -- cgit From 6d23f8a5d432337aa2590ea8fd5eee8b0bc28eee Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:13 -0800 Subject: arch, mm: remove arch specific show_mem We have a generic implementation for quite some time already. If there is any arch specific information to be printed then we should add a callback called from the generic code rather than duplicate the whole show_mem. The current code has resulted in the code duplication and the output divergence which is both confusing and adds maintainance costs. Let's just get rid of this mess. Link: http://lkml.kernel.org/r/20170117091543.25850-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Guan Xuetao [UniCore32] Acked-by: Helge Deller [for parisc] Acked-by: Chris Metcalf [for tile] Acked-by: Mel Gorman Acked-by: Johannes Weiner Cc: Tony Luck Cc: Fenghua Yu Cc: "James E.J. Bottomley" Cc: "David S. Miller" Cc: Hillf Danton Cc: Vlastimil Babka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 48 ----------------------------------------------- arch/parisc/mm/init.c | 49 ------------------------------------------------ arch/sparc/mm/init_32.c | 11 ----------- arch/tile/mm/pgtable.c | 45 -------------------------------------------- arch/unicore32/mm/init.c | 44 ------------------------------------------- 5 files changed, 197 deletions(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index bb4610faca84..06cdaef54b2e 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -684,51 +684,3 @@ int arch_remove_memory(u64 start, u64 size) } #endif #endif - -/** - * show_mem - give short summary of memory stats - * - * Shows a simple page count of reserved and used pages in the system. - * For discontig machines, it does this on a per-pgdat basis. - */ -void show_mem(unsigned int filter) -{ - int total_reserved = 0; - unsigned long total_present = 0; - pg_data_t *pgdat; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(filter); - printk(KERN_INFO "Node memory in pages:\n"); - for_each_online_pgdat(pgdat) { - unsigned long present; - unsigned long flags; - int reserved = 0; - int nid = pgdat->node_id; - int zoneid; - - if (skip_free_areas_node(filter, nid)) - continue; - pgdat_resize_lock(pgdat, &flags); - - for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { - struct zone *zone = &pgdat->node_zones[zoneid]; - if (!populated_zone(zone)) - continue; - - reserved += zone->present_pages - zone->managed_pages; - } - present = pgdat->node_present_pages; - - pgdat_resize_unlock(pgdat, &flags); - total_present += present; - total_reserved += reserved; - printk(KERN_INFO "Node %4d: RAM: %11ld, rsvd: %8d, ", - nid, present, reserved); - } - printk(KERN_INFO "%ld pages of RAM\n", total_present); - printk(KERN_INFO "%d reserved pages\n", total_reserved); - printk(KERN_INFO "Total of %ld pages in page table cache\n", - quicklist_total_size()); - printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); -} diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index a055e5b6b380..66f3a6345105 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -653,55 +653,6 @@ void __init mem_init(void) unsigned long *empty_zero_page __read_mostly; EXPORT_SYMBOL(empty_zero_page); -void show_mem(unsigned int filter) -{ - int total = 0,reserved = 0; - pg_data_t *pgdat; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(filter); - - for_each_online_pgdat(pgdat) { - unsigned long flags; - int zoneid; - - pgdat_resize_lock(pgdat, &flags); - for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { - struct zone *zone = &pgdat->node_zones[zoneid]; - if (!populated_zone(zone)) - continue; - - total += zone->present_pages; - reserved = zone->present_pages - zone->managed_pages; - } - pgdat_resize_unlock(pgdat, &flags); - } - - printk(KERN_INFO "%d pages of RAM\n", total); - printk(KERN_INFO "%d reserved pages\n", reserved); - -#ifdef CONFIG_DISCONTIGMEM - { - struct zonelist *zl; - int i, j; - - for (i = 0; i < npmem_ranges; i++) { - zl = node_zonelist(i, 0); - for (j = 0; j < MAX_NR_ZONES; j++) { - struct zoneref *z; - struct zone *zone; - - printk("Zone list for zone %d on node %d: ", j, i); - for_each_zone_zonelist(zone, z, zl, j) - printk("[%d/%s] ", zone_to_nid(zone), - zone->name); - printk("\n"); - } - } - } -#endif -} - /* * pagetable_init() sets up the page tables * diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index eb8287155279..c6afe98de4d9 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -55,17 +55,6 @@ extern unsigned int sparc_ramdisk_size; unsigned long highstart_pfn, highend_pfn; -void show_mem(unsigned int filter) -{ - printk("Mem-info:\n"); - show_free_areas(filter); - printk("Free swap: %6ldkB\n", - get_nr_swap_pages() << (PAGE_SHIFT-10)); - printk("%ld pages of RAM\n", totalram_pages); - printk("%ld free pages\n", nr_free_pages()); -} - - unsigned long last_valid_pfn; unsigned long calc_highpages(void) diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index 7cc6ee7f1a58..492a7361e58e 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -36,51 +36,6 @@ #define K(x) ((x) << (PAGE_SHIFT-10)) -/* - * The normal show_free_areas() is too verbose on Tile, with dozens - * of processors and often four NUMA zones each with high and lowmem. - */ -void show_mem(unsigned int filter) -{ - struct zone *zone; - - pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n", - (global_node_page_state(NR_ACTIVE_ANON) + - global_node_page_state(NR_ACTIVE_FILE)), - (global_node_page_state(NR_INACTIVE_ANON) + - global_node_page_state(NR_INACTIVE_FILE)), - global_node_page_state(NR_FILE_DIRTY), - global_node_page_state(NR_WRITEBACK), - global_node_page_state(NR_UNSTABLE_NFS), - global_page_state(NR_FREE_PAGES), - (global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)), - global_node_page_state(NR_FILE_MAPPED), - global_page_state(NR_PAGETABLE), - global_page_state(NR_BOUNCE), - global_node_page_state(NR_FILE_PAGES), - get_nr_swap_pages()); - - for_each_zone(zone) { - unsigned long flags, order, total = 0, largest_order = -1; - - if (!populated_zone(zone)) - continue; - - spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { - int nr = zone->free_area[order].nr_free; - total += nr << order; - if (nr) - largest_order = order; - } - spin_unlock_irqrestore(&zone->lock, flags); - pr_err("Node %d %7s: %lukB (largest %luKb)\n", - zone_to_nid(zone), zone->name, - K(total), largest_order ? K(1UL) << largest_order : 0); - } -} - /** * shatter_huge_page() - ensure a given address is mapped by a small page. * diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index be2bde9b07cf..f4950fbfe574 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -57,50 +57,6 @@ early_param("initrd", early_initrd); */ struct meminfo meminfo; -void show_mem(unsigned int filter) -{ - int free = 0, total = 0, reserved = 0; - int shared = 0, cached = 0, slab = 0, i; - struct meminfo *mi = &meminfo; - - printk(KERN_DEFAULT "Mem-info:\n"); - show_free_areas(filter); - - for_each_bank(i, mi) { - struct membank *bank = &mi->bank[i]; - unsigned int pfn1, pfn2; - struct page *page, *end; - - pfn1 = bank_pfn_start(bank); - pfn2 = bank_pfn_end(bank); - - page = pfn_to_page(pfn1); - end = pfn_to_page(pfn2 - 1) + 1; - - do { - total++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (PageSlab(page)) - slab++; - else if (!page_count(page)) - free++; - else - shared += page_count(page) - 1; - page++; - } while (page < end); - } - - printk(KERN_DEFAULT "%d pages of RAM\n", total); - printk(KERN_DEFAULT "%d free pages\n", free); - printk(KERN_DEFAULT "%d reserved pages\n", reserved); - printk(KERN_DEFAULT "%d slab pages\n", slab); - printk(KERN_DEFAULT "%d pages shared\n", shared); - printk(KERN_DEFAULT "%d pages swap cached\n", cached); -} - static void __init find_limits(unsigned long *min, unsigned long *max_low, unsigned long *max_high) { -- cgit From 9af744d743170b5f5ef70031dea8d772d166ab28 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:16 -0800 Subject: lib/show_mem.c: teach show_mem to work with the given nodemask show_mem() allows to filter out node specific data which is irrelevant to the allocation request via SHOW_MEM_FILTER_NODES. The filtering is done in skip_free_areas_node which skips all nodes which are not in the mems_allowed of the current process. This works most of the time as expected because the nodemask shouldn't be outside of the allocating task but there are some exceptions. E.g. memory hotplug might want to request allocations from outside of the allowed nodes (see new_node_page). Get rid of this hardcoded behavior and push the allocation mask down the show_mem path and use it instead of cpuset_current_mems_allowed. NULL nodemask is interpreted as cpuset_current_mems_allowed. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20170117091543.25850-5-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Cc: Hillf Danton Cc: Johannes Weiner Cc: Vlastimil Babka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/xmon/xmon.c | 2 +- arch/sparc/kernel/setup_32.c | 2 +- drivers/net/ethernet/sgi/ioc3-eth.c | 2 +- drivers/tty/sysrq.c | 2 +- drivers/tty/vt/keyboard.c | 2 +- include/linux/mm.h | 5 ++--- lib/show_mem.c | 4 ++-- mm/nommu.c | 6 +++--- mm/oom_kill.c | 2 +- mm/page_alloc.c | 38 ++++++++++++++++++------------------- 10 files changed, 32 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 1be0499f5397..5720236d0266 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -916,7 +916,7 @@ cmds(struct pt_regs *excp) memzcan(); break; case 'i': - show_mem(0); + show_mem(0, NULL); break; default: termch = cmd; diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c index c4e65cb3280f..6f06058c5ae7 100644 --- a/arch/sparc/kernel/setup_32.c +++ b/arch/sparc/kernel/setup_32.c @@ -82,7 +82,7 @@ static void prom_sync_me(void) "nop\n\t" : : "r" (&trapbase)); prom_printf("PROM SYNC COMMAND...\n"); - show_free_areas(0); + show_free_areas(0, NULL); if (!is_idle_task(current)) { local_irq_enable(); sys_sync(); diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c index d390b9663dc3..57e6cef81ebe 100644 --- a/drivers/net/ethernet/sgi/ioc3-eth.c +++ b/drivers/net/ethernet/sgi/ioc3-eth.c @@ -914,7 +914,7 @@ static void ioc3_alloc_rings(struct net_device *dev) skb = ioc3_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC); if (!skb) { - show_free_areas(0); + show_free_areas(0, NULL); continue; } diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 701c085bb19b..71136742e606 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -317,7 +317,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = { static void sysrq_handle_showmem(int key) { - show_mem(0); + show_mem(0, NULL); } static struct sysrq_key_op sysrq_showmem_op = { .handler = sysrq_handle_showmem, diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c index 3dd6a491cdba..397e1509fe51 100644 --- a/drivers/tty/vt/keyboard.c +++ b/drivers/tty/vt/keyboard.c @@ -572,7 +572,7 @@ static void fn_scroll_back(struct vc_data *vc) static void fn_show_mem(struct vc_data *vc) { - show_mem(0); + show_mem(0, NULL); } static void fn_show_state(struct vc_data *vc) diff --git a/include/linux/mm.h b/include/linux/mm.h index 28b6c3f8a7f3..8a67cae5a07c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1152,8 +1152,7 @@ extern void pagefault_out_of_memory(void); */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ -extern void show_free_areas(unsigned int flags); -extern bool skip_free_areas_node(unsigned int flags, int nid); +extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); int shmem_zero_setup(struct vm_area_struct *); #ifdef CONFIG_SHMEM @@ -1934,7 +1933,7 @@ extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); -extern void show_mem(unsigned int flags); +extern void show_mem(unsigned int flags, nodemask_t *nodemask); extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); diff --git a/lib/show_mem.c b/lib/show_mem.c index 1feed6a2b12a..0beaa1d899aa 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -9,13 +9,13 @@ #include #include -void show_mem(unsigned int filter) +void show_mem(unsigned int filter, nodemask_t *nodemask) { pg_data_t *pgdat; unsigned long total = 0, reserved = 0, highmem = 0; printk("Mem-Info:\n"); - show_free_areas(filter); + show_free_areas(filter, nodemask); for_each_online_pgdat(pgdat) { unsigned long flags; diff --git a/mm/nommu.c b/mm/nommu.c index 24f9f5f39145..bc964c26be8c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1191,7 +1191,7 @@ error_free: enomem: pr_err("Allocation of length %lu from process %d (%s) failed\n", len, current->pid, current->comm); - show_free_areas(0); + show_free_areas(0, NULL); return -ENOMEM; } @@ -1412,13 +1412,13 @@ error_getting_vma: kmem_cache_free(vm_region_jar, region); pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0); + show_free_areas(0, NULL); return -ENOMEM; error_getting_region: pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0); + show_free_areas(0, NULL); return -ENOMEM; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ec9f11d4f094..7176b6a754cf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -417,7 +417,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) if (oc->memcg) mem_cgroup_print_oom_info(oc->memcg, p); else - show_mem(SHOW_MEM_FILTER_NODES); + show_mem(SHOW_MEM_FILTER_NODES, nm); if (sysctl_oom_dump_tasks) dump_tasks(oc->memcg, oc->nodemask); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 96c8fe602dfb..644fb75f6f24 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3005,7 +3005,7 @@ static inline bool should_suppress_show_mem(void) return ret; } -static void warn_alloc_show_mem(gfp_t gfp_mask) +static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) { unsigned int filter = SHOW_MEM_FILTER_NODES; static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); @@ -3025,7 +3025,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; - show_mem(filter); + show_mem(filter, nodemask); } void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) @@ -3052,7 +3052,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) cpuset_print_current_mems_allowed(); dump_stack(); - warn_alloc_show_mem(gfp_mask); + warn_alloc_show_mem(gfp_mask, nm); } static inline struct page * @@ -4274,20 +4274,20 @@ void si_meminfo_node(struct sysinfo *val, int nid) * Determine whether the node should be displayed or not, depending on whether * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). */ -bool skip_free_areas_node(unsigned int flags, int nid) +static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) { - bool ret = false; - unsigned int cpuset_mems_cookie; - if (!(flags & SHOW_MEM_FILTER_NODES)) - goto out; + return false; - do { - cpuset_mems_cookie = read_mems_allowed_begin(); - ret = !node_isset(nid, cpuset_current_mems_allowed); - } while (read_mems_allowed_retry(cpuset_mems_cookie)); -out: - return ret; + /* + * no node mask - aka implicit memory numa policy. Do not bother with + * the synchronization - read_mems_allowed_begin - because we do not + * have to be precise here. + */ + if (!nodemask) + nodemask = &cpuset_current_mems_allowed; + + return !node_isset(nid, *nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -4328,7 +4328,7 @@ static void show_migration_types(unsigned char type) * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ -void show_free_areas(unsigned int filter) +void show_free_areas(unsigned int filter, nodemask_t *nodemask) { unsigned long free_pcp = 0; int cpu; @@ -4336,7 +4336,7 @@ void show_free_areas(unsigned int filter) pg_data_t *pgdat; for_each_populated_zone(zone) { - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; for_each_online_cpu(cpu) @@ -4370,7 +4370,7 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { - if (skip_free_areas_node(filter, pgdat->node_id)) + if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) continue; printk("Node %d" @@ -4422,7 +4422,7 @@ void show_free_areas(unsigned int filter) for_each_populated_zone(zone) { int i; - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; free_pcp = 0; @@ -4487,7 +4487,7 @@ void show_free_areas(unsigned int filter) unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; show_node(zone); printk(KERN_CONT "%s: ", zone->name); -- cgit From 9a67f6488eca926f8356b2737fc9f8f6c0cbed85 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:19 -0800 Subject: mm: consolidate GFP_NOFAIL checks in the allocator slowpath Tetsuo Handa has pointed out that commit 0a0337e0d1d1 ("mm, oom: rework oom detection") has subtly changed semantic for costly high order requests with __GFP_NOFAIL and withtout __GFP_REPEAT and those can fail right now. My code inspection didn't reveal any such users in the tree but it is true that this might lead to unexpected allocation failures and subsequent OOPs. __alloc_pages_slowpath wrt. GFP_NOFAIL is hard to follow currently. There are few special cases but we are lacking a catch all place to be sure we will not miss any case where the non failing allocation might fail. This patch reorganizes the code a bit and puts all those special cases under nopage label which is the generic go-to-fail path. Non failing allocations are retried or those that cannot retry like non-sleeping allocation go to the failure point directly. This should make the code flow much easier to follow and make it less error prone for future changes. While we are there we have to move the stall check up to catch potentially looping non-failing allocations. [akpm@linux-foundation.org: fix alloc_flags may-be-used-uninitalized] Link: http://lkml.kernel.org/r/20161220134904.21023-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Acked-by: Hillf Danton Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 91 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 39 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 644fb75f6f24..dd36da6ffef5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3577,6 +3577,14 @@ retry_cpuset: no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); + + /* + * The fast path uses conservative alloc_flags to succeed only until + * kswapd needs to be woken up, and to avoid the cost of setting up + * alloc_flags precisely. So we do that now. + */ + alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* * We need to recalculate the starting point for the zonelist iterator * because we might have used different nodemask in the fast path, or @@ -3588,14 +3596,6 @@ retry_cpuset: if (!ac->preferred_zoneref->zone) goto nopage; - - /* - * The fast path uses conservative alloc_flags to succeed only until - * kswapd needs to be woken up, and to avoid the cost of setting up - * alloc_flags precisely. So we do that now. - */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); - if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); @@ -3672,35 +3672,21 @@ retry: goto got_pg; /* Caller is not willing to reclaim, we can't balance anything */ - if (!can_direct_reclaim) { - /* - * All existing users of the __GFP_NOFAIL are blockable, so warn - * of any new users that actually allow this type of allocation - * to fail. - */ - WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); + if (!can_direct_reclaim) goto nopage; - } - /* Avoid recursion of direct reclaim */ - if (current->flags & PF_MEMALLOC) { - /* - * __GFP_NOFAIL request from this context is rather bizarre - * because we cannot reclaim anything and only can loop waiting - * for somebody to do a work for us. - */ - if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { - cond_resched(); - goto retry; - } - goto nopage; + /* Make sure we know about allocations which stall for too long */ + if (time_after(jiffies, alloc_start + stall_timeout)) { + warn_alloc(gfp_mask, ac->nodemask, + "page allocation stalls for %ums, order:%u", + jiffies_to_msecs(jiffies-alloc_start), order); + stall_timeout += 10 * HZ; } - /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + /* Avoid recursion of direct reclaim */ + if (current->flags & PF_MEMALLOC) goto nopage; - /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); @@ -3724,14 +3710,6 @@ retry: if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) goto nopage; - /* Make sure we know about allocations which stall for too long */ - if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, ac->nodemask, - "page allocation stalls for %ums, order:%u", - jiffies_to_msecs(jiffies-alloc_start), order); - stall_timeout += 10 * HZ; - } - if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) goto retry; @@ -3760,6 +3738,10 @@ retry: if (page) goto got_pg; + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE)) + goto nopage; + /* Retry as long as the OOM killer is making progress */ if (did_some_progress) { no_progress_loops = 0; @@ -3777,6 +3759,37 @@ nopage: if (read_mems_allowed_retry(cpuset_mems_cookie)) goto retry_cpuset; + /* + * Make sure that __GFP_NOFAIL request doesn't leak out and make sure + * we always retry + */ + if (gfp_mask & __GFP_NOFAIL) { + /* + * All existing users of the __GFP_NOFAIL are blockable, so warn + * of any new users that actually require GFP_NOWAIT + */ + if (WARN_ON_ONCE(!can_direct_reclaim)) + goto fail; + + /* + * PF_MEMALLOC request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us + */ + WARN_ON_ONCE(current->flags & PF_MEMALLOC); + + /* + * non failing costly orders are a hard requirement which we + * are not prepared for much so let's warn about these users + * so that we can identify them and convert them to something + * else. + */ + WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); + + cond_resched(); + goto retry; + } +fail: warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); got_pg: -- cgit From 06ad276ac18742c6b281698d41b27a290cd42407 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:22 -0800 Subject: mm, oom: do not enforce OOM killer for __GFP_NOFAIL automatically __alloc_pages_may_oom makes sure to skip the OOM killer depending on the allocation request. This includes lowmem requests, costly high order requests and others. For a long time __GFP_NOFAIL acted as an override for all those rules. This is not documented and it can be quite surprising as well. E.g. GFP_NOFS requests are not invoking the OOM killer but GFP_NOFS|__GFP_NOFAIL does so if we try to convert some of the existing open coded loops around allocator to nofail request (and we have done that in the past) then such a change would have a non trivial side effect which is far from obvious. Note that the primary motivation for skipping the OOM killer is to prevent from pre-mature invocation. The exception has been added by commit 82553a937f12 ("oom: invoke oom killer for __GFP_NOFAIL"). The changelog points out that the oom killer has to be invoked otherwise the request would be looping for ever. But this argument is rather weak because the OOM killer doesn't really guarantee a forward progress for those exceptional cases: - it will hardly help to form costly order which in turn can result in the system panic because of no oom killable task in the end - I believe we certainly do not want to put the system down just because there is a nasty driver asking for order-9 page with GFP_NOFAIL not realizing all the consequences. It is much better this request would loop for ever than the massive system disruption - lowmem is also highly unlikely to be freed during OOM killer - GFP_NOFS request could trigger while there is still a lot of memory pinned by filesystems. This patch simply removes the __GFP_NOFAIL special case in order to have a more clear semantic without surprising side effects. Signed-off-by: Michal Hocko Reported-by: Nils Holland Acked-by: Johannes Weiner Cc: Vlastimil Babka Cc: Hillf Danton Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- mm/page_alloc.c | 49 ++++++++++++++++++++++++------------------------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7176b6a754cf..c7b48b4282d9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1013,7 +1013,7 @@ bool out_of_memory(struct oom_control *oc) * make sure exclude 0 mask - all other users should have at least * ___GFP_DIRECT_RECLAIM to get here. */ - if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL))) + if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) return true; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dd36da6ffef5..1e37740837ac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3090,32 +3090,31 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (page) goto out; - if (!(gfp_mask & __GFP_NOFAIL)) { - /* Coredumps can quickly deplete all memory reserves */ - if (current->flags & PF_DUMPCORE) - goto out; - /* The OOM killer will not help higher order allocs */ - if (order > PAGE_ALLOC_COSTLY_ORDER) - goto out; - /* The OOM killer does not needlessly kill tasks for lowmem */ - if (ac->high_zoneidx < ZONE_NORMAL) - goto out; - if (pm_suspended_storage()) - goto out; - /* - * XXX: GFP_NOFS allocations should rather fail than rely on - * other request to make a forward progress. - * We are in an unfortunate situation where out_of_memory cannot - * do much for this context but let's try it to at least get - * access to memory reserved if the current task is killed (see - * out_of_memory). Once filesystems are ready to handle allocation - * failures more gracefully we should just bail out here. - */ + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + /* The OOM killer does not needlessly kill tasks for lowmem */ + if (ac->high_zoneidx < ZONE_NORMAL) + goto out; + if (pm_suspended_storage()) + goto out; + /* + * XXX: GFP_NOFS allocations should rather fail than rely on + * other request to make a forward progress. + * We are in an unfortunate situation where out_of_memory cannot + * do much for this context but let's try it to at least get + * access to memory reserved if the current task is killed (see + * out_of_memory). Once filesystems are ready to handle allocation + * failures more gracefully we should just bail out here. + */ + + /* The OOM killer may not free memory on a specific node */ + if (gfp_mask & __GFP_THISNODE) + goto out; - /* The OOM killer may not free memory on a specific node */ - if (gfp_mask & __GFP_THISNODE) - goto out; - } /* Exhausted what can be done so it's blamo time */ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; -- cgit From 6c18ba7a18997dadbf7ee912e15677ad2c9993e5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Feb 2017 15:46:25 -0800 Subject: mm: help __GFP_NOFAIL allocations which do not trigger OOM killer Now that __GFP_NOFAIL doesn't override decisions to skip the oom killer we are left with requests which require to loop inside the allocator without invoking the oom killer (e.g. GFP_NOFS|__GFP_NOFAIL used by fs code) and so they might, in very unlikely situations, loop for ever - e.g. other parallel request could starve them. This patch tries to limit the likelihood of such a lockup by giving these __GFP_NOFAIL requests a chance to move on by consuming a small part of memory reserves. We are using ALLOC_HARDER which should be enough to prevent from the starvation by regular allocation requests, yet it shouldn't consume enough from the reserves to disrupt high priority requests (ALLOC_HIGH). While we are at it, let's introduce a helper __alloc_pages_cpuset_fallback which enforces the cpusets but allows to fallback to ignore them if the first attempt fails. __GFP_NOFAIL requests can be considered important enough to allow cpuset runaway in order for the system to move on. It is highly unlikely that any of these will be GFP_USER anyway. Link: http://lkml.kernel.org/r/20161220134904.21023-4-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Vlastimil Babka Cc: Johannes Weiner Cc: Hillf Danton Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 46 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1e37740837ac..a179607de26f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3055,6 +3055,26 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) warn_alloc_show_mem(gfp_mask, nm); } +static inline struct page * +__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, + const struct alloc_context *ac) +{ + struct page *page; + + page = get_page_from_freelist(gfp_mask, order, + alloc_flags|ALLOC_CPUSET, ac); + /* + * fallback to ignore cpuset restriction if our nodes + * are depleted + */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, + alloc_flags, ac); + + return page; +} + static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) @@ -3119,17 +3139,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; - if (gfp_mask & __GFP_NOFAIL) { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); - /* - * fallback to ignore cpuset restriction if our nodes - * are depleted - */ - if (!page) - page = get_page_from_freelist(gfp_mask, order, + /* + * Help non-failing allocations by giving them access to memory + * reserves + */ + if (gfp_mask & __GFP_NOFAIL) + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_NO_WATERMARKS, ac); - } } out: mutex_unlock(&oom_lock); @@ -3785,6 +3801,16 @@ nopage: */ WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); + /* + * Help non-failing allocations by giving them access to memory + * reserves but do not use ALLOC_NO_WATERMARKS because this + * could deplete whole memory reserves which would just make + * the situation worse + */ + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); + if (page) + goto got_pg; + cond_resched(); goto retry; } -- cgit From 685dbf6f5a643c4bdb9323ee3544ec652505d2ea Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Feb 2017 15:46:28 -0800 Subject: mm, page_alloc: warn_alloc nodemask is NULL when cpusets are disabled The patch "mm, page_alloc: warn_alloc print nodemask" implicitly sets the allocation nodemask to cpuset_current_mems_allowed when there is no effective mempolicy. cpuset_current_mems_allowed is only effective when cpusets are enabled, which is also printed by warn_alloc(), so setting the nodemask to cpuset_current_mems_allowed is redundant and prevents debugging issues where ac->nodemask is not set properly in the page allocator. This provides better debugging output since cpuset_print_current_mems_allowed() is already provided. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701181347320.142399@chino.kir.corp.google.com Signed-off-by: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a179607de26f..c21b33668133 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3034,7 +3034,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) va_list args; static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - nodemask_t *nm = (nodemask) ? nodemask : &cpuset_current_mems_allowed; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || debug_guardpage_minorder() > 0) @@ -3048,11 +3047,16 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) pr_cont("%pV", &vaf); va_end(args); - pr_cont(", mode:%#x(%pGg), nodemask=%*pbl\n", gfp_mask, &gfp_mask, nodemask_pr_args(nm)); + pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); + if (nodemask) + pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); + else + pr_cont("(null)\n"); + cpuset_print_current_mems_allowed(); dump_stack(); - warn_alloc_show_mem(gfp_mask, nm); + warn_alloc_show_mem(gfp_mask, nodemask); } static inline struct page * -- cgit From da162e9368990ed747075e2ab427da0759fc4a59 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 22 Feb 2017 15:46:31 -0800 Subject: mm: drop zap_details::ignore_dirty The only user of ignore_dirty is oom-reaper. But it doesn't really use it. ignore_dirty only has effect on file pages mapped with dirty pte. But oom-repear skips shared VMAs, so there's no way we can dirty file pte in them. Link: http://lkml.kernel.org/r/20170118122429.43661-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Peter Zijlstra Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/memory.c | 6 ------ mm/oom_kill.c | 3 +-- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a67cae5a07c..eae478a42f26 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1175,7 +1175,6 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ - bool ignore_dirty; /* Ignore dirty pages */ bool check_swap_entries; /* Check also swap entries */ }; diff --git a/mm/memory.c b/mm/memory.c index d7676a68c80a..88872a93c3ca 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1155,12 +1155,6 @@ again: if (!PageAnon(page)) { if (pte_dirty(ptent)) { - /* - * oom_reaper cannot tear down dirty - * pages - */ - if (unlikely(details && details->ignore_dirty)) - continue; force_flush = 1; set_page_dirty(page); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c7b48b4282d9..33fcc8a40aeb 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -465,8 +465,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; - struct zap_details details = {.check_swap_entries = true, - .ignore_dirty = true}; + struct zap_details details = {.check_swap_entries = true}; bool ret = true; /* -- cgit From 3e8715fdc03e8df4d26d8e436166e44e3e416d3b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 22 Feb 2017 15:46:34 -0800 Subject: mm: drop zap_details::check_swap_entries detail == NULL would give the same functionality as .check_swap_entries==true. Link: http://lkml.kernel.org/r/20170118122429.43661-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Peter Zijlstra Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/memory.c | 4 ++-- mm/oom_kill.c | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index eae478a42f26..062936e8b832 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1175,7 +1175,6 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ - bool check_swap_entries; /* Check also swap entries */ }; struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index 88872a93c3ca..e9035a0afee2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1173,8 +1173,8 @@ again: } continue; } - /* only check swap_entries if explicitly asked for in details */ - if (unlikely(details && !details->check_swap_entries)) + /* If details->check_mapping, we leave swap entries. */ + if (unlikely(details)) continue; entry = pte_to_swp_entry(ptent); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 33fcc8a40aeb..a1977247c7ea 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -465,7 +465,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; - struct zap_details details = {.check_swap_entries = true}; bool ret = true; /* @@ -531,7 +530,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) */ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, - &details); + NULL); } tlb_finish_mmu(&tlb, 0, -1); pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", -- cgit From ecf1385d72f0491400a8ceca7001196ca369aa8c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 22 Feb 2017 15:46:37 -0800 Subject: mm: drop unused argument of zap_page_range() There's no users of zap_page_range() who wants non-NULL 'details'. Let's drop it. Link: http://lkml.kernel.org/r/20170118122429.43661-3-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Peter Zijlstra Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/gmap.c | 2 +- arch/x86/mm/mpx.c | 2 +- drivers/android/binder.c | 2 +- drivers/staging/android/ion/ion.c | 3 +-- include/linux/mm.h | 2 +- mm/madvise.c | 2 +- mm/memory.c | 5 ++--- 7 files changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index ec1f0dedb948..59ac93714fa4 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -687,7 +687,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) /* Find vma in the parent mm */ vma = find_vma(gmap->mm, vmaddr); size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size, NULL); + zap_page_range(vma, vmaddr, size); } up_read(&gmap->mm->mmap_sem); } diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index af59f808742f..aad4ac386f98 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -796,7 +796,7 @@ static noinline int zap_bt_entries_mapping(struct mm_struct *mm, return -EINVAL; len = min(vma->vm_end, end) - addr; - zap_page_range(vma, addr, len, NULL); + zap_page_range(vma, addr, len); trace_mpx_unmap_zap(addr, addr+len); vma = vma->vm_next; diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 9451b762fa1c..15b263a420e8 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -657,7 +657,7 @@ free_range: page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE]; if (vma) zap_page_range(vma, (uintptr_t)page_addr + - proc->user_buffer_offset, PAGE_SIZE, NULL); + proc->user_buffer_offset, PAGE_SIZE); err_vm_insert_page_failed: unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); err_map_kernel_failed: diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index 937c2d5d7ec3..969600779e44 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -865,8 +865,7 @@ static void ion_buffer_sync_for_device(struct ion_buffer *buffer, list_for_each_entry(vma_list, &buffer->vmas, list) { struct vm_area_struct *vma = vma_list->vma; - zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, - NULL); + zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); } mutex_unlock(&buffer->lock); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 062936e8b832..574bc157a27c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1185,7 +1185,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *); + unsigned long size); void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long start, unsigned long end); diff --git a/mm/madvise.c b/mm/madvise.c index ca75b8a01ba0..7f1490f0d3a6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -478,7 +478,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, return -EINVAL; madvise_userfault_dontneed(vma, prev, start, end); - zap_page_range(vma, start, end - start, NULL); + zap_page_range(vma, start, end - start); return 0; } diff --git a/mm/memory.c b/mm/memory.c index e9035a0afee2..7663068a33c6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1370,12 +1370,11 @@ void unmap_vmas(struct mmu_gather *tlb, * @vma: vm_area_struct holding the applicable pages * @start: starting address of pages to zap * @size: number of bytes to zap - * @details: details of shared cache invalidation * * Caller must protect the VMA list */ void zap_page_range(struct vm_area_struct *vma, unsigned long start, - unsigned long size, struct zap_details *details) + unsigned long size) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; @@ -1386,7 +1385,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); for ( ; vma && vma->vm_start < end; vma = vma->vm_next) - unmap_single_vma(&tlb, vma, start, end, details); + unmap_single_vma(&tlb, vma, start, end, NULL); mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } -- cgit From 235190738aba7c5c94300c8d882842a535280e5a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 22 Feb 2017 15:46:39 -0800 Subject: oom-reaper: use madvise_dontneed() logic to decide if unmap the VMA Logic on whether we can reap pages from the VMA should match what we have in madvise_dontneed(). In particular, we should skip, VM_PFNMAP VMAs, but we don't now. Let's just extract condition on which we can shoot down pagesi from a VMA with MADV_DONTNEED into separate function and use it in both places. Link: http://lkml.kernel.org/r/20170118122429.43661-4-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Peter Zijlstra Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 +++++ mm/madvise.c | 4 +++- mm/oom_kill.c | 9 +-------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 9ad04fc6eefe..8ab72f4374e0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -43,6 +43,11 @@ int do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); +static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); +} + void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, diff --git a/mm/madvise.c b/mm/madvise.c index 7f1490f0d3a6..b530a4986035 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -25,6 +25,8 @@ #include +#include "internal.h" + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_sem for writing. Others, which simply traverse vmas, need @@ -474,7 +476,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, unsigned long start, unsigned long end) { *prev = vma; - if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + if (!can_madv_dontneed_vma(vma)) return -EINVAL; madvise_userfault_dontneed(vma, prev, start, end); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a1977247c7ea..8256788ac119 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -508,14 +508,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) tlb_gather_mmu(&tlb, mm, 0, -1); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (is_vm_hugetlb_page(vma)) - continue; - - /* - * mlocked VMAs require explicit munlocking before unmap. - * Let's keep it simple here and skip such VMAs. - */ - if (vma->vm_flags & VM_LOCKED) + if (!can_madv_dontneed_vma(vma)) continue; /* -- cgit From 5d63f81c9e495e1f38fa36208bdcbbe2d2e72960 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Wed, 22 Feb 2017 15:46:42 -0800 Subject: mm/memblock.c: remove unnecessary log and clean up There is no variable named flags in memblock_add() and memblock_reserve() so remove it from the log messages. This patch also cleans up the type casting for phys_addr_t by using %pa to print them. Link: http://lkml.kernel.org/r/1484720165-25403-1-git-send-email-miles.chen@mediatek.com Signed-off-by: Miles Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 54 +++++++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index d105d6c81d53..c004f52be419 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -611,10 +611,10 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", - (unsigned long long)base, - (unsigned long long)base + size - 1, - 0UL, (void *)_RET_IP_); + phys_addr_t end = base + size - 1; + + memblock_dbg("memblock_add: [%pa-%pa] %pF\n", + &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); } @@ -718,10 +718,10 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { - memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", - (unsigned long long)base, - (unsigned long long)base + size - 1, - (void *)_RET_IP_); + phys_addr_t end = base + size - 1; + + memblock_dbg(" memblock_free: [%pa-%pa] %pF\n", + &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); return memblock_remove_range(&memblock.reserved, base, size); @@ -729,10 +729,10 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", - (unsigned long long)base, - (unsigned long long)base + size - 1, - 0UL, (void *)_RET_IP_); + phys_addr_t end = base + size - 1; + + memblock_dbg("memblock_reserve: [%pa-%pa] %pF\n", + &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); } @@ -1227,8 +1227,8 @@ phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys alloc = __memblock_alloc_base(size, align, max_addr); if (alloc == 0) - panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", - (unsigned long long) size, (unsigned long long) max_addr); + panic("ERROR: Failed to allocate %pa bytes below %pa.\n", + &size, &max_addr); return alloc; } @@ -1695,7 +1695,7 @@ phys_addr_t __init_memblock memblock_get_current_limit(void) static void __init_memblock memblock_dump(struct memblock_type *type, char *name) { - unsigned long long base, size; + phys_addr_t base, end, size; unsigned long flags; int idx; struct memblock_region *rgn; @@ -1707,23 +1707,24 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name base = rgn->base; size = rgn->size; + end = base + size - 1; flags = rgn->flags; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif - pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", - name, idx, base, base + size - 1, size, nid_buf, flags); + pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n", + name, idx, &base, &end, &size, nid_buf, flags); } } void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); - pr_info(" memory size = %#llx reserved size = %#llx\n", - (unsigned long long)memblock.memory.total_size, - (unsigned long long)memblock.reserved.total_size); + pr_info(" memory size = %pa reserved size = %pa\n", + &memblock.memory.total_size, + &memblock.reserved.total_size); memblock_dump(&memblock.memory, "memory"); memblock_dump(&memblock.reserved, "reserved"); @@ -1749,19 +1750,14 @@ static int memblock_debug_show(struct seq_file *m, void *private) struct memblock_type *type = m->private; struct memblock_region *reg; int i; + phys_addr_t end; for (i = 0; i < type->cnt; i++) { reg = &type->regions[i]; - seq_printf(m, "%4d: ", i); - if (sizeof(phys_addr_t) == 4) - seq_printf(m, "0x%08lx..0x%08lx\n", - (unsigned long)reg->base, - (unsigned long)(reg->base + reg->size - 1)); - else - seq_printf(m, "0x%016llx..0x%016llx\n", - (unsigned long long)reg->base, - (unsigned long long)(reg->base + reg->size - 1)); + end = reg->base + reg->size - 1; + seq_printf(m, "%4d: ", i); + seq_printf(m, "%pa..%pa\n", ®->base, &end); } return 0; } -- cgit From c87d1655c29500b459fb135258a93f8309ada9c7 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 22 Feb 2017 15:46:45 -0800 Subject: zram: remove obsolete sysfs attrs We had a deprecated_attr_warn() warning for 2 years and now the time has come and we finally can do the cleanup. The plan was as follows: : per-stat sysfs attributes are considered to be deprecated. : The basic strategy is: : -- the existing RW nodes will be downgraded to WO nodes (in linux 4.11) : -- deprecated RO sysfs nodes will eventually be removed (in linux 4.11) : : The list of deprecated attributes can be found here: : Documentation/ABI/obsolete/sysfs-block-zram : : Basically, every attribute that has its own read accessible sysfs : node (e.g. num_reads) *AND* is accessible via one of the stat files : (zram/stat or zram/io_stat or zram/mm_stat) is considered : to be deprecated. The patch also removes `obsolete/sysfs-block-zram', clean ups `testing/sysfs-block-zram' and tweaks zram.txt files. Link: http://lkml.kernel.org/r/20170118035838.11090-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/obsolete/sysfs-block-zram | 119 ---------------------------- Documentation/ABI/testing/sysfs-block-zram | 101 ++--------------------- Documentation/blockdev/zram.txt | 74 ++++++++--------- drivers/block/zram/zram_drv.c | 101 +---------------------- 4 files changed, 42 insertions(+), 353 deletions(-) delete mode 100644 Documentation/ABI/obsolete/sysfs-block-zram diff --git a/Documentation/ABI/obsolete/sysfs-block-zram b/Documentation/ABI/obsolete/sysfs-block-zram deleted file mode 100644 index 720ea92cfb2e..000000000000 --- a/Documentation/ABI/obsolete/sysfs-block-zram +++ /dev/null @@ -1,119 +0,0 @@ -What: /sys/block/zram/num_reads -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The num_reads file is read-only and specifies the number of - reads (failed or successful) done on this device. - Now accessible via zram/stat node. - -What: /sys/block/zram/num_writes -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The num_writes file is read-only and specifies the number of - writes (failed or successful) done on this device. - Now accessible via zram/stat node. - -What: /sys/block/zram/invalid_io -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The invalid_io file is read-only and specifies the number of - non-page-size-aligned I/O requests issued to this device. - Now accessible via zram/io_stat node. - -What: /sys/block/zram/failed_reads -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The failed_reads file is read-only and specifies the number of - failed reads happened on this device. - Now accessible via zram/io_stat node. - -What: /sys/block/zram/failed_writes -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The failed_writes file is read-only and specifies the number of - failed writes happened on this device. - Now accessible via zram/io_stat node. - -What: /sys/block/zram/notify_free -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The notify_free file is read-only. Depending on device usage - scenario it may account a) the number of pages freed because - of swap slot free notifications or b) the number of pages freed - because of REQ_DISCARD requests sent by bio. The former ones - are sent to a swap block device when a swap slot is freed, which - implies that this disk is being used as a swap disk. The latter - ones are sent by filesystem mounted with discard option, - whenever some data blocks are getting discarded. - Now accessible via zram/io_stat node. - -What: /sys/block/zram/zero_pages -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The zero_pages file is read-only and specifies number of zero - filled pages written to this disk. No memory is allocated for - such pages. - Now accessible via zram/mm_stat node. - -What: /sys/block/zram/orig_data_size -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The orig_data_size file is read-only and specifies uncompressed - size of data stored in this disk. This excludes zero-filled - pages (zero_pages) since no memory is allocated for them. - Unit: bytes - Now accessible via zram/mm_stat node. - -What: /sys/block/zram/compr_data_size -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The compr_data_size file is read-only and specifies compressed - size of data stored in this disk. So, compression ratio can be - calculated using orig_data_size and this statistic. - Unit: bytes - Now accessible via zram/mm_stat node. - -What: /sys/block/zram/mem_used_total -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The mem_used_total file is read-only and specifies the amount - of memory, including allocator fragmentation and metadata - overhead, allocated for this disk. So, allocator space - efficiency can be calculated using compr_data_size and this - statistic. - Unit: bytes - Now accessible via zram/mm_stat node. - -What: /sys/block/zram/mem_used_max -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The mem_used_max file is read/write and specifies the amount - of maximum memory zram have consumed to store compressed data. - For resetting the value, you should write "0". Otherwise, - you could see -EINVAL. - Unit: bytes - Downgraded to write-only node: so it's possible to set new - value only; its current value is stored in zram/mm_stat - node. - -What: /sys/block/zram/mem_limit -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The mem_limit file is read/write and specifies the maximum - amount of memory ZRAM can use to store the compressed data. - The limit could be changed in run time and "0" means disable - the limit. No limit is the initial state. Unit: bytes - Downgraded to write-only node: so it's possible to set new - value only; its current value is stored in zram/mm_stat - node. diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 4518d30b8c2e..451b6d882b2c 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -22,41 +22,6 @@ Description: device. The reset operation frees all the memory associated with this device. -What: /sys/block/zram/num_reads -Date: August 2010 -Contact: Nitin Gupta -Description: - The num_reads file is read-only and specifies the number of - reads (failed or successful) done on this device. - -What: /sys/block/zram/num_writes -Date: August 2010 -Contact: Nitin Gupta -Description: - The num_writes file is read-only and specifies the number of - writes (failed or successful) done on this device. - -What: /sys/block/zram/invalid_io -Date: August 2010 -Contact: Nitin Gupta -Description: - The invalid_io file is read-only and specifies the number of - non-page-size-aligned I/O requests issued to this device. - -What: /sys/block/zram/failed_reads -Date: February 2014 -Contact: Sergey Senozhatsky -Description: - The failed_reads file is read-only and specifies the number of - failed reads happened on this device. - -What: /sys/block/zram/failed_writes -Date: February 2014 -Contact: Sergey Senozhatsky -Description: - The failed_writes file is read-only and specifies the number of - failed writes happened on this device. - What: /sys/block/zram/max_comp_streams Date: February 2014 Contact: Sergey Senozhatsky @@ -73,74 +38,24 @@ Description: available and selected compression algorithms, change compression algorithm selection. -What: /sys/block/zram/notify_free -Date: August 2010 -Contact: Nitin Gupta -Description: - The notify_free file is read-only. Depending on device usage - scenario it may account a) the number of pages freed because - of swap slot free notifications or b) the number of pages freed - because of REQ_DISCARD requests sent by bio. The former ones - are sent to a swap block device when a swap slot is freed, which - implies that this disk is being used as a swap disk. The latter - ones are sent by filesystem mounted with discard option, - whenever some data blocks are getting discarded. - -What: /sys/block/zram/zero_pages -Date: August 2010 -Contact: Nitin Gupta -Description: - The zero_pages file is read-only and specifies number of zero - filled pages written to this disk. No memory is allocated for - such pages. - -What: /sys/block/zram/orig_data_size -Date: August 2010 -Contact: Nitin Gupta -Description: - The orig_data_size file is read-only and specifies uncompressed - size of data stored in this disk. This excludes zero-filled - pages (zero_pages) since no memory is allocated for them. - Unit: bytes - -What: /sys/block/zram/compr_data_size -Date: August 2010 -Contact: Nitin Gupta -Description: - The compr_data_size file is read-only and specifies compressed - size of data stored in this disk. So, compression ratio can be - calculated using orig_data_size and this statistic. - Unit: bytes - -What: /sys/block/zram/mem_used_total -Date: August 2010 -Contact: Nitin Gupta -Description: - The mem_used_total file is read-only and specifies the amount - of memory, including allocator fragmentation and metadata - overhead, allocated for this disk. So, allocator space - efficiency can be calculated using compr_data_size and this - statistic. - Unit: bytes - What: /sys/block/zram/mem_used_max Date: August 2014 Contact: Minchan Kim Description: - The mem_used_max file is read/write and specifies the amount - of maximum memory zram have consumed to store compressed data. - For resetting the value, you should write "0". Otherwise, - you could see -EINVAL. + The mem_used_max file is write-only and is used to reset + the counter of maximum memory zram have consumed to store + compressed data. For resetting the value, you should write + "0". Otherwise, you could see -EINVAL. Unit: bytes What: /sys/block/zram/mem_limit Date: August 2014 Contact: Minchan Kim Description: - The mem_limit file is read/write and specifies the maximum - amount of memory ZRAM can use to store the compressed data. The - limit could be changed in run time and "0" means disable the - limit. No limit is the initial state. Unit: bytes + The mem_limit file is write-only and specifies the maximum + amount of memory ZRAM can use to store the compressed data. + The limit could be changed in run time and "0" means disable + the limit. No limit is the initial state. Unit: bytes What: /sys/block/zram/compact Date: August 2015 diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 0535ae1f73e5..1c0c08d9206b 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -161,42 +161,14 @@ Name access description disksize RW show and set the device's disk size initstate RO shows the initialization state of the device reset WO trigger device reset -num_reads RO the number of reads -failed_reads RO the number of failed reads -num_write RO the number of writes -failed_writes RO the number of failed writes -invalid_io RO the number of non-page-size-aligned I/O requests +mem_used_max WO reset the `mem_used_max' counter (see later) +mem_limit WO specifies the maximum amount of memory ZRAM can use + to store the compressed data max_comp_streams RW the number of possible concurrent compress operations comp_algorithm RW show and change the compression algorithm -notify_free RO the number of notifications to free pages (either - slot free notifications or REQ_DISCARD requests) -zero_pages RO the number of zero filled pages written to this disk -orig_data_size RO uncompressed size of data stored in this disk -compr_data_size RO compressed size of data stored in this disk -mem_used_total RO the amount of memory allocated for this disk -mem_used_max RW the maximum amount of memory zram have consumed to - store the data (to reset this counter to the actual - current value, write 1 to this attribute) -mem_limit RW the maximum amount of memory ZRAM can use to store - the compressed data -pages_compacted RO the number of pages freed during compaction - (available only via zram/mm_stat node) compact WO trigger memory compaction debug_stat RO this file is used for zram debugging purposes -WARNING -======= -per-stat sysfs attributes are considered to be deprecated. -The basic strategy is: --- the existing RW nodes will be downgraded to WO nodes (in linux 4.11) --- deprecated RO sysfs nodes will eventually be removed (in linux 4.11) - -The list of deprecated attributes can be found here: -Documentation/ABI/obsolete/sysfs-block-zram - -Basically, every attribute that has its own read accessible sysfs node -(e.g. num_reads) *AND* is accessible via one of the stat files (zram/stat -or zram/io_stat or zram/mm_stat) is considered to be deprecated. User space is advised to use the following files to read the device statistics. @@ -211,22 +183,40 @@ The stat file represents device's I/O statistics not accounted by block layer and, thus, not available in zram/stat file. It consists of a single line of text and contains the following stats separated by whitespace: - failed_reads - failed_writes - invalid_io - notify_free + failed_reads the number of failed reads + failed_writes the number of failed writes + invalid_io the number of non-page-size-aligned I/O requests + notify_free Depending on device usage scenario it may account + a) the number of pages freed because of swap slot free + notifications or b) the number of pages freed because of + REQ_DISCARD requests sent by bio. The former ones are + sent to a swap block device when a swap slot is freed, + which implies that this disk is being used as a swap disk. + The latter ones are sent by filesystem mounted with + discard option, whenever some data blocks are getting + discarded. File /sys/block/zram/mm_stat The stat file represents device's mm statistics. It consists of a single line of text and contains the following stats separated by whitespace: - orig_data_size - compr_data_size - mem_used_total - mem_limit - mem_used_max - zero_pages - num_migrated + orig_data_size uncompressed size of data stored in this disk. + This excludes zero-filled pages (zero_pages) since no + memory is allocated for them. + Unit: bytes + compr_data_size compressed size of data stored in this disk + mem_used_total the amount of memory allocated for this disk. This + includes allocator fragmentation and metadata overhead, + allocated for this disk. So, allocator space efficiency + can be calculated using compr_data_size and this statistic. + Unit: bytes + mem_limit the maximum amount of memory ZRAM can use to store + the compressed data + mem_used_max the maximum amount of memory zram have consumed to + store the data + zero_pages the number of zero filled pages written to this disk. + No memory is allocated for such pages. + pages_compacted the number of pages freed during compaction 9) Deactivate: swapoff /dev/zram0 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3cd7856156b4..c73fede582f7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -45,27 +45,6 @@ static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; -static inline void deprecated_attr_warn(const char *name) -{ - pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n", - task_pid_nr(current), - current->comm, - name, - "See zram documentation."); -} - -#define ZRAM_ATTR_RO(name) \ -static ssize_t name##_show(struct device *d, \ - struct device_attribute *attr, char *b) \ -{ \ - struct zram *zram = dev_to_zram(d); \ - \ - deprecated_attr_warn(__stringify(name)); \ - return scnprintf(b, PAGE_SIZE, "%llu\n", \ - (u64)atomic64_read(&zram->stats.name)); \ -} \ -static DEVICE_ATTR_RO(name); - static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -218,47 +197,6 @@ static ssize_t disksize_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); } -static ssize_t orig_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("orig_data_size"); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); -} - -static ssize_t mem_used_total_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val = 0; - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("mem_used_total"); - down_read(&zram->init_lock); - if (init_done(zram)) { - struct zram_meta *meta = zram->meta; - val = zs_get_total_pages(meta->mem_pool); - } - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); -} - -static ssize_t mem_limit_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val; - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("mem_limit"); - down_read(&zram->init_lock); - val = zram->limit_pages; - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); -} - static ssize_t mem_limit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -277,21 +215,6 @@ static ssize_t mem_limit_store(struct device *dev, return len; } -static ssize_t mem_used_max_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val = 0; - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("mem_used_max"); - down_read(&zram->init_lock); - if (init_done(zram)) - val = atomic_long_read(&zram->stats.max_used_pages); - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); -} - static ssize_t mem_used_max_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -467,14 +390,6 @@ static ssize_t debug_stat_show(struct device *dev, static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); -ZRAM_ATTR_RO(num_reads); -ZRAM_ATTR_RO(num_writes); -ZRAM_ATTR_RO(failed_reads); -ZRAM_ATTR_RO(failed_writes); -ZRAM_ATTR_RO(invalid_io); -ZRAM_ATTR_RO(notify_free); -ZRAM_ATTR_RO(zero_pages); -ZRAM_ATTR_RO(compr_data_size); static inline bool zram_meta_get(struct zram *zram) { @@ -1188,10 +1103,8 @@ static DEVICE_ATTR_WO(compact); static DEVICE_ATTR_RW(disksize); static DEVICE_ATTR_RO(initstate); static DEVICE_ATTR_WO(reset); -static DEVICE_ATTR_RO(orig_data_size); -static DEVICE_ATTR_RO(mem_used_total); -static DEVICE_ATTR_RW(mem_limit); -static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_WO(mem_limit); +static DEVICE_ATTR_WO(mem_used_max); static DEVICE_ATTR_RW(max_comp_streams); static DEVICE_ATTR_RW(comp_algorithm); @@ -1199,17 +1112,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, &dev_attr_initstate.attr, &dev_attr_reset.attr, - &dev_attr_num_reads.attr, - &dev_attr_num_writes.attr, - &dev_attr_failed_reads.attr, - &dev_attr_failed_writes.attr, &dev_attr_compact.attr, - &dev_attr_invalid_io.attr, - &dev_attr_notify_free.attr, - &dev_attr_zero_pages.attr, - &dev_attr_orig_data_size.attr, - &dev_attr_compr_data_size.attr, - &dev_attr_mem_used_total.attr, &dev_attr_mem_limit.attr, &dev_attr_mem_used_max.attr, &dev_attr_max_comp_streams.attr, -- cgit From 083fb8edda0487d192e8c117f625563b920cf7a4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 22 Feb 2017 15:46:48 -0800 Subject: mm: fix stray kernel-doc notation Delete stray (second) function description in find_lock_page() kernel-doc notation. Note: scripts/kernel-doc just ignores the second function description. Fixes: 2457aec63745e ("mm: non-atomically mark page accessed during page cache allocation where possible") Link: http://lkml.kernel.org/r/b037e9a3-516c-ec02-6c8e-fa5479747ba6@infradead.org Signed-off-by: Randy Dunlap Reported-by: Matthew Wilcox Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b572f5530392..84943e8057ef 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -266,7 +266,6 @@ static inline struct page *find_get_page_flags(struct address_space *mapping, /** * find_lock_page - locate, pin and lock a pagecache page - * pagecache_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index * -- cgit From f201ebd87652cf1519792f8662bb3f862c76aa33 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 22 Feb 2017 15:46:51 -0800 Subject: mm/z3fold.c: limit first_num to the actual range of possible buddy indexes At present, Tying the first_num size to NCHUNKS_ORDER is confusing. the number of chunks is completely unrelated to the number of buddies. The patch limits the first_num to actual range of possible buddy indexes. and that is more reasonable and obvious without functional change. Link: http://lkml.kernel.org/r/1476776569-29504-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Suggested-by: Dan Streetman Acked-by: Dan Streetman Acked-by: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 8f9e89ca1d31..207e5ddc87a2 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -50,7 +50,7 @@ #define ZHDR_SIZE_ALIGNED CHUNK_SIZE #define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) -#define BUDDY_MASK ((1 << NCHUNKS_ORDER) - 1) +#define BUDDY_MASK (0x3) struct z3fold_pool; struct z3fold_ops { @@ -109,7 +109,7 @@ struct z3fold_header { unsigned short middle_chunks; unsigned short last_chunks; unsigned short start_middle; - unsigned short first_num:NCHUNKS_ORDER; + unsigned short first_num:2; }; /* @@ -179,7 +179,11 @@ static struct z3fold_header *handle_to_z3fold_header(unsigned long handle) return (struct z3fold_header *)(handle & PAGE_MASK); } -/* Returns buddy number */ +/* + * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle + * but that doesn't matter. because the masking will result in the + * correct buddy number. + */ static enum buddy handle_to_buddy(unsigned long handle) { struct z3fold_header *zhdr = handle_to_z3fold_header(handle); -- cgit