From 87c1b497c299e48e681f041de4dd6ce4831aaf75 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:36:53 -0700 Subject: ntfs: logging clean-up - Convert spinlock/static array to va_format (inspired by Joe Perches help on previous logging patches). - Convert printk(KERN_ERR to pr_warn in __ntfs_warning. - Convert printk(KERN_ERR to pr_err in __ntfs_error. - Convert printk(KERN_DEBUG to pr_debug in __ntfs_debug. (Note that __ntfs_debug is still guarded by #if DEBUG) - Improve !DEBUG to parse all arguments (Joe Perches). - Sparse pr_foo() conversions in super.c NTFS, NTFS-fs prefixes as well as 'warning' and 'error' were removed : pr_foo() automatically adds module name and error level is already specified. Signed-off-by: Fabian Frederick Cc: Anton Altaparmakov Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/debug.c | 58 ++++++++++++++++++++++++--------------------------------- fs/ntfs/debug.h | 7 ++++++- fs/ntfs/super.c | 28 ++++++++++++---------------- 3 files changed, 42 insertions(+), 51 deletions(-) diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c index 807150e2c2b9..dd6103cc93c1 100644 --- a/fs/ntfs/debug.c +++ b/fs/ntfs/debug.c @@ -18,16 +18,9 @@ * distribution in the file COPYING); if not, write to the Free Software * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "debug.h" -/* - * A static buffer to hold the error string being displayed and a spinlock - * to protect concurrent accesses to it. - */ -static char err_buf[1024]; -static DEFINE_SPINLOCK(err_buf_lock); - /** * __ntfs_warning - output a warning to the syslog * @function: name of function outputting the warning @@ -50,6 +43,7 @@ static DEFINE_SPINLOCK(err_buf_lock); void __ntfs_warning(const char *function, const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; int flen = 0; @@ -59,17 +53,15 @@ void __ntfs_warning(const char *function, const struct super_block *sb, #endif if (function) flen = strlen(function); - spin_lock(&err_buf_lock); va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; if (sb) - printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n", - sb->s_id, flen ? function : "", err_buf); + pr_warn("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); else - printk(KERN_ERR "NTFS-fs warning: %s(): %s\n", - flen ? function : "", err_buf); - spin_unlock(&err_buf_lock); + pr_warn("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); } /** @@ -94,6 +86,7 @@ void __ntfs_warning(const char *function, const struct super_block *sb, void __ntfs_error(const char *function, const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; int flen = 0; @@ -103,17 +96,15 @@ void __ntfs_error(const char *function, const struct super_block *sb, #endif if (function) flen = strlen(function); - spin_lock(&err_buf_lock); va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; if (sb) - printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n", - sb->s_id, flen ? function : "", err_buf); + pr_err("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); else - printk(KERN_ERR "NTFS-fs error: %s(): %s\n", - flen ? function : "", err_buf); - spin_unlock(&err_buf_lock); + pr_err("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); } #ifdef DEBUG @@ -124,6 +115,7 @@ int debug_msgs = 0; void __ntfs_debug (const char *file, int line, const char *function, const char *fmt, ...) { + struct va_format vaf; va_list args; int flen = 0; @@ -131,13 +123,11 @@ void __ntfs_debug (const char *file, int line, const char *function, return; if (function) flen = strlen(function); - spin_lock(&err_buf_lock); va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf); va_end(args); - printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line, - flen ? function : "", err_buf); - spin_unlock(&err_buf_lock); } /* Dump a runlist. Caller has to provide synchronisation for @rl. */ @@ -149,12 +139,12 @@ void ntfs_debug_dump_runlist(const runlist_element *rl) if (!debug_msgs) return; - printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n"); + pr_debug("Dumping runlist (values in hex):\n"); if (!rl) { - printk(KERN_DEBUG "Run list not present.\n"); + pr_debug("Run list not present.\n"); return; } - printk(KERN_DEBUG "VCN LCN Run length\n"); + pr_debug("VCN LCN Run length\n"); for (i = 0; ; i++) { LCN lcn = (rl + i)->lcn; @@ -163,13 +153,13 @@ void ntfs_debug_dump_runlist(const runlist_element *rl) if (index > -LCN_ENOENT - 1) index = 3; - printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n", + pr_debug("%-16Lx %s %-16Lx%s\n", (long long)(rl + i)->vcn, lcn_str[index], (long long)(rl + i)->length, (rl + i)->length ? "" : " (runlist end)"); } else - printk(KERN_DEBUG "%-16Lx %-16Lx %-16Lx%s\n", + pr_debug("%-16Lx %-16Lx %-16Lx%s\n", (long long)(rl + i)->vcn, (long long)(rl + i)->lcn, (long long)(rl + i)->length, diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h index 53c27eaf2307..61bf091e32a8 100644 --- a/fs/ntfs/debug.h +++ b/fs/ntfs/debug.h @@ -48,7 +48,12 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl); #else /* !DEBUG */ -#define ntfs_debug(f, a...) do {} while (0) +#define ntfs_debug(fmt, ...) \ +do { \ + if (0) \ + no_printk(fmt, ##__VA_ARGS__); \ +} while (0) + #define ntfs_debug_dump_runlist(rl) do {} while (0) #endif /* !DEBUG */ diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index bd5610d48242..9de2491f2926 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -19,6 +19,7 @@ * distribution in the file COPYING); if not, write to the Free Software * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -1896,7 +1897,7 @@ get_ctx_vol_failed: vol->minor_ver = vi->minor_ver; ntfs_attr_put_search_ctx(ctx); unmap_mft_record(NTFS_I(vol->vol_ino)); - printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver, + pr_info("volume version %i.%i.\n", vol->major_ver, vol->minor_ver); if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " @@ -3095,7 +3096,7 @@ static int __init init_ntfs_fs(void) int err = 0; /* This may be ugly but it results in pretty output so who cares. (-8 */ - printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/" + pr_info("driver " NTFS_VERSION " [Flags: R/" #ifdef NTFS_RW "W" #else @@ -3115,16 +3116,15 @@ static int __init init_ntfs_fs(void) sizeof(ntfs_index_context), 0 /* offset */, SLAB_HWCACHE_ALIGN, NULL /* ctor */); if (!ntfs_index_ctx_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_index_ctx_cache_name); + pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name); goto ictx_err_out; } ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, sizeof(ntfs_attr_search_ctx), 0 /* offset */, SLAB_HWCACHE_ALIGN, NULL /* ctor */); if (!ntfs_attr_ctx_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_attr_ctx_cache_name); + pr_crit("NTFS: Failed to create %s!\n", + ntfs_attr_ctx_cache_name); goto actx_err_out; } @@ -3132,8 +3132,7 @@ static int __init init_ntfs_fs(void) (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, SLAB_HWCACHE_ALIGN, NULL); if (!ntfs_name_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_name_cache_name); + pr_crit("Failed to create %s!\n", ntfs_name_cache_name); goto name_err_out; } @@ -3141,8 +3140,7 @@ static int __init init_ntfs_fs(void) sizeof(ntfs_inode), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); if (!ntfs_inode_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_inode_cache_name); + pr_crit("Failed to create %s!\n", ntfs_inode_cache_name); goto inode_err_out; } @@ -3151,15 +3149,14 @@ static int __init init_ntfs_fs(void) SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, ntfs_big_inode_init_once); if (!ntfs_big_inode_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_big_inode_cache_name); + pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); goto big_inode_err_out; } /* Register the ntfs sysctls. */ err = ntfs_sysctl(1); if (err) { - printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n"); + pr_crit("Failed to register NTFS sysctls!\n"); goto sysctl_err_out; } @@ -3168,7 +3165,7 @@ static int __init init_ntfs_fs(void) ntfs_debug("NTFS driver registered successfully."); return 0; /* Success! */ } - printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n"); + pr_crit("Failed to register NTFS filesystem driver!\n"); /* Unregister the ntfs sysctls. */ ntfs_sysctl(0); @@ -3184,8 +3181,7 @@ actx_err_out: kmem_cache_destroy(ntfs_index_ctx_cache); ictx_err_out: if (!err) { - printk(KERN_CRIT "NTFS: Aborting NTFS filesystem driver " - "registration...\n"); + pr_crit("Aborting NTFS filesystem driver registration...\n"); err = -ENOMEM; } return err; -- cgit From a9af0c5dfdaf0b2e1a8bab7fbf6f29138947d534 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 7 Apr 2014 15:36:54 -0700 Subject: mm/hugetlb.c: add NULL check of return value of huge_pte_offset huge_pte_offset() could return NULL, so we need NULL check to avoid potential NULL pointer dereferences. Signed-off-by: Naoya Horiguchi Cc: Mel Gorman Cc: Sasha Levin Cc: Kirill A. Shutemov Cc: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7c02b9dadfb0..6cddfadaba03 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2690,7 +2690,8 @@ retry_avoidcopy: BUG_ON(huge_pte_none(pte)); spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(pte_same(huge_ptep_get(ptep), pte))) + if (likely(ptep && + pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; /* * race occurs while re-acquiring page table @@ -2734,7 +2735,7 @@ retry_avoidcopy: */ spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(pte_same(huge_ptep_get(ptep), pte))) { + if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); /* Break COW */ -- cgit From 88a9ab6e3dfb5b10168130c255c6102c925343ab Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Apr 2014 15:36:55 -0700 Subject: mm,numa: reorganize change_pmd_range() Reorganize the order of ifs in change_pmd_range a little, in preparation for the next patch. [akpm@linux-foundation.org: fix indenting, per David] Signed-off-by: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Reported-by: Xing Gang Tested-by: Chegu Vinod Acked-by: David Rientjes Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 769a67a15803..79cb51866e02 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -118,6 +118,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, unsigned long this_pages; next = pmd_addr_end(addr, end); + if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) + continue; if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); @@ -133,10 +135,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, continue; } } - /* fall through */ + /* fall through, the trans huge pmd just split */ } - if (pmd_none_or_clear_bad(pmd)) - continue; + VM_BUG_ON(pmd_trans_huge(*pmd)); this_pages = change_pte_range(vma, pmd, addr, next, newprot, dirty_accountable, prot_numa); pages += this_pages; -- cgit From 1ad9f620c3a22fa800489455ce517c29e576934e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Apr 2014 15:36:56 -0700 Subject: mm: numa: recheck for transhuge pages under lock during protection changes Sasha reported the following bug using trinity kernel BUG at mm/mprotect.c:149! invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 20 PID: 26219 Comm: trinity-c216 Tainted: G W 3.14.0-rc5-next-20140305-sasha-00011-ge06f5f3-dirty #105 task: ffff8800b6c80000 ti: ffff880228436000 task.ti: ffff880228436000 RIP: change_protection_range+0x3b3/0x500 Call Trace: change_protection+0x25/0x30 change_prot_numa+0x1b/0x30 task_numa_work+0x279/0x360 task_work_run+0xae/0xf0 do_notify_resume+0x8e/0xe0 retint_signal+0x4d/0x92 The VM_BUG_ON was added in -mm by the patch "mm,numa: reorganize change_pmd_range". The race existed without the patch but was just harder to hit. The problem is that a transhuge check is made without holding the PTL. It's possible at the time of the check that a parallel fault clears the pmd and inserts a new one which then triggers the VM_BUG_ON check. This patch removes the VM_BUG_ON but fixes the race by rechecking transhuge under the PTL when marking page tables for NUMA hinting and bailing if a race occurred. It is not a problem for calls to mprotect() as they hold mmap_sem for write. Signed-off-by: Mel Gorman Reported-by: Sasha Levin Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 79cb51866e02..2c51c79c8a69 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -36,6 +36,34 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif +/* + * For a prot_numa update we only hold mmap_sem for read so there is a + * potential race with faulting where a pmd was temporarily none. This + * function checks for a transhuge pmd under the appropriate lock. It + * returns a pte if it was successfully locked or NULL if it raced with + * a transhuge insertion. + */ +static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, int prot_numa, spinlock_t **ptl) +{ + pte_t *pte; + spinlock_t *pmdl; + + /* !prot_numa is protected by mmap_sem held for write */ + if (!prot_numa) + return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + + pmdl = pmd_lock(vma->vm_mm, pmd); + if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { + spin_unlock(pmdl); + return NULL; + } + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + spin_unlock(pmdl); + return pte; +} + static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -45,7 +73,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; unsigned long pages = 0; - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); + if (!pte) + return 0; + arch_enter_lazy_mmu_mode(); do { oldpte = *pte; @@ -132,12 +163,13 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pages += HPAGE_PMD_NR; nr_huge_updates++; } + + /* huge pmd was handled */ continue; } } /* fall through, the trans huge pmd just split */ } - VM_BUG_ON(pmd_trans_huge(*pmd)); this_pages = change_pte_range(vma, pmd, addr, next, newprot, dirty_accountable, prot_numa); pages += this_pages; -- cgit From a5338093bfb462256f70f3450c08f73e59543e26 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Apr 2014 15:36:57 -0700 Subject: mm: move mmu notifier call from change_protection to change_pmd_range The NUMA scanning code can end up iterating over many gigabytes of unpopulated memory, especially in the case of a freshly started KVM guest with lots of memory. This results in the mmu notifier code being called even when there are no mapped pages in a virtual address range. The amount of time wasted can be enough to trigger soft lockup warnings with very large KVM guests. This patch moves the mmu notifier call to the pmd level, which represents 1GB areas of memory on x86-64. Furthermore, the mmu notifier code is only called from the address in the PMD where present mappings are first encountered. The hugetlbfs code is left alone for now; hugetlb mappings are not relocatable, and as such are left alone by the NUMA code, and should never trigger this problem to begin with. Signed-off-by: Rik van Riel Acked-by: David Rientjes Cc: Peter Zijlstra Cc: Andrea Arcangeli Reported-by: Xing Gang Tested-by: Chegu Vinod Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 ++ mm/mprotect.c | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6cddfadaba03..ed5072c64daa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3186,6 +3186,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, BUG_ON(address >= end); flush_cache_range(vma, address, end); + mmu_notifier_invalidate_range_start(mm, start, end); mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; @@ -3215,6 +3216,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, */ flush_tlb_range(vma, start, end); mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + mmu_notifier_invalidate_range_end(mm, start, end); return pages << h->order; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 2c51c79c8a69..c43d557941f8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -140,9 +140,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pgprot_t newprot, int dirty_accountable, int prot_numa) { pmd_t *pmd; + struct mm_struct *mm = vma->vm_mm; unsigned long next; unsigned long pages = 0; unsigned long nr_huge_updates = 0; + unsigned long mni_start = 0; pmd = pmd_offset(pud, addr); do { @@ -151,6 +153,13 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, next = pmd_addr_end(addr, end); if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) continue; + + /* invoke the mmu notifier if the pmd is populated */ + if (!mni_start) { + mni_start = addr; + mmu_notifier_invalidate_range_start(mm, mni_start, end); + } + if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); @@ -175,6 +184,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pages += this_pages; } while (pmd++, addr = next, addr != end); + if (mni_start) + mmu_notifier_invalidate_range_end(mm, mni_start, end); + if (nr_huge_updates) count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); return pages; @@ -234,15 +246,12 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) { - struct mm_struct *mm = vma->vm_mm; unsigned long pages; - mmu_notifier_invalidate_range_start(mm, start, end); if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); - mmu_notifier_invalidate_range_end(mm, start, end); return pages; } -- cgit From 619d0d76c1ee943f171c7d4fc021ec7602388579 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Mon, 7 Apr 2014 15:36:59 -0700 Subject: mm/vmscan: restore sc->gfp_mask after promoting it to __GFP_HIGHMEM We promote sc->gfp_mask to __GFP_HIGHMEM to forcibly scan highmem if there are too many buffer_heads pinning highmem. See cc715d99e5 ("mm: vmscan: forcibly scan highmem if there are too many buffer_heads pinning highmem"). This patch restores sc->gfp_mask to its caller original value after finishing the scan job, to avoid the impact on other invocations from its upper caller, such as vmpressure_prio(), shrink_slab(). Signed-off-by: Weijie Yang Acked-by: Mel Gorman Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1f56a80a7c41..1c51e4f52fd9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2314,6 +2314,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long lru_pages = 0; bool aborted_reclaim = false; struct reclaim_state *reclaim_state = current->reclaim_state; + gfp_t orig_mask; struct shrink_control shrink = { .gfp_mask = sc->gfp_mask, }; @@ -2323,6 +2324,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * allowed level, force direct reclaim to scan the highmem zone as * highmem pages could be pinning lowmem pages storing buffer_heads */ + orig_mask = sc->gfp_mask; if (buffer_heads_over_limit) sc->gfp_mask |= __GFP_HIGHMEM; @@ -2393,6 +2395,12 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) } } + /* + * Restore to original mask to avoid the impact on the caller if we + * promoted it to __GFP_HIGHMEM. + */ + sc->gfp_mask = orig_mask; + return aborted_reclaim; } -- cgit From 9bbc04eeb01fcb5c20bb10f34989665df7200163 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Mon, 7 Apr 2014 15:37:00 -0700 Subject: mm/vmscan: do not check compaction_ready on promoted zones We abort direct reclaim if we find the zone is ready for compaction. Sometimes the zone is just a promoted highmem zone to force a scan of highmem, which is not the intended zone the caller want to allocate a page from. In this situation, setting aborted_reclaim to indicate the caller turned back to retry the allocation is waste of time and could cause a loop in __alloc_pages_slowpath(). This patch does not check compaction_ready() on promoted zones to avoid the above situation. Only set aborted_reclaim if the caller intended zone is ready for compaction. Signed-off-by: Weijie Yang Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1c51e4f52fd9..06879ead7380 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2318,6 +2318,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) struct shrink_control shrink = { .gfp_mask = sc->gfp_mask, }; + enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2358,7 +2359,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * noticeable problem, like transparent huge * page allocations. */ - if (compaction_ready(zone, sc)) { + if ((zonelist_zone_idx(z) <= requested_highidx) + && compaction_ready(zone, sc)) { aborted_reclaim = true; continue; } -- cgit From 7aa6b4ad5a81d7761b044d38ac0120850a6396ca Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 7 Apr 2014 15:37:01 -0700 Subject: mm/memory.c: update comment in unmap_single_vma() The described issue now occurs inside mmap_region(). And unfortunately is still valid. Signed-off-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 82c1e4cf00d1..c6ee34d10fcc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1320,9 +1320,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error - * cleanup path of do_mmap_pgoff. When + * cleanup path of mmap_region. When * hugetlbfs ->mmap method fails, - * do_mmap_pgoff() nullifies vma->vm_file + * mmap_region() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. -- cgit From 70ef57e6c22c3323dce179b7d0d433c479266612 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 7 Apr 2014 15:37:01 -0700 Subject: mm: exclude memoryless nodes from zone_reclaim We had a report about strange OOM killer strikes on a PPC machine although there was a lot of swap free and a tons of anonymous memory which could be swapped out. In the end it turned out that the OOM was a side effect of zone reclaim which wasn't unmapping and swapping out and so the system was pushed to the OOM. Although this sounds like a bug somewhere in the kswapd vs. zone reclaim vs. direct reclaim interaction numactl on the said hardware suggests that the zone reclaim should not have been set in the first place: node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 node 0 size: 0 MB node 0 free: 0 MB node 2 cpus: node 2 size: 7168 MB node 2 free: 6019 MB node distances: node 0 2 0: 10 40 2: 40 10 So all the CPUs are associated with Node0 which doesn't have any memory while Node2 contains all the available memory. Node distances cause an automatic zone_reclaim_mode enabling. Zone reclaim is intended to keep the allocations local but this doesn't make any sense on the memoryless nodes. So let's exclude such nodes for init_zone_allows_reclaim which evaluates zone reclaim behavior and suitable reclaim_nodes. Signed-off-by: Michal Hocko Acked-by: David Rientjes Acked-by: Nishanth Aravamudan Tested-by: Nishanth Aravamudan Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 979378deccbf..336ee925f756 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1870,7 +1870,7 @@ static void __paginginit init_zone_allows_reclaim(int nid) { int i; - for_each_online_node(i) + for_each_node_state(i, N_MEMORY) if (node_distance(nid, i) <= RECLAIM_DISTANCE) node_set(i, NODE_DATA(nid)->reclaim_nodes); else @@ -4919,7 +4919,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - init_zone_allows_reclaim(nid); + if (node_state(nid, N_MEMORY)) + init_zone_allows_reclaim(nid); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); #endif -- cgit From 7d348b9ea64db0a315d777ce7d4b06697f946503 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 7 Apr 2014 15:37:03 -0700 Subject: mm/compaction: disallow high-order page for migration target Purpose of compaction is to get a high order page. Currently, if we find high-order page while searching migration target page, we break it to order-0 pages and use them as migration target. It is contrary to purpose of compaction, so disallow high-order page to be used for migration target. Additionally, clean-up logic in suitable_migration_target() to simplify the code. There is no functional changes from this clean-up. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index b6ab77160068..9a03fdb1fd84 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -217,21 +217,12 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, /* Returns true if the page is within a block suitable for migration to */ static bool suitable_migration_target(struct page *page) { - int migratetype = get_pageblock_migratetype(page); - - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ - if (migratetype == MIGRATE_RESERVE) - return false; - - if (is_migrate_isolate(migratetype)) - return false; - - /* If the page is a large free page, then allow migration */ + /* If the page is a large free page, then disallow migration */ if (PageBuddy(page) && page_order(page) >= pageblock_order) - return true; + return false; /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(migratetype)) + if (migrate_async_suitable(get_pageblock_migratetype(page))) return true; /* Otherwise skip the block */ -- cgit From 01ead5340bcf5f3a1cd2452c75516d0ef4d908d7 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 7 Apr 2014 15:37:04 -0700 Subject: mm/compaction: do not call suitable_migration_target() on every page suitable_migration_target() checks that pageblock is suitable for migration target. In isolate_freepages_block(), it is called on every page and this is inefficient. So make it called once per pageblock. suitable_migration_target() also checks if page is highorder or not, but it's criteria for highorder is pageblock order. So calling it once within pageblock range has no problem. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 9a03fdb1fd84..3a1828541dc0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -244,6 +244,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, struct page *cursor, *valid_page = NULL; unsigned long flags; bool locked = false; + bool checked_pageblock = false; cursor = pfn_to_page(blockpfn); @@ -275,8 +276,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, break; /* Recheck this is a suitable migration target under lock */ - if (!strict && !suitable_migration_target(page)) - break; + if (!strict && !checked_pageblock) { + /* + * We need to check suitability of pageblock only once + * and this isolate_freepages_block() is called with + * pageblock range, so just check once is sufficient. + */ + checked_pageblock = true; + if (!suitable_migration_target(page)) + break; + } /* Recheck this is a buddy page under lock */ if (!PageBuddy(page)) -- cgit From be1aa03b973c7dcdc576f3503f7a60429825c35d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 7 Apr 2014 15:37:05 -0700 Subject: mm/compaction: change the timing to check to drop the spinlock It is odd to drop the spinlock when we scan (SWAP_CLUSTER_MAX - 1) th pfn page. This may results in below situation while isolating migratepage. 1. try isolate 0x0 ~ 0x200 pfn pages. 2. When low_pfn is 0x1ff, ((low_pfn+1) % SWAP_CLUSTER_MAX) == 0, so drop the spinlock. 3. Then, to complete isolating, retry to aquire the lock. I think that it is better to use SWAP_CLUSTER_MAX th pfn for checking the criteria about dropping the lock. This has no harm 0x0 pfn, because, at this time, locked variable would be false. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 3a1828541dc0..0eb9f99196ce 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -487,7 +487,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, cond_resched(); for (; low_pfn < end_pfn; low_pfn++) { /* give a chance to irqs before checking need_resched() */ - if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { + if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { if (should_release_lock(&zone->lru_lock)) { spin_unlock_irqrestore(&zone->lru_lock, flags); locked = false; -- cgit From c122b2087ab94192f2b937e47b563a9c4e688ece Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 7 Apr 2014 15:37:06 -0700 Subject: mm/compaction: check pageblock suitability once per pageblock isolation_suitable() and migrate_async_suitable() is used to be sure that this pageblock range is fine to be migragted. It isn't needed to call it on every page. Current code do well if not suitable, but, don't do well when suitable. 1) It re-checks isolation_suitable() on each page of a pageblock that was already estabilished as suitable. 2) It re-checks migrate_async_suitable() on each page of a pageblock that was not entered through the next_pageblock: label, because last_pageblock_nr is not otherwise updated. This patch fixes situation by 1) calling isolation_suitable() only once per pageblock and 2) always updating last_pageblock_nr to the pageblock that was just checked. Additionally, move PageBuddy() check after pageblock unit check, since pageblock check is the first thing we should do and makes things more simple. [vbabka@suse.cz: rephrase commit description] Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 0eb9f99196ce..6878c005bc8e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -526,8 +526,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* If isolation recently failed, do not retry */ pageblock_nr = low_pfn >> pageblock_order; - if (!isolation_suitable(cc, page)) - goto next_pageblock; + if (last_pageblock_nr != pageblock_nr) { + int mt; + + last_pageblock_nr = pageblock_nr; + if (!isolation_suitable(cc, page)) + goto next_pageblock; + + /* + * For async migration, also only scan in MOVABLE + * blocks. Async migration is optimistic to see if + * the minimum amount of work satisfies the allocation + */ + mt = get_pageblock_migratetype(page); + if (!cc->sync && !migrate_async_suitable(mt)) { + cc->finished_update_migrate = true; + skipped_async_unsuitable = true; + goto next_pageblock; + } + } /* * Skip if free. page_order cannot be used without zone->lock @@ -536,18 +553,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (PageBuddy(page)) continue; - /* - * For async migration, also only scan in MOVABLE blocks. Async - * migration is optimistic to see if the minimum amount of work - * satisfies the allocation - */ - if (!cc->sync && last_pageblock_nr != pageblock_nr && - !migrate_async_suitable(get_pageblock_migratetype(page))) { - cc->finished_update_migrate = true; - skipped_async_unsuitable = true; - goto next_pageblock; - } - /* * Check may be lockless but that's ok as we recheck later. * It's possible to migrate LRU pages and balloon pages @@ -639,7 +644,6 @@ check_compact_cluster: next_pageblock: low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; - last_pageblock_nr = pageblock_nr; } acct_isolated(zone, locked, cc); -- cgit From b6c750163c0d138f5041d95fcdbd1094b6928057 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 7 Apr 2014 15:37:07 -0700 Subject: mm/compaction: clean-up code on success of ballon isolation It is just for clean-up to reduce code size and improve readability. There is no functional change. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 6878c005bc8e..054c28b51c75 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -562,11 +562,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (unlikely(balloon_page_movable(page))) { if (locked && balloon_page_isolate(page)) { /* Successfully isolated */ - cc->finished_update_migrate = true; - list_add(&page->lru, migratelist); - cc->nr_migratepages++; - nr_isolated++; - goto check_compact_cluster; + goto isolate_success; } } continue; @@ -627,13 +623,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, VM_BUG_ON_PAGE(PageTransCompound(page), page); /* Successfully isolated */ - cc->finished_update_migrate = true; del_page_from_lru_list(page, lruvec, page_lru(page)); + +isolate_success: + cc->finished_update_migrate = true; list_add(&page->lru, migratelist); cc->nr_migratepages++; nr_isolated++; -check_compact_cluster: /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { ++low_pfn; -- cgit From 1e1836e84f87d12feac6dd225fcef5eba1ca724b Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Mon, 7 Apr 2014 15:37:09 -0700 Subject: mm: revert "thp: make MADV_HUGEPAGE check for mm->def_flags" The main motivation behind this patch is to provide a way to disable THP for jobs where the code cannot be modified, and using a malloc hook with madvise is not an option (i.e. statically allocated data). This patch allows us to do just that, without affecting other jobs running on the system. We need to do this sort of thing for jobs where THP hurts performance, due to the possibility of increased remote memory accesses that can be created by situations such as the following: When you touch 1 byte of an untouched, contiguous 2MB chunk, a THP will be handed out, and the THP will be stuck on whatever node the chunk was originally referenced from. If many remote nodes need to do work on that same chunk, they'll be making remote accesses. With THP disabled, 4K pages can be handed out to separate nodes as they're needed, greatly reducing the amount of remote accesses to memory. This patch is based on some of my work combined with some suggestions/patches given by Oleg Nesterov. The main goal here is to add a prctl switch to allow us to disable to THP on a per mm_struct basis. Here's a bit of test data with the new patch in place... First with the flag unset: # perf stat -a ./prctl_wrapper_mmv3 0 ./thp_pthread -C 0 -m 0 -c 512 -b 256g Setting thp_disabled for this task... thp_disable: 0 Set thp_disabled state to 0 Process pid = 18027 PF/ MAX MIN TOTCPU/ TOT_PF/ TOT_PF/ WSEC/ TYPE: CPUS WALL WALL SYS USER TOTCPU CPU WALL_SEC SYS_SEC CPU NODES 512 1.120 0.060 0.000 0.110 0.110 0.000 28571428864 -9223372036854775808 55803572 23 Performance counter stats for './prctl_wrapper_mmv3_hack 0 ./thp_pthread -C 0 -m 0 -c 512 -b 256g': 273719072.841402 task-clock # 641.026 CPUs utilized [100.00%] 1,008,986 context-switches # 0.000 M/sec [100.00%] 7,717 CPU-migrations # 0.000 M/sec [100.00%] 1,698,932 page-faults # 0.000 M/sec 355,222,544,890,379 cycles # 1.298 GHz [100.00%] 536,445,412,234,588 stalled-cycles-frontend # 151.02% frontend cycles idle [100.00%] 409,110,531,310,223 stalled-cycles-backend # 115.17% backend cycles idle [100.00%] 148,286,797,266,411 instructions # 0.42 insns per cycle # 3.62 stalled cycles per insn [100.00%] 27,061,793,159,503 branches # 98.867 M/sec [100.00%] 1,188,655,196 branch-misses # 0.00% of all branches 427.001706337 seconds time elapsed Now with the flag set: # perf stat -a ./prctl_wrapper_mmv3 1 ./thp_pthread -C 0 -m 0 -c 512 -b 256g Setting thp_disabled for this task... thp_disable: 1 Set thp_disabled state to 1 Process pid = 144957 PF/ MAX MIN TOTCPU/ TOT_PF/ TOT_PF/ WSEC/ TYPE: CPUS WALL WALL SYS USER TOTCPU CPU WALL_SEC SYS_SEC CPU NODES 512 0.620 0.260 0.250 0.320 0.570 0.001 51612901376 128000000000 100806448 23 Performance counter stats for './prctl_wrapper_mmv3_hack 1 ./thp_pthread -C 0 -m 0 -c 512 -b 256g': 138789390.540183 task-clock # 641.959 CPUs utilized [100.00%] 534,205 context-switches # 0.000 M/sec [100.00%] 4,595 CPU-migrations # 0.000 M/sec [100.00%] 63,133,119 page-faults # 0.000 M/sec 147,977,747,269,768 cycles # 1.066 GHz [100.00%] 200,524,196,493,108 stalled-cycles-frontend # 135.51% frontend cycles idle [100.00%] 105,175,163,716,388 stalled-cycles-backend # 71.07% backend cycles idle [100.00%] 180,916,213,503,160 instructions # 1.22 insns per cycle # 1.11 stalled cycles per insn [100.00%] 26,999,511,005,868 branches # 194.536 M/sec [100.00%] 714,066,351 branch-misses # 0.00% of all branches 216.196778807 seconds time elapsed As with previous versions of the patch, We're getting about a 2x performance increase here. Here's a link to the test case I used, along with the little wrapper to activate the flag: http://oss.sgi.com/projects/memtests/thp_pthread_mmprctlv3.tar.gz This patch (of 3): Revert commit 8e72033f2a48 and add in code to fix up any issues caused by the revert. The revert is necessary because hugepage_madvise would return -EINVAL when VM_NOHUGEPAGE is set, which will break subsequent chunks of this patch set. Here's a snip of an e-mail from Gerald detailing the original purpose of this code, and providing justification for the revert: "The intent of commit 8e72033f2a48 was to guard against any future programming errors that may result in an madvice(MADV_HUGEPAGE) on guest mappings, which would crash the kernel. Martin suggested adding the bit to arch/s390/mm/pgtable.c, if 8e72033f2a48 was to be reverted, because that check will also prevent a kernel crash in the case described above, it will now send a SIGSEGV instead. This would now also allow to do the madvise on other parts, if needed, so it is a more flexible approach. One could also say that it would have been better to do it this way right from the beginning..." Signed-off-by: Alex Thorlton Suggested-by: Oleg Nesterov Tested-by: Christian Borntraeger Cc: Gerald Schaefer Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Paolo Bonzini Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Rik van Riel Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/pgtable.c | 3 +++ mm/huge_memory.c | 13 +++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 796c9320c709..5d8324cd866b 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -505,6 +505,9 @@ static int gmap_connect_pgtable(unsigned long address, unsigned long segment, if (!pmd_present(*pmd) && __pte_alloc(mm, vma, pmd, vmaddr)) return -ENOMEM; + /* large pmds cannot yet be handled */ + if (pmd_large(*pmd)) + return -EFAULT; /* pmd now points to a valid segment table entry. */ rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT); if (!rmap) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6ac89e9f82ef..a2f4981418fc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1891,17 +1891,22 @@ out: int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { - struct mm_struct *mm = vma->vm_mm; - switch (advice) { case MADV_HUGEPAGE: +#ifdef CONFIG_S390 + /* + * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 + * can't handle this properly after s390_enable_sie, so we simply + * ignore the madvise to prevent qemu from causing a SIGSEGV. + */ + if (mm_has_pgste(vma->vm_mm)) + return 0; +#endif /* * Be somewhat over-protective like KSM for now! */ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) return -EINVAL; - if (mm->def_flags & VM_NOHUGEPAGE) - return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* -- cgit From a0715cc22601e8830ace98366c0c2bd8da52af52 Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Mon, 7 Apr 2014 15:37:10 -0700 Subject: mm, thp: add VM_INIT_DEF_MASK and PRCTL_THP_DISABLE Add VM_INIT_DEF_MASK, to allow us to set the default flags for VMs. It also adds a prctl control which allows us to set the THP disable bit in mm->def_flags so that VMs will pick up the setting as they are created. Signed-off-by: Alex Thorlton Suggested-by: Oleg Nesterov Cc: Gerald Schaefer Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Christian Borntraeger Cc: Paolo Bonzini Cc: "Kirill A. Shutemov" Cc: Mel Gorman Acked-by: Rik van Riel Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Alexander Viro Cc: Johannes Weiner Cc: David Rientjes Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ include/uapi/linux/prctl.h | 3 +++ kernel/fork.c | 11 ++++++++--- kernel/sys.c | 15 +++++++++++++++ 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 35300f390eb6..c270fa68a32b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -177,6 +177,9 @@ extern unsigned int kobjsize(const void *objp); */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/* This mask defines which mm->def_flags a process can inherit its parent */ +#define VM_INIT_DEF_MASK VM_NOHUGEPAGE + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 289760f424aa..58afc04c107e 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -149,4 +149,7 @@ #define PR_GET_TID_ADDRESS 40 +#define PR_SET_THP_DISABLE 41 +#define PR_GET_THP_DISABLE 42 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index abc45890f0a5..e40c0a01d5a6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -530,8 +530,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? - (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); @@ -540,8 +538,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm_init_owner(mm, p); clear_tlb_flush_pending(mm); - if (likely(!mm_alloc_pgd(mm))) { + if (current->mm) { + mm->flags = current->mm->flags & MMF_INIT_MASK; + mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; + } else { + mm->flags = default_dump_filter; mm->def_flags = 0; + } + + if (likely(!mm_alloc_pgd(mm))) { mmu_notifier_mm_init(mm); return mm; } diff --git a/kernel/sys.c b/kernel/sys.c index adaeab6f7a87..fba0f29401ea 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1996,6 +1996,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 || arg3 || arg4 || arg5) return -EINVAL; return current->no_new_privs ? 1 : 0; + case PR_GET_THP_DISABLE: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + break; + case PR_SET_THP_DISABLE: + if (arg3 || arg4 || arg5) + return -EINVAL; + down_write(&me->mm->mmap_sem); + if (arg2) + me->mm->def_flags |= VM_NOHUGEPAGE; + else + me->mm->def_flags &= ~VM_NOHUGEPAGE; + up_write(&me->mm->mmap_sem); + break; default: error = -EINVAL; break; -- cgit From ab0e113f6bee71a3933755d2c9ae41fcee631800 Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Mon, 7 Apr 2014 15:37:12 -0700 Subject: exec: kill the unnecessary mm->def_flags setting in load_elf_binary() load_elf_binary() sets current->mm->def_flags = def_flags and def_flags is always zero. Not only this looks strange, this is unnecessary because mm_init() has already set ->def_flags = 0. Signed-off-by: Alex Thorlton Suggested-by: Oleg Nesterov Cc: Gerald Schaefer Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Christian Borntraeger Cc: Paolo Bonzini Cc: "Kirill A. Shutemov" Cc: Mel Gorman Acked-by: Rik van Riel Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Alexander Viro Cc: Johannes Weiner Cc: David Rientjes Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 0f59799fa105..aa3cb626671e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -584,7 +584,6 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; - unsigned long def_flags = 0; struct pt_regs *regs = current_pt_regs(); struct { struct elfhdr elf_ex; @@ -724,9 +723,6 @@ static int load_elf_binary(struct linux_binprm *bprm) if (retval) goto out_free_dentry; - /* OK, This is the point of no return */ - current->mm->def_flags = def_flags; - /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ SET_PERSONALITY(loc->elf_ex); -- cgit From 9164550ecd15253d72b5fe3b4baa9505c4b6fa1f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:14 -0700 Subject: mm: disable split page table lock for !MMU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no reason to enable split page table lock if don't have page tables. It also triggers build error at least on ARM since we don't define pmd_page() for !MMU. In file included from arch/arm/kernel/asm-offsets.c:14:0: include/linux/mm.h: In function 'pte_lockptr': include/linux/mm.h:1392:2: error: implicit declaration of function 'pmd_page' [-Werror=implicit-function-declaration] include/linux/mm.h:1392:2: warning: passing argument 1 of 'ptlock_ptr' makes pointer from integer without a cast [enabled by default] include/linux/mm.h:1384:27: note: expected 'struct page *' but argument is of type 'int' Signed-off-by: Kirill A. Shutemov Reported-by: Uwe Kleine-König Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/Kconfig b/mm/Kconfig index 2888024e0b0a..37fbe1ef5239 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -216,6 +216,7 @@ config PAGEFLAGS_EXTENDED # config SPLIT_PTLOCK_CPUS int + default "999999" if !MMU default "999999" if ARM && !CPU_CACHE_VIPT default "999999" if PARISC && !PA20 default "4" -- cgit From 65a6a4105f84f961fb219f5acaf05203f7114cf9 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 7 Apr 2014 15:37:15 -0700 Subject: tools/vm/page-types.c: page-cache sniffing feature After this patch 'page-types' can walk over a file's mappings and analyze populated page cache pages mostly without disturbing its state. It maps chunk of file, marks VMA as MADV_RANDOM to turn off readahead, pokes VMA via mincore() to determine cached pages, triggers page-fault only for them, and finally gathers information via pagemap/kpageflags. Before unmap it marks VMA as MADV_SEQUENTIAL for ignoring reference bits. usage: page-types -f If is directory it will analyse all files in all subdirectories. Symlinks are not followed as well as mount points. Hardlinks aren't handled, they'll be dumped as many times as they are found. Recursive walk brings all dentries into dcache and populates page cache of block-devices aka 'Buffers'. Probably it's worth to add ioctl for dumping file page cache as array of PFNs as a replacement for this hackish juggling with mmap/madvise/mincore/pagemap. Also recursive walk could be replaced with dumping cached inodes via some ioctl or debugfs interface followed by openning them via open_by_handle_at, this would fix hardlinks handling and unneeded population of dcache and buffers. This interface might be used as data source for constructing readahead plans and for background optimizations of actively used files. collateral changes: + fix 64-bit LFS: define _FILE_OFFSET_BITS instead of _LARGEFILE64_SOURCE + replace lseek + read with single pread + make show_page_range() reusable after flush usage example: ~/src/linux/tools/vm$ sudo ./page-types -L -f page-types foffset offset flags page-types Inode: 2229277 Size: 89065 (22 pages) Modify: Tue Feb 25 12:00:59 2014 (162 seconds ago) Access: Tue Feb 25 12:01:00 2014 (161 seconds ago) 0 3cbf3b __RU_lA____M________________________ 1 38946a __RU_lA____M________________________ 2 1a3cec __RU_lA____M________________________ 3 1a8321 __RU_lA____M________________________ 4 3af7cc __RU_lA____M________________________ 5 1ed532 __RU_lA_____________________________ 6 2e436a __RU_lA_____________________________ 7 29a35e ___U_lA_____________________________ 8 2de86e ___U_lA_____________________________ 9 3bdfb4 ___U_lA_____________________________ 10 3cd8a3 ___U_lA_____________________________ 11 2afa50 ___U_lA_____________________________ 12 2534c2 ___U_lA_____________________________ 13 1b7a40 ___U_lA_____________________________ 14 17b0be ___U_lA_____________________________ 15 392b0c ___U_lA_____________________________ 16 3ba46a __RU_lA_____________________________ 17 397dc8 ___U_lA_____________________________ 18 1f2a36 ___U_lA_____________________________ 19 21fd30 __RU_lA_____________________________ 20 2c35ba __RU_l______________________________ 21 20f181 __RU_l______________________________ flags page-count MB symbolic-flags long-symbolic-flags 0x000000000000002c 2 0 __RU_l______________________________ referenced,uptodate,lru 0x0000000000000068 11 0 ___U_lA_____________________________ uptodate,lru,active 0x000000000000006c 4 0 __RU_lA_____________________________ referenced,uptodate,lru,active 0x000000000000086c 5 0 __RU_lA____M________________________ referenced,uptodate,lru,active,mmap total 22 0 ~/src/linux/tools/vm$ sudo ./page-types -f / flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000028 21761 85 ___U_l______________________________ uptodate,lru 0x000000000000002c 127279 497 __RU_l______________________________ referenced,uptodate,lru 0x0000000000000068 74160 289 ___U_lA_____________________________ uptodate,lru,active 0x000000000000006c 84469 329 __RU_lA_____________________________ referenced,uptodate,lru,active 0x000000000000007c 1 0 __RUDlA_____________________________ referenced,uptodate,dirty,lru,active 0x0000000000000228 370 1 ___U_l___I__________________________ uptodate,lru,reclaim 0x0000000000000828 49 0 ___U_l_____M________________________ uptodate,lru,mmap 0x000000000000082c 126 0 __RU_l_____M________________________ referenced,uptodate,lru,mmap 0x0000000000000868 137 0 ___U_lA____M________________________ uptodate,lru,active,mmap 0x000000000000086c 12890 50 __RU_lA____M________________________ referenced,uptodate,lru,active,mmap total 321242 1254 Signed-off-by: Konstantin Khlebnikov Cc: Arnaldo Carvalho de Melo Cc: Fengguang Wu Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page-types.c | 170 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 152 insertions(+), 18 deletions(-) diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index f9be24d9efac..05654f5e48d5 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -19,7 +19,8 @@ * Authors: Wu Fengguang */ -#define _LARGEFILE64_SOURCE +#define _FILE_OFFSET_BITS 64 +#define _GNU_SOURCE #include #include #include @@ -29,11 +30,14 @@ #include #include #include +#include +#include #include #include #include #include #include +#include #include "../../include/uapi/linux/magic.h" #include "../../include/uapi/linux/kernel-page-flags.h" #include @@ -158,6 +162,7 @@ static int opt_raw; /* for kernel developers */ static int opt_list; /* list pages (in ranges) */ static int opt_no_summary; /* don't show summary */ static pid_t opt_pid; /* process to walk */ +const char * opt_file; #define MAX_ADDR_RANGES 1024 static int nr_addr_ranges; @@ -253,12 +258,7 @@ static unsigned long do_u64_read(int fd, char *name, if (index > ULONG_MAX / 8) fatal("index overflow: %lu\n", index); - if (lseek(fd, index * 8, SEEK_SET) < 0) { - perror(name); - exit(EXIT_FAILURE); - } - - bytes = read(fd, buf, count * 8); + bytes = pread(fd, buf, count * 8, (off_t)index * 8); if (bytes < 0) { perror(name); exit(EXIT_FAILURE); @@ -343,8 +343,8 @@ static char *page_flag_longname(uint64_t flags) * page list and summary */ -static void show_page_range(unsigned long voffset, - unsigned long offset, uint64_t flags) +static void show_page_range(unsigned long voffset, unsigned long offset, + unsigned long size, uint64_t flags) { static uint64_t flags0; static unsigned long voff; @@ -352,14 +352,16 @@ static void show_page_range(unsigned long voffset, static unsigned long count; if (flags == flags0 && offset == index + count && - (!opt_pid || voffset == voff + count)) { - count++; + size && voffset == voff + count) { + count += size; return; } if (count) { if (opt_pid) printf("%lx\t", voff); + if (opt_file) + printf("%lu\t", voff); printf("%lx\t%lx\t%s\n", index, count, page_flag_name(flags0)); } @@ -367,7 +369,12 @@ static void show_page_range(unsigned long voffset, flags0 = flags; index = offset; voff = voffset; - count = 1; + count = size; +} + +static void flush_page_range(void) +{ + show_page_range(0, 0, 0, 0); } static void show_page(unsigned long voffset, @@ -375,6 +382,8 @@ static void show_page(unsigned long voffset, { if (opt_pid) printf("%lx\t", voffset); + if (opt_file) + printf("%lu\t", voffset); printf("%lx\t%s\n", offset, page_flag_name(flags)); } @@ -565,7 +574,7 @@ static void add_page(unsigned long voffset, unpoison_page(offset); if (opt_list == 1) - show_page_range(voffset, offset, flags); + show_page_range(voffset, offset, 1, flags); else if (opt_list == 2) show_page(voffset, offset, flags); @@ -667,7 +676,7 @@ static void walk_addr_ranges(void) for (i = 0; i < nr_addr_ranges; i++) if (!opt_pid) - walk_pfn(0, opt_offset[i], opt_size[i], 0); + walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0); else walk_task(opt_offset[i], opt_size[i]); @@ -699,9 +708,7 @@ static void usage(void) " -a|--addr addr-spec Walk a range of pages\n" " -b|--bits bits-spec Walk pages with specified bits\n" " -p|--pid pid Walk process address space\n" -#if 0 /* planned features */ " -f|--file filename Walk file address space\n" -#endif " -l|--list Show page details in ranges\n" " -L|--list-each Show page details one by one\n" " -N|--no-summary Don't show summary info\n" @@ -799,8 +806,130 @@ static void parse_pid(const char *str) fclose(file); } +static void show_file(const char *name, const struct stat *st) +{ + unsigned long long size = st->st_size; + char atime[64], mtime[64]; + long now = time(NULL); + + printf("%s\tInode: %u\tSize: %llu (%llu pages)\n", + name, (unsigned)st->st_ino, + size, (size + page_size - 1) / page_size); + + strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime)); + strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime)); + + printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n", + mtime, now - st->st_mtime, + atime, now - st->st_atime); +} + +static void walk_file(const char *name, const struct stat *st) +{ + uint8_t vec[PAGEMAP_BATCH]; + uint64_t buf[PAGEMAP_BATCH], flags; + unsigned long nr_pages, pfn, i; + int fd; + off_t off; + ssize_t len; + void *ptr; + int first = 1; + + fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW); + + for (off = 0; off < st->st_size; off += len) { + nr_pages = (st->st_size - off + page_size - 1) / page_size; + if (nr_pages > PAGEMAP_BATCH) + nr_pages = PAGEMAP_BATCH; + len = nr_pages * page_size; + + ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off); + if (ptr == MAP_FAILED) + fatal("mmap failed: %s", name); + + /* determine cached pages */ + if (mincore(ptr, len, vec)) + fatal("mincore failed: %s", name); + + /* turn off readahead */ + if (madvise(ptr, len, MADV_RANDOM)) + fatal("madvice failed: %s", name); + + /* populate ptes */ + for (i = 0; i < nr_pages ; i++) { + if (vec[i] & 1) + (void)*(volatile int *)(ptr + i * page_size); + } + + /* turn off harvesting reference bits */ + if (madvise(ptr, len, MADV_SEQUENTIAL)) + fatal("madvice failed: %s", name); + + if (pagemap_read(buf, (unsigned long)ptr / page_size, + nr_pages) != nr_pages) + fatal("cannot read pagemap"); + + munmap(ptr, len); + + for (i = 0; i < nr_pages; i++) { + pfn = pagemap_pfn(buf[i]); + if (!pfn) + continue; + if (!kpageflags_read(&flags, pfn, 1)) + continue; + if (first && opt_list) { + first = 0; + flush_page_range(); + show_file(name, st); + } + add_page(off / page_size + i, pfn, flags, buf[i]); + } + } + + close(fd); +} + +int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f) +{ + (void)f; + switch (type) { + case FTW_F: + if (S_ISREG(st->st_mode)) + walk_file(name, st); + break; + case FTW_DNR: + fprintf(stderr, "cannot read dir: %s\n", name); + break; + } + return 0; +} + +static void walk_page_cache(void) +{ + struct stat st; + + kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY); + pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY); + + if (stat(opt_file, &st)) + fatal("stat failed: %s\n", opt_file); + + if (S_ISREG(st.st_mode)) { + walk_file(opt_file, &st); + } else if (S_ISDIR(st.st_mode)) { + /* do not follow symlinks and mountpoints */ + if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0) + fatal("nftw failed: %s\n", opt_file); + } else + fatal("unhandled file type: %s\n", opt_file); + + close(kpageflags_fd); + close(pagemap_fd); +} + static void parse_file(const char *name) { + opt_file = name; } static void parse_addr_range(const char *optarg) @@ -991,15 +1120,20 @@ int main(int argc, char *argv[]) if (opt_list && opt_pid) printf("voffset\t"); + if (opt_list && opt_file) + printf("foffset\t"); if (opt_list == 1) printf("offset\tlen\tflags\n"); if (opt_list == 2) printf("offset\tflags\n"); - walk_addr_ranges(); + if (opt_file) + walk_page_cache(); + else + walk_addr_ranges(); if (opt_list == 1) - show_page_range(0, 0, 0); /* drain the buffer */ + flush_page_range(); if (opt_no_summary) return 0; -- cgit From 179e09637c1bb17c0b6fb347353c7c52e6931fa5 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 7 Apr 2014 15:37:16 -0700 Subject: drivers/lguest/page_tables.c: rename do_set_pte() "mm: introduce vm_ops->map_pages()" wants to export a do_set_pte() from core kernel. Rename lguest's do_set_pte() to something more lguest-specific. Cc: "Kirill A. Shutemov" Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/lguest/page_tables.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index bfb39bb56ef1..e8b55c3a6170 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -887,7 +887,7 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. */ -static void do_set_pte(struct lg_cpu *cpu, int idx, +static void __guest_set_pte(struct lg_cpu *cpu, int idx, unsigned long vaddr, pte_t gpte) { /* Look up the matching shadow page directory entry. */ @@ -960,13 +960,13 @@ void guest_set_pte(struct lg_cpu *cpu, unsigned int i; for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) if (cpu->lg->pgdirs[i].pgdir) - do_set_pte(cpu, i, vaddr, gpte); + __guest_set_pte(cpu, i, vaddr, gpte); } else { /* Is this page table one we have a shadow for? */ int pgdir = find_pgdir(cpu->lg, gpgdir); if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs)) /* If so, do the update. */ - do_set_pte(cpu, pgdir, vaddr, gpte); + __guest_set_pte(cpu, pgdir, vaddr, gpte); } } -- cgit From 8c6e50b0290c4c708a3e6462729e1e9151a9a7df Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:18 -0700 Subject: mm: introduce vm_ops->map_pages() Here's new version of faultaround patchset. It took a while to tune it and collect performance data. First patch adds new callback ->map_pages to vm_operations_struct. ->map_pages() is called when VM asks to map easy accessible pages. Filesystem should find and map pages associated with offsets from "pgoff" till "max_pgoff". ->map_pages() is called with page table locked and must not block. If it's not possible to reach a page without blocking, filesystem should skip it. Filesystem should use do_set_pte() to setup page table entry. Pointer to entry associated with offset "pgoff" is passed in "pte" field in vm_fault structure. Pointers to entries for other offsets should be calculated relative to "pte". Currently VM use ->map_pages only on read page fault path. We try to map FAULT_AROUND_PAGES a time. FAULT_AROUND_PAGES is 16 for now. Performance data for different FAULT_AROUND_ORDER is below. TODO: - implement ->map_pages() for shmem/tmpfs; - modify get_user_pages() to be able to use ->map_pages() and implement mmap(MAP_POPULATE|MAP_NONBLOCK) on top. ========================================================================= Tested on 4-socket machine (120 threads) with 128GiB of RAM. Few real-world workloads. The sweet spot for FAULT_AROUND_ORDER here is somewhere between 3 and 5. Let's say 4 :) Linux build (make -j60) FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 minor-faults 283,301,572 247,151,987 212,215,789 204,772,882 199,568,944 194,703,779 193,381,485 time, seconds 151.227629483 153.920996480 151.356125472 150.863792049 150.879207877 151.150764954 151.450962358 Linux rebuild (make -j60) FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 minor-faults 5,396,854 4,148,444 2,855,286 2,577,282 2,361,957 2,169,573 2,112,643 time, seconds 27.404543757 27.559725591 27.030057426 26.855045126 26.678618635 26.974523490 26.761320095 Git test suite (make -j60 test) FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 minor-faults 129,591,823 99,200,751 66,106,718 57,606,410 51,510,808 45,776,813 44,085,515 time, seconds 66.087215026 64.784546905 64.401156567 65.282708668 66.034016829 66.793780811 67.237810413 Two synthetic tests: access every word in file in sequential/random order. It doesn't improve much after FAULT_AROUND_ORDER == 4. Sequential access 16GiB file FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 1 thread minor-faults 4,195,437 2,098,275 525,068 262,251 131,170 32,856 8,282 time, seconds 7.250461742 6.461711074 5.493859139 5.488488147 5.707213983 5.898510832 5.109232856 8 threads minor-faults 33,557,540 16,892,728 4,515,848 2,366,999 1,423,382 442,732 142,339 time, seconds 16.649304881 9.312555263 6.612490639 6.394316732 6.669827501 6.75078944 6.371900528 32 threads minor-faults 134,228,222 67,526,810 17,725,386 9,716,537 4,763,731 1,668,921 537,200 time, seconds 49.164430543 29.712060103 12.938649729 10.175151004 11.840094583 9.594081325 9.928461797 60 threads minor-faults 251,687,988 126,146,952 32,919,406 18,208,804 10,458,947 2,733,907 928,217 time, seconds 86.260656897 49.626551828 22.335007632 17.608243696 16.523119035 16.339489186 16.326390902 120 threads minor-faults 503,352,863 252,939,677 67,039,168 35,191,827 19,170,091 4,688,357 1,471,862 time, seconds 124.589206333 79.757867787 39.508707872 32.167281632 29.972989292 28.729834575 28.042251622 Random access 1GiB file 1 thread minor-faults 262,636 132,743 34,369 17,299 8,527 3,451 1,222 time, seconds 15.351890914 16.613802482 16.569227308 15.179220992 16.557356122 16.578247824 15.365266994 8 threads minor-faults 2,098,948 1,061,871 273,690 154,501 87,110 25,663 7,384 time, seconds 15.040026343 15.096933500 14.474757288 14.289129964 14.411537468 14.296316837 14.395635804 32 threads minor-faults 8,390,734 4,231,023 1,054,432 528,847 269,242 97,746 26,881 time, seconds 20.430433109 21.585235358 22.115062928 14.872878951 14.880856305 14.883370649 14.821261690 60 threads minor-faults 15,733,258 7,892,809 1,973,393 988,266 594,789 164,994 51,691 time, seconds 26.577302548 25.692397770 18.728863715 20.153026398 21.619101933 17.745086260 17.613215273 120 threads minor-faults 31,471,111 15,816,616 3,959,209 1,978,685 1,008,299 264,635 96,010 time, seconds 41.835322703 40.459786095 36.085306105 35.313894834 35.814445675 36.552633793 34.289210594 Touch only one page in page table in 16GiB file FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 1 thread minor-faults 8,372 8,324 8,270 8,260 8,249 8,239 8,237 time, seconds 0.039892712 0.045369149 0.051846126 0.063681685 0.079095975 0.17652406 0.541213386 8 threads minor-faults 65,731 65,681 65,628 65,620 65,608 65,599 65,596 time, seconds 0.124159196 0.488600638 0.156854426 0.191901957 0.242631486 0.543569456 1.677303984 32 threads minor-faults 262,388 262,341 262,285 262,276 262,266 262,257 263,183 time, seconds 0.452421421 0.488600638 0.565020946 0.648229739 0.789850823 1.651584361 5.000361559 60 threads minor-faults 491,822 491,792 491,723 491,711 491,701 491,691 491,825 time, seconds 0.763288616 0.869620515 0.980727360 1.161732354 1.466915814 3.04041448 9.308612938 120 threads minor-faults 983,466 983,655 983,366 983,372 983,363 984,083 984,164 time, seconds 1.595846553 1.667902182 2.008959376 2.425380942 2.941368804 5.977807890 18.401846125 This patch (of 2): Introduce new vm_ops callback ->map_pages() and uses it for mapping easy accessible pages around fault address. On read page fault, if filesystem provides ->map_pages(), we try to map up to FAULT_AROUND_PAGES pages around page fault address in hope to reduce number of minor page faults. We call ->map_pages first and use ->fault() as fallback if page by the offset is not ready to be mapped (cold page cache or something). Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Alexander Viro Cc: Dave Chinner Cc: Ning Qu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 10 +++++ include/linux/mm.h | 8 ++++ mm/memory.c | 81 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 96 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index f424e0e5b46b..efca5c1bbb10 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -529,6 +529,7 @@ locking rules: open: yes close: yes fault: yes can return with page locked +map_pages: yes page_mkwrite: yes can return with page locked access: yes @@ -540,6 +541,15 @@ the page, then ensure it is not already truncated (the page lock will block subsequent truncate), and then return with VM_FAULT_LOCKED, and the page locked. The VM will unlock the page. + ->map_pages() is called when VM asks to map easy accessible pages. +Filesystem should find and map pages associated with offsets from "pgoff" +till "max_pgoff". ->map_pages() is called with page table locked and must +not block. If it's not possible to reach a page without blocking, +filesystem should skip it. Filesystem should use do_set_pte() to setup +page table entry. Pointer to entry associated with offset "pgoff" is +passed in "pte" field in vm_fault structure. Pointers to entries for other +offsets should be calculated relative to "pte". + ->page_mkwrite() is called when a previously read-only pte is about to become writeable. The filesystem again must ensure that there are no truncate/invalidate races, and then return with the page locked. If diff --git a/include/linux/mm.h b/include/linux/mm.h index c270fa68a32b..f710d32291e8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -213,6 +213,10 @@ struct vm_fault { * is set (which is also implied by * VM_FAULT_ERROR). */ + /* for ->map_pages() only */ + pgoff_t max_pgoff; /* map pages for offset from pgoff till + * max_pgoff inclusive */ + pte_t *pte; /* pte entry associated with ->pgoff */ }; /* @@ -224,6 +228,7 @@ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); + void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ @@ -584,6 +589,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) pte = pte_mkwrite(pte); return pte; } + +void do_set_pte(struct vm_area_struct *vma, unsigned long address, + struct page *page, pte_t *pte, bool write, bool anon); #endif /* diff --git a/mm/memory.c b/mm/memory.c index c6ee34d10fcc..4eefb7e31521 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3342,7 +3342,22 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, return ret; } -static void do_set_pte(struct vm_area_struct *vma, unsigned long address, +/** + * do_set_pte - setup new PTE entry for given page and add reverse page mapping. + * + * @vma: virtual memory area + * @address: user virtual address + * @page: page to map + * @pte: pointer to target page table entry + * @write: true, if new entry is writable + * @anon: true, if it's anonymous page + * + * Caller must hold page table lock relevant for @pte. + * + * Target users are page handler itself and implementations of + * vm_ops->map_pages. + */ +void do_set_pte(struct vm_area_struct *vma, unsigned long address, struct page *page, pte_t *pte, bool write, bool anon) { pte_t entry; @@ -3366,6 +3381,52 @@ static void do_set_pte(struct vm_area_struct *vma, unsigned long address, update_mmu_cache(vma, address, pte); } +#define FAULT_AROUND_ORDER 4 +#define FAULT_AROUND_PAGES (1UL << FAULT_AROUND_ORDER) +#define FAULT_AROUND_MASK ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1) + +static void do_fault_around(struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pgoff_t pgoff, unsigned int flags) +{ + unsigned long start_addr; + pgoff_t max_pgoff; + struct vm_fault vmf; + int off; + + BUILD_BUG_ON(FAULT_AROUND_PAGES > PTRS_PER_PTE); + + start_addr = max(address & FAULT_AROUND_MASK, vma->vm_start); + off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + pte -= off; + pgoff -= off; + + /* + * max_pgoff is either end of page table or end of vma + * or FAULT_AROUND_PAGES from pgoff, depending what is neast. + */ + max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + PTRS_PER_PTE - 1; + max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, + pgoff + FAULT_AROUND_PAGES - 1); + + /* Check if it makes any sense to call ->map_pages */ + while (!pte_none(*pte)) { + if (++pgoff > max_pgoff) + return; + start_addr += PAGE_SIZE; + if (start_addr >= vma->vm_end) + return; + pte++; + } + + vmf.virtual_address = (void __user *) start_addr; + vmf.pte = pte; + vmf.pgoff = pgoff; + vmf.max_pgoff = max_pgoff; + vmf.flags = flags; + vma->vm_ops->map_pages(vma, &vmf); +} + static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) @@ -3373,7 +3434,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct page *fault_page; spinlock_t *ptl; pte_t *pte; - int ret; + int ret = 0; + + /* + * Let's call ->map_pages() first and use ->fault() as fallback + * if page by the offset is not ready to be mapped (cold cache or + * something). + */ + if (vma->vm_ops->map_pages) { + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + do_fault_around(vma, address, pte, pgoff, flags); + if (!pte_same(*pte, orig_pte)) + goto unlock_out; + pte_unmap_unlock(pte, ptl); + } ret = __do_fault(vma, address, pgoff, flags, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -3387,8 +3461,9 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } do_set_pte(vma, address, fault_page, pte, false, false); - pte_unmap_unlock(pte, ptl); unlock_page(fault_page); +unlock_out: + pte_unmap_unlock(pte, ptl); return ret; } -- cgit From f1820361f83d556a7f0a9f629100f3825e594328 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:19 -0700 Subject: mm: implement ->map_pages for page cache filemap_map_pages() is generic implementation of ->map_pages() for filesystems who uses page cache. It should be safe to use filemap_map_pages() for ->map_pages() if filesystem use filemap_fault() for ->fault(). Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Alexander Viro Cc: Dave Chinner Cc: Ning Qu Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_file.c | 2 ++ fs/btrfs/file.c | 1 + fs/cifs/file.c | 1 + fs/ext4/file.c | 1 + fs/f2fs/file.c | 1 + fs/fuse/file.c | 1 + fs/gfs2/file.c | 1 + fs/nfs/file.c | 1 + fs/nilfs2/file.c | 1 + fs/ubifs/file.c | 1 + fs/xfs/xfs_file.c | 1 + include/linux/mm.h | 1 + mm/filemap.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/nommu.c | 6 +++++ 14 files changed, 93 insertions(+) diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index a16b0ff497ca..d8223209d4b1 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -832,6 +832,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -839,6 +840,7 @@ static const struct vm_operations_struct v9fs_file_vm_ops = { static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .close = v9fs_mmap_vm_close, .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e1ffb1e22898..c660527af838 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2025,6 +2025,7 @@ out: static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 834fce759d80..216d7e99f921 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3113,6 +3113,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6db7f7db7777..4e508fc83dcf 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -200,6 +200,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0dfcef53a6ed..129a3bdb05ca 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -84,6 +84,7 @@ out: static const struct vm_operations_struct f2fs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 65df7d8be4f5..48992cac714b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2117,6 +2117,7 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct fuse_file_vm_ops = { .close = fuse_vma_close, .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = fuse_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6c794085abac..80d67253623c 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -494,6 +494,7 @@ out: static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = gfs2_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 5bb790a69c71..284ca901fe16 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -617,6 +617,7 @@ out: static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = nfs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 08fdb77852ac..f3a82fbcae02 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -134,6 +134,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = nilfs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 123c79b7261e..4f34dbae823d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1538,6 +1538,7 @@ out_unlock: static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = ubifs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f7abff8c16ca..003c0051b62f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1483,6 +1483,7 @@ const struct file_operations xfs_dir_file_operations = { static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = xfs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/include/linux/mm.h b/include/linux/mm.h index f710d32291e8..9132faed1a41 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1847,6 +1847,7 @@ extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); +extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf); extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); /* mm/page-writeback.c */ diff --git a/mm/filemap.c b/mm/filemap.c index 21781f1fe52b..3f9b5fbb623f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,6 +33,7 @@ #include /* for BUG_ON(!in_atomic()) only */ #include #include +#include #include "internal.h" #define CREATE_TRACE_POINTS @@ -2064,6 +2065,78 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); +void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct radix_tree_iter iter; + void **slot; + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + loff_t size; + struct page *page; + unsigned long address = (unsigned long) vmf->virtual_address; + unsigned long addr; + pte_t *pte; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { + if (iter.index > vmf->max_pgoff) + break; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + goto next; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + break; + else + goto next; + } + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + if (!PageUptodate(page) || + PageReadahead(page) || + PageHWPoison(page)) + goto skip; + if (!trylock_page(page)) + goto skip; + + if (page->mapping != mapping || !PageUptodate(page)) + goto unlock; + + size = i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1; + if (page->index >= size >> PAGE_CACHE_SHIFT) + goto unlock; + + pte = vmf->pte + page->index - vmf->pgoff; + if (!pte_none(*pte)) + goto unlock; + + if (file->f_ra.mmap_miss > 0) + file->f_ra.mmap_miss--; + addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; + do_set_pte(vma, addr, page, pte, false, false); + unlock_page(page); + goto next; +unlock: + unlock_page(page); +skip: + page_cache_release(page); +next: + if (iter.index == vmf->max_pgoff) + break; + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(filemap_map_pages); + int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; @@ -2093,6 +2166,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite); const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/mm/nommu.c b/mm/nommu.c index a554e5a451cd..e19482533ce3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1985,6 +1985,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); +void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + BUG(); +} +EXPORT_SYMBOL(filemap_map_pages); + int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, unsigned long size, pgoff_t pgoff) { -- cgit From 99e3e53f4e0fd807607bf381e14f6de8feedd383 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:21 -0700 Subject: mm: cleanup size checks in filemap_fault() and filemap_map_pages() Minor cleanups: - 'size' variable is now in bytes, not pages; - use round_up(): it should be easier to read. Signed-off-by: Kirill A. Shutemov Cc: Linus Torvalds Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Alexander Viro Cc: Dave Chinner Cc: Ning Qu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 3f9b5fbb623f..b952d99c827c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1953,11 +1953,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; - pgoff_t size; + loff_t size; int ret = 0; - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (offset >= size) + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (offset >= size >> PAGE_CACHE_SHIFT) return VM_FAULT_SIGBUS; /* @@ -2006,8 +2006,8 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(offset >= size)) { + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) { unlock_page(page); page_cache_release(page); return VM_FAULT_SIGBUS; @@ -2111,8 +2111,8 @@ repeat: if (page->mapping != mapping || !PageUptodate(page)) goto unlock; - size = i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1; - if (page->index >= size >> PAGE_CACHE_SHIFT) + size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE); + if (page->index >= size >> PAGE_CACHE_SHIFT) goto unlock; pte = vmf->pte + page->index - vmf->pgoff; -- cgit From 1592eef01505177ed50149795a1560ec5a139df1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:22 -0700 Subject: mm: add debugfs tunable for fault_around_order Let's allow people to tweak faultaround at runtime. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kirill A. Shutemov Cc: Linus Torvalds Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Alexander Viro Cc: Dave Chinner Cc: Ning Qu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 7 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 4eefb7e31521..1b88da5c08b3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -3382,8 +3383,63 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, } #define FAULT_AROUND_ORDER 4 -#define FAULT_AROUND_PAGES (1UL << FAULT_AROUND_ORDER) -#define FAULT_AROUND_MASK ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1) + +#ifdef CONFIG_DEBUG_FS +static unsigned int fault_around_order = FAULT_AROUND_ORDER; + +static int fault_around_order_get(void *data, u64 *val) +{ + *val = fault_around_order; + return 0; +} + +static int fault_around_order_set(void *data, u64 val) +{ + BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE); + if (1UL << val > PTRS_PER_PTE) + return -EINVAL; + fault_around_order = val; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops, + fault_around_order_get, fault_around_order_set, "%llu\n"); + +static int __init fault_around_debugfs(void) +{ + void *ret; + + ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL, + &fault_around_order_fops); + if (!ret) + pr_warn("Failed to create fault_around_order in debugfs"); + return 0; +} +late_initcall(fault_around_debugfs); + +static inline unsigned long fault_around_pages(void) +{ + return 1UL << fault_around_order; +} + +static inline unsigned long fault_around_mask(void) +{ + return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1); +} +#else +static inline unsigned long fault_around_pages(void) +{ + unsigned long nr_pages; + + nr_pages = 1UL << FAULT_AROUND_ORDER; + BUILD_BUG_ON(nr_pages > PTRS_PER_PTE); + return nr_pages; +} + +static inline unsigned long fault_around_mask(void) +{ + return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1); +} +#endif static void do_fault_around(struct vm_area_struct *vma, unsigned long address, pte_t *pte, pgoff_t pgoff, unsigned int flags) @@ -3393,21 +3449,19 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, struct vm_fault vmf; int off; - BUILD_BUG_ON(FAULT_AROUND_PAGES > PTRS_PER_PTE); - - start_addr = max(address & FAULT_AROUND_MASK, vma->vm_start); + start_addr = max(address & fault_around_mask(), vma->vm_start); off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); pte -= off; pgoff -= off; /* * max_pgoff is either end of page table or end of vma - * or FAULT_AROUND_PAGES from pgoff, depending what is neast. + * or fault_around_pages() from pgoff, depending what is neast. */ max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, - pgoff + FAULT_AROUND_PAGES - 1); + pgoff + fault_around_pages() - 1); /* Check if it makes any sense to call ->map_pages */ while (!pte_none(*pte)) { -- cgit From d7c1755179b82d954f593ca5285b9360f2f62e9c Mon Sep 17 00:00:00 2001 From: Ning Qu Date: Mon, 7 Apr 2014 15:37:24 -0700 Subject: mm: implement ->map_pages for shmem/tmpfs In shmem/tmpfs, we also use the generic filemap_map_pages, seems the additional checking is not worth a separate version of map_pages for it. Signed-off-by: Ning Qu Acked-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Rik van Riel Cc: Dave Hansen Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/shmem.c b/mm/shmem.c index a3ba988ec946..70709347a1e2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2723,6 +2723,7 @@ static const struct super_operations shmem_ops = { static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, + .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, -- cgit From 615d6e8756c87149f2d4c1b93d471bca002bd849 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 7 Apr 2014 15:37:25 -0700 Subject: mm: per-thread vma caching This patch is a continuation of efforts trying to optimize find_vma(), avoiding potentially expensive rbtree walks to locate a vma upon faults. The original approach (https://lkml.org/lkml/2013/11/1/410), where the largest vma was also cached, ended up being too specific and random, thus further comparison with other approaches were needed. There are two things to consider when dealing with this, the cache hit rate and the latency of find_vma(). Improving the hit-rate does not necessarily translate in finding the vma any faster, as the overhead of any fancy caching schemes can be too high to consider. We currently cache the last used vma for the whole address space, which provides a nice optimization, reducing the total cycles in find_vma() by up to 250%, for workloads with good locality. On the other hand, this simple scheme is pretty much useless for workloads with poor locality. Analyzing ebizzy runs shows that, no matter how many threads are running, the mmap_cache hit rate is less than 2%, and in many situations below 1%. The proposed approach is to replace this scheme with a small per-thread cache, maximizing hit rates at a very low maintenance cost. Invalidations are performed by simply bumping up a 32-bit sequence number. The only expensive operation is in the rare case of a seq number overflow, where all caches that share the same address space are flushed. Upon a miss, the proposed replacement policy is based on the page number that contains the virtual address in question. Concretely, the following results are seen on an 80 core, 8 socket x86-64 box: 1) System bootup: Most programs are single threaded, so the per-thread scheme does improve ~50% hit rate by just adding a few more slots to the cache. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 50.61% | 19.90 | | patched | 73.45% | 13.58 | +----------------+----------+------------------+ 2) Kernel build: This one is already pretty good with the current approach as we're dealing with good locality. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 75.28% | 11.03 | | patched | 88.09% | 9.31 | +----------------+----------+------------------+ 3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 70.66% | 17.14 | | patched | 91.15% | 12.57 | +----------------+----------+------------------+ 4) Ebizzy: There's a fair amount of variation from run to run, but this approach always shows nearly perfect hit rates, while baseline is just about non-existent. The amounts of cycles can fluctuate between anywhere from ~60 to ~116 for the baseline scheme, but this approach reduces it considerably. For instance, with 80 threads: +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 1.06% | 91.54 | | patched | 99.97% | 14.18 | +----------------+----------+------------------+ [akpm@linux-foundation.org: fix nommu build, per Davidlohr] [akpm@linux-foundation.org: document vmacache_valid() logic] [akpm@linux-foundation.org: attempt to untangle header files] [akpm@linux-foundation.org: add vmacache_find() BUG_ON] [hughd@google.com: add vmacache_valid_mm() (from Oleg)] [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: adjust and enhance comments] Signed-off-by: Davidlohr Bueso Reviewed-by: Rik van Riel Acked-by: Linus Torvalds Reviewed-by: Michel Lespinasse Cc: Oleg Nesterov Tested-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/unicore32/include/asm/mmu_context.h | 4 +- fs/exec.c | 5 +- fs/proc/task_mmu.c | 3 +- include/linux/mm_types.h | 4 +- include/linux/sched.h | 7 ++ include/linux/vmacache.h | 38 +++++++++++ kernel/debug/debug_core.c | 14 +++- kernel/fork.c | 7 +- mm/Makefile | 2 +- mm/mmap.c | 55 ++++++++------- mm/nommu.c | 24 ++++--- mm/vmacache.c | 112 +++++++++++++++++++++++++++++++ 12 files changed, 231 insertions(+), 44 deletions(-) create mode 100644 include/linux/vmacache.h create mode 100644 mm/vmacache.c diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h index fb5e4c658f7a..ef470a7a3d0f 100644 --- a/arch/unicore32/include/asm/mmu_context.h +++ b/arch/unicore32/include/asm/mmu_context.h @@ -14,6 +14,8 @@ #include #include +#include +#include #include #include @@ -73,7 +75,7 @@ do { \ else \ mm->mmap = NULL; \ rb_erase(&high_vma->vm_rb, &mm->mm_rb); \ - mm->mmap_cache = NULL; \ + vmacache_invalidate(mm); \ mm->map_count--; \ remove_vma(high_vma); \ } \ diff --git a/fs/exec.c b/fs/exec.c index 25dfeba6d55f..b60ccf969a8b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -822,7 +823,7 @@ EXPORT_SYMBOL(read_code); static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; - struct mm_struct * old_mm, *active_mm; + struct mm_struct *old_mm, *active_mm; /* Notify parent that we're no longer interested in the old VM */ tsk = current; @@ -848,6 +849,8 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); + tsk->mm->vmacache_seqnum = 0; + vmacache_flush(tsk); task_unlock(tsk); if (old_mm) { up_read(&old_mm->mmap_sem); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fb52b548080d..442177b1119a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) /* * We remember last_addr rather than next_addr to hit with - * mmap_cache most of the time. We have zero last_addr at + * vmacache most of the time. We have zero last_addr at * the beginning and also after lseek. We will have -1 last_addr * after the end of the vmas. */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 290901a8c1de..2b58d192ea24 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -342,9 +342,9 @@ struct mm_rss_stat { struct kioctx_table; struct mm_struct { - struct vm_area_struct * mmap; /* list of VMAs */ + struct vm_area_struct *mmap; /* list of VMAs */ struct rb_root mm_rb; - struct vm_area_struct * mmap_cache; /* last find_vma result */ + u32 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, diff --git a/include/linux/sched.h b/include/linux/sched.h index 7cb07fd26680..642477dd814a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -132,6 +132,10 @@ struct perf_event_context; struct blk_plug; struct filename; +#define VMACACHE_BITS 2 +#define VMACACHE_SIZE (1U << VMACACHE_BITS) +#define VMACACHE_MASK (VMACACHE_SIZE - 1) + /* * List of flags we want to share for kernel threads, * if only because they are not used by them anyway. @@ -1235,6 +1239,9 @@ struct task_struct { #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif + /* per-thread vma caching */ + u32 vmacache_seqnum; + struct vm_area_struct *vmacache[VMACACHE_SIZE]; #if defined(SPLIT_RSS_COUNTING) struct task_rss_stat rss_stat; #endif diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h new file mode 100644 index 000000000000..c3fa0fd43949 --- /dev/null +++ b/include/linux/vmacache.h @@ -0,0 +1,38 @@ +#ifndef __LINUX_VMACACHE_H +#define __LINUX_VMACACHE_H + +#include +#include + +/* + * Hash based on the page number. Provides a good hit rate for + * workloads with good locality and those with random accesses as well. + */ +#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK) + +static inline void vmacache_flush(struct task_struct *tsk) +{ + memset(tsk->vmacache, 0, sizeof(tsk->vmacache)); +} + +extern void vmacache_flush_all(struct mm_struct *mm); +extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); +extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, + unsigned long addr); + +#ifndef CONFIG_MMU +extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, + unsigned long start, + unsigned long end); +#endif + +static inline void vmacache_invalidate(struct mm_struct *mm) +{ + mm->vmacache_seqnum++; + + /* deal with overflows */ + if (unlikely(mm->vmacache_seqnum == 0)) + vmacache_flush_all(mm); +} + +#endif /* __LINUX_VMACACHE_H */ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 99982a70ddad..2956c8da1605 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) if (!CACHE_FLUSH_IS_SAFE) return; - if (current->mm && current->mm->mmap_cache) { - flush_cache_range(current->mm->mmap_cache, - addr, addr + BREAK_INSTR_SIZE); + if (current->mm) { + int i; + + for (i = 0; i < VMACACHE_SIZE; i++) { + if (!current->vmacache[i]) + continue; + flush_cache_range(current->vmacache[i], + addr, addr + BREAK_INSTR_SIZE); + } } + /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } diff --git a/kernel/fork.c b/kernel/fork.c index e40c0a01d5a6..bc0e96b78dfd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include #include @@ -364,7 +366,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; - mm->mmap_cache = NULL; + mm->vmacache_seqnum = 0; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; @@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (!oldmm) return 0; + /* initialize the new vmacache entries */ + vmacache_flush(tsk); + if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; diff --git a/mm/Makefile b/mm/Makefile index cdd741519ee0..23a6f7e23019 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o balloon_compaction.o \ + compaction.o balloon_compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/mmap.c b/mm/mmap.c index 46433e137abc..b1202cf81f4b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, prev->vm_next = next = vma->vm_next; if (next) next->vm_prev = prev; - if (mm->mmap_cache == vma) - mm->mmap_cache = prev; + + /* Kill the cache */ + vmacache_invalidate(mm); } /* @@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = NULL; + struct rb_node *rb_node; + struct vm_area_struct *vma; /* Check the cache first. */ - /* (Cache hit rate is typically around 35%.) */ - vma = ACCESS_ONCE(mm->mmap_cache); - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { - struct rb_node *rb_node; + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; - rb_node = mm->mm_rb.rb_node; - vma = NULL; + rb_node = mm->mm_rb.rb_node; + vma = NULL; - while (rb_node) { - struct vm_area_struct *vma_tmp; - - vma_tmp = rb_entry(rb_node, - struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } - if (vma) - mm->mmap_cache = vma; + while (rb_node) { + struct vm_area_struct *tmp; + + tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (tmp->vm_end > addr) { + vma = tmp; + if (tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; } + + if (vma) + vmacache_update(addr, vma); return vma; } @@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } else mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; - mm->mmap_cache = NULL; /* Kill the cache. */ + + /* Kill the cache */ + vmacache_invalidate(mm); } /* diff --git a/mm/nommu.c b/mm/nommu.c index e19482533ce3..5d3f3524bbdc 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -768,16 +769,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) */ static void delete_vma_from_mm(struct vm_area_struct *vma) { + int i; struct address_space *mapping; struct mm_struct *mm = vma->vm_mm; + struct task_struct *curr = current; kenter("%p", vma); protect_vma(vma, 0); mm->map_count--; - if (mm->mmap_cache == vma) - mm->mmap_cache = NULL; + for (i = 0; i < VMACACHE_SIZE; i++) { + /* if the vma is cached, invalidate the entire cache */ + if (curr->vmacache[i] == vma) { + vmacache_invalidate(curr->mm); + break; + } + } /* remove the VMA from the mapping */ if (vma->vm_file) { @@ -825,8 +833,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma; /* check the cache first */ - vma = ACCESS_ONCE(mm->mmap_cache); - if (vma && vma->vm_start <= addr && vma->vm_end > addr) + vma = vmacache_find(mm, addr); + if (likely(vma)) return vma; /* trawl the list (there may be multiple mappings in which addr @@ -835,7 +843,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) if (vma->vm_start > addr) return NULL; if (vma->vm_end > addr) { - mm->mmap_cache = vma; + vmacache_update(addr, vma); return vma; } } @@ -874,8 +882,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, unsigned long end = addr + len; /* check the cache first */ - vma = mm->mmap_cache; - if (vma && vma->vm_start == addr && vma->vm_end == end) + vma = vmacache_find_exact(mm, addr, end); + if (vma) return vma; /* trawl the list (there may be multiple mappings in which addr @@ -886,7 +894,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, if (vma->vm_start > addr) return NULL; if (vma->vm_end == end) { - mm->mmap_cache = vma; + vmacache_update(addr, vma); return vma; } } diff --git a/mm/vmacache.c b/mm/vmacache.c new file mode 100644 index 000000000000..d4224b397c0e --- /dev/null +++ b/mm/vmacache.c @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2014 Davidlohr Bueso. + */ +#include +#include +#include + +/* + * Flush vma caches for threads that share a given mm. + * + * The operation is safe because the caller holds the mmap_sem + * exclusively and other threads accessing the vma cache will + * have mmap_sem held at least for read, so no extra locking + * is required to maintain the vma cache. + */ +void vmacache_flush_all(struct mm_struct *mm) +{ + struct task_struct *g, *p; + + rcu_read_lock(); + for_each_process_thread(g, p) { + /* + * Only flush the vmacache pointers as the + * mm seqnum is already set and curr's will + * be set upon invalidation when the next + * lookup is done. + */ + if (mm == p->mm) + vmacache_flush(p); + } + rcu_read_unlock(); +} + +/* + * This task may be accessing a foreign mm via (for example) + * get_user_pages()->find_vma(). The vmacache is task-local and this + * task's vmacache pertains to a different mm (ie, its own). There is + * nothing we can do here. + * + * Also handle the case where a kernel thread has adopted this mm via use_mm(). + * That kernel thread's vmacache is not applicable to this mm. + */ +static bool vmacache_valid_mm(struct mm_struct *mm) +{ + return current->mm == mm && !(current->flags & PF_KTHREAD); +} + +void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) +{ + if (vmacache_valid_mm(newvma->vm_mm)) + current->vmacache[VMACACHE_HASH(addr)] = newvma; +} + +static bool vmacache_valid(struct mm_struct *mm) +{ + struct task_struct *curr; + + if (!vmacache_valid_mm(mm)) + return false; + + curr = current; + if (mm->vmacache_seqnum != curr->vmacache_seqnum) { + /* + * First attempt will always be invalid, initialize + * the new cache for this task here. + */ + curr->vmacache_seqnum = mm->vmacache_seqnum; + vmacache_flush(curr); + return false; + } + return true; +} + +struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (vma && vma->vm_start <= addr && vma->vm_end > addr) { + BUG_ON(vma->vm_mm != mm); + return vma; + } + } + + return NULL; +} + +#ifndef CONFIG_MMU +struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (vma && vma->vm_start == start && vma->vm_end == end) + return vma; + } + + return NULL; +} +#endif -- cgit From 3b32123d734cb414e366b35a3b2142a995f9d1a0 Mon Sep 17 00:00:00 2001 From: Gideon Israel Dsouza Date: Mon, 7 Apr 2014 15:37:26 -0700 Subject: mm: use macros from compiler.h instead of __attribute__((...)) To increase compiler portability there is which provides convenience macros for various gcc constructs. Eg: __weak for __attribute__((weak)). I've replaced all instances of gcc attributes with the right macro in the memory management (/mm) subsystem. [akpm@linux-foundation.org: while-we're-there consistency tweaks] Signed-off-by: Gideon Israel Dsouza Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 ++- mm/nommu.c | 3 ++- mm/sparse.c | 4 +++- mm/util.c | 5 +++-- mm/vmalloc.c | 4 +++- 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ed5072c64daa..c5aa43993364 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -3521,7 +3522,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, #else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ /* Can be overriden by architectures */ -__attribute__((weak)) struct page * +struct page * __weak follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int write) { diff --git a/mm/nommu.c b/mm/nommu.c index 5d3f3524bbdc..e68deff6d447 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -460,7 +461,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases); * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. */ -void __attribute__((weak)) vmalloc_sync_all(void) +void __weak vmalloc_sync_all(void) { } diff --git a/mm/sparse.c b/mm/sparse.c index 38cad8fd7397..d1b48b691ac8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -5,10 +5,12 @@ #include #include #include +#include #include #include #include #include + #include "internal.h" #include #include @@ -461,7 +463,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) } #endif -void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) +void __weak __meminit vmemmap_populate_print_last(void) { } diff --git a/mm/util.c b/mm/util.c index a24aa22f2473..d7813e6d4cc7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -307,7 +308,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * If the architecture not support this function, simply return with no * page pinned */ -int __attribute__((weak)) __get_user_pages_fast(unsigned long start, +int __weak __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { return 0; @@ -338,7 +339,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * callers need to carefully consider what to use. On many architectures, * get_user_pages_fast simply falls back to get_user_pages. */ -int __attribute__((weak)) get_user_pages_fast(unsigned long start, +int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0fdf96803c5b..a7b522f4851d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -27,7 +27,9 @@ #include #include #include +#include #include + #include #include #include @@ -2181,7 +2183,7 @@ EXPORT_SYMBOL(remap_vmalloc_range); * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. */ -void __attribute__((weak)) vmalloc_sync_all(void) +void __weak vmalloc_sync_all(void) { } -- cgit From 514ddb446c5c5a238eca32b7052b7a8accae4e93 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 7 Apr 2014 15:37:27 -0700 Subject: fork: collapse copy_flags into copy_process copy_flags() does not use the clone_flags formal and can be collapsed into copy_process() for cleaner code. Signed-off-by: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tejun Heo Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Cc: Jianguo Wu Cc: Tim Hockin Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index bc0e96b78dfd..c777964c0662 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1080,15 +1080,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static void copy_flags(unsigned long clone_flags, struct task_struct *p) -{ - unsigned long new_flags = p->flags; - - new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); - new_flags |= PF_FORKNOEXEC; - p->flags = new_flags; -} - SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; @@ -1238,7 +1229,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - copy_flags(clone_flags, p); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); -- cgit From 2a389610a7331d22344698f23ef2e8c55b2cde7b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 7 Apr 2014 15:37:29 -0700 Subject: mm, mempolicy: rename slab_node for clarity slab_node() is actually a mempolicy function, so rename it to mempolicy_slab_node() to make it clearer that it used for processes with mempolicies. At the same time, cleanup its code by saving numa_mem_id() in a local variable (since we require a node with memory, not just any node) and remove an obsolete comment that assumes the mempolicy is actually passed into the function. Signed-off-by: David Rientjes Acked-by: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tejun Heo Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Cc: Jianguo Wu Cc: Tim Hockin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 +- mm/mempolicy.c | 15 ++++++--------- mm/slab.c | 4 ++-- mm/slub.c | 2 +- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5f1ea756aace..cfe55dfca015 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -151,7 +151,7 @@ extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_nodemask_intersects(struct task_struct *tsk, const nodemask_t *mask); -extern unsigned slab_node(void); +extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e3ab02822799..0ad0ba31979f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1782,21 +1782,18 @@ static unsigned interleave_nodes(struct mempolicy *policy) /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. - * @policy must be protected by freeing by the caller. If @policy is - * the current task's mempolicy, this protection is implicit, as only the - * task can change it's policy. The system default policy requires no - * such protection. */ -unsigned slab_node(void) +unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; + int node = numa_mem_id(); if (in_interrupt()) - return numa_node_id(); + return node; policy = current->mempolicy; if (!policy || policy->flags & MPOL_F_LOCAL) - return numa_node_id(); + return node; switch (policy->mode) { case MPOL_PREFERRED: @@ -1816,11 +1813,11 @@ unsigned slab_node(void) struct zonelist *zonelist; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); - zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; + zonelist = &NODE_DATA(node)->node_zonelists[0]; (void)first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes, &zone); - return zone ? zone->node : numa_node_id(); + return zone ? zone->node : node; } default: diff --git a/mm/slab.c b/mm/slab.c index 9153c802e2fe..4b17f4c2e92d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3042,7 +3042,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) - nid_alloc = slab_node(); + nid_alloc = mempolicy_slab_node(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3074,7 +3074,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = node_zonelist(slab_node(), flags); + zonelist = node_zonelist(mempolicy_slab_node(), flags); retry: /* diff --git a/mm/slub.c b/mm/slub.c index fe6d7be22ef0..5b05e4fe9a1a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1685,7 +1685,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = node_zonelist(slab_node(), flags); + zonelist = node_zonelist(mempolicy_slab_node(), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; -- cgit From f0432d159601f96839f514f286eaa5b75c4112dc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 7 Apr 2014 15:37:30 -0700 Subject: mm, mempolicy: remove per-process flag PF_MEMPOLICY is an unnecessary optimization for CONFIG_SLAB users. There's no significant performance degradation to checking current->mempolicy rather than current->flags & PF_MEMPOLICY in the allocation path, especially since this is considered unlikely(). Running TCP_RR with netperf-2.4.5 through localhost on 16 cpu machine with 64GB of memory and without a mempolicy: threads before after 16 1249409 1244487 32 1281786 1246783 48 1239175 1239138 64 1244642 1241841 80 1244346 1248918 96 1266436 1254316 112 1307398 1312135 128 1327607 1326502 Per-process flags are a scarce resource so we should free them up whenever possible and make them available. We'll be using it shortly for memcg oom reserves. Signed-off-by: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tejun Heo Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Cc: Jianguo Wu Cc: Tim Hockin Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 1 - include/linux/sched.h | 1 - kernel/fork.c | 1 - mm/mempolicy.c | 31 ------------------------------- mm/slab.c | 4 ++-- 5 files changed, 2 insertions(+), 36 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index cfe55dfca015..3c1b968da0ca 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -143,7 +143,6 @@ extern void numa_policy_init(void); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, enum mpol_rebind_step step); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); -extern void mpol_fix_fork_child_flag(struct task_struct *p); extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, diff --git a/include/linux/sched.h b/include/linux/sched.h index 642477dd814a..6c70645eb3b6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1851,7 +1851,6 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* this thread called freeze_processes and should not be frozen */ diff --git a/kernel/fork.c b/kernel/fork.c index c777964c0662..e905e9c6b224 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1276,7 +1276,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->mempolicy = NULL; goto bad_fork_cleanup_threadgroup_lock; } - mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0ad0ba31979f..78e1472933ea 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -795,36 +795,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, return err; } -/* - * Update task->flags PF_MEMPOLICY bit: set iff non-default - * mempolicy. Allows more rapid checking of this (combined perhaps - * with other PF_* flag bits) on memory allocation hot code paths. - * - * If called from outside this file, the task 'p' should -only- be - * a newly forked child not yet visible on the task list, because - * manipulating the task flags of a visible task is not safe. - * - * The above limitation is why this routine has the funny name - * mpol_fix_fork_child_flag(). - * - * It is also safe to call this with a task pointer of current, - * which the static wrapper mpol_set_task_struct_flag() does, - * for use within this file. - */ - -void mpol_fix_fork_child_flag(struct task_struct *p) -{ - if (p->mempolicy) - p->flags |= PF_MEMPOLICY; - else - p->flags &= ~PF_MEMPOLICY; -} - -static void mpol_set_task_struct_flag(void) -{ - mpol_fix_fork_child_flag(current); -} - /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) @@ -861,7 +831,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, } old = current->mempolicy; current->mempolicy = new; - mpol_set_task_struct_flag(); if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); diff --git a/mm/slab.c b/mm/slab.c index 4b17f4c2e92d..3db4cb06e32e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3027,7 +3027,7 @@ out: #ifdef CONFIG_NUMA /* - * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. + * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. * * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. @@ -3259,7 +3259,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) { void *objp; - if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { + if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { objp = alternate_node_alloc(cache, flags); if (objp) goto out; -- cgit From 539a13b47e462d28c48f076c63871580f694a366 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 7 Apr 2014 15:37:32 -0700 Subject: res_counter: remove interface for locked charging and uncharging The res_counter_{charge,uncharge}_locked() variants are not used in the kernel outside of the resource counter code itself, so remove the interface. Signed-off-by: David Rientjes Acked-by: Michal Hocko Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tejun Heo Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Cc: Jianguo Wu Cc: Tim Hockin Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/resource_counter.txt | 12 ++---------- include/linux/res_counter.h | 6 +----- kernel/res_counter.c | 23 ++++++++++++----------- 3 files changed, 15 insertions(+), 26 deletions(-) diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt index 5108afb3645c..762ca54eb929 100644 --- a/Documentation/cgroups/resource_counter.txt +++ b/Documentation/cgroups/resource_counter.txt @@ -76,15 +76,7 @@ to work with it. limit_fail_at parameter is set to the particular res_counter element where the charging failed. - d. int res_counter_charge_locked - (struct res_counter *rc, unsigned long val, bool force) - - The same as res_counter_charge(), but it must not acquire/release the - res_counter->lock internally (it must be called with res_counter->lock - held). The force parameter indicates whether we can bypass the limit. - - e. u64 res_counter_uncharge[_locked] - (struct res_counter *rc, unsigned long val) + d. u64 res_counter_uncharge(struct res_counter *rc, unsigned long val) When a resource is released (freed) it should be de-accounted from the resource counter it was accounted to. This is called @@ -93,7 +85,7 @@ to work with it. The _locked routines imply that the res_counter->lock is taken. - f. u64 res_counter_uncharge_until + e. u64 res_counter_uncharge_until (struct res_counter *rc, struct res_counter *top, unsigned long val) diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 201a69749659..56b7bc32db4f 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -104,15 +104,13 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent); * units, e.g. numbers, bytes, Kbytes, etc * * returns 0 on success and <0 if the counter->usage will exceed the - * counter->limit _locked call expects the counter->lock to be taken + * counter->limit * * charge_nofail works the same, except that it charges the resource * counter unconditionally, and returns < 0 if the after the current * charge we are over limit. */ -int __must_check res_counter_charge_locked(struct res_counter *counter, - unsigned long val, bool force); int __must_check res_counter_charge(struct res_counter *counter, unsigned long val, struct res_counter **limit_fail_at); int res_counter_charge_nofail(struct res_counter *counter, @@ -125,12 +123,10 @@ int res_counter_charge_nofail(struct res_counter *counter, * @val: the amount of the resource * * these calls check for usage underflow and show a warning on the console - * _locked call expects the counter->lock to be taken * * returns the total charges still present in @counter. */ -u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); u64 res_counter_uncharge(struct res_counter *counter, unsigned long val); u64 res_counter_uncharge_until(struct res_counter *counter, diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 4aa8a305aede..51dbac6a3633 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -22,8 +22,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) counter->parent = parent; } -int res_counter_charge_locked(struct res_counter *counter, unsigned long val, - bool force) +static u64 res_counter_uncharge_locked(struct res_counter *counter, + unsigned long val) +{ + if (WARN_ON(counter->usage < val)) + val = counter->usage; + + counter->usage -= val; + return counter->usage; +} + +static int res_counter_charge_locked(struct res_counter *counter, + unsigned long val, bool force) { int ret = 0; @@ -86,15 +96,6 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, return __res_counter_charge(counter, val, limit_fail_at, true); } -u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) -{ - if (WARN_ON(counter->usage < val)) - val = counter->usage; - - counter->usage -= val; - return counter->usage; -} - u64 res_counter_uncharge_until(struct res_counter *counter, struct res_counter *top, unsigned long val) -- cgit From da1c67a76f7cf2b3404823d24f9f10fa91aa5dc5 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 7 Apr 2014 15:37:34 -0700 Subject: mm, compaction: determine isolation mode only once The conditions that control the isolation mode in isolate_migratepages_range() do not change during the iteration, so extract them out and only define the value once. This actually does have an effect, gcc doesn't optimize it itself because of cc->sync. Signed-off-by: David Rientjes Cc: Mel Gorman Acked-by: Rik van Riel Acked-by: Vlastimil Babka Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 054c28b51c75..37f976287068 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -460,12 +460,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; - isolate_mode_t mode = 0; struct lruvec *lruvec; unsigned long flags; bool locked = false; struct page *page = NULL, *valid_page = NULL; bool skipped_async_unsuitable = false; + const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | + (unevictable ? ISOLATE_UNEVICTABLE : 0); /* * Ensure that there are not too many pages isolated from the LRU @@ -608,12 +609,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } - if (!cc->sync) - mode |= ISOLATE_ASYNC_MIGRATE; - - if (unevictable) - mode |= ISOLATE_UNEVICTABLE; - lruvec = mem_cgroup_page_lruvec(page, zone); /* Try isolate the page */ -- cgit From eb9a3c62a0b6064c7f7e5b961ce00c646d21cb78 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 7 Apr 2014 15:37:35 -0700 Subject: mempool: add unlikely and likely hints Add unlikely and likely hints to the function mempool_free. It lays out the code in such a way that the common path is executed straighforward and saves a cache line. Signed-off-by: Mikulas Patocka Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 659aa42bad16..905434f18c97 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -304,9 +304,9 @@ void mempool_free(void *element, mempool_t *pool) * ensures that there will be frees which return elements to the * pool waking up the waiters. */ - if (pool->curr_nr < pool->min_nr) { + if (unlikely(pool->curr_nr < pool->min_nr)) { spin_lock_irqsave(&pool->lock, flags); - if (pool->curr_nr < pool->min_nr) { + if (likely(pool->curr_nr < pool->min_nr)) { add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); wake_up(&pool->wait); -- cgit From ac7149045d9fcca1063e22e4c6f607bca8fce268 Mon Sep 17 00:00:00 2001 From: Choi Gi-yong Date: Mon, 7 Apr 2014 15:37:36 -0700 Subject: mm: fix 'ERROR: do not initialise globals to 0 or NULL' and coding style Signed-off-by: Choi Gi-yong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +-- mm/nommu.c | 16 ++++++---------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c5aa43993364..27938c441b6f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2898,8 +2898,7 @@ retry: if (anon_rmap) { ClearPagePrivate(page); hugepage_add_new_anon_rmap(page, vma, address); - } - else + } else page_dup_rmap(page); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); diff --git a/mm/nommu.c b/mm/nommu.c index e68deff6d447..85f8d6698d48 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -298,7 +298,7 @@ long vwrite(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; memcpy(addr, buf, count); - return(count); + return count; } /* @@ -1012,8 +1012,7 @@ static int validate_mmap_request(struct file *file, /* we mustn't privatise shared mappings */ capabilities &= ~BDI_CAP_MAP_COPY; - } - else { + } else { /* we're going to read the file into private memory we * allocate */ if (!(capabilities & BDI_CAP_MAP_COPY)) @@ -1044,23 +1043,20 @@ static int validate_mmap_request(struct file *file, if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { if (prot & PROT_EXEC) return -EPERM; - } - else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { + } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { /* handle implication of PROT_EXEC by PROT_READ */ if (current->personality & READ_IMPLIES_EXEC) { if (capabilities & BDI_CAP_EXEC_MAP) prot |= PROT_EXEC; } - } - else if ((prot & PROT_READ) && + } else if ((prot & PROT_READ) && (prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP) ) { /* backing file is not executable, try to copy */ capabilities &= ~BDI_CAP_MAP_DIRECT; } - } - else { + } else { /* anonymous mappings are always memory backed and can be * privately mapped */ @@ -1668,7 +1664,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* find the first potentially overlapping VMA */ vma = find_vma(mm, start); if (!vma) { - static int limit = 0; + static int limit; if (limit < 5) { printk(KERN_WARNING "munmap of memory not mmapped by process %d" -- cgit From 3643763834b935208b067db9f4a239aba9dbe28d Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Mon, 7 Apr 2014 15:37:37 -0700 Subject: mm/vmalloc.c: enhance vm_map_ram() comment vm_map_ram() has a fragmentation problem when it cannot purge a chunk(ie, 4M address space) if there is a pinning object in that addresss space. So it could consume all VMALLOC address space easily. We can fix the fragmentation problem by using vmap instead of vm_map_ram() but vmap() is known to be slow compared to vm_map_ram(). Minchan said vm_map_ram is 5 times faster than vmap in his tests. So I thought we should fix fragment problem of vm_map_ram because our proprietary GPU driver has used it heavily. On second thought, it's not an easy because we should reuse freed space for solving the problem and it could make more IPI and bitmap operation for searching hole. It could mitigate API's goal which is very fast mapping. And even fragmentation problem wouldn't show in 64 bit machine. Another option is that the user should separate long-life and short-life object and use vmap for long-life but vm_map_ram for short-life. If we inform the user about the characteristic of vm_map_ram the user can choose one according to the page lifetime. Let's add some notice messages to user. [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Gioh Kim Reviewed-by: Zhang Yanfei Cc: Minchan Kim Cc: Johannes Weiner Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a7b522f4851d..bf233b283319 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1085,6 +1085,12 @@ EXPORT_SYMBOL(vm_unmap_ram); * @node: prefer to allocate data structures on this node * @prot: memory protection to use. PAGE_KERNEL for regular RAM * + * If you use this function for less than VMAP_MAX_ALLOC pages, it could be + * faster than vmap so it's good. But if you mix long-life and short-life + * objects with vm_map_ram(), it could consume lots of address space through + * fragmentation (especially on a 32bit machine). You could see failures in + * the end. Please use this function for short-lived objects. + * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) -- cgit From d230dec18dc9a581f153c14cb5371640cd62543b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:38 -0700 Subject: mm: use 'const char *' insted of 'char *' for reason in dump_page() I tried to use 'dump_page(page, __func__)' for debugging, but it triggers warning: warning: passing argument 2 of `dump_page' discards `const' qualifier from pointer target type [enabled by default] Let's convert 'reason' to 'const char *' in dump_page() and friends: we shouldn't modify it anyway. Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmdebug.h | 4 ++-- mm/page_alloc.c | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 5042c036dda9..2d57efa64cc1 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -3,8 +3,8 @@ struct page; -extern void dump_page(struct page *page, char *reason); -extern void dump_page_badflags(struct page *page, char *reason, +extern void dump_page(struct page *page, const char *reason); +extern void dump_page_badflags(struct page *page, const char *reason, unsigned long badflags); #ifdef CONFIG_DEBUG_VM diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 336ee925f756..73c25912c7c4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -295,7 +295,8 @@ static inline int bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(struct page *page, char *reason, unsigned long bad_flags) +static void bad_page(struct page *page, const char *reason, + unsigned long bad_flags) { static unsigned long resume; static unsigned long nr_shown; @@ -623,7 +624,7 @@ out: static inline int free_pages_check(struct page *page) { - char *bad_reason = NULL; + const char *bad_reason = NULL; unsigned long bad_flags = 0; if (unlikely(page_mapcount(page))) @@ -859,7 +860,7 @@ static inline void expand(struct zone *zone, struct page *page, */ static inline int check_new_page(struct page *page) { - char *bad_reason = NULL; + const char *bad_reason = NULL; unsigned long bad_flags = 0; if (unlikely(page_mapcount(page))) @@ -6545,7 +6546,8 @@ static void dump_page_flags(unsigned long flags) printk(")\n"); } -void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) +void dump_page_badflags(struct page *page, const char *reason, + unsigned long badflags) { printk(KERN_ALERT "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", @@ -6561,7 +6563,7 @@ void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) mem_cgroup_print_bad_page(page); } -void dump_page(struct page *page, char *reason) +void dump_page(struct page *page, const char *reason) { dump_page_badflags(page, reason, 0); } -- cgit From 7af467e8e1881fb65924329f46998e6e0d801038 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:39 -0700 Subject: mm: memcg: remove unnecessary preemption disabling lock_page_cgroup() disables preemption, remove explicit preemption disabling for code paths holding this lock. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dcc8153a1681..6e0f781412a2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -921,8 +921,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, bool anon, int nr_pages) { - preempt_disable(); - /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is * counted as CACHE even if it's on ANON LRU. @@ -947,8 +945,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, } __this_cpu_add(memcg->stat->nr_page_events, nr_pages); - - preempt_enable(); } unsigned long @@ -3748,17 +3744,14 @@ void mem_cgroup_split_huge_fixup(struct page *head) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline -void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, - struct mem_cgroup *to, - unsigned int nr_pages, - enum mem_cgroup_stat_index idx) +static void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, + struct mem_cgroup *to, + unsigned int nr_pages, + enum mem_cgroup_stat_index idx) { /* Update stat data for mem_cgroup */ - preempt_disable(); __this_cpu_sub(from->stat->count[idx], nr_pages); __this_cpu_add(to->stat->count[idx], nr_pages); - preempt_enable(); } /** -- cgit From 59d1d256e156bb232700836b79d1ead5027f7b1d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:40 -0700 Subject: mm: memcg: remove mem_cgroup_move_account_page_stat() It used to disable preemption and run sanity checks but now it's only taking a number out of one percpu counter and putting it into another. Do this directly in the callsite and save the indirection. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6e0f781412a2..b9928230a14c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3744,16 +3744,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, - struct mem_cgroup *to, - unsigned int nr_pages, - enum mem_cgroup_stat_index idx) -{ - /* Update stat data for mem_cgroup */ - __this_cpu_sub(from->stat->count[idx], nr_pages); - __this_cpu_add(to->stat->count[idx], nr_pages); -} - /** * mem_cgroup_move_account - move account of the page * @page: the page @@ -3799,13 +3789,19 @@ static int mem_cgroup_move_account(struct page *page, move_lock_mem_cgroup(from, &flags); - if (!anon && page_mapped(page)) - mem_cgroup_move_account_page_stat(from, to, nr_pages, - MEM_CGROUP_STAT_FILE_MAPPED); + if (!anon && page_mapped(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + } - if (PageWriteback(page)) - mem_cgroup_move_account_page_stat(from, to, nr_pages, - MEM_CGROUP_STAT_WRITEBACK); + if (PageWriteback(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + } mem_cgroup_charge_statistics(from, page, anon, -nr_pages); -- cgit From 1bec6b333e241a9db47d3939fb08a4e174ece02f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:41 -0700 Subject: mm: memcg: inline mem_cgroup_charge_common() mem_cgroup_charge_common() is used by both cache and anon pages, but most of its body only applies to anon pages and the remainder is not worth having in a separate function. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b9928230a14c..5aab347a5b0b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3887,20 +3887,21 @@ out: return ret; } -/* - * Charge the memory controller for page usage. - * Return - * 0 if the charge was successful - * < 0 if the cgroup is over its limit - */ -static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype) +int mem_cgroup_newpage_charge(struct page *page, + struct mm_struct *mm, gfp_t gfp_mask) { struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; bool oom = true; int ret; + if (mem_cgroup_disabled()) + return 0; + + VM_BUG_ON_PAGE(page_mapped(page), page); + VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); + VM_BUG_ON(!mm); + if (PageTransHuge(page)) { nr_pages <<= compound_order(page); VM_BUG_ON_PAGE(!PageTransHuge(page), page); @@ -3914,22 +3915,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); if (ret == -ENOMEM) return ret; - __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); + __mem_cgroup_commit_charge(memcg, page, nr_pages, + MEM_CGROUP_CHARGE_TYPE_ANON, false); return 0; } -int mem_cgroup_newpage_charge(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - if (mem_cgroup_disabled()) - return 0; - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - VM_BUG_ON(!mm); - return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_ANON); -} - /* * While swap-in, try_charge -> commit or cancel, the page is locked. * And when try_charge() successfully returns, one refcnt to memcg without @@ -4047,9 +4037,11 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (PageCompound(page)) return 0; - if (!PageSwapCache(page)) - ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); - else { /* page is swapcache/shmem */ + if (!PageSwapCache(page)) { + ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); + if (ret != -ENOMEM) + __mem_cgroup_commit_charge(memcg, page, 1, type, false); + } else { /* page is swapcache/shmem */ ret = __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); if (!ret) -- cgit From 284f39afeaa4ab1409b8f43b29cdea3007960ee3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:41 -0700 Subject: mm: memcg: push !mm handling out to page cache charge function Only page cache charges can happen without an mm context, so push this special case out of the inner core and into the cache charge function. An ancient comment explains that the mm can also be NULL in case the task is currently being migrated, but that is not actually true with the current case, so just remove it. Signed-off-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5aab347a5b0b..8d6cedd16f8d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2703,15 +2703,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (gfp_mask & __GFP_NOFAIL) oom = false; - - /* - * We always charge the cgroup the mm_struct belongs to. - * The mm_struct's mem_cgroup changes on task migration if the - * thread group leader migrates. It's possible that mm is not - * set, if so charge the root memcg (happens for pagecache usage). - */ - if (!*ptr && !mm) - *ptr = root_mem_cgroup; again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; @@ -4038,6 +4029,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, return 0; if (!PageSwapCache(page)) { + /* + * Page cache insertions can happen without an actual + * task context, e.g. during disk probing on boot. + */ + if (!mm) + memcg = root_mem_cgroup; ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); if (ret != -ENOMEM) __mem_cgroup_commit_charge(memcg, page, 1, type, false); -- cgit From 03583f1a631c0511dfd2f16e716d5b40f675de5a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:42 -0700 Subject: memcg: remove unnecessary !mm check from try_get_mem_cgroup_from_mm() Users pass either a mm that has been established under task lock, or use a verified current->mm, which means the task can't be exiting. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8d6cedd16f8d..c3b674f9774f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1075,13 +1075,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *memcg = NULL; - if (!mm) - return NULL; - /* - * Because we have no locks, mm->owner's may be being moved to other - * cgroup. We use css_tryget() here even if this looks - * pessimistic (rather than adding locks here). - */ rcu_read_lock(); do { memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); -- cgit From df381975463996178d685f6ef7d3555c5f887201 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:43 -0700 Subject: memcg: get_mem_cgroup_from_mm() Instead of returning NULL from try_get_mem_cgroup_from_mm() when the mm owner is exiting, just return root_mem_cgroup. This makes sense for all callsites and gets rid of some of them having to fallback manually. [fengguang.wu@intel.com: fix warnings] Signed-off-by: Johannes Weiner Signed-off-by: Fengguang Wu Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ------ mm/memcontrol.c | 18 ++++-------------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index eccfb4a4b379..134636f835f7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -94,7 +94,6 @@ bool task_in_mem_cgroup(struct task_struct *task, extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); -extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm); extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css); @@ -294,11 +293,6 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) return NULL; } -static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) -{ - return NULL; -} - static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c3b674f9774f..87c3ec37dd26 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1071,7 +1071,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); } -struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) +static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *memcg = NULL; @@ -1079,7 +1079,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) do { memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (unlikely(!memcg)) - break; + memcg = root_mem_cgroup; } while (!css_tryget(&memcg->css)); rcu_read_unlock(); return memcg; @@ -1475,7 +1475,7 @@ bool task_in_mem_cgroup(struct task_struct *task, p = find_lock_task_mm(task); if (p) { - curr = try_get_mem_cgroup_from_mm(p->mm); + curr = get_mem_cgroup_from_mm(p->mm); task_unlock(p); } else { /* @@ -1489,8 +1489,6 @@ bool task_in_mem_cgroup(struct task_struct *task, css_get(&curr->css); rcu_read_unlock(); } - if (!curr) - return false; /* * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is @@ -3617,15 +3615,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) if (!current->mm || current->memcg_kmem_skip_account) return true; - memcg = try_get_mem_cgroup_from_mm(current->mm); - - /* - * very rare case described in mem_cgroup_from_task. Unfortunately there - * isn't much we can do without complicating this too much, and it would - * be gfp-dependent anyway. Just let it go - */ - if (unlikely(!memcg)) - return true; + memcg = get_mem_cgroup_from_mm(current->mm); if (!memcg_can_account_kmem(memcg)) { css_put(&memcg->css); -- cgit From b6b6cc72bc404c952968530d7df4c3a4ab82b65b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 7 Apr 2014 15:37:44 -0700 Subject: memcg: do not replicate get_mem_cgroup_from_mm in __mem_cgroup_try_charge __mem_cgroup_try_charge duplicates get_mem_cgroup_from_mm for charges which came without a memcg. The only reason seems to be a tiny optimization when css_tryget is not called if the charge can be consumed from the stock. Nevertheless css_tryget is very cheap since it has been reworked to use per-cpu counting so this optimization doesn't give us anything these days. So let's drop the code duplication so that the code is more readable. Signed-off-by: Michal Hocko Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 50 ++++++-------------------------------------------- 1 file changed, 6 insertions(+), 44 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 87c3ec37dd26..7480022d4655 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2697,52 +2697,14 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; - if (mem_cgroup_is_root(memcg)) - goto done; - if (consume_stock(memcg, nr_pages)) - goto done; css_get(&memcg->css); } else { - struct task_struct *p; - - rcu_read_lock(); - p = rcu_dereference(mm->owner); - /* - * Because we don't have task_lock(), "p" can exit. - * In that case, "memcg" can point to root or p can be NULL with - * race with swapoff. Then, we have small risk of mis-accouning. - * But such kind of mis-account by race always happens because - * we don't have cgroup_mutex(). It's overkill and we allo that - * small race, here. - * (*) swapoff at el will charge against mm-struct not against - * task-struct. So, mm->owner can be NULL. - */ - memcg = mem_cgroup_from_task(p); - if (!memcg) - memcg = root_mem_cgroup; - if (mem_cgroup_is_root(memcg)) { - rcu_read_unlock(); - goto done; - } - if (consume_stock(memcg, nr_pages)) { - /* - * It seems dagerous to access memcg without css_get(). - * But considering how consume_stok works, it's not - * necessary. If consume_stock success, some charges - * from this memcg are cached on this cpu. So, we - * don't need to call css_get()/css_tryget() before - * calling consume_stock(). - */ - rcu_read_unlock(); - goto done; - } - /* after here, we may be blocked. we need to get refcnt */ - if (!css_tryget(&memcg->css)) { - rcu_read_unlock(); - goto again; - } - rcu_read_unlock(); + memcg = get_mem_cgroup_from_mm(mm); } + if (mem_cgroup_is_root(memcg)) + goto done; + if (consume_stock(memcg, nr_pages)) + goto done; do { bool invoke_oom = oom && !nr_oom_retries; @@ -2778,8 +2740,8 @@ again: if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); - css_put(&memcg->css); done: + css_put(&memcg->css); *ptr = memcg; return 0; nomem: -- cgit From 6d1fdc48938cd51a3964778d78f27cb26c8eb55d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:45 -0700 Subject: memcg: sanitize __mem_cgroup_try_charge() call protocol Some callsites pass a memcg directly, some callsites pass an mm that then has to be translated to a memcg. This makes for a terrible function interface. Just push the mm-to-memcg translation into the respective callsites and always pass a memcg to mem_cgroup_try_charge(). [mhocko@suse.cz: add charge mm helper] Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 207 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 102 insertions(+), 105 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7480022d4655..038b037f8d67 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2575,7 +2575,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, } -/* See __mem_cgroup_try_charge() for details */ +/* See mem_cgroup_try_charge() for details */ enum { CHARGE_OK, /* success */ CHARGE_RETRY, /* need to retry but retry is not bad */ @@ -2648,45 +2648,34 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return CHARGE_NOMEM; } -/* - * __mem_cgroup_try_charge() does - * 1. detect memcg to be charged against from passed *mm and *ptr, - * 2. update res_counter - * 3. call memory reclaim if necessary. - * - * In some special case, if the task is fatal, fatal_signal_pending() or - * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup - * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon - * as possible without any hazards. 2: all pages should have a valid - * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg - * pointer, that is treated as a charge to root_mem_cgroup. - * - * So __mem_cgroup_try_charge() will return - * 0 ... on success, filling *ptr with a valid memcg pointer. - * -ENOMEM ... charge failure because of resource limits. - * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. +/** + * mem_cgroup_try_charge - try charging a memcg + * @memcg: memcg to charge + * @nr_pages: number of pages to charge + * @oom: trigger OOM if reclaim fails * - * Unlike the exported interface, an "oom" parameter is added. if oom==true, - * the oom-killer can be invoked. + * Returns 0 if @memcg was charged successfully, -EINTR if the charge + * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. */ -static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, - unsigned int nr_pages, - struct mem_cgroup **ptr, - bool oom) +static int mem_cgroup_try_charge(struct mem_cgroup *memcg, + gfp_t gfp_mask, + unsigned int nr_pages, + bool oom) { unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct mem_cgroup *memcg = NULL; int ret; + if (mem_cgroup_is_root(memcg)) + goto done; /* - * Unlike gloval-vm's OOM-kill, we're not in memory shortage - * in system level. So, allow to go ahead dying process in addition to - * MEMDIE process. + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. */ - if (unlikely(test_thread_flag(TIF_MEMDIE) - || fatal_signal_pending(current))) + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current))) goto bypass; if (unlikely(task_in_memcg_oom(current))) @@ -2695,14 +2684,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (gfp_mask & __GFP_NOFAIL) oom = false; again: - if (*ptr) { /* css should be a valid one */ - memcg = *ptr; - css_get(&memcg->css); - } else { - memcg = get_mem_cgroup_from_mm(mm); - } - if (mem_cgroup_is_root(memcg)) - goto done; if (consume_stock(memcg, nr_pages)) goto done; @@ -2710,10 +2691,8 @@ again: bool invoke_oom = oom && !nr_oom_retries; /* If killed, bypass charge */ - if (fatal_signal_pending(current)) { - css_put(&memcg->css); + if (fatal_signal_pending(current)) goto bypass; - } ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, invoke_oom); @@ -2722,17 +2701,12 @@ again: break; case CHARGE_RETRY: /* not in OOM situation but retry */ batch = nr_pages; - css_put(&memcg->css); - memcg = NULL; goto again; case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - css_put(&memcg->css); goto nomem; case CHARGE_NOMEM: /* OOM routine works */ - if (!oom || invoke_oom) { - css_put(&memcg->css); + if (!oom || invoke_oom) goto nomem; - } nr_oom_retries--; break; } @@ -2741,19 +2715,43 @@ again: if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); done: - css_put(&memcg->css); - *ptr = memcg; return 0; nomem: - if (!(gfp_mask & __GFP_NOFAIL)) { - *ptr = NULL; + if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; - } bypass: - *ptr = root_mem_cgroup; return -EINTR; } +/** + * mem_cgroup_try_charge_mm - try charging a mm + * @mm: mm_struct to charge + * @nr_pages: number of pages to charge + * @oom: trigger OOM if reclaim fails + * + * Returns the charged mem_cgroup associated with the given mm_struct or + * NULL the charge failed. + */ +static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, + gfp_t gfp_mask, + unsigned int nr_pages, + bool oom) + +{ + struct mem_cgroup *memcg; + int ret; + + memcg = get_mem_cgroup_from_mm(mm); + ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); + css_put(&memcg->css); + if (ret == -EINTR) + memcg = root_mem_cgroup; + else if (ret) + memcg = NULL; + + return memcg; +} + /* * Somemtimes we have to undo a charge we got by try_charge(). * This function is for that and do uncharge, put css's refcnt. @@ -2949,20 +2947,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; - struct mem_cgroup *_memcg; int ret = 0; ret = res_counter_charge(&memcg->kmem, size, &fail_res); if (ret) return ret; - _memcg = memcg; - ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, - &_memcg, oom_gfp_allowed(gfp)); - + ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, + oom_gfp_allowed(gfp)); if (ret == -EINTR) { /* - * __mem_cgroup_try_charge() chosed to bypass to root due to + * mem_cgroup_try_charge() chosed to bypass to root due to * OOM kill or fatal signal. Since our only options are to * either fail the allocation or charge it to this cgroup, do * it as a temporary condition. But we can't fail. From a @@ -2972,7 +2967,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) * * This condition will only trigger if the task entered * memcg_charge_kmem in a sane state, but was OOM-killed during - * __mem_cgroup_try_charge() above. Tasks that were already + * mem_cgroup_try_charge() above. Tasks that were already * dying when the allocation triggers should have been already * directed to the root cgroup in memcontrol.h */ @@ -3826,10 +3821,9 @@ out: int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; + struct mem_cgroup *memcg; bool oom = true; - int ret; if (mem_cgroup_disabled()) return 0; @@ -3848,9 +3842,9 @@ int mem_cgroup_newpage_charge(struct page *page, oom = false; } - ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); - if (ret == -ENOMEM) - return ret; + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); + if (!memcg) + return -ENOMEM; __mem_cgroup_commit_charge(memcg, page, nr_pages, MEM_CGROUP_CHARGE_TYPE_ANON, false); return 0; @@ -3867,7 +3861,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, gfp_t mask, struct mem_cgroup **memcgp) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; int ret; @@ -3880,31 +3874,29 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, * in turn serializes uncharging. */ if (PageCgroupUsed(pc)) - return 0; - if (!do_swap_account) - goto charge_cur_mm; - memcg = try_get_mem_cgroup_from_page(page); + goto out; + if (do_swap_account) + memcg = try_get_mem_cgroup_from_page(page); if (!memcg) - goto charge_cur_mm; - *memcgp = memcg; - ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); + memcg = get_mem_cgroup_from_mm(mm); + ret = mem_cgroup_try_charge(memcg, mask, 1, true); css_put(&memcg->css); if (ret == -EINTR) - ret = 0; - return ret; -charge_cur_mm: - ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); - if (ret == -EINTR) - ret = 0; - return ret; + memcg = root_mem_cgroup; + else if (ret) + return ret; +out: + *memcgp = memcg; + return 0; } int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp) { - *memcgp = NULL; - if (mem_cgroup_disabled()) + if (mem_cgroup_disabled()) { + *memcgp = NULL; return 0; + } /* * A racing thread's fault, or swapoff, may have already * updated the pte, and even removed page from swap cache: in @@ -3912,12 +3904,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, * there's also a KSM case which does need to charge the page. */ if (!PageSwapCache(page)) { - int ret; + struct mem_cgroup *memcg; - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); - if (ret == -EINTR) - ret = 0; - return ret; + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + if (!memcg) + return -ENOMEM; + *memcgp = memcg; + return 0; } return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); } @@ -3964,8 +3957,8 @@ void mem_cgroup_commit_charge_swapin(struct page *page, int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *memcg = NULL; enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; + struct mem_cgroup *memcg; int ret; if (mem_cgroup_disabled()) @@ -3973,23 +3966,28 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (PageCompound(page)) return 0; - if (!PageSwapCache(page)) { - /* - * Page cache insertions can happen without an actual - * task context, e.g. during disk probing on boot. - */ - if (!mm) - memcg = root_mem_cgroup; - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); - if (ret != -ENOMEM) - __mem_cgroup_commit_charge(memcg, page, 1, type, false); - } else { /* page is swapcache/shmem */ + if (PageSwapCache(page)) { /* shmem */ ret = __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); - if (!ret) - __mem_cgroup_commit_charge_swapin(page, memcg, type); + if (ret) + return ret; + __mem_cgroup_commit_charge_swapin(page, memcg, type); + return 0; } - return ret; + + /* + * Page cache insertions can happen without an actual mm + * context, e.g. during disk probing on boot. + */ + if (unlikely(!mm)) + memcg = root_mem_cgroup; + else { + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + if (!memcg) + return -ENOMEM; + } + __mem_cgroup_commit_charge(memcg, page, 1, type, false); + return 0; } static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, @@ -6601,8 +6599,7 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, - GFP_KERNEL, 1, &memcg, false); + ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); if (ret) /* mem_cgroup_clear_mc() will do uncharge later */ return ret; -- cgit From d715ae08f2ff87508a081c4df78061bf4f7211d6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 7 Apr 2014 15:37:46 -0700 Subject: memcg: rename high level charging functions mem_cgroup_newpage_charge is used only for charging anonymous memory so it is better to rename it to mem_cgroup_charge_anon. mem_cgroup_cache_charge is used for file backed memory so rename it to mem_cgroup_charge_file. Signed-off-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memcg_test.txt | 4 ++-- include/linux/memcontrol.h | 8 ++++---- mm/filemap.c | 2 +- mm/huge_memory.c | 8 ++++---- mm/memcontrol.c | 4 ++-- mm/memory.c | 6 +++--- mm/shmem.c | 6 +++--- 7 files changed, 19 insertions(+), 19 deletions(-) diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt index ce94a83a7d9a..80ac454704b8 100644 --- a/Documentation/cgroups/memcg_test.txt +++ b/Documentation/cgroups/memcg_test.txt @@ -24,7 +24,7 @@ Please note that implementation details can be changed. a page/swp_entry may be charged (usage += PAGE_SIZE) at - mem_cgroup_newpage_charge() + mem_cgroup_charge_anon() Called at new page fault and Copy-On-Write. mem_cgroup_try_charge_swapin() @@ -32,7 +32,7 @@ Please note that implementation details can be changed. Followed by charge-commit-cancel protocol. (With swap accounting) At commit, a charge recorded in swap_cgroup is removed. - mem_cgroup_cache_charge() + mem_cgroup_charge_file() Called at add_to_page_cache() mem_cgroup_cache_charge_swapin() diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 134636f835f7..96f3fc87ab96 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -65,7 +65,7 @@ struct mem_cgroup_reclaim_cookie { * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.) */ -extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, +extern int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); /* for swap handling */ extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, @@ -74,7 +74,7 @@ extern void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg); extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg); -extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, +extern int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); @@ -233,13 +233,13 @@ void mem_cgroup_print_bad_page(struct page *page); #else /* CONFIG_MEMCG */ struct mem_cgroup; -static inline int mem_cgroup_newpage_charge(struct page *page, +static inline int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { return 0; } -static inline int mem_cgroup_cache_charge(struct page *page, +static inline int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { return 0; diff --git a/mm/filemap.c b/mm/filemap.c index b952d99c827c..27ebc0c9571b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -563,7 +563,7 @@ static int __add_to_page_cache_locked(struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); - error = mem_cgroup_cache_charge(page, current->mm, + error = mem_cgroup_charge_file(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); if (error) return error; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a2f4981418fc..64635f5278ff 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -968,7 +968,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, __GFP_OTHER_NODE, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_newpage_charge(pages[i], mm, + mem_cgroup_charge_anon(pages[i], mm, GFP_KERNEL))) { if (pages[i]) put_page(pages[i]); @@ -1101,7 +1101,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { put_page(new_page); if (page) { split_huge_page(page); @@ -2359,7 +2359,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (!new_page) return; - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) return; /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 038b037f8d67..e33b1d09eb1f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3818,7 +3818,7 @@ out: return ret; } -int mem_cgroup_newpage_charge(struct page *page, +int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { unsigned int nr_pages = 1; @@ -3954,7 +3954,7 @@ void mem_cgroup_commit_charge_swapin(struct page *page, MEM_CGROUP_CHARGE_TYPE_ANON); } -int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, +int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; diff --git a/mm/memory.c b/mm/memory.c index 1b88da5c08b3..854e4027719f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2828,7 +2828,7 @@ gotten: } __SetPageUptodate(new_page); - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) goto oom_free_new; mmun_start = address & PAGE_MASK; @@ -3281,7 +3281,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, */ __SetPageUptodate(page); - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) + if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); @@ -3537,7 +3537,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!new_page) return VM_FAULT_OOM; - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) { + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) { page_cache_release(new_page); return VM_FAULT_OOM; } diff --git a/mm/shmem.c b/mm/shmem.c index 70709347a1e2..70273f8df586 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -683,7 +683,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); + error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ @@ -1080,7 +1080,7 @@ repeat: goto failed; } - error = mem_cgroup_cache_charge(page, current->mm, + error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, @@ -1134,7 +1134,7 @@ repeat: SetPageSwapBacked(page); __set_page_locked(page); - error = mem_cgroup_cache_charge(page, current->mm, + error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); if (error) goto decused; -- cgit From 3a025760fc158b3726eac89ee95d7f29599e9dfa Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:48 -0700 Subject: mm: page_alloc: spill to remote nodes before waking kswapd On NUMA systems, a node may start thrashing cache or even swap anonymous pages while there are still free pages on remote nodes. This is a result of commits 81c0a2bb515f ("mm: page_alloc: fair zone allocator policy") and fff4068cba48 ("mm: page_alloc: revert NUMA aspect of fair allocation policy"). Before those changes, the allocator would first try all allowed zones, including those on remote nodes, before waking any kswapds. But now, the allocator fastpath doubles as the fairness pass, which in turn can only consider the local node to prevent remote spilling based on exhausted fairness batches alone. Remote nodes are only considered in the slowpath, after the kswapds are woken up. But if remote nodes still have free memory, kswapd should not be woken to rebalance the local node or it may thrash cash or swap prematurely. Fix this by adding one more unfair pass over the zonelist that is allowed to spill to remote nodes after the local fairness pass fails but before entering the slowpath and waking the kswapds. This also gets rid of the GFP_THISNODE exemption from the fairness protocol because the unfair pass is no longer tied to kswapd, which GFP_THISNODE is not allowed to wake up. However, because remote spills can be more frequent now - we prefer them over local kswapd reclaim - the allocation batches on remote nodes could underflow more heavily. When resetting the batches, use atomic_long_read() directly instead of zone_page_state() to calculate the delta as the latter filters negative counter values. Signed-off-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Mel Gorman Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 1 + mm/page_alloc.c | 89 +++++++++++++++++++++++++++++---------------------------- 2 files changed, 46 insertions(+), 44 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 29e1e761f9eb..3e910000fda4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -370,5 +370,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ +#define ALLOC_FAIR 0x100 /* fair zone allocation */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 73c25912c7c4..15d140755e71 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1239,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) } local_irq_restore(flags); } -static bool gfp_thisnode_allocation(gfp_t gfp_mask) -{ - return (gfp_mask & GFP_THISNODE) == GFP_THISNODE; -} -#else -static bool gfp_thisnode_allocation(gfp_t gfp_mask) -{ - return false; -} #endif /* @@ -1584,12 +1575,7 @@ again: get_pageblock_migratetype(page)); } - /* - * NOTE: GFP_THISNODE allocations do not partake in the kswapd - * aging protocol, so they can't be fair. - */ - if (!gfp_thisnode_allocation(gfp_flags)) - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1955,23 +1941,12 @@ zonelist_scan: * zone size to ensure fair page aging. The zone a * page was allocated in should have no effect on the * time the page has in memory before being reclaimed. - * - * Try to stay in local zones in the fastpath. If - * that fails, the slowpath is entered, which will do - * another pass starting with the local zones, but - * ultimately fall back to remote zones that do not - * partake in the fairness round-robin cycle of this - * zonelist. - * - * NOTE: GFP_THISNODE allocations do not partake in - * the kswapd aging protocol, so they can't be fair. */ - if ((alloc_flags & ALLOC_WMARK_LOW) && - !gfp_thisnode_allocation(gfp_mask)) { - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) - continue; + if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) continue; + if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) + continue; } /* * When allocating a page cache page for writing, we @@ -2409,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) +static void reset_alloc_batches(struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) { struct zoneref *z; struct zone *zone; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wakeup_kswapd(zone, order, zone_idx(preferred_zone)); /* * Only reset the batches of zones that were actually - * considered in the fast path, we don't want to - * thrash fairness information for zones that are not + * considered in the fairness pass, we don't want to + * trash fairness information for zones that are not * actually part of this zonelist's round-robin cycle. */ if (!zone_local(preferred_zone, zone)) continue; mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - - low_wmark_pages(zone) - - zone_page_state(zone, NR_ALLOC_BATCH)); + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); } } +static void wake_all_kswapds(unsigned int order, + struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) +{ + struct zoneref *z; + struct zone *zone; + + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + wakeup_kswapd(zone, order, zone_idx(preferred_zone)); +} + static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { @@ -2523,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (gfp_thisnode_allocation(gfp_mask)) + if (IS_ENABLED(CONFIG_NUMA) && + (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; restart: - prepare_slowpath(gfp_mask, order, zonelist, - high_zoneidx, preferred_zone); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); /* * OK, we're below the kswapd watermark and have kicked background @@ -2712,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2753,11 +2737,28 @@ retry_cpuset: if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif +retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); if (unlikely(!page)) { + /* + * The first pass makes sure allocations are spread + * fairly within the local node. However, the local + * node might have free pages left after the fairness + * batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without + * fairness, and include remote zones now, before + * entering the slowpath and waking kswapd: prefer + * spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + reset_alloc_batches(zonelist, high_zoneidx, + preferred_zone); + alloc_flags &= ~ALLOC_FAIR; + goto retry; + } /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not -- cgit From 57e68e9cd65b4b8eb4045a1e0d0746458502554c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 7 Apr 2014 15:37:50 -0700 Subject: mm: try_to_unmap_cluster() should lock_page() before mlocking A BUG_ON(!PageLocked) was triggered in mlock_vma_page() by Sasha Levin fuzzing with trinity. The call site try_to_unmap_cluster() does not lock the pages other than its check_page parameter (which is already locked). The BUG_ON in mlock_vma_page() is not documented and its purpose is somewhat unclear, but apparently it serializes against page migration, which could otherwise fail to transfer the PG_mlocked flag. This would not be fatal, as the page would be eventually encountered again, but NR_MLOCK accounting would become distorted nevertheless. This patch adds a comment to the BUG_ON in mlock_vma_page() and munlock_vma_page() to that effect. The call site try_to_unmap_cluster() is fixed so that for page != check_page, trylock_page() is attempted (to avoid possible deadlocks as we already have check_page locked) and mlock_vma_page() is performed only upon success. If the page lock cannot be obtained, the page is left without PG_mlocked, which is again not a problem in the whole unevictable memory design. Signed-off-by: Vlastimil Babka Signed-off-by: Bob Liu Reported-by: Sasha Levin Cc: Wanpeng Li Cc: Michel Lespinasse Cc: KOSAKI Motohiro Acked-by: Rik van Riel Cc: David Rientjes Cc: Mel Gorman Cc: Hugh Dickins Cc: Joonsoo Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 2 ++ mm/rmap.c | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index 4e1a68162285..b1eb53634005 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -79,6 +79,7 @@ void clear_page_mlock(struct page *page) */ void mlock_vma_page(struct page *page) { + /* Serialize with page migration */ BUG_ON(!PageLocked(page)); if (!TestSetPageMlocked(page)) { @@ -174,6 +175,7 @@ unsigned int munlock_vma_page(struct page *page) unsigned int nr_pages; struct zone *zone = page_zone(page); + /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); /* diff --git a/mm/rmap.c b/mm/rmap.c index 11cf322f8133..9c3e77396d1a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1332,9 +1332,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, BUG_ON(!page || PageAnon(page)); if (locked_vma) { - mlock_vma_page(page); /* no-op if already mlocked */ - if (page == check_page) + if (page == check_page) { + /* we know we have check_page locked */ + mlock_vma_page(page); ret = SWAP_MLOCK; + } else if (trylock_page(page)) { + /* + * If we can lock the page, perform mlock. + * Otherwise leave the page alone, it will be + * eventually encountered again later. + */ + mlock_vma_page(page); + unlock_page(page); + } continue; /* don't unmap */ } -- cgit From ed6d7c8e578331cad594ee70d60e2e146b5dce7b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 7 Apr 2014 15:37:51 -0700 Subject: mm: remove unused arg of set_page_dirty_balance() There's only one caller of set_page_dirty_balance() and that will call it with page_mkwrite == 0. The page_mkwrite argument was unused since commit b827e496c893 "mm: close page_mkwrite races". Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 2 +- mm/memory.c | 2 +- mm/page-writeback.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 021b8a319b9e..5777c13849ba 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -178,7 +178,7 @@ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); -void set_page_dirty_balance(struct page *page, int page_mkwrite); +void set_page_dirty_balance(struct page *page); void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); diff --git a/mm/memory.c b/mm/memory.c index 854e4027719f..d0f0bef3be48 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2782,7 +2782,7 @@ reuse: */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); + set_page_dirty_balance(dirty_page); /* file_update_time outside page_lock */ if (vma->vm_file) file_update_time(vma->vm_file); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7106cb1aca8e..ef413492a149 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1562,9 +1562,9 @@ pause: bdi_start_background_writeback(bdi); } -void set_page_dirty_balance(struct page *page, int page_mkwrite) +void set_page_dirty_balance(struct page *page) { - if (set_page_dirty(page) || page_mkwrite) { + if (set_page_dirty(page)) { struct address_space *mapping = page_mapping(page); if (mapping) -- cgit From 136199f0a67cd6bb3f0e8de0ad50f52879f82077 Mon Sep 17 00:00:00 2001 From: Emil Medve Date: Mon, 7 Apr 2014 15:37:52 -0700 Subject: memblock: use for_each_memblock() This is a small cleanup. Signed-off-by: Emil Medve Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 24 +++++++++++------------- mm/page_alloc.c | 10 +++++----- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 7fe5354e7552..c5c20c46f97e 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1271,16 +1271,14 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) void __init memblock_enforce_memory_limit(phys_addr_t limit) { - unsigned long i; phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + struct memblock_region *r; if (!limit) return; /* find out max address */ - for (i = 0; i < memblock.memory.cnt; i++) { - struct memblock_region *r = &memblock.memory.regions[i]; - + for_each_memblock(memory, r) { if (limit <= r->size) { max_addr = r->base + limit; break; @@ -1379,13 +1377,12 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si void __init_memblock memblock_trim_memory(phys_addr_t align) { - int i; phys_addr_t start, end, orig_start, orig_end; - struct memblock_type *mem = &memblock.memory; + struct memblock_region *r; - for (i = 0; i < mem->cnt; i++) { - orig_start = mem->regions[i].base; - orig_end = mem->regions[i].base + mem->regions[i].size; + for_each_memblock(memory, r) { + orig_start = r->base; + orig_end = r->base + r->size; start = round_up(orig_start, align); end = round_down(orig_end, align); @@ -1393,11 +1390,12 @@ void __init_memblock memblock_trim_memory(phys_addr_t align) continue; if (start < end) { - mem->regions[i].base = start; - mem->regions[i].size = end - start; + r->base = start; + r->size = end - start; } else { - memblock_remove_region(mem, i); - i--; + memblock_remove_region(&memblock.memory, + r - memblock.memory.regions); + r--; } } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 15d140755e71..48427a7cfb45 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5073,7 +5073,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) nodemask_t saved_node_state = node_states[N_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_MEMORY]); - struct memblock_type *type = &memblock.memory; + struct memblock_region *r; /* Need to find movable_zone earlier when movable_node is specified. */ find_usable_zone_for_movable(); @@ -5083,13 +5083,13 @@ static void __init find_zone_movable_pfns_for_nodes(void) * options. */ if (movable_node_is_enabled()) { - for (i = 0; i < type->cnt; i++) { - if (!memblock_is_hotpluggable(&type->regions[i])) + for_each_memblock(memory, r) { + if (!memblock_is_hotpluggable(r)) continue; - nid = type->regions[i].nid; + nid = r->nid; - usable_startpfn = PFN_DOWN(type->regions[i].base); + usable_startpfn = PFN_DOWN(r->base); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; -- cgit From 167632303005670713603452a3c9ee5de4aa5828 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:37:53 -0700 Subject: mm/memblock.c: use PFN_PHYS() Replace ((phys_addr_t)(x) << PAGE_SHIFT) by pfn macro. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index c5c20c46f97e..e9d6ca9a01a9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1253,7 +1253,7 @@ phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) pages += end_pfn - start_pfn; } - return (phys_addr_t)pages << PAGE_SHIFT; + return PFN_PHYS(pages); } /* lowest address */ @@ -1324,7 +1324,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) { struct memblock_type *type = &memblock.memory; - int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT); + int mid = memblock_search(type, PFN_PHYS(pfn)); if (mid == -1) return -1; -- cgit From 55f67141a8927b2be3e51840da37b8a2320143ed Mon Sep 17 00:00:00 2001 From: "Mizuma, Masayoshi" Date: Mon, 7 Apr 2014 15:37:54 -0700 Subject: mm: hugetlb: fix softlockup when a large number of hugepages are freed. When I decrease the value of nr_hugepage in procfs a lot, softlockup happens. It is because there is no chance of context switch during this process. On the other hand, when I allocate a large number of hugepages, there is some chance of context switch. Hence softlockup doesn't happen during this process. So it's necessary to add the context switch in the freeing process as same as allocating process to avoid softlockup. When I freed 12 TB hugapages with kernel-2.6.32-358.el6, the freeing process occupied a CPU over 150 seconds and following softlockup message appeared twice or more. $ echo 6000000 > /proc/sys/vm/nr_hugepages $ cat /proc/sys/vm/nr_hugepages 6000000 $ grep ^Huge /proc/meminfo HugePages_Total: 6000000 HugePages_Free: 6000000 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 2048 kB $ echo 0 > /proc/sys/vm/nr_hugepages BUG: soft lockup - CPU#16 stuck for 67s! [sh:12883] ... Pid: 12883, comm: sh Not tainted 2.6.32-358.el6.x86_64 #1 Call Trace: free_pool_huge_page+0xb8/0xd0 set_max_huge_pages+0x128/0x190 hugetlb_sysctl_handler_common+0x113/0x140 hugetlb_sysctl_handler+0x1e/0x20 proc_sys_call_handler+0x97/0xd0 proc_sys_write+0x14/0x20 vfs_write+0xb8/0x1a0 sys_write+0x51/0x90 __audit_syscall_exit+0x265/0x290 system_call_fastpath+0x16/0x1b I have not confirmed this problem with upstream kernels because I am not able to prepare the machine equipped with 12TB memory now. However I confirmed that the amount of decreasing hugepages was directly proportional to the amount of required time. I measured required times on a smaller machine. It showed 130-145 hugepages decreased in a millisecond. Amount of decreasing Required time Decreasing rate hugepages (msec) (pages/msec) ------------------------------------------------------------ 10,000 pages == 20GB 70 - 74 135-142 30,000 pages == 60GB 208 - 229 131-144 It means decrement of 6TB hugepages will trigger softlockup with the default threshold 20sec, in this decreasing rate. Signed-off-by: Masayoshi Mizuma Cc: Joonsoo Kim Cc: Michal Hocko Cc: Wanpeng Li Cc: Aneesh Kumar Cc: KOSAKI Motohiro Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 27938c441b6f..dd30f22b35e0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1536,6 +1536,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, while (min_count < persistent_huge_pages(h)) { if (!free_pool_huge_page(h, nodes_allowed, 0)) break; + cond_resched_lock(&hugetlb_lock); } while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) -- cgit From 29f175d125f0f3a9503af8a5596f93d714cceb08 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:37:55 -0700 Subject: mm/readahead.c: inline ra_submit Commit f9acc8c7b35a ("readahead: sanify file_ra_state names") left ra_submit with a single function call. Move ra_submit to internal.h and inline it to save some stack. Thanks to Andrew Morton for commenting different versions. Signed-off-by: Fabian Frederick Suggested-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 --- mm/internal.h | 15 +++++++++++++++ mm/readahead.c | 21 +++------------------ 3 files changed, 18 insertions(+), 21 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9132faed1a41..6edcea720ddd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1875,9 +1875,6 @@ void page_cache_async_readahead(struct address_space *mapping, unsigned long size); unsigned long max_sane_readahead(unsigned long nr); -unsigned long ra_submit(struct file_ra_state *ra, - struct address_space *mapping, - struct file *filp); /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); diff --git a/mm/internal.h b/mm/internal.h index 3e910000fda4..07b67361a40a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,7 @@ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H +#include #include void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, @@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v) atomic_set(&page->_count, v); } +extern int __do_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size); + +/* + * Submit IO for the read-ahead request in file_ra_state. + */ +static inline unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, struct file *filp) +{ + return __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); +} + /* * Turn a non-refcounted page (->_count == 0) into refcounted with * a count of one. diff --git a/mm/readahead.c b/mm/readahead.c index 29c5e1af5a0c..0ca36a7770b1 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,9 +8,7 @@ */ #include -#include #include -#include #include #include #include @@ -20,6 +18,8 @@ #include #include +#include "internal.h" + /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -149,8 +149,7 @@ out: * * Returns the number of pages requested, or the maximum amount of I/O allowed. */ -static int -__do_page_cache_readahead(struct address_space *mapping, struct file *filp, +int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size) { @@ -243,20 +242,6 @@ unsigned long max_sane_readahead(unsigned long nr) return min(nr, MAX_READAHEAD); } -/* - * Submit IO for the read-ahead request in file_ra_state. - */ -unsigned long ra_submit(struct file_ra_state *ra, - struct address_space *mapping, struct file *filp) -{ - int actual; - - actual = __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); - - return actual; -} - /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large -- cgit From 85892f196fd8fb22386436f86f2b74104d7005f8 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 7 Apr 2014 15:37:56 -0700 Subject: madvise: correct the comment of MADV_DODUMP flag s/MADV_NODUMP/MADV_DONTDUMP/ Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/asm-generic/mman-common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 4164529a94f9..ddc3b36f1046 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -50,7 +50,7 @@ #define MADV_DONTDUMP 16 /* Explicity exclude from the core dump, overrides the coredump filter bits */ -#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ +#define MADV_DODUMP 17 /* Clear the MADV_DONTDUMP flag */ /* compatibility flags */ #define MAP_FILE 0 -- cgit From 834a964a098e7726fc296d7cd8f65ed3eeedd412 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Mon, 7 Apr 2014 15:37:57 -0700 Subject: numa: use LAST_CPUPID_SHIFT to calculate LAST_CPUPID_MASK LAST_CPUPID_MASK is calculated using LAST_CPUPID_WIDTH. However LAST_CPUPID_WIDTH itself can be 0. (when LAST_CPUPID_NOT_IN_PAGE_FLAGS is set). In such a case LAST_CPUPID_MASK turns out to be 0. But with recent commit 1ae71d0319: (mm: numa: bugfix for LAST_CPUPID_NOT_IN_PAGE_FLAGS) if LAST_CPUPID_MASK is 0, page_cpupid_xchg_last() and page_cpupid_reset_last() causes page->_last_cpupid to be set to 0. This causes performance regression. Its almost as if numa_balancing is off. Fix LAST_CPUPID_MASK by using LAST_CPUPID_SHIFT instead of LAST_CPUPID_WIDTH. Some performance numbers and perf stats with and without the fix. (3.14-rc6) ---------- numa01 Performance counter stats for '/usr/bin/time -f %e %S %U %c %w -o start_bench.out -a ./numa01': 12,27,462 cs [100.00%] 2,41,957 migrations [100.00%] 1,68,01,713 faults [100.00%] 7,99,35,29,041 cache-misses 98,808 migrate:mm_migrate_pages [100.00%] 1407.690148814 seconds time elapsed numa02 Performance counter stats for '/usr/bin/time -f %e %S %U %c %w -o start_bench.out -a ./numa02': 63,065 cs [100.00%] 14,364 migrations [100.00%] 2,08,118 faults [100.00%] 25,32,59,404 cache-misses 12 migrate:mm_migrate_pages [100.00%] 63.840827219 seconds time elapsed (3.14-rc6 with fix) ------------------- numa01 Performance counter stats for '/usr/bin/time -f %e %S %U %c %w -o start_bench.out -a ./numa01': 9,68,911 cs [100.00%] 1,01,414 migrations [100.00%] 88,38,697 faults [100.00%] 4,42,92,51,042 cache-misses 4,25,060 migrate:mm_migrate_pages [100.00%] 685.965331189 seconds time elapsed numa02 Performance counter stats for '/usr/bin/time -f %e %S %U %c %w -o start_bench.out -a ./numa02': 17,543 cs [100.00%] 2,962 migrations [100.00%] 1,17,843 faults [100.00%] 11,80,61,644 cache-misses 12,358 migrate:mm_migrate_pages [100.00%] 20.380132343 seconds time elapsed Signed-off-by: Srikar Dronamraju Cc: Liu Ping Fan Reviewed-by: Aneesh Kumar K.V Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6edcea720ddd..abc848412e3c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -695,7 +695,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_WIDTH) - 1) +#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) -- cgit From ed12d845b5f528cc0846023862b9c448a36122ec Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 7 Apr 2014 15:37:59 -0700 Subject: mm/page_alloc.c: change mm debug routines back to EXPORT_SYMBOL A new dump_page() routine was recently added, and marked EXPORT_SYMBOL_GPL. dump_page() was also added to the VM_BUG_ON_PAGE() macro, and so the end result is that non-GPL code can no longer call get_page() and a few other routines. This only happens if the kernel was compiled with CONFIG_DEBUG_VM. Change dump_page() to be EXPORT_SYMBOL. Longer explanation: Prior to commit 309381feaee5 ("mm: dump page when hitting a VM_BUG_ON using VM_BUG_ON_PAGE") , it was possible to build MIT-licensed (non-GPL) drivers on Fedora. Fedora is semi-unique, in that it sets CONFIG_VM_DEBUG. Because Fedora sets CONFIG_VM_DEBUG, they end up pulling in dump_page(), via VM_BUG_ON_PAGE, via get_page(). As one of the authors of NVIDIA's new, open source, "UVM-Lite" kernel module, I originally choose to use the kernel's get_page() routine from within nvidia_uvm_page_cache.c, because get_page() has always seemed to be very clearly intended for use by non-GPL, driver code. So I'm hoping that making get_page() widely accessible again will not be too controversial. We did check with Fedora first, and they responded (https://bugzilla.redhat.com/show_bug.cgi?id=1074710#c3) that we should try to get upstream changed, before asking Fedora to change. Their reasoning seems beneficial to Linux: leaving CONFIG_DEBUG_VM set allows Fedora to help catch mm bugs. Signed-off-by: John Hubbard Cc: Sasha Levin Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 48427a7cfb45..5dba2933c9c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6568,4 +6568,4 @@ void dump_page(struct page *page, const char *reason) { dump_page_badflags(page, reason, 0); } -EXPORT_SYMBOL_GPL(dump_page); +EXPORT_SYMBOL(dump_page); -- cgit From be2d1d56c82d8cf20e6c77515eb499f8e86eb5be Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:00 -0700 Subject: zram: drop `init_done' struct zram member Introduce init_done() helper function which allows us to drop `init_done' struct zram member. init_done() uses the fact that ->init_done == 1 equals to ->meta != NULL. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 21 +++++++++++---------- drivers/block/zram/zram_drv.h | 1 - 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 51c557cfd92b..b3e6f072c19b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -42,6 +42,11 @@ static struct zram *zram_devices; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +static inline int init_done(struct zram *zram) +{ + return zram->meta != NULL; +} + static inline struct zram *dev_to_zram(struct device *dev) { return (struct zram *)dev_to_disk(dev)->private_data; @@ -60,7 +65,7 @@ static ssize_t initstate_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%u\n", zram->init_done); + return sprintf(buf, "%u\n", init_done(zram)); } static ssize_t num_reads_show(struct device *dev, @@ -133,7 +138,7 @@ static ssize_t mem_used_total_show(struct device *dev, struct zram_meta *meta = zram->meta; down_read(&zram->init_lock); - if (zram->init_done) + if (init_done(zram)) val = zs_get_total_size_bytes(meta->mem_pool); up_read(&zram->init_lock); @@ -546,14 +551,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) struct zram_meta *meta; down_write(&zram->init_lock); - if (!zram->init_done) { + if (!init_done(zram)) { up_write(&zram->init_lock); return; } meta = zram->meta; - zram->init_done = 0; - /* Free all pages that are still in this zram device */ for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) { unsigned long handle = meta->table[index].handle; @@ -594,8 +597,6 @@ static void zram_init_device(struct zram *zram, struct zram_meta *meta) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); zram->meta = meta; - zram->init_done = 1; - pr_debug("Initialization done!\n"); } @@ -615,7 +616,7 @@ static ssize_t disksize_store(struct device *dev, if (!meta) return -ENOMEM; down_write(&zram->init_lock); - if (zram->init_done) { + if (init_done(zram)) { up_write(&zram->init_lock); zram_meta_free(meta); pr_info("Cannot change disksize for initialized device\n"); @@ -736,7 +737,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) struct zram *zram = queue->queuedata; down_read(&zram->init_lock); - if (unlikely(!zram->init_done)) + if (unlikely(!init_done(zram))) goto error; if (!valid_io_request(zram, bio)) { @@ -859,7 +860,7 @@ static int create_device(struct zram *zram, int device_id) goto out_free_disk; } - zram->init_done = 0; + zram->meta = NULL; return 0; out_free_disk: diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index ad8aa35bae00..e81e9cdf4147 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -95,7 +95,6 @@ struct zram { struct zram_meta *meta; struct request_queue *queue; struct gendisk *disk; - int init_done; /* Prevent concurrent execution of device init, reset and R/W request */ struct rw_semaphore init_lock; /* -- cgit From be257c61306750d11c20d2ac567bf63304c696a3 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:01 -0700 Subject: zram: do not pass rw argument to __zram_make_request() Do not pass rw argument down the __zram_make_request() -> zram_bvec_rw() chain, decode it in zram_bvec_rw() instead. Besides, this is the place where we distinguish READ and WRITE bio data directions, so account zram RW stats here, instead of __zram_make_request(). This also allows to account a real number of zram READ/WRITE operations, not just requests (single RW request may cause a number of zram RW ops with separate locking, compression/decompression, etc). Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index b3e6f072c19b..3f0d6de36f74 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -533,14 +533,18 @@ out: } static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, struct bio *bio, int rw) + int offset, struct bio *bio) { int ret; + int rw = bio_data_dir(bio); - if (rw == READ) + if (rw == READ) { + atomic64_inc(&zram->stats.num_reads); ret = zram_bvec_read(zram, bvec, index, offset, bio); - else + } else { + atomic64_inc(&zram->stats.num_writes); ret = zram_bvec_write(zram, bvec, index, offset); + } return ret; } @@ -672,22 +676,13 @@ out: return ret; } -static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) +static void __zram_make_request(struct zram *zram, struct bio *bio) { int offset; u32 index; struct bio_vec bvec; struct bvec_iter iter; - switch (rw) { - case READ: - atomic64_inc(&zram->stats.num_reads); - break; - case WRITE: - atomic64_inc(&zram->stats.num_writes); - break; - } - index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT; offset = (bio->bi_iter.bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; @@ -706,16 +701,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) bv.bv_len = max_transfer_size; bv.bv_offset = bvec.bv_offset; - if (zram_bvec_rw(zram, &bv, index, offset, bio, rw) < 0) + if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0) goto out; bv.bv_len = bvec.bv_len - max_transfer_size; bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index+1, 0, bio, rw) < 0) + if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0) goto out; } else - if (zram_bvec_rw(zram, &bvec, index, offset, bio, rw) - < 0) + if (zram_bvec_rw(zram, &bvec, index, offset, bio) < 0) goto out; update_position(&index, &offset, &bvec); @@ -745,7 +739,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) goto error; } - __zram_make_request(zram, bio, bio_data_dir(bio)); + __zram_make_request(zram, bio); up_read(&zram->init_lock); return; -- cgit From b7cccf8b4009bf74df61f3c9d86b95fabd807c11 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:02 -0700 Subject: zram: remove good and bad compress stats Remove `good' and `bad' compressed sub-requests stats. RW request may cause a number of RW sub-requests. zram used to account `good' compressed sub-queries (with compressed size less than 50% of original size), `bad' compressed sub-queries (with compressed size greater that 75% of original size), leaving sub-requests with compression size between 50% and 75% of original size not accounted and not reported. zram already accounts each sub-request's compression size so we can calculate real device compression ratio. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 11 ----------- drivers/block/zram/zram_drv.h | 2 -- 2 files changed, 13 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3f0d6de36f74..995304ef40bb 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -293,7 +293,6 @@ static void zram_free_page(struct zram *zram, size_t index) { struct zram_meta *meta = zram->meta; unsigned long handle = meta->table[index].handle; - u16 size = meta->table[index].size; if (unlikely(!handle)) { /* @@ -307,14 +306,8 @@ static void zram_free_page(struct zram *zram, size_t index) return; } - if (unlikely(size > max_zpage_size)) - atomic_dec(&zram->stats.bad_compress); - zs_free(meta->mem_pool, handle); - if (size <= PAGE_SIZE / 2) - atomic_dec(&zram->stats.good_compress); - atomic64_sub(meta->table[index].size, &zram->stats.compr_size); atomic_dec(&zram->stats.pages_stored); @@ -478,7 +471,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } if (unlikely(clen > max_zpage_size)) { - atomic_inc(&zram->stats.bad_compress); clen = PAGE_SIZE; src = NULL; if (is_partial_io(bvec)) @@ -518,9 +510,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, /* Update stats */ atomic64_add(clen, &zram->stats.compr_size); atomic_inc(&zram->stats.pages_stored); - if (clen <= PAGE_SIZE / 2) - atomic_inc(&zram->stats.good_compress); - out: if (locked) mutex_unlock(&meta->buffer_lock); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index e81e9cdf4147..2f173cb1fd0a 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -78,8 +78,6 @@ struct zram_stats { atomic64_t notify_free; /* no. of swap slot free notifications */ atomic_t pages_zero; /* no. of zero filled pages */ atomic_t pages_stored; /* no. of pages currently stored */ - atomic_t good_compress; /* % of pages with compression ratio<=50% */ - atomic_t bad_compress; /* % of pages with compression ratio>=75% */ }; struct zram_meta { -- cgit From 90a7806ea9b9f7cb4751859cc2506e2d80e36ef1 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:03 -0700 Subject: zram: use atomic64_t for all zram stats This is a preparation patch for stats code duplication removal. 1) use atomic64_t for `pages_zero' and `pages_stored' zram stats. 2) `compr_size' and `pages_zero' struct zram_stats members did not follow the existing device attr naming scheme: zram_stats.ATTR has ATTR_show() function. rename them: -- compr_size -> compr_data_size -- pages_zero -> zero_pages Minchan Kim's note: If we really have trouble with atomic stat operation, we could change it with percpu_counter so that it could solve atomic overhead and unnecessary memory space by introducing unsigned long instead of 64bit atomic_t. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 18 +++++++++--------- drivers/block/zram/zram_drv.h | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 995304ef40bb..1bf97b2f8f3f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -109,7 +109,7 @@ static ssize_t zero_pages_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%u\n", atomic_read(&zram->stats.pages_zero)); + return sprintf(buf, "%llu\n", (u64)atomic64_read(&zram->stats.zero_pages)); } static ssize_t orig_data_size_show(struct device *dev, @@ -118,7 +118,7 @@ static ssize_t orig_data_size_show(struct device *dev, struct zram *zram = dev_to_zram(dev); return sprintf(buf, "%llu\n", - (u64)(atomic_read(&zram->stats.pages_stored)) << PAGE_SHIFT); + (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); } static ssize_t compr_data_size_show(struct device *dev, @@ -127,7 +127,7 @@ static ssize_t compr_data_size_show(struct device *dev, struct zram *zram = dev_to_zram(dev); return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.compr_size)); + (u64)atomic64_read(&zram->stats.compr_data_size)); } static ssize_t mem_used_total_show(struct device *dev, @@ -301,15 +301,15 @@ static void zram_free_page(struct zram *zram, size_t index) */ if (zram_test_flag(meta, index, ZRAM_ZERO)) { zram_clear_flag(meta, index, ZRAM_ZERO); - atomic_dec(&zram->stats.pages_zero); + atomic64_dec(&zram->stats.zero_pages); } return; } zs_free(meta->mem_pool, handle); - atomic64_sub(meta->table[index].size, &zram->stats.compr_size); - atomic_dec(&zram->stats.pages_stored); + atomic64_sub(meta->table[index].size, &zram->stats.compr_data_size); + atomic64_dec(&zram->stats.pages_stored); meta->table[index].handle = 0; meta->table[index].size = 0; @@ -452,7 +452,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, zram_set_flag(meta, index, ZRAM_ZERO); write_unlock(&zram->meta->tb_lock); - atomic_inc(&zram->stats.pages_zero); + atomic64_inc(&zram->stats.zero_pages); ret = 0; goto out; } @@ -508,8 +508,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, write_unlock(&zram->meta->tb_lock); /* Update stats */ - atomic64_add(clen, &zram->stats.compr_size); - atomic_inc(&zram->stats.pages_stored); + atomic64_add(clen, &zram->stats.compr_data_size); + atomic64_inc(&zram->stats.pages_stored); out: if (locked) mutex_unlock(&meta->buffer_lock); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 2f173cb1fd0a..58d4ac537f65 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -69,15 +69,15 @@ struct table { } __aligned(4); struct zram_stats { - atomic64_t compr_size; /* compressed size of pages stored */ + atomic64_t compr_data_size; /* compressed size of pages stored */ atomic64_t num_reads; /* failed + successful */ atomic64_t num_writes; /* --do-- */ atomic64_t failed_reads; /* should NEVER! happen */ atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ - atomic_t pages_zero; /* no. of zero filled pages */ - atomic_t pages_stored; /* no. of pages currently stored */ + atomic64_t zero_pages; /* no. of zero filled pages */ + atomic64_t pages_stored; /* no. of pages currently stored */ }; struct zram_meta { -- cgit From a68eb3b65e658406d386bebef02277f4007b2f45 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:04 -0700 Subject: zram: remove zram stats code duplication Introduce ZRAM_ATTR_RO macro that generates device_attribute and default ATTR show() function for existing atomic64_t zram stats. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 82 ++++++++++++------------------------------- 1 file changed, 23 insertions(+), 59 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1bf97b2f8f3f..29c35119e82a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -42,6 +42,17 @@ static struct zram *zram_devices; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +#define ZRAM_ATTR_RO(name) \ +static ssize_t zram_attr_##name##_show(struct device *d, \ + struct device_attribute *attr, char *b) \ +{ \ + struct zram *zram = dev_to_zram(d); \ + return sprintf(b, "%llu\n", \ + (u64)atomic64_read(&zram->stats.name)); \ +} \ +static struct device_attribute dev_attr_##name = \ + __ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL); + static inline int init_done(struct zram *zram) { return zram->meta != NULL; @@ -63,53 +74,14 @@ static ssize_t disksize_show(struct device *dev, static ssize_t initstate_show(struct device *dev, struct device_attribute *attr, char *buf) { + u32 val; struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%u\n", init_done(zram)); -} - -static ssize_t num_reads_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_reads)); -} - -static ssize_t num_writes_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_writes)); -} - -static ssize_t invalid_io_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.invalid_io)); -} - -static ssize_t notify_free_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.notify_free)); -} - -static ssize_t zero_pages_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); + down_read(&zram->init_lock); + val = init_done(zram); + up_read(&zram->init_lock); - return sprintf(buf, "%llu\n", (u64)atomic64_read(&zram->stats.zero_pages)); + return sprintf(buf, "%u\n", val); } static ssize_t orig_data_size_show(struct device *dev, @@ -121,15 +93,6 @@ static ssize_t orig_data_size_show(struct device *dev, (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); } -static ssize_t compr_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.compr_data_size)); -} - static ssize_t mem_used_total_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -762,15 +725,16 @@ static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, disksize_show, disksize_store); static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); -static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL); -static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL); -static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL); -static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL); -static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL); static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); -static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); +ZRAM_ATTR_RO(num_reads); +ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(invalid_io); +ZRAM_ATTR_RO(notify_free); +ZRAM_ATTR_RO(zero_pages); +ZRAM_ATTR_RO(compr_data_size); + static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, &dev_attr_initstate.attr, -- cgit From 6444724939db5de7390c90f7b4a657159b3b4465 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:05 -0700 Subject: zram: report failed read and write stats zram accounted but did not report numbers of failed read and write queries. make these stats available as failed_reads and failed_writes attrs. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 29c35119e82a..c06d7fba4237 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -730,6 +730,8 @@ static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(failed_reads); +ZRAM_ATTR_RO(failed_writes); ZRAM_ATTR_RO(invalid_io); ZRAM_ATTR_RO(notify_free); ZRAM_ATTR_RO(zero_pages); @@ -741,6 +743,8 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_reset.attr, &dev_attr_num_reads.attr, &dev_attr_num_writes.attr, + &dev_attr_failed_reads.attr, + &dev_attr_failed_writes.attr, &dev_attr_invalid_io.attr, &dev_attr_notify_free.attr, &dev_attr_zero_pages.attr, -- cgit From 59fc86a4922f1a1c0f69eac758a7e2b2b138aab4 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:06 -0700 Subject: zram: drop not used table `count' member struct table `count' member is not used. Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 58d4ac537f65..1d5b1f5786a8 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -64,7 +64,6 @@ enum zram_pageflags { struct table { unsigned long handle; u16 size; /* object size (excluding header) */ - u8 count; /* object ref count (not yet used) */ u8 flags; } __aligned(4); -- cgit From e64cd51d2fa87733176246101df871a8ac5c7c20 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:07 -0700 Subject: zram: move zram size warning to documentation Move zram warning about disksize and size of memory correlation to zram documentation. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/blockdev/zram.txt | 5 +++++ drivers/block/zram/zram_drv.c | 15 --------------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 2eccddffa6c8..393541be1ec0 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -33,6 +33,11 @@ Following shows a typical sequence of steps for using zram. echo 512M > /sys/block/zram0/disksize echo 1G > /sys/block/zram0/disksize +Note: +There is little point creating a zram of greater than twice the size of memory +since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the +size of the disk when not in use so a huge zram is wasteful. + 3) Activate: mkswap /dev/zram0 swapon /dev/zram0 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c06d7fba4237..21aee3edcb25 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -535,23 +535,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) static void zram_init_device(struct zram *zram, struct zram_meta *meta) { - if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) { - pr_info( - "There is little point creating a zram of greater than " - "twice the size of memory since we expect a 2:1 compression " - "ratio. Note that zram uses about 0.1%% of the size of " - "the disk when not in use so a huge zram is " - "wasteful.\n" - "\tMemory Size: %lu kB\n" - "\tSize you selected: %llu kB\n" - "Continuing anyway ...\n", - (totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10 - ); - } - /* zram devices sort of resembles non-rotational disks */ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); - zram->meta = meta; pr_debug("Initialization done!\n"); } -- cgit From 8dd1d3247e6c00b50ef83934ea8b22a1590015de Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:08 -0700 Subject: zram: document failed_reads, failed_writes stats Document `failed_reads' and `failed_writes' device attributes. Remove info about `discard' - there is no such zram attr. Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-block-zram | 24 +++++++++++++++--------- Documentation/blockdev/zram.txt | 3 ++- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 3f0b9ae61d8c..8aa046841625 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -43,6 +43,21 @@ Description: The invalid_io file is read-only and specifies the number of non-page-size-aligned I/O requests issued to this device. +What: /sys/block/zram/failed_reads +Date: February 2014 +Contact: Sergey Senozhatsky +Description: + The failed_reads file is read-only and specifies the number of + failed reads happened on this device. + + +What: /sys/block/zram/failed_writes +Date: February 2014 +Contact: Sergey Senozhatsky +Description: + The failed_writes file is read-only and specifies the number of + failed writes happened on this device. + What: /sys/block/zram/notify_free Date: August 2010 Contact: Nitin Gupta @@ -53,15 +68,6 @@ Description: is freed. This statistic is applicable only when this disk is being used as a swap disk. -What: /sys/block/zram/discard -Date: August 2010 -Contact: Nitin Gupta -Description: - The discard file is read-only and specifies the number of - discard requests received by this device. These requests - provide information to block device regarding blocks which are - no longer used by filesystem. - What: /sys/block/zram/zero_pages Date: August 2010 Contact: Nitin Gupta diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 393541be1ec0..b31ac5e5d4b9 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -51,9 +51,10 @@ size of the disk when not in use so a huge zram is wasteful. disksize num_reads num_writes + failed_reads + failed_writes invalid_io notify_free - discard zero_pages orig_data_size compr_data_size -- cgit From b67d1ec189ffb92cdad9b2bd29475fb1e0166983 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:09 -0700 Subject: zram: delete zram_init_device() allocate new `zram_meta' in disksize_store() only for uninitialised zram device, saving a number of allocations and deallocations in case if disksize_store() was called on currently used device. at the same time zram_meta stack variable is not necessary, because we can set ->meta directly. there is also no need in setting QUEUE_FLAG_NONROT queue on every disksize_store(), set it once during device creation. [minchan@kernel.org: handle zram->meta alloc fail case] [minchan@kernel.org: prevent lockdep spew of init_lock] Signed-off-by: Sergey Senozhatsky Signed-off-by: Minchan Kim Acked-by: Jerome Marchand Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 21aee3edcb25..76ba67673a90 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -533,14 +533,6 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) up_write(&zram->init_lock); } -static void zram_init_device(struct zram *zram, struct zram_meta *meta) -{ - /* zram devices sort of resembles non-rotational disks */ - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); - zram->meta = meta; - pr_debug("Initialization done!\n"); -} - static ssize_t disksize_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -556,17 +548,18 @@ static ssize_t disksize_store(struct device *dev, meta = zram_meta_alloc(disksize); if (!meta) return -ENOMEM; + down_write(&zram->init_lock); if (init_done(zram)) { - up_write(&zram->init_lock); zram_meta_free(meta); + up_write(&zram->init_lock); pr_info("Cannot change disksize for initialized device\n"); return -EBUSY; } + zram->meta = meta; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - zram_init_device(zram, meta); up_write(&zram->init_lock); return len; @@ -776,7 +769,8 @@ static int create_device(struct zram *zram, int device_id) /* Actual capacity set using syfs (/sys/block/zram/disksize */ set_capacity(zram->disk, 0); - + /* zram devices sort of resembles non-rotational disks */ + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); /* * To ensure that we always get PAGE_SIZE aligned * and n*PAGE_SIZED sized I/O requests. -- cgit From e7e1ef439d18f9a21521116ea9f2b976d7230e54 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:11 -0700 Subject: zram: introduce compressing backend abstraction ZRAM performs direct LZO compression algorithm calls, making it the one and only option. While LZO is generally performs well, LZ4 algorithm tends to have a faster decompression (see http://code.google.com/p/lz4/ for full report) Name Ratio C.speed D.speed MB/s MB/s LZ4 (r101) 2.084 422 1820 LZO 2.06 2.106 414 600 Thus, users who have mostly read (decompress) usage scenarious or mixed workflow (writes with relatively high read ops number) will benefit from using LZ4 compression backend. Introduce compressing backend abstraction zcomp in order to support multiple compression algorithms with the following set of operations: .create .destroy .compress .decompress Schematically zram write() usually contains the following steps: 0) preparation (decompression of partioal IO, etc.) 1) lock buffer_lock mutex (protects meta compress buffers) 2) compress (using meta compress buffers) 3) alloc and map zs_pool object 4) copy compressed data (from meta compress buffers) to object allocated by 3) 5) free previous pool page, assign a new one 6) unlock buffer_lock mutex As we can see, compressing buffers must remain untouched from 1) to 4), because, otherwise, concurrent write() can overwrite data. At the same time, zram_meta must be aware of a) specific compression algorithm memory requirements and b) necessary locking to protect compression buffers. To remove requirement a) new struct zcomp_strm introduced, which contains a compress/decompress `buffer' and compression algorithm `private' part. While struct zcomp implements zcomp_strm stream handling and locking and removes requirement b) from zram meta. zcomp ->create() and ->destroy(), respectively, allocate and deallocate algorithm specific zcomp_strm `private' part. Every zcomp has zcomp stream and mutex to protect its compression stream. Stream usage semantics remains the same -- only one write can hold stream lock and use its buffers. zcomp_strm_find() turns caller into exclusive user of a stream (holding stream mutex until zram release stream), and zcomp_strm_release() makes zcomp stream available (unlock the stream mutex). Hence no concurrent write (compression) operations possible at the moment. iozone -t 3 -R -r 16K -s 60M -I +Z test base patched -------------------------------------------------- Initial write 597992.91 591660.58 Rewrite 609674.34 616054.97 Read 2404771.75 2452909.12 Re-read 2459216.81 2470074.44 Reverse Read 1652769.66 1589128.66 Stride read 2202441.81 2202173.31 Random read 2236311.47 2276565.31 Mixed workload 1423760.41 1709760.06 Random write 579584.08 615933.86 Pwrite 597550.02 594933.70 Pread 1703672.53 1718126.72 Fwrite 1330497.06 1461054.00 Fread 3922851.00 3957242.62 Usage examples: comp = zcomp_create(NAME) /* NAME e.g. "lzo" */ which initialises compressing backend if requested algorithm is supported. Compress: zstrm = zcomp_strm_find(comp) zcomp_compress(comp, zstrm, src, &dst_len) [..] /* copy compressed data */ zcomp_strm_release(comp, zstrm) Decompress: zcomp_decompress(comp, src, src_len, dst); Free compessing backend and its zcomp stream: zcomp_destroy(comp) Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 115 +++++++++++++++++++++++++++++++++++++++++ drivers/block/zram/zcomp.h | 58 +++++++++++++++++++++ drivers/block/zram/zcomp_lzo.c | 47 +++++++++++++++++ drivers/block/zram/zcomp_lzo.h | 17 ++++++ 4 files changed, 237 insertions(+) create mode 100644 drivers/block/zram/zcomp.c create mode 100644 drivers/block/zram/zcomp.h create mode 100644 drivers/block/zram/zcomp_lzo.c create mode 100644 drivers/block/zram/zcomp_lzo.h diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c new file mode 100644 index 000000000000..22f4ae235660 --- /dev/null +++ b/drivers/block/zram/zcomp.c @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include "zcomp.h" +#include "zcomp_lzo.h" + +static struct zcomp_backend *find_backend(const char *compress) +{ + if (strncmp(compress, "lzo", 3) == 0) + return &zcomp_lzo; + return NULL; +} + +static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + if (zstrm->private) + comp->backend->destroy(zstrm->private); + free_pages((unsigned long)zstrm->buffer, 1); + kfree(zstrm); +} + +/* + * allocate new zcomp_strm structure with ->private initialized by + * backend, return NULL on error + */ +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) +{ + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL); + if (!zstrm) + return NULL; + + zstrm->private = comp->backend->create(); + /* + * allocate 2 pages. 1 for compressed data, plus 1 extra for the + * case when compressed size is larger than the original one + */ + zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + if (!zstrm->private || !zstrm->buffer) { + zcomp_strm_free(comp, zstrm); + zstrm = NULL; + } + return zstrm; +} + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) +{ + mutex_lock(&comp->strm_lock); + return comp->zstrm; +} + +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + mutex_unlock(&comp->strm_lock); +} + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len) +{ + return comp->backend->compress(src, zstrm->buffer, dst_len, + zstrm->private); +} + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst) +{ + return comp->backend->decompress(src, src_len, dst); +} + +void zcomp_destroy(struct zcomp *comp) +{ + zcomp_strm_free(comp, comp->zstrm); + kfree(comp); +} + +/* + * search available compressors for requested algorithm. + * allocate new zcomp and initialize it. return NULL + * if requested algorithm is not supported or in case + * of init error + */ +struct zcomp *zcomp_create(const char *compress) +{ + struct zcomp *comp; + struct zcomp_backend *backend; + + backend = find_backend(compress); + if (!backend) + return NULL; + + comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); + if (!comp) + return NULL; + + comp->backend = backend; + mutex_init(&comp->strm_lock); + + comp->zstrm = zcomp_strm_alloc(comp); + if (!comp->zstrm) { + kfree(comp); + return NULL; + } + return comp; +} diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h new file mode 100644 index 000000000000..c9a98e1317fe --- /dev/null +++ b/drivers/block/zram/zcomp.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_H_ +#define _ZCOMP_H_ + +#include + +struct zcomp_strm { + /* compression/decompression buffer */ + void *buffer; + /* + * The private data of the compression stream, only compression + * stream backend can touch this (e.g. compression algorithm + * working memory) + */ + void *private; +}; + +/* static compression backend */ +struct zcomp_backend { + int (*compress)(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private); + + int (*decompress)(const unsigned char *src, size_t src_len, + unsigned char *dst); + + void *(*create)(void); + void (*destroy)(void *private); + + const char *name; +}; + +/* dynamic per-device compression frontend */ +struct zcomp { + struct mutex strm_lock; + struct zcomp_strm *zstrm; + struct zcomp_backend *backend; +}; + +struct zcomp *zcomp_create(const char *comp); +void zcomp_destroy(struct zcomp *comp); + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm); + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len); + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst); +#endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c new file mode 100644 index 000000000000..da1bc47d588e --- /dev/null +++ b/drivers/block/zram/zcomp_lzo.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "zcomp_lzo.h" + +static void *lzo_create(void) +{ + return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); +} + +static void lzo_destroy(void *private) +{ + kfree(private); +} + +static int lzo_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private); + return ret == LZO_E_OK ? 0 : ret; +} + +static int lzo_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len); + return ret == LZO_E_OK ? 0 : ret; +} + +struct zcomp_backend zcomp_lzo = { + .compress = lzo_compress, + .decompress = lzo_decompress, + .create = lzo_create, + .destroy = lzo_destroy, + .name = "lzo", +}; diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h new file mode 100644 index 000000000000..128c5807fa14 --- /dev/null +++ b/drivers/block/zram/zcomp_lzo.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZO_H_ +#define _ZCOMP_LZO_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lzo; + +#endif /* _ZCOMP_LZO_H_ */ -- cgit From b7ca232ee7e85ed3b18e39eb20a7f458ee1d6047 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:12 -0700 Subject: zram: use zcomp compressing backends Do not perform direct LZO compress/decompress calls, initialise and use zcomp LZO backend (single compression stream) instead. [akpm@linux-foundation.org: resolve conflicts with zram-delete-zram_init_device-fix.patch] Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/Makefile | 2 +- drivers/block/zram/zram_drv.c | 69 +++++++++++++++++++------------------------ drivers/block/zram/zram_drv.h | 8 ++--- 3 files changed, 36 insertions(+), 43 deletions(-) diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index cb0f9ced6a93..757c6a5cadff 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,3 +1,3 @@ -zram-y := zram_drv.o +zram-y := zcomp_lzo.o zcomp.o zram_drv.o obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 76ba67673a90..98823f9ca8b1 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include @@ -38,6 +37,7 @@ /* Globals */ static int zram_major; static struct zram *zram_devices; +static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; @@ -160,8 +160,6 @@ static inline int valid_io_request(struct zram *zram, struct bio *bio) static void zram_meta_free(struct zram_meta *meta) { zs_destroy_pool(meta->mem_pool); - kfree(meta->compress_workmem); - free_pages((unsigned long)meta->compress_buffer, 1); vfree(meta->table); kfree(meta); } @@ -173,22 +171,11 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) if (!meta) goto out; - meta->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - if (!meta->compress_workmem) - goto free_meta; - - meta->compress_buffer = - (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); - if (!meta->compress_buffer) { - pr_err("Error allocating compressor buffer space\n"); - goto free_workmem; - } - num_pages = disksize >> PAGE_SHIFT; meta->table = vzalloc(num_pages * sizeof(*meta->table)); if (!meta->table) { pr_err("Error allocating zram address table\n"); - goto free_buffer; + goto free_meta; } meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM); @@ -198,15 +185,10 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) } rwlock_init(&meta->tb_lock); - mutex_init(&meta->buffer_lock); return meta; free_table: vfree(meta->table); -free_buffer: - free_pages((unsigned long)meta->compress_buffer, 1); -free_workmem: - kfree(meta->compress_workmem); free_meta: kfree(meta); meta = NULL; @@ -280,8 +262,7 @@ static void zram_free_page(struct zram *zram, size_t index) static int zram_decompress_page(struct zram *zram, char *mem, u32 index) { - int ret = LZO_E_OK; - size_t clen = PAGE_SIZE; + int ret = 0; unsigned char *cmem; struct zram_meta *meta = zram->meta; unsigned long handle; @@ -301,12 +282,12 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) if (size == PAGE_SIZE) copy_page(mem, cmem); else - ret = lzo1x_decompress_safe(cmem, size, mem, &clen); + ret = zcomp_decompress(zram->comp, cmem, size, mem); zs_unmap_object(meta->mem_pool, handle); read_unlock(&meta->tb_lock); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) { + if (unlikely(ret)) { pr_err("Decompression failed! err=%d, page=%u\n", ret, index); atomic64_inc(&zram->stats.failed_reads); return ret; @@ -349,7 +330,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, ret = zram_decompress_page(zram, uncmem, index); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) + if (unlikely(ret)) goto out_cleanup; if (is_partial_io(bvec)) @@ -374,11 +355,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; struct zram_meta *meta = zram->meta; + struct zcomp_strm *zstrm; bool locked = false; page = bvec->bv_page; - src = meta->compress_buffer; - if (is_partial_io(bvec)) { /* * This is a partial IO. We need to read the full page @@ -394,7 +374,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - mutex_lock(&meta->buffer_lock); + zstrm = zcomp_strm_find(zram->comp); locked = true; user_mem = kmap_atomic(page); @@ -420,22 +400,20 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen, - meta->compress_workmem); + ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); user_mem = NULL; uncmem = NULL; } - if (unlikely(ret != LZO_E_OK)) { + if (unlikely(ret)) { pr_err("Compression failed! err=%d\n", ret); goto out; } - + src = zstrm->buffer; if (unlikely(clen > max_zpage_size)) { clen = PAGE_SIZE; - src = NULL; if (is_partial_io(bvec)) src = uncmem; } @@ -457,6 +435,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, memcpy(cmem, src, clen); } + zcomp_strm_release(zram->comp, zstrm); + locked = false; zs_unmap_object(meta->mem_pool, handle); /* @@ -475,10 +455,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, atomic64_inc(&zram->stats.pages_stored); out: if (locked) - mutex_unlock(&meta->buffer_lock); + zcomp_strm_release(zram->comp, zstrm); if (is_partial_io(bvec)) kfree(uncmem); - if (ret) atomic64_inc(&zram->stats.failed_writes); return ret; @@ -522,6 +501,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) zs_free(meta->mem_pool, handle); } + zcomp_destroy(zram->comp); zram_meta_free(zram->meta); zram->meta = NULL; /* Reset stats */ @@ -539,6 +519,7 @@ static ssize_t disksize_store(struct device *dev, u64 disksize; struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); + int err; disksize = memparse(buf, NULL); if (!disksize) @@ -551,10 +532,17 @@ static ssize_t disksize_store(struct device *dev, down_write(&zram->init_lock); if (init_done(zram)) { - zram_meta_free(meta); - up_write(&zram->init_lock); pr_info("Cannot change disksize for initialized device\n"); - return -EBUSY; + err = -EBUSY; + goto out_free_meta; + } + + zram->comp = zcomp_create(default_compressor); + if (!zram->comp) { + pr_info("Cannot initialise %s compressing backend\n", + default_compressor); + err = -EINVAL; + goto out_free_meta; } zram->meta = meta; @@ -563,6 +551,11 @@ static ssize_t disksize_store(struct device *dev, up_write(&zram->init_lock); return len; + +out_free_meta: + up_write(&zram->init_lock); + zram_meta_free(meta); + return err; } static ssize_t reset_store(struct device *dev, diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 1d5b1f5786a8..45e04f7b713f 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -16,9 +16,10 @@ #define _ZRAM_DRV_H_ #include -#include #include +#include "zcomp.h" + /* * Some arbitrary value. This is just to catch * invalid value for num_devices module parameter. @@ -81,17 +82,16 @@ struct zram_stats { struct zram_meta { rwlock_t tb_lock; /* protect table */ - void *compress_workmem; - void *compress_buffer; struct table *table; struct zs_pool *mem_pool; - struct mutex buffer_lock; /* protect compress buffers */ }; struct zram { struct zram_meta *meta; struct request_queue *queue; struct gendisk *disk; + struct zcomp *comp; + /* Prevent concurrent execution of device init, reset and R/W request */ struct rw_semaphore init_lock; /* -- cgit From 9cc97529a180b369fcb7e5265771b6ba7e01f05b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:13 -0700 Subject: zram: factor out single stream compression This is preparation patch to add multi stream support to zcomp. Introduce struct zcomp_strm_single and a set of functions to manage zcomp_strm stream access. zcomp_strm_single implements single compession stream, same way as current zcomp implementation. This moves zcomp_strm stream control and locking from zcomp, so compressing backend zcomp is not aware of required locking. Single and multi streams require different locking schemes. Minchan Kim reported that spinlock-based locking scheme (which is used in multi stream implementation) has demonstrated a severe perfomance regression for single compression stream case, comparing to mutex-based. see https://lkml.org/lkml/2014/2/18/16 The following set of functions added: - zcomp_strm_single_find()/zcomp_strm_single_release() find and release a compression stream, implement required locking - zcomp_strm_single_create()/zcomp_strm_single_destroy() create and destroy zcomp_strm_single New ->strm_find() and ->strm_release() callbacks added to zcomp, which are set to zcomp_strm_single_find() and zcomp_strm_single_release() during initialisation. Instead of direct locking and zcomp_strm access from zcomp_strm_find() and zcomp_strm_release(), zcomp now calls ->strm_find() and ->strm_release() correspondingly. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 62 ++++++++++++++++++++++++++++++++++++++++------ drivers/block/zram/zcomp.h | 7 ++++-- 2 files changed, 59 insertions(+), 10 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 22f4ae235660..72e8071f9d73 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -16,6 +16,14 @@ #include "zcomp.h" #include "zcomp_lzo.h" +/* + * single zcomp_strm backend + */ +struct zcomp_strm_single { + struct mutex strm_lock; + struct zcomp_strm *zstrm; +}; + static struct zcomp_backend *find_backend(const char *compress) { if (strncmp(compress, "lzo", 3) == 0) @@ -54,15 +62,56 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) return zstrm; } +static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp) +{ + struct zcomp_strm_single *zs = comp->stream; + mutex_lock(&zs->strm_lock); + return zs->zstrm; +} + +static void zcomp_strm_single_release(struct zcomp *comp, + struct zcomp_strm *zstrm) +{ + struct zcomp_strm_single *zs = comp->stream; + mutex_unlock(&zs->strm_lock); +} + +static void zcomp_strm_single_destroy(struct zcomp *comp) +{ + struct zcomp_strm_single *zs = comp->stream; + zcomp_strm_free(comp, zs->zstrm); + kfree(zs); +} + +static int zcomp_strm_single_create(struct zcomp *comp) +{ + struct zcomp_strm_single *zs; + + comp->destroy = zcomp_strm_single_destroy; + comp->strm_find = zcomp_strm_single_find; + comp->strm_release = zcomp_strm_single_release; + zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL); + if (!zs) + return -ENOMEM; + + comp->stream = zs; + mutex_init(&zs->strm_lock); + zs->zstrm = zcomp_strm_alloc(comp); + if (!zs->zstrm) { + kfree(zs); + return -ENOMEM; + } + return 0; +} + struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) { - mutex_lock(&comp->strm_lock); - return comp->zstrm; + return comp->strm_find(comp); } void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) { - mutex_unlock(&comp->strm_lock); + comp->strm_release(comp, zstrm); } int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, @@ -80,7 +129,7 @@ int zcomp_decompress(struct zcomp *comp, const unsigned char *src, void zcomp_destroy(struct zcomp *comp) { - zcomp_strm_free(comp, comp->zstrm); + comp->destroy(comp); kfree(comp); } @@ -104,10 +153,7 @@ struct zcomp *zcomp_create(const char *compress) return NULL; comp->backend = backend; - mutex_init(&comp->strm_lock); - - comp->zstrm = zcomp_strm_alloc(comp); - if (!comp->zstrm) { + if (zcomp_strm_single_create(comp) != 0) { kfree(comp); return NULL; } diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index c9a98e1317fe..dc3500d842a3 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -39,9 +39,12 @@ struct zcomp_backend { /* dynamic per-device compression frontend */ struct zcomp { - struct mutex strm_lock; - struct zcomp_strm *zstrm; + void *stream; struct zcomp_backend *backend; + + struct zcomp_strm *(*strm_find)(struct zcomp *comp); + void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); + void (*destroy)(struct zcomp *comp); }; struct zcomp *zcomp_create(const char *comp); -- cgit From beca3ec71fe5490ee9237dc42400f50402baf83e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:14 -0700 Subject: zram: add multi stream functionality Existing zram (zcomp) implementation has only one compression stream (buffer and algorithm private part), so in order to prevent data corruption only one write (compress operation) can use this compression stream, forcing all concurrent write operations to wait for stream lock to be released. This patch changes zcomp to keep a compression streams list of user-defined size (via sysfs device attr). Each write operation still exclusively holds compression stream, the difference is that we can have N write operations (depending on size of streams list) executing in parallel. See TEST section later in commit message for performance data. Introduce struct zcomp_strm_multi and a set of functions to manage zcomp_strm stream access. zcomp_strm_multi has a list of idle zcomp_strm structs, spinlock to protect idle list and wait queue, making it possible to perform parallel compressions. The following set of functions added: - zcomp_strm_multi_find()/zcomp_strm_multi_release() find and release a compression stream, implement required locking - zcomp_strm_multi_create()/zcomp_strm_multi_destroy() create and destroy zcomp_strm_multi zcomp ->strm_find() and ->strm_release() callbacks are set during initialisation to zcomp_strm_multi_find()/zcomp_strm_multi_release() correspondingly. Each time zcomp issues a zcomp_strm_multi_find() call, the following set of operations performed: - spin lock strm_lock - if idle list is not empty, remove zcomp_strm from idle list, spin unlock and return zcomp stream pointer to caller - if idle list is empty, current adds itself to wait queue. it will be awaken by zcomp_strm_multi_release() caller. zcomp_strm_multi_release(): - spin lock strm_lock - add zcomp stream to idle list - spin unlock, wake up sleeper Minchan Kim reported that spinlock-based locking scheme has demonstrated a severe perfomance regression for single compression stream case, comparing to mutex-based (see https://lkml.org/lkml/2014/2/18/16) base spinlock mutex ==Initial write ==Initial write ==Initial write records: 5 records: 5 records: 5 avg: 1642424.35 avg: 699610.40 avg: 1655583.71 std: 39890.95(2.43%) std: 232014.19(33.16%) std: 52293.96 max: 1690170.94 max: 1163473.45 max: 1697164.75 min: 1568669.52 min: 573429.88 min: 1553410.23 ==Rewrite ==Rewrite ==Rewrite records: 5 records: 5 records: 5 avg: 1611775.39 avg: 501406.64 avg: 1684419.11 std: 17144.58(1.06%) std: 15354.41(3.06%) std: 18367.42 max: 1641800.95 max: 531356.78 max: 1706445.84 min: 1593515.27 min: 488817.78 min: 1655335.73 When only one compression stream available, mutex with spin on owner tends to perform much better than frequent wait_event()/wake_up(). This is why single stream implemented as a special case with mutex locking. Introduce and document zram device attribute max_comp_streams. This attr shows and stores current zcomp's max number of zcomp streams (max_strm). Extend zcomp's zcomp_create() with `max_strm' parameter. `max_strm' limits the number of zcomp_strm structs in compression backend's idle list (max_comp_streams). max_comp_streams used during initialisation as follows: -- passing to zcomp_create() max_strm equals to 1 will initialise zcomp using single compression stream zcomp_strm_single (mutex-based locking). -- passing to zcomp_create() max_strm greater than 1 will initialise zcomp using multi compression stream zcomp_strm_multi (spinlock-based locking). default max_comp_streams value is 1, meaning that zram with single stream will be initialised. Later patch will introduce configuration knob to change max_comp_streams on already initialised and used zcomp. TEST iozone -t 3 -R -r 16K -s 60M -I +Z test base 1 strm (mutex) 3 strm (spinlock) ----------------------------------------------------------------------- Initial write 589286.78 583518.39 718011.05 Rewrite 604837.97 596776.38 1515125.72 Random write 584120.11 595714.58 1388850.25 Pwrite 535731.17 541117.38 739295.27 Fwrite 1418083.88 1478612.72 1484927.06 Usage example: set max_comp_streams to 4 echo 4 > /sys/block/zram0/max_comp_streams show current max_comp_streams (default value is 1). cat /sys/block/zram0/max_comp_streams Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-block-zram | 9 ++- Documentation/blockdev/zram.txt | 31 ++++++-- drivers/block/zram/zcomp.c | 124 ++++++++++++++++++++++++++++- drivers/block/zram/zcomp.h | 4 +- drivers/block/zram/zram_drv.c | 42 +++++++++- drivers/block/zram/zram_drv.h | 2 +- 6 files changed, 201 insertions(+), 11 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 8aa046841625..0da9ed6b82ea 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -50,7 +50,6 @@ Description: The failed_reads file is read-only and specifies the number of failed reads happened on this device. - What: /sys/block/zram/failed_writes Date: February 2014 Contact: Sergey Senozhatsky @@ -58,6 +57,14 @@ Description: The failed_writes file is read-only and specifies the number of failed writes happened on this device. +What: /sys/block/zram/max_comp_streams +Date: February 2014 +Contact: Sergey Senozhatsky +Description: + The max_comp_streams file is read-write and specifies the + number of backend's zcomp_strm compression streams (number of + concurrent compress operations). + What: /sys/block/zram/notify_free Date: August 2010 Contact: Nitin Gupta diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index b31ac5e5d4b9..aadfe60391b7 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -21,7 +21,28 @@ Following shows a typical sequence of steps for using zram. This creates 4 devices: /dev/zram{0,1,2,3} (num_devices parameter is optional. Default: 1) -2) Set Disksize +2) Set max number of compression streams + Compression backend may use up to max_comp_streams compression streams, + thus allowing up to max_comp_streams concurrent compression operations. + By default, compression backend uses single compression stream. + + Examples: + #show max compression streams number + cat /sys/block/zram0/max_comp_streams + + #set max compression streams number to 3 + echo 3 > /sys/block/zram0/max_comp_streams + +Note: +In order to enable compression backend's multi stream support max_comp_streams +must be initially set to desired concurrency level before ZRAM device +initialisation. Once the device initialised as a single stream compression +backend (max_comp_streams equals to 0) changing the value of max_comp_streams +will not take any effect, because single stream compression backend implemented +as a special case and does not support dynamic max_comp_streams. Only multi +stream backend supports dynamic max_comp_streams adjustment. + +3) Set Disksize Set disk size by writing the value to sysfs node 'disksize'. The value can be either in bytes or you can use mem suffixes. Examples: @@ -38,14 +59,14 @@ There is little point creating a zram of greater than twice the size of memory since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the size of the disk when not in use so a huge zram is wasteful. -3) Activate: +4) Activate: mkswap /dev/zram0 swapon /dev/zram0 mkfs.ext4 /dev/zram1 mount /dev/zram1 /tmp -4) Stats: +5) Stats: Per-device statistics are exported as various nodes under /sys/block/zram/ disksize @@ -60,11 +81,11 @@ size of the disk when not in use so a huge zram is wasteful. compr_data_size mem_used_total -5) Deactivate: +6) Deactivate: swapoff /dev/zram0 umount /dev/zram1 -6) Reset: +7) Reset: Write any positive value to 'reset' sysfs node echo 1 > /sys/block/zram0/reset echo 1 > /sys/block/zram1/reset diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 72e8071f9d73..c06f75f54718 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -24,6 +24,21 @@ struct zcomp_strm_single { struct zcomp_strm *zstrm; }; +/* + * multi zcomp_strm backend + */ +struct zcomp_strm_multi { + /* protect strm list */ + spinlock_t strm_lock; + /* max possible number of zstrm streams */ + int max_strm; + /* number of available zstrm streams */ + int avail_strm; + /* list of available strms */ + struct list_head idle_strm; + wait_queue_head_t strm_wait; +}; + static struct zcomp_backend *find_backend(const char *compress) { if (strncmp(compress, "lzo", 3) == 0) @@ -62,6 +77,107 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) return zstrm; } +/* + * get idle zcomp_strm or wait until other process release + * (zcomp_strm_release()) one for us + */ +static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + while (1) { + spin_lock(&zs->strm_lock); + if (!list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + spin_unlock(&zs->strm_lock); + return zstrm; + } + /* zstrm streams limit reached, wait for idle stream */ + if (zs->avail_strm >= zs->max_strm) { + spin_unlock(&zs->strm_lock); + wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); + continue; + } + /* allocate new zstrm stream */ + zs->avail_strm++; + spin_unlock(&zs->strm_lock); + + zstrm = zcomp_strm_alloc(comp); + if (!zstrm) { + spin_lock(&zs->strm_lock); + zs->avail_strm--; + spin_unlock(&zs->strm_lock); + wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); + continue; + } + break; + } + return zstrm; +} + +/* add stream back to idle list and wake up waiter or free the stream */ +static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + struct zcomp_strm_multi *zs = comp->stream; + + spin_lock(&zs->strm_lock); + if (zs->avail_strm <= zs->max_strm) { + list_add(&zstrm->list, &zs->idle_strm); + spin_unlock(&zs->strm_lock); + wake_up(&zs->strm_wait); + return; + } + + zs->avail_strm--; + spin_unlock(&zs->strm_lock); + zcomp_strm_free(comp, zstrm); +} + +static void zcomp_strm_multi_destroy(struct zcomp *comp) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + while (!list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + zcomp_strm_free(comp, zstrm); + } + kfree(zs); +} + +static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) +{ + struct zcomp_strm *zstrm; + struct zcomp_strm_multi *zs; + + comp->destroy = zcomp_strm_multi_destroy; + comp->strm_find = zcomp_strm_multi_find; + comp->strm_release = zcomp_strm_multi_release; + zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL); + if (!zs) + return -ENOMEM; + + comp->stream = zs; + spin_lock_init(&zs->strm_lock); + INIT_LIST_HEAD(&zs->idle_strm); + init_waitqueue_head(&zs->strm_wait); + zs->max_strm = max_strm; + zs->avail_strm = 1; + + zstrm = zcomp_strm_alloc(comp); + if (!zstrm) { + kfree(zs); + return -ENOMEM; + } + list_add(&zstrm->list, &zs->idle_strm); + return 0; +} + static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp) { struct zcomp_strm_single *zs = comp->stream; @@ -139,7 +255,7 @@ void zcomp_destroy(struct zcomp *comp) * if requested algorithm is not supported or in case * of init error */ -struct zcomp *zcomp_create(const char *compress) +struct zcomp *zcomp_create(const char *compress, int max_strm) { struct zcomp *comp; struct zcomp_backend *backend; @@ -153,7 +269,11 @@ struct zcomp *zcomp_create(const char *compress) return NULL; comp->backend = backend; - if (zcomp_strm_single_create(comp) != 0) { + if (max_strm > 1) + zcomp_strm_multi_create(comp, max_strm); + else + zcomp_strm_single_create(comp); + if (!comp->stream) { kfree(comp); return NULL; } diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index dc3500d842a3..2a3684446160 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -21,6 +21,8 @@ struct zcomp_strm { * working memory) */ void *private; + /* used in multi stream backend, protected by backend strm_lock */ + struct list_head list; }; /* static compression backend */ @@ -47,7 +49,7 @@ struct zcomp { void (*destroy)(struct zcomp *comp); }; -struct zcomp *zcomp_create(const char *comp); +struct zcomp *zcomp_create(const char *comp, int max_strm); void zcomp_destroy(struct zcomp *comp); struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 98823f9ca8b1..bdc7eb8c6df7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -108,6 +108,40 @@ static ssize_t mem_used_total_show(struct device *dev, return sprintf(buf, "%llu\n", val); } +static ssize_t max_comp_streams_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->max_comp_streams; + up_read(&zram->init_lock); + + return sprintf(buf, "%d\n", val); +} + +static ssize_t max_comp_streams_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int num; + struct zram *zram = dev_to_zram(dev); + + if (kstrtoint(buf, 0, &num)) + return -EINVAL; + if (num < 1) + return -EINVAL; + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Can't set max_comp_streams for initialized device\n"); + return -EBUSY; + } + zram->max_comp_streams = num; + up_write(&zram->init_lock); + return len; +} + /* flag operations needs meta->tb_lock */ static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) @@ -502,6 +536,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) } zcomp_destroy(zram->comp); + zram->max_comp_streams = 1; + zram_meta_free(zram->meta); zram->meta = NULL; /* Reset stats */ @@ -537,7 +573,7 @@ static ssize_t disksize_store(struct device *dev, goto out_free_meta; } - zram->comp = zcomp_create(default_compressor); + zram->comp = zcomp_create(default_compressor, zram->max_comp_streams); if (!zram->comp) { pr_info("Cannot initialise %s compressing backend\n", default_compressor); @@ -698,6 +734,8 @@ static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); +static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, + max_comp_streams_show, max_comp_streams_store); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); @@ -722,6 +760,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_orig_data_size.attr, &dev_attr_compr_data_size.attr, &dev_attr_mem_used_total.attr, + &dev_attr_max_comp_streams.attr, NULL, }; @@ -784,6 +823,7 @@ static int create_device(struct zram *zram, int device_id) } zram->meta = NULL; + zram->max_comp_streams = 1; return 0; out_free_disk: diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 45e04f7b713f..ccf36d11755a 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -99,7 +99,7 @@ struct zram { * we can store in a disk. */ u64 disksize; /* bytes */ - + int max_comp_streams; struct zram_stats stats; }; #endif -- cgit From fe8eb122c82b2049c460fc6df6e8583a2f935cff Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:15 -0700 Subject: zram: add set_max_streams knob This patch allows to change max_comp_streams on initialised zcomp. Introduce zcomp set_max_streams() knob, zcomp_strm_multi_set_max_streams() and zcomp_strm_single_set_max_streams() callbacks to change streams limit for zcomp_strm_multi and zcomp_strm_single, accordingly. set_max_streams for single steam zcomp does nothing. If user has lowered the limit, then zcomp_strm_multi_set_max_streams() attempts to immediately free extra streams (as much as it can, depending on idle streams availability). Note, this patch does not allow to change stream 'policy' from single to multi stream (or vice versa) on already initialised compression backend. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 36 ++++++++++++++++++++++++++++++++++++ drivers/block/zram/zcomp.h | 3 +++ drivers/block/zram/zram_drv.c | 5 ++--- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index c06f75f54718..ac276f79f21c 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -136,6 +136,29 @@ static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstr zcomp_strm_free(comp, zstrm); } +/* change max_strm limit */ +static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + spin_lock(&zs->strm_lock); + zs->max_strm = num_strm; + /* + * if user has lowered the limit and there are idle streams, + * immediately free as much streams (and memory) as we can. + */ + while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + zcomp_strm_free(comp, zstrm); + zs->avail_strm--; + } + spin_unlock(&zs->strm_lock); + return 0; +} + static void zcomp_strm_multi_destroy(struct zcomp *comp) { struct zcomp_strm_multi *zs = comp->stream; @@ -158,6 +181,7 @@ static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) comp->destroy = zcomp_strm_multi_destroy; comp->strm_find = zcomp_strm_multi_find; comp->strm_release = zcomp_strm_multi_release; + comp->set_max_streams = zcomp_strm_multi_set_max_streams; zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL); if (!zs) return -ENOMEM; @@ -192,6 +216,12 @@ static void zcomp_strm_single_release(struct zcomp *comp, mutex_unlock(&zs->strm_lock); } +static int zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) +{ + /* zcomp_strm_single support only max_comp_streams == 1 */ + return -ENOTSUPP; +} + static void zcomp_strm_single_destroy(struct zcomp *comp) { struct zcomp_strm_single *zs = comp->stream; @@ -206,6 +236,7 @@ static int zcomp_strm_single_create(struct zcomp *comp) comp->destroy = zcomp_strm_single_destroy; comp->strm_find = zcomp_strm_single_find; comp->strm_release = zcomp_strm_single_release; + comp->set_max_streams = zcomp_strm_single_set_max_streams; zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL); if (!zs) return -ENOMEM; @@ -220,6 +251,11 @@ static int zcomp_strm_single_create(struct zcomp *comp) return 0; } +int zcomp_set_max_streams(struct zcomp *comp, int num_strm) +{ + return comp->set_max_streams(comp, num_strm); +} + struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) { return comp->strm_find(comp); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 2a3684446160..bd11d59c5dd1 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -46,6 +46,7 @@ struct zcomp { struct zcomp_strm *(*strm_find)(struct zcomp *comp); void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); + int (*set_max_streams)(struct zcomp *comp, int num_strm); void (*destroy)(struct zcomp *comp); }; @@ -60,4 +61,6 @@ int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, int zcomp_decompress(struct zcomp *comp, const unsigned char *src, size_t src_len, unsigned char *dst); + +int zcomp_set_max_streams(struct zcomp *comp, int num_strm); #endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index bdc7eb8c6df7..3a5f24c341dc 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -133,9 +133,8 @@ static ssize_t max_comp_streams_store(struct device *dev, return -EINVAL; down_write(&zram->init_lock); if (init_done(zram)) { - up_write(&zram->init_lock); - pr_info("Can't set max_comp_streams for initialized device\n"); - return -EBUSY; + if (zcomp_set_max_streams(zram->comp, num)) + pr_info("Cannot change max compression streams\n"); } zram->max_comp_streams = num; up_write(&zram->init_lock); -- cgit From e46b8a030d76d3c94156c545c3f4c3676d813435 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:17 -0700 Subject: zram: make compression algorithm selection possible Add and document `comp_algorithm' device attribute. This attribute allows to show supported compression and currently selected compression algorithms: cat /sys/block/zram0/comp_algorithm [lzo] lz4 and change selected compression algorithm: echo lzo > /sys/block/zram0/comp_algorithm Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-block-zram | 8 +++++++ Documentation/blockdev/zram.txt | 24 +++++++++++++++---- drivers/block/zram/zcomp.c | 32 +++++++++++++++++++++++--- drivers/block/zram/zcomp.h | 2 ++ drivers/block/zram/zram_drv.c | 37 +++++++++++++++++++++++++++--- drivers/block/zram/zram_drv.h | 1 + 6 files changed, 93 insertions(+), 11 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 0da9ed6b82ea..70ec992514d0 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -65,6 +65,14 @@ Description: number of backend's zcomp_strm compression streams (number of concurrent compress operations). +What: /sys/block/zram/comp_algorithm +Date: February 2014 +Contact: Sergey Senozhatsky +Description: + The comp_algorithm file is read-write and lets to show + available and selected compression algorithms, change + compression algorithm selection. + What: /sys/block/zram/notify_free Date: August 2010 Contact: Nitin Gupta diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index aadfe60391b7..2604ffed51db 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -42,7 +42,21 @@ will not take any effect, because single stream compression backend implemented as a special case and does not support dynamic max_comp_streams. Only multi stream backend supports dynamic max_comp_streams adjustment. -3) Set Disksize +3) Select compression algorithm + Using comp_algorithm device attribute one can see available and + currently selected (shown in square brackets) compression algortithms, + change selected compression algorithm (once the device is initialised + there is no way to change compression algorithm). + + Examples: + #show supported compression algorithms + cat /sys/block/zram0/comp_algorithm + lzo [lz4] + + #select lzo compression algorithm + echo lzo > /sys/block/zram0/comp_algorithm + +4) Set Disksize Set disk size by writing the value to sysfs node 'disksize'. The value can be either in bytes or you can use mem suffixes. Examples: @@ -59,14 +73,14 @@ There is little point creating a zram of greater than twice the size of memory since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the size of the disk when not in use so a huge zram is wasteful. -4) Activate: +5) Activate: mkswap /dev/zram0 swapon /dev/zram0 mkfs.ext4 /dev/zram1 mount /dev/zram1 /tmp -5) Stats: +6) Stats: Per-device statistics are exported as various nodes under /sys/block/zram/ disksize @@ -81,11 +95,11 @@ size of the disk when not in use so a huge zram is wasteful. compr_data_size mem_used_total -6) Deactivate: +7) Deactivate: swapoff /dev/zram0 umount /dev/zram1 -7) Reset: +8) Reset: Write any positive value to 'reset' sysfs node echo 1 > /sys/block/zram0/reset echo 1 > /sys/block/zram1/reset diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index ac276f79f21c..aad533a8bc55 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -39,11 +39,20 @@ struct zcomp_strm_multi { wait_queue_head_t strm_wait; }; +static struct zcomp_backend *backends[] = { + &zcomp_lzo, + NULL +}; + static struct zcomp_backend *find_backend(const char *compress) { - if (strncmp(compress, "lzo", 3) == 0) - return &zcomp_lzo; - return NULL; + int i = 0; + while (backends[i]) { + if (sysfs_streq(compress, backends[i]->name)) + break; + i++; + } + return backends[i]; } static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) @@ -251,6 +260,23 @@ static int zcomp_strm_single_create(struct zcomp *comp) return 0; } +/* show available compressors */ +ssize_t zcomp_available_show(const char *comp, char *buf) +{ + ssize_t sz = 0; + int i = 0; + + while (backends[i]) { + if (sysfs_streq(comp, backends[i]->name)) + sz += sprintf(buf + sz, "[%s] ", backends[i]->name); + else + sz += sprintf(buf + sz, "%s ", backends[i]->name); + i++; + } + sz += sprintf(buf + sz, "\n"); + return sz; +} + int zcomp_set_max_streams(struct zcomp *comp, int num_strm) { return comp->set_max_streams(comp, num_strm); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index bd11d59c5dd1..8b8997f8613b 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -50,6 +50,8 @@ struct zcomp { void (*destroy)(struct zcomp *comp); }; +ssize_t zcomp_available_show(const char *comp, char *buf); + struct zcomp *zcomp_create(const char *comp, int max_strm); void zcomp_destroy(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3a5f24c341dc..15d46f2e158c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -141,6 +141,34 @@ static ssize_t max_comp_streams_store(struct device *dev, return len; } +static ssize_t comp_algorithm_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + size_t sz; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + sz = zcomp_available_show(zram->compressor, buf); + up_read(&zram->init_lock); + + return sz; +} + +static ssize_t comp_algorithm_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Can't change algorithm for initialized device\n"); + return -EBUSY; + } + strlcpy(zram->compressor, buf, sizeof(zram->compressor)); + up_write(&zram->init_lock); + return len; +} + /* flag operations needs meta->tb_lock */ static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) @@ -572,10 +600,10 @@ static ssize_t disksize_store(struct device *dev, goto out_free_meta; } - zram->comp = zcomp_create(default_compressor, zram->max_comp_streams); + zram->comp = zcomp_create(zram->compressor, zram->max_comp_streams); if (!zram->comp) { pr_info("Cannot initialise %s compressing backend\n", - default_compressor); + zram->compressor); err = -EINVAL; goto out_free_meta; } @@ -735,6 +763,8 @@ static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, max_comp_streams_show, max_comp_streams_store); +static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, + comp_algorithm_show, comp_algorithm_store); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); @@ -760,6 +790,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_compr_data_size.attr, &dev_attr_mem_used_total.attr, &dev_attr_max_comp_streams.attr, + &dev_attr_comp_algorithm.attr, NULL, }; @@ -820,7 +851,7 @@ static int create_device(struct zram *zram, int device_id) pr_warn("Error creating sysfs group"); goto out_free_disk; } - + strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram->meta = NULL; zram->max_comp_streams = 1; return 0; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index ccf36d11755a..7f21c145e317 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -101,5 +101,6 @@ struct zram { u64 disksize; /* bytes */ int max_comp_streams; struct zram_stats stats; + char compressor[10]; }; #endif -- cgit From 6e76668e415adf799839f0ab205142ad7002d260 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:18 -0700 Subject: zram: add lz4 algorithm backend Introduce LZ4 compression backend and make it available for selection. LZ4 support is optional and requires user to set ZRAM_LZ4_COMPRESS config option. The default compression backend is LZO. TEST (x86_64, core i5, 2 cores + 2 hyperthreading, zram disk size 1G, ext4 file system, 3 compression streams) iozone -t 3 -R -r 16K -s 60M -I +Z Test LZO LZ4 ---------------------------------------------- Initial write 1642744.62 1317005.09 Rewrite 2498980.88 1800645.16 Read 3957026.38 5877043.75 Re-read 3950997.38 5861847.00 Reverse Read 2937114.56 5047384.00 Stride read 2948163.19 4929587.38 Random read 3292692.69 4880793.62 Mixed workload 1545602.62 3502940.38 Random write 2448039.75 1758786.25 Pwrite 1670051.03 1338329.69 Pread 2530682.00 5097177.62 Fwrite 3232085.62 3275942.56 Fread 6306880.25 6645271.12 So on my system LZ4 is slower in write-only tests, while it performs better in read-only and mixed (reads + writes) tests. Official LZ4 benchmarks available here http://code.google.com/p/lz4/ (linux kernel uses revision r90). Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/Kconfig | 10 +++++++++ drivers/block/zram/Makefile | 2 ++ drivers/block/zram/zcomp.c | 6 ++++++ drivers/block/zram/zcomp_lz4.c | 47 ++++++++++++++++++++++++++++++++++++++++++ drivers/block/zram/zcomp_lz4.h | 17 +++++++++++++++ 5 files changed, 82 insertions(+) create mode 100644 drivers/block/zram/zcomp_lz4.c create mode 100644 drivers/block/zram/zcomp_lz4.h diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 3450be850399..6489c0fd0ea6 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -15,6 +15,16 @@ config ZRAM See zram.txt for more information. +config ZRAM_LZ4_COMPRESS + bool "Enable LZ4 algorithm support" + depends on ZRAM + select LZ4_COMPRESS + select LZ4_DECOMPRESS + default n + help + This option enables LZ4 compression algorithm support. Compression + algorithm can be changed using `comp_algorithm' device attribute. + config ZRAM_DEBUG bool "Compressed RAM block device debug support" depends on ZRAM diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index 757c6a5cadff..be0763ff57a2 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,3 +1,5 @@ zram-y := zcomp_lzo.o zcomp.o zram_drv.o +zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o + obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index aad533a8bc55..d5919031ca8b 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -15,6 +15,9 @@ #include "zcomp.h" #include "zcomp_lzo.h" +#ifdef CONFIG_ZRAM_LZ4_COMPRESS +#include "zcomp_lz4.h" +#endif /* * single zcomp_strm backend @@ -41,6 +44,9 @@ struct zcomp_strm_multi { static struct zcomp_backend *backends[] = { &zcomp_lzo, +#ifdef CONFIG_ZRAM_LZ4_COMPRESS + &zcomp_lz4, +#endif NULL }; diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c new file mode 100644 index 000000000000..f2afb7e988c3 --- /dev/null +++ b/drivers/block/zram/zcomp_lz4.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "zcomp_lz4.h" + +static void *zcomp_lz4_create(void) +{ + return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); +} + +static void zcomp_lz4_destroy(void *private) +{ + kfree(private); +} + +static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + /* return : Success if return 0 */ + return lz4_compress(src, PAGE_SIZE, dst, dst_len, private); +} + +static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + /* return : Success if return 0 */ + return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len); +} + +struct zcomp_backend zcomp_lz4 = { + .compress = zcomp_lz4_compress, + .decompress = zcomp_lz4_decompress, + .create = zcomp_lz4_create, + .destroy = zcomp_lz4_destroy, + .name = "lz4", +}; diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h new file mode 100644 index 000000000000..60613fb29dd8 --- /dev/null +++ b/drivers/block/zram/zcomp_lz4.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZ4_H_ +#define _ZCOMP_LZ4_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lz4; + +#endif /* _ZCOMP_LZ4_H_ */ -- cgit From d61f98c70e8b0d324e8e83be2ed546d6295e63f3 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:19 -0700 Subject: zram: move comp allocation out of init_lock While fixing lockdep spew of ->init_lock reported by Sasha Levin [1], Minchan Kim noted [2] that it's better to move compression backend allocation (using GPF_KERNEL) out of the ->init_lock lock, same way as with zram_meta_alloc(), in order to prevent the same lockdep spew. [1] https://lkml.org/lkml/2014/2/27/337 [2] https://lkml.org/lkml/2014/3/3/32 Signed-off-by: Sergey Senozhatsky Reported-by: Minchan Kim Acked-by: Minchan Kim Cc: Sasha Levin Acked-by: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 15d46f2e158c..e4d536b485e6 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -580,9 +580,10 @@ static ssize_t disksize_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { u64 disksize; + struct zcomp *comp; struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); - int err; + int err = -EINVAL; disksize = memparse(buf, NULL); if (!disksize) @@ -593,30 +594,32 @@ static ssize_t disksize_store(struct device *dev, if (!meta) return -ENOMEM; + comp = zcomp_create(zram->compressor, zram->max_comp_streams); + if (!comp) { + pr_info("Cannot initialise %s compressing backend\n", + zram->compressor); + goto out_cleanup; + } + down_write(&zram->init_lock); if (init_done(zram)) { + up_write(&zram->init_lock); pr_info("Cannot change disksize for initialized device\n"); err = -EBUSY; - goto out_free_meta; - } - - zram->comp = zcomp_create(zram->compressor, zram->max_comp_streams); - if (!zram->comp) { - pr_info("Cannot initialise %s compressing backend\n", - zram->compressor); - err = -EINVAL; - goto out_free_meta; + goto out_cleanup; } zram->meta = meta; + zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); up_write(&zram->init_lock); return len; -out_free_meta: - up_write(&zram->init_lock); +out_cleanup: + if (comp) + zcomp_destroy(comp); zram_meta_free(meta); return err; } -- cgit From fcfa8d95cacf5cbbe6dee6b8d229fe86142266e0 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:20 -0700 Subject: zram: return error-valued pointer from zcomp_create() Instead of returning just NULL, return ERR_PTR from zcomp_create() if compressing backend creation has failed. ERR_PTR(-EINVAL) for unsupported compression algorithm request, ERR_PTR(-ENOMEM) for allocation (zcomp or compression stream) error. Perform IS_ERR() check of returned from zcomp_create() value in disksize_store() and set return code to PTR_ERR(). Change suggested by Jerome Marchand. [akpm@linux-foundation.org: clean up error recovery flow] Signed-off-by: Sergey Senozhatsky Reported-by: Jerome Marchand Cc: Minchan Kim Cc: Nitin Gupta Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 14 ++++++++------ drivers/block/zram/zram_drv.c | 19 ++++++++++--------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index d5919031ca8b..5647d8fe1dc1 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -319,9 +320,10 @@ void zcomp_destroy(struct zcomp *comp) /* * search available compressors for requested algorithm. - * allocate new zcomp and initialize it. return NULL - * if requested algorithm is not supported or in case - * of init error + * allocate new zcomp and initialize it. return compressing + * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL) + * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in + * case of allocation error. */ struct zcomp *zcomp_create(const char *compress, int max_strm) { @@ -330,11 +332,11 @@ struct zcomp *zcomp_create(const char *compress, int max_strm) backend = find_backend(compress); if (!backend) - return NULL; + return ERR_PTR(-EINVAL); comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); if (!comp) - return NULL; + return ERR_PTR(-ENOMEM); comp->backend = backend; if (max_strm > 1) @@ -343,7 +345,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm) zcomp_strm_single_create(comp); if (!comp->stream) { kfree(comp); - return NULL; + return ERR_PTR(-ENOMEM); } return comp; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e4d536b485e6..6b462d27e7d7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "zram_drv.h" @@ -583,7 +584,7 @@ static ssize_t disksize_store(struct device *dev, struct zcomp *comp; struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); - int err = -EINVAL; + int err; disksize = memparse(buf, NULL); if (!disksize) @@ -595,18 +596,18 @@ static ssize_t disksize_store(struct device *dev, return -ENOMEM; comp = zcomp_create(zram->compressor, zram->max_comp_streams); - if (!comp) { + if (IS_ERR(comp)) { pr_info("Cannot initialise %s compressing backend\n", zram->compressor); - goto out_cleanup; + err = PTR_ERR(comp); + goto out_free_meta; } down_write(&zram->init_lock); if (init_done(zram)) { - up_write(&zram->init_lock); pr_info("Cannot change disksize for initialized device\n"); err = -EBUSY; - goto out_cleanup; + goto out_destroy_comp; } zram->meta = meta; @@ -614,12 +615,12 @@ static ssize_t disksize_store(struct device *dev, zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); up_write(&zram->init_lock); - return len; -out_cleanup: - if (comp) - zcomp_destroy(comp); +out_destroy_comp: + up_write(&zram->init_lock); + zcomp_destroy(comp); +out_free_meta: zram_meta_free(meta); return err; } -- cgit From 60a726e33375a1096e85399cfa1327081b4c38be Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 7 Apr 2014 15:38:21 -0700 Subject: zram: propagate error to user When we initialized zcomp with single, we couldn't change max_comp_streams without zram reset but current interface doesn't show any error to user and even it changes max_comp_streams's value without any effect so it would make user very confusing. This patch prevents max_comp_streams's change when zcomp was initialized as single zcomp and emit the error to user(ex, echo). [akpm@linux-foundation.org: don't return with the lock held, per Sergey] [fengguang.wu@intel.com: fix coccinelle warnings] Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Jerome Marchand Acked-by: Sergey Senozhatsky Signed-off-by: Fengguang Wu Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/blockdev/zram.txt | 9 +++++---- drivers/block/zram/zcomp.c | 10 +++++----- drivers/block/zram/zcomp.h | 4 ++-- drivers/block/zram/zram_drv.c | 17 +++++++++++++---- 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 2604ffed51db..0595c3f56ccf 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -37,10 +37,11 @@ Note: In order to enable compression backend's multi stream support max_comp_streams must be initially set to desired concurrency level before ZRAM device initialisation. Once the device initialised as a single stream compression -backend (max_comp_streams equals to 0) changing the value of max_comp_streams -will not take any effect, because single stream compression backend implemented -as a special case and does not support dynamic max_comp_streams. Only multi -stream backend supports dynamic max_comp_streams adjustment. +backend (max_comp_streams equals to 1), you will see error if you try to change +the value of max_comp_streams because single stream compression backend +implemented as a special case by lock overhead issue and does not support +dynamic max_comp_streams. Only multi stream backend supports dynamic +max_comp_streams adjustment. 3) Select compression algorithm Using comp_algorithm device attribute one can see available and diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 5647d8fe1dc1..b0e7592c44d8 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -153,7 +153,7 @@ static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstr } /* change max_strm limit */ -static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) +static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) { struct zcomp_strm_multi *zs = comp->stream; struct zcomp_strm *zstrm; @@ -172,7 +172,7 @@ static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) zs->avail_strm--; } spin_unlock(&zs->strm_lock); - return 0; + return true; } static void zcomp_strm_multi_destroy(struct zcomp *comp) @@ -232,10 +232,10 @@ static void zcomp_strm_single_release(struct zcomp *comp, mutex_unlock(&zs->strm_lock); } -static int zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) +static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) { /* zcomp_strm_single support only max_comp_streams == 1 */ - return -ENOTSUPP; + return false; } static void zcomp_strm_single_destroy(struct zcomp *comp) @@ -284,7 +284,7 @@ ssize_t zcomp_available_show(const char *comp, char *buf) return sz; } -int zcomp_set_max_streams(struct zcomp *comp, int num_strm) +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm) { return comp->set_max_streams(comp, num_strm); } diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 8b8997f8613b..c59d1fca72c0 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -46,7 +46,7 @@ struct zcomp { struct zcomp_strm *(*strm_find)(struct zcomp *comp); void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); - int (*set_max_streams)(struct zcomp *comp, int num_strm); + bool (*set_max_streams)(struct zcomp *comp, int num_strm); void (*destroy)(struct zcomp *comp); }; @@ -64,5 +64,5 @@ int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, int zcomp_decompress(struct zcomp *comp, const unsigned char *src, size_t src_len, unsigned char *dst); -int zcomp_set_max_streams(struct zcomp *comp, int num_strm); +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); #endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6b462d27e7d7..80a1cfca1bf0 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -127,19 +127,28 @@ static ssize_t max_comp_streams_store(struct device *dev, { int num; struct zram *zram = dev_to_zram(dev); + int ret; - if (kstrtoint(buf, 0, &num)) - return -EINVAL; + ret = kstrtoint(buf, 0, &num); + if (ret < 0) + return ret; if (num < 1) return -EINVAL; + down_write(&zram->init_lock); if (init_done(zram)) { - if (zcomp_set_max_streams(zram->comp, num)) + if (!zcomp_set_max_streams(zram->comp, num)) { pr_info("Cannot change max compression streams\n"); + ret = -EINVAL; + goto out; + } } + zram->max_comp_streams = num; + ret = len; +out: up_write(&zram->init_lock); - return len; + return ret; } static ssize_t comp_algorithm_show(struct device *dev, -- cgit From 56b4e8cb85827a2ccc4752a2a7148e56b62b7e96 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:22 -0700 Subject: zram: use scnprintf() in attrs show() methods sysfs.txt documentation lists the following requirements: - The buffer will always be PAGE_SIZE bytes in length. On i386, this is 4096. - show() methods should return the number of bytes printed into the buffer. This is the return value of scnprintf(). - show() should always use scnprintf(). Use scnprintf() in show() functions. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 8 +++++--- drivers/block/zram/zram_drv.c | 12 ++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index b0e7592c44d8..f1ff39a3d1c1 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -275,12 +275,14 @@ ssize_t zcomp_available_show(const char *comp, char *buf) while (backends[i]) { if (sysfs_streq(comp, backends[i]->name)) - sz += sprintf(buf + sz, "[%s] ", backends[i]->name); + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "[%s] ", backends[i]->name); else - sz += sprintf(buf + sz, "%s ", backends[i]->name); + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "%s ", backends[i]->name); i++; } - sz += sprintf(buf + sz, "\n"); + sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); return sz; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 80a1cfca1bf0..15e7b8e64cbb 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -48,7 +48,7 @@ static ssize_t zram_attr_##name##_show(struct device *d, \ struct device_attribute *attr, char *b) \ { \ struct zram *zram = dev_to_zram(d); \ - return sprintf(b, "%llu\n", \ + return scnprintf(b, PAGE_SIZE, "%llu\n", \ (u64)atomic64_read(&zram->stats.name)); \ } \ static struct device_attribute dev_attr_##name = \ @@ -69,7 +69,7 @@ static ssize_t disksize_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%llu\n", zram->disksize); + return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); } static ssize_t initstate_show(struct device *dev, @@ -82,7 +82,7 @@ static ssize_t initstate_show(struct device *dev, val = init_done(zram); up_read(&zram->init_lock); - return sprintf(buf, "%u\n", val); + return scnprintf(buf, PAGE_SIZE, "%u\n", val); } static ssize_t orig_data_size_show(struct device *dev, @@ -90,7 +90,7 @@ static ssize_t orig_data_size_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%llu\n", + return scnprintf(buf, PAGE_SIZE, "%llu\n", (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); } @@ -106,7 +106,7 @@ static ssize_t mem_used_total_show(struct device *dev, val = zs_get_total_size_bytes(meta->mem_pool); up_read(&zram->init_lock); - return sprintf(buf, "%llu\n", val); + return scnprintf(buf, PAGE_SIZE, "%llu\n", val); } static ssize_t max_comp_streams_show(struct device *dev, @@ -119,7 +119,7 @@ static ssize_t max_comp_streams_show(struct device *dev, val = zram->max_comp_streams; up_read(&zram->init_lock); - return sprintf(buf, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } static ssize_t max_comp_streams_store(struct device *dev, -- cgit From f4659d8e620d08bd1a84a8aec5d2f5294a242764 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 7 Apr 2014 15:38:24 -0700 Subject: zram: support REQ_DISCARD zram is ram based block device and can be used by backend of filesystem. When filesystem deletes a file, it normally doesn't do anything on data block of that file. It just marks on metadata of that file. This behavior has no problem on disk based block device, but has problems on ram based block device, since we can't free memory used for data block. To overcome this disadvantage, there is REQ_DISCARD functionality. If block device support REQ_DISCARD and filesystem is mounted with discard option, filesystem sends REQ_DISCARD to block device whenever some data blocks are discarded. All we have to do is to handle this request. This patch implements to flag up QUEUE_FLAG_DISCARD and handle this REQ_DISCARD request. With it, we can free memory used by zram if it isn't used. [akpm@linux-foundation.org: tweak comments] Signed-off-by: Joonsoo Kim Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 62 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 15e7b8e64cbb..9849b5233bf4 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -551,6 +551,47 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, return ret; } +/* + * zram_bio_discard - handler on discard request + * @index: physical block index in PAGE_SIZE units + * @offset: byte offset within physical block + */ +static void zram_bio_discard(struct zram *zram, u32 index, + int offset, struct bio *bio) +{ + size_t n = bio->bi_iter.bi_size; + + /* + * zram manages data in physical block size units. Because logical block + * size isn't identical with physical block size on some arch, we + * could get a discard request pointing to a specific offset within a + * certain physical block. Although we can handle this request by + * reading that physiclal block and decompressing and partially zeroing + * and re-compressing and then re-storing it, this isn't reasonable + * because our intent with a discard request is to save memory. So + * skipping this logical block is appropriate here. + */ + if (offset) { + if (n < offset) + return; + + n -= offset; + index++; + } + + while (n >= PAGE_SIZE) { + /* + * Discard request can be large so the lock hold times could be + * lengthy. So take the lock once per page. + */ + write_lock(&zram->meta->tb_lock); + zram_free_page(zram, index); + write_unlock(&zram->meta->tb_lock); + index++; + n -= PAGE_SIZE; + } +} + static void zram_reset_device(struct zram *zram, bool reset_capacity) { size_t index; @@ -686,6 +727,12 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) offset = (bio->bi_iter.bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + zram_bio_discard(zram, index, offset, bio); + bio_endio(bio, 0); + return; + } + bio_for_each_segment(bvec, bio, iter) { int max_transfer_size = PAGE_SIZE - offset; @@ -855,6 +902,21 @@ static int create_device(struct zram *zram, int device_id) ZRAM_LOGICAL_BLOCK_SIZE); blk_queue_io_min(zram->disk->queue, PAGE_SIZE); blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); + zram->disk->queue->limits.discard_granularity = PAGE_SIZE; + zram->disk->queue->limits.max_discard_sectors = UINT_MAX; + /* + * zram_bio_discard() will clear all logical blocks if logical block + * size is identical with physical block size(PAGE_SIZE). But if it is + * different, we will skip discarding some parts of logical blocks in + * the part of the request range which isn't aligned to physical block + * size. So we can't ensure that all discarded logical blocks are + * zeroed. + */ + if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) + zram->disk->queue->limits.discard_zeroes_data = 1; + else + zram->disk->queue->limits.discard_zeroes_data = 0; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue); add_disk(zram->disk); -- cgit From 6b4525164e247e29f48b3a69e3d35f60fab50ae5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 7 Apr 2014 15:38:25 -0700 Subject: mm/zswap.c: fix trivial typo and arrange indentation Signed-off-by: SeongJae Park Cc: Seth Jennings Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index e55bab9dc41f..5b2245324715 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -160,14 +160,14 @@ static void zswap_comp_exit(void) * rbnode - links the entry into red-black tree for the appropriate swap type * refcount - the number of outstanding reference to the entry. This is needed * to protect against premature freeing of the entry by code - * concurent calls to load, invalidate, and writeback. The lock + * concurrent calls to load, invalidate, and writeback. The lock * for the zswap_tree structure that contains the entry must * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * offset - the swap offset for the entry. Index into the red-black tree. * handle - zsmalloc allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during - * decompression + * decompression */ struct zswap_entry { struct rb_node rbnode; -- cgit From 6335b19344cc263724ae49a76ed930b21a659055 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 7 Apr 2014 15:38:26 -0700 Subject: mm/zswap.c: update zsmalloc in comment to zbud zswap used zsmalloc before and now using zbud. But, some comments saying it use zsmalloc yet. Fix the trivial problems. Signed-off-by: SeongJae Park Cc: Seth Jennings Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 5b2245324715..25312eb373a0 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -165,7 +165,7 @@ static void zswap_comp_exit(void) * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * offset - the swap offset for the entry. Index into the red-black tree. - * handle - zsmalloc allocation handle that stores the compressed page data + * handle - zbud allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during * decompression */ @@ -282,7 +282,7 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) } /* - * Carries out the common pattern of freeing and entry's zsmalloc allocation, + * Carries out the common pattern of freeing and entry's zbud allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_free_entry(struct zswap_tree *tree, -- cgit From 60105e1248f571aa3b895cd63bef072ed9d90c77 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 7 Apr 2014 15:38:27 -0700 Subject: mm/zswap: support multiple swap devices Cai Liu reporeted that now zbud pool pages counting has a problem when multiple swap is used because it just counts only one swap intead of all of swap so zswap cannot control writeback properly. The result is unnecessary writeback or no writeback when we should really writeback. IOW, it made zswap crazy. Another problem in zswap is: For example, let's assume we use two swap A and B with different priority and A already has charged 19% long time ago and let's assume that A swap is full now so VM start to use B so that B has charged 1% recently. It menas zswap charged (19% + 1%) is full by default. Then, if VM want to swap out more pages into B, zbud_reclaim_page would be evict one of pages in B's pool and it would be repeated continuously. It's totally LRU reverse problem and swap thrashing in B would happen. This patch makes zswap consider mutliple swap by creating *a* zbud pool which will be shared by multiple swap so all of zswap pages in multiple swap keep order by LRU so it can prevent above two problems. Signed-off-by: Minchan Kim Reported-by: Cai Liu Suggested-by: Weijie Yang Cc: Seth Jennings Reviewed-by: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 64 ++++++++++++++++++++++++++++++++------------------------------ 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 25312eb373a0..c0c9b7c80c05 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -89,6 +89,9 @@ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); +/* zbud_pool is shared by all of zswap backend */ +static struct zbud_pool *zswap_pool; + /********************************* * compression functions **********************************/ @@ -189,7 +192,6 @@ struct zswap_header { struct zswap_tree { struct rb_root rbroot; spinlock_t lock; - struct zbud_pool *pool; }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; @@ -285,13 +287,12 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) * Carries out the common pattern of freeing and entry's zbud allocation, * freeing the entry itself, and decrementing the number of stored pages. */ -static void zswap_free_entry(struct zswap_tree *tree, - struct zswap_entry *entry) +static void zswap_free_entry(struct zswap_entry *entry) { - zbud_free(tree->pool, entry->handle); + zbud_free(zswap_pool, entry->handle); zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(tree->pool); + zswap_pool_pages = zbud_get_pool_size(zswap_pool); } /* caller must hold the tree lock */ @@ -311,7 +312,7 @@ static void zswap_entry_put(struct zswap_tree *tree, BUG_ON(refcount < 0); if (refcount == 0) { zswap_rb_erase(&tree->rbroot, entry); - zswap_free_entry(tree, entry); + zswap_free_entry(entry); } } @@ -545,7 +546,6 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) zbud_unmap(pool, handle); tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); - BUG_ON(pool != tree->pool); /* find and ref zswap entry */ spin_lock(&tree->lock); @@ -573,13 +573,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(tree->pool, entry->handle) + + src = (u8 *)zbud_map(zswap_pool, entry->handle) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(tree->pool, entry->handle); + zbud_unmap(zswap_pool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -652,7 +652,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; - if (zbud_reclaim_page(tree->pool, 8)) { + if (zbud_reclaim_page(zswap_pool, 8)) { zswap_reject_reclaim_fail++; ret = -ENOMEM; goto reject; @@ -679,7 +679,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); - ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN, + ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; @@ -689,11 +689,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto freepage; } - zhdr = zbud_map(tree->pool, handle); + zhdr = zbud_map(zswap_pool, handle); zhdr->swpentry = swp_entry(type, offset); buf = (u8 *)(zhdr + 1); memcpy(buf, dst, dlen); - zbud_unmap(tree->pool, handle); + zbud_unmap(zswap_pool, handle); put_cpu_var(zswap_dstmem); /* populate entry */ @@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(tree->pool); + zswap_pool_pages = zbud_get_pool_size(zswap_pool); return 0; @@ -752,13 +752,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(tree->pool, entry->handle) + + src = (u8 *)zbud_map(zswap_pool, entry->handle) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(tree->pool, entry->handle); + zbud_unmap(zswap_pool, entry->handle); BUG_ON(ret); spin_lock(&tree->lock); @@ -804,11 +804,9 @@ static void zswap_frontswap_invalidate_area(unsigned type) /* walk the tree and free everything */ spin_lock(&tree->lock); rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) - zswap_free_entry(tree, entry); + zswap_free_entry(entry); tree->rbroot = RB_ROOT; spin_unlock(&tree->lock); - - zbud_destroy_pool(tree->pool); kfree(tree); zswap_trees[type] = NULL; } @@ -822,20 +820,14 @@ static void zswap_frontswap_init(unsigned type) struct zswap_tree *tree; tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); - if (!tree) - goto err; - tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); - if (!tree->pool) - goto freetree; + if (!tree) { + pr_err("alloc failed, zswap disabled for swap type %d\n", type); + return; + } + tree->rbroot = RB_ROOT; spin_lock_init(&tree->lock); zswap_trees[type] = tree; - return; - -freetree: - kfree(tree); -err: - pr_err("alloc failed, zswap disabled for swap type %d\n", type); } static struct frontswap_ops zswap_frontswap_ops = { @@ -907,9 +899,16 @@ static int __init init_zswap(void) return 0; pr_info("loading zswap\n"); + + zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); + if (!zswap_pool) { + pr_err("zbud pool creation failed\n"); + goto error; + } + if (zswap_entry_cache_create()) { pr_err("entry cache creation failed\n"); - goto error; + goto cachefail; } if (zswap_comp_init()) { pr_err("compressor initialization failed\n"); @@ -919,6 +918,7 @@ static int __init init_zswap(void) pr_err("per-cpu initialization failed\n"); goto pcpufail; } + frontswap_register_ops(&zswap_frontswap_ops); if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); @@ -927,6 +927,8 @@ pcpufail: zswap_comp_exit(); compfail: zswap_entry_cache_destory(); +cachefail: + zbud_destroy_pool(zswap_pool); error: return -ENOMEM; } -- cgit From 5d2d42de185b77b4bc60b5ac77386c4099d71bf3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 7 Apr 2014 15:38:28 -0700 Subject: mm/zswap.c: remove unnecessary parentheses Fix following trivial checkpatch error: ERROR: return is not a function, parentheses are not required Signed-off-by: SeongJae Park Acked-by: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index c0c9b7c80c05..34b75cc70827 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -204,7 +204,7 @@ static struct kmem_cache *zswap_entry_cache; static int zswap_entry_cache_create(void) { zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); - return (zswap_entry_cache == NULL); + return zswap_entry_cache == NULL; } static void zswap_entry_cache_destory(void) @@ -408,8 +408,8 @@ cleanup: **********************************/ static bool zswap_is_full(void) { - return (totalram_pages * zswap_max_pool_percent / 100 < - zswap_pool_pages); + return totalram_pages * zswap_max_pool_percent / 100 < + zswap_pool_pages; } /********************************* -- cgit From c39df5fa37b0623589508c95515b4aa1531c524e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:29 -0700 Subject: exit: call disassociate_ctty() before exit_task_namespaces() Commit 8aac62706ada ("move exit_task_namespaces() outside of exit_notify()") breaks pppd and the exiting service crashes the kernel: BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: ppp_register_channel+0x13/0x20 [ppp_generic] Call Trace: ppp_asynctty_open+0x12b/0x170 [ppp_async] tty_ldisc_open.isra.2+0x27/0x60 tty_ldisc_hangup+0x1e3/0x220 __tty_hangup+0x2c4/0x440 disassociate_ctty+0x61/0x270 do_exit+0x7f2/0xa50 ppp_register_channel() needs ->net_ns and current->nsproxy == NULL. Move disassociate_ctty() before exit_task_namespaces(), it doesn't make sense to delay it after perf_event_exit_task() or cgroup_exit(). This also allows to use task_work_add() inside the (nontrivial) code paths in disassociate_ctty(). Investigated by Peter Hurley. Signed-off-by: Oleg Nesterov Reported-by: Sree Harsha Totakura Cc: Peter Hurley Cc: Sree Harsha Totakura Cc: "Eric W. Biederman" Cc: Jeff Dike Cc: Ingo Molnar Cc: Andrey Vagin Cc: Al Viro Cc: [v3.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 6480d1c85d7a..11f9e39a7368 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -784,6 +784,8 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + if (group_dead) + disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); check_stack_usage(); @@ -799,13 +801,9 @@ void do_exit(long code) cgroup_exit(tsk); - if (group_dead) - disassociate_ctty(1); - module_put(task_thread_info(tsk)->exec_domain->module); proc_exit_connector(tsk); - /* * FIXME: do that only when needed, using sched_exit tracepoint */ -- cgit From 4bcb8232cf4eb061b086c10f56b6808adcdb5a93 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:30 -0700 Subject: exit: move check_stack_usage() to the end of do_exit() It is not clear why check_stack_usage() is called so early and thus it never checks the stack usage in, say, exit_notify() or flush_ptrace_hw_breakpoint() or other functions which are only called by do_exit(). Move the callsite down to the last preempt_disable/schedule. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/exit.c b/kernel/exit.c index 11f9e39a7368..171c9a9d7b00 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -788,7 +788,6 @@ void do_exit(long code) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); - check_stack_usage(); exit_thread(); /* @@ -842,6 +841,7 @@ void do_exit(long code) validate_creds_for_do_exit(tsk); + check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); -- cgit From ef9823939e5acd5d323ff61fbc427ef998dd203e Mon Sep 17 00:00:00 2001 From: Guillaume Morin Date: Mon, 7 Apr 2014 15:38:31 -0700 Subject: kernel/exit.c: call proc_exit_connector() after exit_state is set The process events connector delivers a notification when a process exits. This is really convenient for a process that spawns and wants to monitor its children through an epoll-able() interface. Unfortunately, there is a small window between when the event is delivered and the child become wait()-able. This is creates a race if the parent wants to make sure that it knows about the exit, e.g pid_t pid = fork(); if (pid > 0) { register_interest_for_pid(pid); if (waitpid(pid, NULL, WNOHANG) > 0) { /* We might have raced with exit() */ } return; } /* Child */ execve(...) register_interest_for_pid() would be telling the the connector socket reader to pay attention to events related to pid. Though this is not a bug, I think it would make the connector a bit more usable if this race was closed by simply moving the call to proc_exit_connector() from just before exit_notify() to right after. Oleg said: : Even with this patch the code above is still "racy" if the child is : multi-threaded. Plus it should obviously filter-out subthreads. And : afaics there is no way to make it reliable, even if you change the code : above so that waitpid() is called only after the last thread exits WNOHANG : still can fail. Signed-off-by: Guillaume Morin Cc: Matt Helsley Cc: Oleg Nesterov Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/exit.c b/kernel/exit.c index 171c9a9d7b00..decf648574f6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -802,13 +802,13 @@ void do_exit(long code) module_put(task_thread_info(tsk)->exec_domain->module); - proc_exit_connector(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_notify(tsk, group_dead); + proc_exit_connector(tsk); #ifdef CONFIG_NUMA task_lock(tsk); mpol_put(tsk->mempolicy); -- cgit From f0b5664ba770190fa528ba72f8f6294cca797103 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 7 Apr 2014 15:38:32 -0700 Subject: fs/proc/meminfo: meminfo_proc_show(): fix typo in comment It should read "reclaimable slab" and not "reclaimable swap". Signed-off-by: Luiz Capitulino Reviewed-by: Rik van Riel Acked-by: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 136e548d9567..7445af0b1aa3 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -73,7 +73,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) available += pagecache; /* - * Part of the reclaimable swap consists of items that are in use, + * Part of the reclaimable slab consists of items that are in use, * and cannot be freed. Cap this estimate at the low watermark. */ available += global_page_state(NR_SLAB_RECLAIMABLE) - -- cgit From 49d063cb353265c3af701bab215ac438ca7df36d Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Mon, 7 Apr 2014 15:38:34 -0700 Subject: proc: show mnt_id in /proc/pid/fdinfo Currently we don't have a way how to determing from which mount point file has been opened. This information is required for proper dumping and restoring file descriptos due to presence of mount namespaces. It's possible, that two file descriptors are opened using the same paths, but one fd references mount point from one namespace while the other fd -- from other namespace. $ ls -l /proc/1/fd/1 lrwx------ 1 root root 64 Mar 19 23:54 /proc/1/fd/1 -> /dev/null $ cat /proc/1/fdinfo/1 pos: 0 flags: 0100002 mnt_id: 16 $ cat /proc/1/mountinfo | grep ^16 16 32 0:4 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=1013356k,nr_inodes=253339,mode=755 Signed-off-by: Andrey Vagin Acked-by: Pavel Emelyanov Acked-by: Cyrill Gorcunov Cc: Rob Landley Cc: Al Viro Cc: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 17 ++++++++++++----- fs/proc/fd.c | 6 ++++-- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index f00bee144add..8b9cd8eb3f91 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1648,18 +1648,21 @@ pids, so one need to either stop or freeze processes being inspected if precise results are needed. -3.7 /proc//fdinfo/ - Information about opened file +3.8 /proc//fdinfo/ - Information about opened file --------------------------------------------------------------- This file provides information associated with an opened file. The regular -files have at least two fields -- 'pos' and 'flags'. The 'pos' represents -the current offset of the opened file in decimal form [see lseek(2) for -details] and 'flags' denotes the octal O_xxx mask the file has been -created with [see open(2) for details]. +files have at least three fields -- 'pos', 'flags' and mnt_id. The 'pos' +represents the current offset of the opened file in decimal form [see lseek(2) +for details], 'flags' denotes the octal O_xxx mask the file has been +created with [see open(2) for details] and 'mnt_id' represents mount ID of +the file system containing the opened file [see 3.5 /proc//mountinfo +for details]. A typical output is pos: 0 flags: 0100002 + mnt_id: 19 The files such as eventfd, fsnotify, signalfd, epoll among the regular pos/flags pair provide additional information particular to the objects they represent. @@ -1668,6 +1671,7 @@ pair provide additional information particular to the objects they represent. ~~~~~~~~~~~~~ pos: 0 flags: 04002 + mnt_id: 9 eventfd-count: 5a where 'eventfd-count' is hex value of a counter. @@ -1676,6 +1680,7 @@ pair provide additional information particular to the objects they represent. ~~~~~~~~~~~~~~ pos: 0 flags: 04002 + mnt_id: 9 sigmask: 0000000000000200 where 'sigmask' is hex value of the signal mask associated @@ -1685,6 +1690,7 @@ pair provide additional information particular to the objects they represent. ~~~~~~~~~~~ pos: 0 flags: 02 + mnt_id: 9 tfd: 5 events: 1d data: ffffffffffffffff where 'tfd' is a target file descriptor number in decimal form, @@ -1718,6 +1724,7 @@ pair provide additional information particular to the objects they represent. pos: 0 flags: 02 + mnt_id: 9 fanotify flags:10 event-flags:0 fanotify mnt_id:12 mflags:40 mask:38 ignored_mask:40000003 fanotify ino:4f969 sdev:800013 mflags:0 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:69f90400c275b5b4 diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 985ea881b5bc..0788d093f5d8 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -11,6 +11,7 @@ #include +#include "../mount.h" #include "internal.h" #include "fd.h" @@ -48,8 +49,9 @@ static int seq_show(struct seq_file *m, void *v) } if (!ret) { - seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", - (long long)file->f_pos, f_flags); + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", + (long long)file->f_pos, f_flags, + real_mount(file->f_path.mnt)->mnt_id); if (file->f_op->show_fdinfo) ret = file->f_op->show_fdinfo(m, file); fput(file); -- cgit From 1c44dbc82f75aabc5de95da92b304393a94751fc Mon Sep 17 00:00:00 2001 From: Monam Agarwal Date: Mon, 7 Apr 2014 15:38:35 -0700 Subject: fs/proc/inode.c: use RCU_INIT_POINTER(x, NULL) Replace rcu_assign_pointer(x, NULL) with RCU_INIT_POINTER(x, NULL) The rcu_assign_pointer() ensures that the initialization of a structure is carried out before storing a pointer to that structure. And in the case of the NULL pointer, there is no structure to initialize. So, rcu_assign_pointer(p, NULL) can be safely converted to RCU_INIT_POINTER(p, NULL) Signed-off-by: Monam Agarwal Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8f20e3404fd2..0adbc02d60e3 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -47,7 +47,7 @@ static void proc_evict_inode(struct inode *inode) pde_put(de); head = PROC_I(inode)->sysctl; if (head) { - rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); + RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); sysctl_head_put(head); } /* Release any associated namespace */ -- cgit From 35a35046e4f9d8849e727b0e0f6edac0ece4ca6e Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Mon, 7 Apr 2014 15:38:36 -0700 Subject: procfs: make /proc/*/{stack,syscall,personality} 0400 These procfs files contain sensitive information and currently their mode is 0444. Change this to 0400, so the VFS will be able to block unprivileged processes from getting file descriptors on arbitrary privileged /proc/*/{stack,syscall,personality} files. This reduces the scope of ASLR leaking and bypasses by protecting already running processes. Signed-off-by: Djalal Harouni Acked-by: Kees Cook Acked-by: Andy Lutomirski Cc: Eric W. Biederman Cc: Al Viro Cc: Oleg Nesterov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index b9760628e1fd..a08c92289357 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2588,7 +2588,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), - ONE("personality", S_IRUGO, proc_pid_personality), + ONE("personality", S_IRUSR, proc_pid_personality), INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), @@ -2598,7 +2598,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUGO, proc_pid_syscall), + INF("syscall", S_IRUSR, proc_pid_syscall), #endif INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tgid_stat), @@ -2626,7 +2626,7 @@ static const struct pid_entry tgid_base_stuff[] = { INF("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE - ONE("stack", S_IRUGO, proc_pid_stack), + ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, proc_pid_schedstat), @@ -2927,14 +2927,14 @@ static const struct pid_entry tid_base_stuff[] = { REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), - ONE("personality", S_IRUGO, proc_pid_personality), + ONE("personality", S_IRUSR, proc_pid_personality), INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUGO, proc_pid_syscall), + INF("syscall", S_IRUSR, proc_pid_syscall), #endif INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tid_stat), @@ -2964,7 +2964,7 @@ static const struct pid_entry tid_base_stuff[] = { INF("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE - ONE("stack", S_IRUGO, proc_pid_stack), + ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, proc_pid_schedstat), -- cgit From 32ed74a4b968a4faff7aaaff557035ce5d5e70ab Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Mon, 7 Apr 2014 15:38:38 -0700 Subject: procfs: make /proc/*/pagemap 0400 The /proc/*/pagemap contain sensitive information and currently its mode is 0444. Change this to 0400, so the VFS will prevent unprivileged processes from getting file descriptors on arbitrary privileged /proc/*/pagemap files. This reduces the scope of address space leaking and bypasses by protecting already running processes. Signed-off-by: Djalal Harouni Acked-by: Kees Cook Acked-by: Andy Lutomirski Cc: Eric W. Biederman Cc: Al Viro Cc: Oleg Nesterov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index a08c92289357..8da60e768b42 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2617,7 +2617,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), - REG("pagemap", S_IRUGO, proc_pagemap_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -2955,7 +2955,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_tid_smaps_operations), - REG("pagemap", S_IRUGO, proc_pagemap_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), -- cgit From 23aebe1691a3d98a79676db6c0fd813e16478804 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:39 -0700 Subject: exec: kill bprm->tcomm[], simplify the "basename" logic Starting from commit c4ad8f98bef7 ("execve: use 'struct filename *' for executable name passing") bprm->filename can not go away after flush_old_exec(), so we do not need to save the binary name in bprm->tcomm[] added by 96e02d158678 ("exec: fix use-after-free bug in setup_new_exec()"). And there was never need for filename_to_taskname-like code, we can simply do set_task_comm(kbasename(filename). This patch has to change set_task_comm() and trace_task_rename() to accept "const char *", but I think this change is also good. Signed-off-by: Oleg Nesterov Cc: Heiko Carstens Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 21 ++------------------- include/linux/binfmts.h | 1 - include/linux/sched.h | 2 +- include/trace/events/task.h | 2 +- 4 files changed, 4 insertions(+), 22 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index b60ccf969a8b..9e81c630dfa7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1046,7 +1046,7 @@ EXPORT_SYMBOL_GPL(get_task_comm); * so that a new one can be started */ -void set_task_comm(struct task_struct *tsk, char *buf) +void set_task_comm(struct task_struct *tsk, const char *buf) { task_lock(tsk); trace_task_rename(tsk, buf); @@ -1055,21 +1055,6 @@ void set_task_comm(struct task_struct *tsk, char *buf) perf_event_comm(tsk); } -static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len) -{ - int i, ch; - - /* Copies the binary name from after last slash */ - for (i = 0; (ch = *(fn++)) != '\0';) { - if (ch == '/') - i = 0; /* overwrite what we wrote */ - else - if (i < len - 1) - tcomm[i++] = ch; - } - tcomm[i] = '\0'; -} - int flush_old_exec(struct linux_binprm * bprm) { int retval; @@ -1083,8 +1068,6 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; set_mm_exe_file(bprm->mm, bprm->file); - - filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm)); /* * Release all of the old mmap stuff */ @@ -1127,7 +1110,7 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); - set_task_comm(current, bprm->tcomm); + set_task_comm(current, kbasename(bprm->filename)); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index b4a745d7d9a9..61f29e5ea840 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -44,7 +44,6 @@ struct linux_binprm { unsigned interp_flags; unsigned interp_data; unsigned long loader, exec; - char tcomm[TASK_COMM_LEN]; }; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 diff --git a/include/linux/sched.h b/include/linux/sched.h index 6c70645eb3b6..f8497059f88c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2357,7 +2357,7 @@ extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, i struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -extern void set_task_comm(struct task_struct *tsk, char *from); +extern void set_task_comm(struct task_struct *tsk, const char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP diff --git a/include/trace/events/task.h b/include/trace/events/task.h index 102a646e1996..dee3bb1d5a6b 100644 --- a/include/trace/events/task.h +++ b/include/trace/events/task.h @@ -32,7 +32,7 @@ TRACE_EVENT(task_newtask, TRACE_EVENT(task_rename, - TP_PROTO(struct task_struct *task, char *comm), + TP_PROTO(struct task_struct *task, const char *comm), TP_ARGS(task, comm), -- cgit From dfccbb5e49a621c1b21a62527d61fc4305617aca Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:41 -0700 Subject: wait: fix reparent_leader() vs EXIT_DEAD->EXIT_ZOMBIE race wait_task_zombie() first does EXIT_ZOMBIE->EXIT_DEAD transition and drops tasklist_lock. If this task is not the natural child and it is traced, we change its state back to EXIT_ZOMBIE for ->real_parent. The last transition is racy, this is even documented in 50b8d257486a "ptrace: partially fix the do_wait(WEXITED) vs EXIT_DEAD->EXIT_ZOMBIE race". wait_consider_task() tries to detect this transition and clear ->notask_error but we can't rely on ptrace_reparented(), debugger can exit and do ptrace_unlink() before its sub-thread sets EXIT_ZOMBIE. And there is another problem which were missed before: this transition can also race with reparent_leader() which doesn't reset >exit_signal if EXIT_DEAD, assuming that this task must be reaped by someone else. So the tracee can be re-parented with ->exit_signal != SIGCHLD, and if /sbin/init doesn't use __WALL it becomes unreapable. Change reparent_leader() to update ->exit_signal even if EXIT_DEAD. Note: this is the simple temporary hack for -stable, it doesn't try to solve all problems, it will be reverted by the next changes. Signed-off-by: Oleg Nesterov Reported-by: Jan Kratochvil Reported-by: Michal Schmidt Tested-by: Michal Schmidt Cc: Al Viro Cc: Lennart Poettering Cc: Roland McGrath Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index decf648574f6..e354cbb13a9b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -560,9 +560,6 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { list_move_tail(&p->sibling, &p->real_parent->children); - - if (p->exit_state == EXIT_DEAD) - return; /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. @@ -570,9 +567,19 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, if (same_thread_group(p->real_parent, father)) return; - /* We don't want people slaying init. */ + /* + * We don't want people slaying init. + * + * Note: we do this even if it is EXIT_DEAD, wait_task_zombie() + * can change ->exit_state to EXIT_ZOMBIE. If this is the final + * state, do_notify_parent() was already called and ->exit_signal + * doesn't matter. + */ p->exit_signal = SIGCHLD; + if (p->exit_state == EXIT_DEAD) + return; + /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { -- cgit From abd50b39e783e1b6c75c7534c37f1eb2d94a89cd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:42 -0700 Subject: wait: introduce EXIT_TRACE to avoid the racy EXIT_DEAD->EXIT_ZOMBIE transition wait_task_zombie() first does EXIT_ZOMBIE->EXIT_DEAD transition and drops tasklist_lock. If this task is not the natural child and it is traced, we change its state back to EXIT_ZOMBIE for ->real_parent. The last transition is racy, this is even documented in 50b8d257486a "ptrace: partially fix the do_wait(WEXITED) vs EXIT_DEAD->EXIT_ZOMBIE race". wait_consider_task() tries to detect this transition and clear ->notask_error but we can't rely on ptrace_reparented(), debugger can exit and do ptrace_unlink() before its sub-thread sets EXIT_ZOMBIE. And there is another problem which were missed before: this transition can also race with reparent_leader() which doesn't reset >exit_signal if EXIT_DEAD, assuming that this task must be reaped by someone else. So the tracee can be re-parented with ->exit_signal != SIGCHLD, and if /sbin/init doesn't use __WALL it becomes unreapable. This was fixed by the previous commit, but it was the temporary hack. 1. Add the new exit_state, EXIT_TRACE. It means that the task is the traced zombie, debugger is going to detach and notify its natural parent. This new state is actually EXIT_ZOMBIE | EXIT_DEAD. This way we can avoid the changes in proc/kgdb code, get_task_state() still reports "X (dead)" in this case. Note: with or without this change userspace can see Z -> X -> Z transition. Not really bad, but probably makes sense to fix. 2. Change wait_task_zombie() to use EXIT_TRACE instead of EXIT_DEAD if we need to notify the ->real_parent. 3. Revert the previous hack in reparent_leader(), now that EXIT_DEAD is always the final state we can safely ignore such a task. 4. Change wait_consider_task() to check EXIT_TRACE separately and kill the racy and no longer needed ptrace_reparented() case. If ptrace == T an EXIT_TRACE thread should be simply ignored, the owner of this state is going to ptrace_unlink() this task. We can pretend that it was already removed from ->ptraced list. Otherwise we should skip this thread too but clear ->notask_error, we must be the natural parent and debugger is going to untrace and notify us. IOW, this doesn't differ from "EXIT_ZOMBIE && p->ptrace" even if the task was already untraced. Signed-off-by: Oleg Nesterov Reported-by: Jan Kratochvil Reported-by: Michal Schmidt Tested-by: Michal Schmidt Cc: Al Viro Cc: Lennart Poettering Cc: Roland McGrath Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + kernel/exit.c | 50 +++++++++++++++++++++----------------------------- 2 files changed, 22 insertions(+), 29 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index f8497059f88c..7781de5e5e7b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -212,6 +212,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); /* in tsk->exit_state */ #define EXIT_ZOMBIE 16 #define EXIT_DEAD 32 +#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* in tsk->state again */ #define TASK_DEAD 64 #define TASK_WAKEKILL 128 diff --git a/kernel/exit.c b/kernel/exit.c index e354cbb13a9b..022a0ff17318 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -560,6 +560,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { list_move_tail(&p->sibling, &p->real_parent->children); + + if (p->exit_state == EXIT_DEAD) + return; /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. @@ -567,19 +570,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, if (same_thread_group(p->real_parent, father)) return; - /* - * We don't want people slaying init. - * - * Note: we do this even if it is EXIT_DEAD, wait_task_zombie() - * can change ->exit_state to EXIT_ZOMBIE. If this is the final - * state, do_notify_parent() was already called and ->exit_signal - * doesn't matter. - */ + /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; - if (p->exit_state == EXIT_DEAD) - return; - /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { @@ -1043,17 +1036,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return wait_noreap_copyout(wo, p, pid, uid, why, status); } + traced = ptrace_reparented(p); /* - * Try to move the task's state to DEAD - * only one thread is allowed to do this: + * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = xchg(&p->exit_state, EXIT_DEAD); - if (state != EXIT_ZOMBIE) { - BUG_ON(state != EXIT_DEAD); + state = traced ? EXIT_TRACE : EXIT_DEAD; + if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; - } - - traced = ptrace_reparented(p); /* * It can be ptraced but not reparented, check * thread_group_leader() to filter out sub-threads. @@ -1114,7 +1103,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) /* * Now we are sure this task is interesting, and no other - * thread can reap it because we set its state to EXIT_DEAD. + * thread can reap it because we its state == DEAD/TRACE. */ read_unlock(&tasklist_lock); @@ -1159,14 +1148,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * If this is not a sub-thread, notify the parent. * If parent wants a zombie, don't release it now. */ + state = EXIT_DEAD; if (thread_group_leader(p) && - !do_notify_parent(p, p->exit_signal)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } + !do_notify_parent(p, p->exit_signal)) + state = EXIT_ZOMBIE; + p->exit_state = state; write_unlock_irq(&tasklist_lock); } - if (p != NULL) + if (state == EXIT_DEAD) release_task(p); return retval; @@ -1362,12 +1351,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, } /* dead body doesn't have much to contribute */ - if (unlikely(p->exit_state == EXIT_DEAD)) { + if (unlikely(p->exit_state == EXIT_DEAD)) + return 0; + + if (unlikely(p->exit_state == EXIT_TRACE)) { /* - * But do not ignore this task until the tracer does - * wait_task_zombie()->do_notify_parent(). + * ptrace == 0 means we are the natural parent. In this case + * we should clear notask_error, debugger will notify us. */ - if (likely(!ptrace) && unlikely(ptrace_reparented(p))) + if (likely(!ptrace)) wo->notask_error = 0; return 0; } -- cgit From b436069059fede30ca31d4bf439cc86436ff5b1d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:43 -0700 Subject: wait: use EXIT_TRACE only if thread_group_leader(zombie) wait_task_zombie() always uses EXIT_TRACE/ptrace_unlink() if ptrace_reparented(). This is suboptimal and a bit confusing: we do not need do_notify_parent(p) if !thread_group_leader(p) and in this case we also do not need ptrace_unlink(), we can rely on ptrace_release_task(). Change wait_task_zombie() to check thread_group_leader() along with ptrace_reparented() and simplify the final p->exit_state transition. Signed-off-by: Oleg Nesterov Tested-by: Michal Schmidt Cc: Jan Kratochvil Cc: Al Viro Cc: Lennart Poettering Cc: Roland McGrath Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 022a0ff17318..4773ed990907 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1040,7 +1040,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = traced ? EXIT_TRACE : EXIT_DEAD; + state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* @@ -1140,18 +1140,15 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (!retval) retval = pid; - if (traced) { + if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); - /* - * If this is not a sub-thread, notify the parent. - * If parent wants a zombie, don't release it now. - */ - state = EXIT_DEAD; - if (thread_group_leader(p) && - !do_notify_parent(p, p->exit_signal)) - state = EXIT_ZOMBIE; + + /* If parent wants a zombie, don't release it now */ + state = EXIT_ZOMBIE; + if (do_notify_parent(p, p->exit_signal)) + state = EXIT_DEAD; p->exit_state = state; write_unlock_irq(&tasklist_lock); } -- cgit From b3ab03160dfaf8ab78d476b670de319f4c1a5685 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:45 -0700 Subject: wait: completely ignore the EXIT_DEAD tasks Now that EXIT_DEAD is the terminal state it doesn't make sense to call eligible_child() or security_task_wait() if the task is really dead. Signed-off-by: Oleg Nesterov Tested-by: Michal Schmidt Cc: Jan Kratochvil Cc: Al Viro Cc: Lennart Poettering Cc: Roland McGrath Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 4773ed990907..33cf8dba0a61 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1329,7 +1329,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { - int ret = eligible_child(wo, p); + int ret; + + if (unlikely(p->exit_state == EXIT_DEAD)) + return 0; + + ret = eligible_child(wo, p); if (!ret) return ret; @@ -1347,10 +1352,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - /* dead body doesn't have much to contribute */ - if (unlikely(p->exit_state == EXIT_DEAD)) - return 0; - if (unlikely(p->exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case -- cgit From ad86622b478eaafdc25b74237df82b10fce6326d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:46 -0700 Subject: wait: swap EXIT_ZOMBIE and EXIT_DEAD to hide EXIT_TRACE from user-space get_task_state() uses the most significant bit to report the state to user-space, this means that EXIT_ZOMBIE->EXIT_TRACE->EXIT_DEAD transition can be noticed via /proc as Z -> X -> Z change. Note that this was possible even before EXIT_TRACE was introduced. This is not really bad but imho it make sense to hide EXIT_TRACE from user-space completely. So the patch simply swaps EXIT_ZOMBIE and EXIT_DEAD, this way EXIT_TRACE will be seen as EXIT_ZOMBIE by user-space. Signed-off-by: Oleg Nesterov Cc: Jan Kratochvil Cc: Michal Schmidt Cc: Al Viro Cc: Lennart Poettering Cc: Roland McGrath Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 4 ++-- include/linux/sched.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 656e401794de..64db2bceac59 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -138,8 +138,8 @@ static const char * const task_state_array[] = { "D (disk sleep)", /* 2 */ "T (stopped)", /* 4 */ "t (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)", /* 32 */ + "X (dead)", /* 16 */ + "Z (zombie)", /* 32 */ }; static inline const char *get_task_state(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index 7781de5e5e7b..075b3056c0c0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -210,8 +210,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); #define __TASK_STOPPED 4 #define __TASK_TRACED 8 /* in tsk->exit_state */ -#define EXIT_ZOMBIE 16 -#define EXIT_DEAD 32 +#define EXIT_DEAD 16 +#define EXIT_ZOMBIE 32 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* in tsk->state again */ #define TASK_DEAD 64 -- cgit From 377d75dafa07ee0da64223c9169f4e17b26c2b9a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:47 -0700 Subject: wait: WSTOPPED|WCONTINUED hangs if a zombie child is traced by real_parent "A zombie is only visible to its ptracer" logic in wait_consider_task() is very wrong. Trivial test-case: #include #include #include #include int main(void) { int child = fork(); if (!child) { assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0); return 0x23; } assert(waitid(P_ALL, child, NULL, WEXITED | WNOWAIT) == 0); assert(waitid(P_ALL, 0, NULL, WSTOPPED) == -1); return 0; } it hangs in waitpid(WSTOPPED) despite the fact it has a single zombie child. This is because wait_consider_task(ptrace => 0) sees p->ptrace and cleares ->notask_error assuming that the debugger should detach and notify us. Change wait_consider_task(ptrace => 0) to pretend that ptrace == T if the child is traced by us. This really simplifies the logic and allows us to do more fixes, see the next changes. This also hides the unwanted group stop state automatically, we can remove another ptrace_reparented() check. Unfortunately, this adds the following behavioural changes: 1. Before this patch wait(WEXITED | __WNOTHREAD) does not reap a natural child if it is traced by the caller's sub-thread. Hopefully nobody will ever notice this change, and I think that nobody should rely on this behaviour anyway. 2. SIGNAL_STOP_CONTINUED is no longer hidden from debugger if it is real parent. While this change comes as a side effect, I think it is good by itself. The group continued state can not be consumed by another process in this case, it doesn't depend on ptrace, it doesn't make sense to hide it from real parent. Perhaps we should add the thread_group_leader() check before wait_task_continued()? May be, but this shouldn't depend on ptrace_reparented(). Signed-off-by: Oleg Nesterov Cc: Al Viro Cc: Jan Kratochvil Cc: Lennart Poettering Cc: Michal Schmidt Cc: Roland McGrath Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 33cf8dba0a61..92d38d4da4b1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1362,6 +1362,22 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } + if (likely(!ptrace) && unlikely(p->ptrace)) { + /* + * If it is traced by its real parent's group, just pretend + * the caller is ptrace_do_wait() and reap this child if it + * is zombie. + * + * This also hides group stop state from real parent; otherwise + * a single stop can be reported twice as group and ptrace stop. + * If a ptracer wants to distinguish these two events for its + * own children it should create a separate process which takes + * the role of real parent. + */ + if (!ptrace_reparented(p)) + ptrace = 1; + } + /* slay zombie? */ if (p->exit_state == EXIT_ZOMBIE) { /* @@ -1402,19 +1418,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) wo->notask_error = 0; } else { - /* - * If @p is ptraced by a task in its real parent's group, - * hide group stop/continued state when looking at @p as - * the real parent; otherwise, a single stop can be - * reported twice as group and ptrace stops. - * - * If a ptracer wants to distinguish the two events for its - * own children, it should create a separate process which - * takes the role of real parent. - */ - if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) - return 0; - /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. -- cgit From 7c733eb3eac0e3d091aaf37c183d2175eeebfb2b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:49 -0700 Subject: wait: WSTOPPED|WCONTINUED doesn't work if a zombie leader is traced by another process Even if the main thread is dead the process still can stop/continue. However, if the leader is ptraced wait_consider_task(ptrace => false) always skips wait_task_stopped/wait_task_continued, so WSTOPPED or WCONTINUED can never work for the natural parent in this case. Move the "A zombie ptracee is only visible to its ptracer" check into the "if (!delay_group_leader(p))" block. ->notask_error is cleared by the "fall through" code below. This depends on the previous change, wait_task_stopped/continued must be avoided if !delay_group_leader() and the tracer is ->real_parent. Otherwise WSTOPPED|WEXITED could wrongly report "stopped" when the child is already dead (single-threaded or not). If it is traced by another task then the "stopped" state is fine until the debugger detaches and reveals a zombie state. Stupid test-case: void *tfunc(void *arg) { sleep(1); // wait for zombie leader raise(SIGSTOP); exit(0x13); return NULL; } int run_child(void) { pthread_t thread; if (!fork()) { int tracee = getppid(); assert(ptrace(PTRACE_ATTACH, tracee, 0,0) == 0); do ptrace(PTRACE_CONT, tracee, 0,0); while (wait(NULL) > 0); return 0; } sleep(1); // wait for PTRACE_ATTACH assert(pthread_create(&thread, NULL, tfunc, NULL) == 0); pthread_exit(NULL); } int main(void) { int child, stat; child = fork(); if (!child) return run_child(); assert(child == waitpid(-1, &stat, WSTOPPED)); assert(stat == 0x137f); kill(child, SIGCONT); assert(child == waitpid(-1, &stat, WCONTINUED)); assert(stat == 0xffff); assert(child == waitpid(-1, &stat, 0)); assert(stat == 0x1300); return 0; } Without this patch it hangs in waitpid(WSTOPPED), wait_task_stopped() is never called. Note: this doesn't fix all problems with a zombie delay_group_leader(), WCONTINUED | WEXITED check is not exactly right. debugger can't assume it will be notified if another thread reaps the whole thread group. Signed-off-by: Oleg Nesterov Cc: Al Viro Cc: Jan Kratochvil Cc: Lennart Poettering Cc: Michal Schmidt Cc: Roland McGrath Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 92d38d4da4b1..6ed6a1d552b5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1380,20 +1380,16 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, /* slay zombie? */ if (p->exit_state == EXIT_ZOMBIE) { - /* - * A zombie ptracee is only visible to its ptracer. - * Notification and reaping will be cascaded to the real - * parent when the ptracer detaches. - */ - if (likely(!ptrace) && unlikely(p->ptrace)) { - /* it will become visible, clear notask_error */ - wo->notask_error = 0; - return 0; - } - /* we don't reap group leaders with subthreads */ - if (!delay_group_leader(p)) - return wait_task_zombie(wo, p); + if (!delay_group_leader(p)) { + /* + * A zombie ptracee is only visible to its ptracer. + * Notification and reaping will be cascaded to the + * real parent when the ptracer detaches. + */ + if (unlikely(ptrace) || likely(!p->ptrace)) + return wait_task_zombie(wo, p); + } /* * Allow access to stopped/continued state via zombie by -- cgit From 82e0703b6ca8b549952c1e4f04746f27eaec012d Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Mon, 7 Apr 2014 15:38:50 -0700 Subject: include/linux/crash_dump.h: add vmcore_cleanup() prototype Eliminate the following warning in proc/vmcore.c: fs/proc/vmcore.c:1088:6: warning: no previous prototype for `vmcore_cleanup' [-Wmissing-prototypes] [akpm@linux-foundation.org: clean up powerpc, remove unneeded EXPORT_SYMBOL] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/fadump.h | 1 - fs/proc/vmcore.c | 1 - include/linux/crash_dump.h | 1 + 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 88dbf9659185..a6774560afe3 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -210,7 +210,6 @@ extern int is_fadump_active(void); extern void crash_fadump(struct pt_regs *, const char *); extern void fadump_cleanup(void); -extern void vmcore_cleanup(void); #else /* CONFIG_FA_DUMP */ static inline int is_fadump_active(void) { return 0; } static inline void crash_fadump(struct pt_regs *regs, const char *str) { } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 88d4585b30f1..ab852715916d 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1118,4 +1118,3 @@ void vmcore_cleanup(void) } free_elfcorebuf(); } -EXPORT_SYMBOL_GPL(vmcore_cleanup); diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 7032518f8542..72ab536ad3de 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -25,6 +25,7 @@ extern int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); +void vmcore_cleanup(void); /* Architecture code defines this if there are other possible ELF * machine types, e.g. on bi-arch capable hardware. */ -- cgit From c4082f36fa3eeb5d4fadc50241b6e3a388561f80 Mon Sep 17 00:00:00 2001 From: WANG Chao Date: Mon, 7 Apr 2014 15:38:51 -0700 Subject: vmcore: continue vmcore initialization if PT_NOTE is found empty Currently when an empty PT_NOTE is detected, vmcore initialization fails. It sounds too harsh. Because PT_NOTE could be empty, for example, one offlined a cpu but never restarted kdump service, and after crash, PT_NOTE program header is there but no data contains. It's better to warn about the empty PT_NOTE and continue to initialise vmcore. And ultimately the multiple PT_NOTE are merged into a single one, all empty PT_NOTE are discarded naturally during the merge. So empty PT_NOTE is not visible to user space and vmcore is as good as expected. Signed-off-by: WANG Chao Cc: Vivek Goyal Cc: HATAYAMA Daisuke Cc: Greg Pearson Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index ab852715916d..6a8e785b29da 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -484,7 +484,6 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) phdr_ptr->p_memsz = real_sz; if (real_sz == 0) { pr_warn("Warning: Zero PT_NOTE entries found\n"); - return -EINVAL; } } @@ -671,7 +670,6 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) phdr_ptr->p_memsz = real_sz; if (real_sz == 0) { pr_warn("Warning: Zero PT_NOTE entries found\n"); - return -EINVAL; } } -- cgit From 90ae3ae539246984d36e43b0e23554bca941476f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 7 Apr 2014 15:38:52 -0700 Subject: idr: remove dead code Remove no longer used deprecated code, and make local functions static. Signed-off-by: Stephen Hemminger Acked-by: Jean Delvare Acked-by: Tejun Heo Cc: Jeff Layton Cc: Philipp Reisner Cc: Jens Axboe Cc: George Spelvin Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/idr.h | 63 ----------------------------------------------------- lib/idr.c | 20 ++--------------- 2 files changed, 2 insertions(+), 81 deletions(-) diff --git a/include/linux/idr.h b/include/linux/idr.h index f669585c4fc5..6af3400b9b2f 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -132,69 +132,6 @@ static inline void *idr_find(struct idr *idr, int id) #define idr_for_each_entry(idp, entry, id) \ for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id) -/* - * Don't use the following functions. These exist only to suppress - * deprecated warnings on EXPORT_SYMBOL()s. - */ -int __idr_pre_get(struct idr *idp, gfp_t gfp_mask); -int __idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id); -void __idr_remove_all(struct idr *idp); - -/** - * idr_pre_get - reserve resources for idr allocation - * @idp: idr handle - * @gfp_mask: memory allocation flags - * - * Part of old alloc interface. This is going away. Use - * idr_preload[_end]() and idr_alloc() instead. - */ -static inline int __deprecated idr_pre_get(struct idr *idp, gfp_t gfp_mask) -{ - return __idr_pre_get(idp, gfp_mask); -} - -/** - * idr_get_new_above - allocate new idr entry above or equal to a start id - * @idp: idr handle - * @ptr: pointer you want associated with the id - * @starting_id: id to start search at - * @id: pointer to the allocated handle - * - * Part of old alloc interface. This is going away. Use - * idr_preload[_end]() and idr_alloc() instead. - */ -static inline int __deprecated idr_get_new_above(struct idr *idp, void *ptr, - int starting_id, int *id) -{ - return __idr_get_new_above(idp, ptr, starting_id, id); -} - -/** - * idr_get_new - allocate new idr entry - * @idp: idr handle - * @ptr: pointer you want associated with the id - * @id: pointer to the allocated handle - * - * Part of old alloc interface. This is going away. Use - * idr_preload[_end]() and idr_alloc() instead. - */ -static inline int __deprecated idr_get_new(struct idr *idp, void *ptr, int *id) -{ - return __idr_get_new_above(idp, ptr, 0, id); -} - -/** - * idr_remove_all - remove all ids from the given idr tree - * @idp: idr handle - * - * If you're trying to destroy @idp, calling idr_destroy() is enough. - * This is going away. Don't use. - */ -static inline void __deprecated idr_remove_all(struct idr *idp) -{ - __idr_remove_all(idp); -} - /* * IDA - IDR based id allocator, use when translation from id to * pointer isn't necessary. diff --git a/lib/idr.c b/lib/idr.c index 1ba4956bfbff..ba2df393c3b0 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -196,7 +196,7 @@ static void idr_mark_full(struct idr_layer **pa, int id) } } -int __idr_pre_get(struct idr *idp, gfp_t gfp_mask) +static int __idr_pre_get(struct idr *idp, gfp_t gfp_mask) { while (idp->id_free_cnt < MAX_IDR_FREE) { struct idr_layer *new; @@ -207,7 +207,6 @@ int __idr_pre_get(struct idr *idp, gfp_t gfp_mask) } return 1; } -EXPORT_SYMBOL(__idr_pre_get); /** * sub_alloc - try to allocate an id without growing the tree depth @@ -374,20 +373,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id, idr_mark_full(pa, id); } -int __idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id) -{ - struct idr_layer *pa[MAX_IDR_LEVEL + 1]; - int rv; - - rv = idr_get_empty_slot(idp, starting_id, pa, 0, idp); - if (rv < 0) - return rv == -ENOMEM ? -EAGAIN : rv; - - idr_fill_slot(idp, ptr, rv, pa); - *id = rv; - return 0; -} -EXPORT_SYMBOL(__idr_get_new_above); /** * idr_preload - preload for idr_alloc() @@ -607,7 +592,7 @@ void idr_remove(struct idr *idp, int id) } EXPORT_SYMBOL(idr_remove); -void __idr_remove_all(struct idr *idp) +static void __idr_remove_all(struct idr *idp) { int n, id, max; int bt_mask; @@ -640,7 +625,6 @@ void __idr_remove_all(struct idr *idp) } idp->layers = 0; } -EXPORT_SYMBOL(__idr_remove_all); /** * idr_destroy - release all cached layers within an idr tree -- cgit From 3f59b067c5140766591a64a3117d86978c57509b Mon Sep 17 00:00:00 2001 From: Monam Agarwal Date: Mon, 7 Apr 2014 15:38:54 -0700 Subject: lib/idr.c: use RCU_INIT_POINTER(x, NULL) Replace rcu_assign_pointer(x, NULL) with RCU_INIT_POINTER(x, NULL) The rcu_assign_pointer() ensures that the initialization of a structure is carried out before storing a pointer to that structure. And in the case of the NULL pointer, there is no structure to initialize. So, rcu_assign_pointer(p, NULL) can be safely converted to RCU_INIT_POINTER(p, NULL) Signed-off-by: Monam Agarwal Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/idr.c b/lib/idr.c index ba2df393c3b0..2642fa8e424d 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -533,7 +533,7 @@ static void sub_remove(struct idr *idp, int shift, int id) n = id & IDR_MASK; if (likely(p != NULL && test_bit(n, p->bitmap))) { __clear_bit(n, p->bitmap); - rcu_assign_pointer(p->ary[n], NULL); + RCU_INIT_POINTER(p->ary[n], NULL); to_free = NULL; while(*paa && ! --((**paa)->count)){ if (to_free) @@ -602,7 +602,7 @@ static void __idr_remove_all(struct idr *idp) n = idp->layers * IDR_BITS; p = idp->top; - rcu_assign_pointer(idp->top, NULL); + RCU_INIT_POINTER(idp->top, NULL); max = idr_max(idp->layers); id = 0; -- cgit From 40f847baf50debfd42ad66f862bfcfea069ffbe7 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Mon, 7 Apr 2014 15:38:55 -0700 Subject: drivers/rapidio/devices/tsi721_dma.c: optimize use of BDMA descriptors Combine SG entries describing single contiguous memory block into one Tsi721 BDMA descriptor. This reduces number of hardware descriptors required for large data transfers and improves performance on the PCIe side by reducing number of descriptor fetch requests. Signed-off-by: Alexandre Bounine Cc: Matt Porter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721.h | 4 ++ drivers/rapidio/devices/tsi721_dma.c | 111 ++++++++++++++++++++++++----------- 2 files changed, 82 insertions(+), 33 deletions(-) diff --git a/drivers/rapidio/devices/tsi721.h b/drivers/rapidio/devices/tsi721.h index 7061ac0ad428..0305675270ee 100644 --- a/drivers/rapidio/devices/tsi721.h +++ b/drivers/rapidio/devices/tsi721.h @@ -644,6 +644,9 @@ enum tsi721_smsg_int_flag { #ifdef CONFIG_RAPIDIO_DMA_ENGINE +#define TSI721_BDMA_BD_RING_SZ 128 +#define TSI721_BDMA_MAX_BCOUNT (TSI721_DMAD_BCOUNT1 + 1) + struct tsi721_tx_desc { struct dma_async_tx_descriptor txd; struct tsi721_dma_desc *hw_desc; @@ -652,6 +655,7 @@ struct tsi721_tx_desc { u64 rio_addr; /* upper 2-bits of 66-bit RIO address */ u8 rio_addr_u; + u32 bcount; bool interrupt; struct list_head desc_node; struct list_head tx_list; diff --git a/drivers/rapidio/devices/tsi721_dma.c b/drivers/rapidio/devices/tsi721_dma.c index 91245f5dbe81..9b60b1f3261c 100644 --- a/drivers/rapidio/devices/tsi721_dma.c +++ b/drivers/rapidio/devices/tsi721_dma.c @@ -304,35 +304,17 @@ struct tsi721_tx_desc *tsi721_desc_get(struct tsi721_bdma_chan *bdma_chan) } static int -tsi721_fill_desc(struct tsi721_bdma_chan *bdma_chan, - struct tsi721_tx_desc *desc, struct scatterlist *sg, +tsi721_desc_fill_init(struct tsi721_tx_desc *desc, struct scatterlist *sg, enum dma_rtype rtype, u32 sys_size) { struct tsi721_dma_desc *bd_ptr = desc->hw_desc; u64 rio_addr; - if (sg_dma_len(sg) > TSI721_DMAD_BCOUNT1 + 1) { - dev_err(bdma_chan->dchan.device->dev, - "SG element is too large\n"); - return -EINVAL; - } - - dev_dbg(bdma_chan->dchan.device->dev, - "desc: 0x%llx, addr: 0x%llx len: 0x%x\n", - (u64)desc->txd.phys, (unsigned long long)sg_dma_address(sg), - sg_dma_len(sg)); - - dev_dbg(bdma_chan->dchan.device->dev, - "bd_ptr = %p did=%d raddr=0x%llx\n", - bd_ptr, desc->destid, desc->rio_addr); - /* Initialize DMA descriptor */ bd_ptr->type_id = cpu_to_le32((DTYPE1 << 29) | (rtype << 19) | desc->destid); - if (desc->interrupt) - bd_ptr->type_id |= cpu_to_le32(TSI721_DMAD_IOF); bd_ptr->bcount = cpu_to_le32(((desc->rio_addr & 0x3) << 30) | - (sys_size << 26) | sg_dma_len(sg)); + (sys_size << 26)); rio_addr = (desc->rio_addr >> 2) | ((u64)(desc->rio_addr_u & 0x3) << 62); bd_ptr->raddr_lo = cpu_to_le32(rio_addr & 0xffffffff); @@ -346,6 +328,20 @@ tsi721_fill_desc(struct tsi721_bdma_chan *bdma_chan, return 0; } +static int +tsi721_desc_fill_end(struct tsi721_tx_desc *desc) +{ + struct tsi721_dma_desc *bd_ptr = desc->hw_desc; + + /* Update DMA descriptor */ + if (desc->interrupt) + bd_ptr->type_id |= cpu_to_le32(TSI721_DMAD_IOF); + bd_ptr->bcount |= cpu_to_le32(desc->bcount & TSI721_DMAD_BCOUNT1); + + return 0; +} + + static void tsi721_dma_chain_complete(struct tsi721_bdma_chan *bdma_chan, struct tsi721_tx_desc *desc) { @@ -674,6 +670,7 @@ struct dma_async_tx_descriptor *tsi721_prep_rio_sg(struct dma_chan *dchan, unsigned int i; u32 sys_size = dma_to_mport(dchan->device)->sys_size; enum dma_rtype rtype; + dma_addr_t next_addr = -1; if (!sgl || !sg_len) { dev_err(dchan->device->dev, "%s: No SG list\n", __func__); @@ -704,36 +701,84 @@ struct dma_async_tx_descriptor *tsi721_prep_rio_sg(struct dma_chan *dchan, for_each_sg(sgl, sg, sg_len, i) { int err; - dev_dbg(dchan->device->dev, "%s: sg #%d\n", __func__, i); + if (sg_dma_len(sg) > TSI721_BDMA_MAX_BCOUNT) { + dev_err(dchan->device->dev, + "%s: SG entry %d is too large\n", __func__, i); + goto err_desc_put; + } + + /* + * If this sg entry forms contiguous block with previous one, + * try to merge it into existing DMA descriptor + */ + if (desc) { + if (next_addr == sg_dma_address(sg) && + desc->bcount + sg_dma_len(sg) <= + TSI721_BDMA_MAX_BCOUNT) { + /* Adjust byte count of the descriptor */ + desc->bcount += sg_dma_len(sg); + goto entry_done; + } + + /* + * Finalize this descriptor using total + * byte count value. + */ + tsi721_desc_fill_end(desc); + dev_dbg(dchan->device->dev, "%s: desc final len: %d\n", + __func__, desc->bcount); + } + + /* + * Obtain and initialize a new descriptor + */ desc = tsi721_desc_get(bdma_chan); if (!desc) { dev_err(dchan->device->dev, - "Not enough descriptors available\n"); - goto err_desc_get; + "%s: Failed to get new descriptor for SG %d\n", + __func__, i); + goto err_desc_put; } - if (sg_is_last(sg)) - desc->interrupt = (flags & DMA_PREP_INTERRUPT) != 0; - else - desc->interrupt = false; - desc->destid = rext->destid; desc->rio_addr = rio_addr; desc->rio_addr_u = 0; + desc->bcount = sg_dma_len(sg); + + dev_dbg(dchan->device->dev, + "sg%d desc: 0x%llx, addr: 0x%llx len: %d\n", + i, (u64)desc->txd.phys, + (unsigned long long)sg_dma_address(sg), + sg_dma_len(sg)); + + dev_dbg(dchan->device->dev, + "bd_ptr = %p did=%d raddr=0x%llx\n", + desc->hw_desc, desc->destid, desc->rio_addr); - err = tsi721_fill_desc(bdma_chan, desc, sg, rtype, sys_size); + err = tsi721_desc_fill_init(desc, sg, rtype, sys_size); if (err) { dev_err(dchan->device->dev, "Failed to build desc: %d\n", err); - goto err_desc_get; + goto err_desc_put; } - rio_addr += sg_dma_len(sg); + next_addr = sg_dma_address(sg); if (!first) first = desc; else list_add_tail(&desc->desc_node, &first->tx_list); + +entry_done: + if (sg_is_last(sg)) { + desc->interrupt = (flags & DMA_PREP_INTERRUPT) != 0; + tsi721_desc_fill_end(desc); + dev_dbg(dchan->device->dev, "%s: desc final len: %d\n", + __func__, desc->bcount); + } else { + rio_addr += sg_dma_len(sg); + next_addr += sg_dma_len(sg); + } } first->txd.cookie = -EBUSY; @@ -741,7 +786,7 @@ struct dma_async_tx_descriptor *tsi721_prep_rio_sg(struct dma_chan *dchan, return &first->txd; -err_desc_get: +err_desc_put: tsi721_desc_put(bdma_chan, first); return NULL; } @@ -792,7 +837,7 @@ int tsi721_register_dma(struct tsi721_device *priv) if (i == TSI721_DMACH_MAINT) continue; - bdma_chan->bd_num = 64; + bdma_chan->bd_num = TSI721_BDMA_BD_RING_SZ; bdma_chan->regs = priv->regs + TSI721_DMAC_BASE(i); bdma_chan->dchan.device = &mport->dma; -- cgit From 2aaf308b95b24649a6dcfed89cd956e972089b2a Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Mon, 7 Apr 2014 15:38:56 -0700 Subject: rapidio: rework device hierarchy and introduce mport class of devices This patch removes an artificial RapidIO bus root device and establishes actual device hierarchy by providing reference to real parent devices. It also introduces device class for RapidIO controller devices (on-chip or an eternal bridge, known as "mport"). Existing implementation was sufficient for SoC-based platforms that have a single RapidIO controller. With introduction of devices using multiple RapidIO controllers and PCIe-to-RapidIO bridges the old scheme is very limiting or does not work at all. The implemented changes allow to properly reference platform's local RapidIO mport devices and provide device details needed for upper layers. This change to RapidIO device hierarchy does not break any known existing kernel or user space interfaces. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Li Yang Cc: Kumar Gala Cc: Andre van Herk Cc: Stef van Os Cc: Jerry Jacobs Cc: Arno Tiemersma Cc: Rob Landley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/rapidio/sysfs.txt | 66 +++++++++++++++++++++++++++++++++++----- arch/powerpc/sysdev/fsl_rio.c | 1 + drivers/net/rionet.c | 1 + drivers/rapidio/devices/tsi721.c | 1 + drivers/rapidio/rio-driver.c | 22 +++++++++----- drivers/rapidio/rio-scan.c | 1 + drivers/rapidio/rio-sysfs.c | 40 ++++++++++++++++++++++++ drivers/rapidio/rio.c | 11 +++++++ drivers/rapidio/rio.h | 1 + include/linux/rio.h | 5 ++- 10 files changed, 133 insertions(+), 16 deletions(-) diff --git a/Documentation/rapidio/sysfs.txt b/Documentation/rapidio/sysfs.txt index 271438c0617f..47ce9a5336e1 100644 --- a/Documentation/rapidio/sysfs.txt +++ b/Documentation/rapidio/sysfs.txt @@ -2,8 +2,8 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -1. Device Subdirectories ------------------------- +1. RapidIO Device Subdirectories +-------------------------------- For each RapidIO device, the RapidIO subsystem creates files in an individual subdirectory with the following name, /sys/bus/rapidio/devices/. @@ -25,8 +25,8 @@ seen by the enumerating host (destID = 1): NOTE: An enumerating or discovering endpoint does not create a sysfs entry for itself, this is why an endpoint with destID=1 is not shown in the list. -2. Attributes Common for All Devices ------------------------------------- +2. Attributes Common for All RapidIO Devices +-------------------------------------------- Each device subdirectory contains the following informational read-only files: @@ -52,16 +52,16 @@ This attribute is similar in behavior to the "config" attribute of PCI devices and provides an access to the RapidIO device registers using standard file read and write operations. -3. Endpoint Device Attributes ------------------------------ +3. RapidIO Endpoint Device Attributes +------------------------------------- Currently Linux RapidIO subsystem does not create any endpoint specific sysfs attributes. It is possible that RapidIO master port drivers and endpoint device drivers will add their device-specific sysfs attributes but such attributes are outside the scope of this document. -4. Switch Device Attributes ---------------------------- +4. RapidIO Switch Device Attributes +----------------------------------- RapidIO switches have additional attributes in sysfs. RapidIO subsystem supports common and device-specific sysfs attributes for switches. Because switches are @@ -106,3 +106,53 @@ attribute: for that controller always will be 0. To initiate RapidIO enumeration/discovery on all available mports a user must write '-1' (or RIO_MPORT_ANY) into this attribute file. + + +6. RapidIO Bus Controllers/Ports +-------------------------------- + +On-chip RapidIO controllers and PCIe-to-RapidIO bridges (referenced as +"Master Port" or "mport") are presented in sysfs as the special class of +devices: "rapidio_port". + +The /sys/class/rapidio_port subdirectory contains individual subdirectories +named as "rapidioN" where N = mport ID registered with RapidIO subsystem. + +NOTE: An mport ID is not a RapidIO destination ID assigned to a given local +mport device. + +Each mport device subdirectory in addition to standard entries contains the +following device-specific attributes: + + port_destid - reports RapidIO destination ID assigned to the given RapidIO + mport device. If value 0xFFFFFFFF is returned this means that + no valid destination ID have been assigned to the mport (yet). + Normally, before enumeration/discovery have been executed only + fabric enumerating mports have a valid destination ID assigned + to them using "hdid=..." rapidio module parameter. + sys_size - reports RapidIO common transport system size: + 0 = small (8-bit destination ID, max. 256 devices), + 1 = large (16-bit destination ID, max. 65536 devices). + +After enumeration or discovery was performed for a given mport device, +the corresponding subdirectory will also contain subdirectories for each +child RapidIO device connected to the mport. Naming conventions for RapidIO +devices are described in Section 1 above. + +The example below shows mport device subdirectory with several child RapidIO +devices attached to it. + +[rio@rapidio ~]$ ls /sys/class/rapidio_port/rapidio0/ -l +total 0 +drwxr-xr-x 3 root root 0 Feb 11 15:10 00:e:0001 +drwxr-xr-x 3 root root 0 Feb 11 15:10 00:e:0004 +drwxr-xr-x 3 root root 0 Feb 11 15:10 00:e:0007 +drwxr-xr-x 3 root root 0 Feb 11 15:10 00:s:0002 +drwxr-xr-x 3 root root 0 Feb 11 15:10 00:s:0003 +drwxr-xr-x 3 root root 0 Feb 11 15:10 00:s:0005 +lrwxrwxrwx 1 root root 0 Feb 11 15:11 device -> ../../../0000:01:00.0 +-r--r--r-- 1 root root 4096 Feb 11 15:11 port_destid +drwxr-xr-x 2 root root 0 Feb 11 15:11 power +lrwxrwxrwx 1 root root 0 Feb 11 15:04 subsystem -> ../../../../../../class/rapidio_port +-r--r--r-- 1 root root 4096 Feb 11 15:11 sys_size +-rw-r--r-- 1 root root 4096 Feb 11 15:04 uevent diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 95dd892e9904..cf2b0840a672 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -531,6 +531,7 @@ int fsl_rio_setup(struct platform_device *dev) sprintf(port->name, "RIO mport %d", i); priv->dev = &dev->dev; + port->dev.parent = &dev->dev; port->ops = ops; port->priv = priv; port->phys_efptr = 0x100; diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index 6d1f6ed3113f..a8497183ff8b 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -493,6 +493,7 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev) ndev->netdev_ops = &rionet_netdev_ops; ndev->mtu = RIO_MAX_MSG_SIZE - 14; ndev->features = NETIF_F_LLTX; + SET_NETDEV_DEV(ndev, &mport->dev); SET_ETHTOOL_OPS(ndev, &rionet_ethtool_ops); spin_lock_init(&rnet->lock); diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index ff7cbf2d28e3..1753dc693c15 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -2256,6 +2256,7 @@ static int tsi721_setup_mport(struct tsi721_device *priv) mport->phy_type = RIO_PHY_SERIAL; mport->priv = (void *)priv; mport->phys_efptr = 0x100; + mport->dev.parent = &pdev->dev; priv->mport = mport; INIT_LIST_HEAD(&mport->dbells); diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c index c9ae692d3451..f301f059bb85 100644 --- a/drivers/rapidio/rio-driver.c +++ b/drivers/rapidio/rio-driver.c @@ -167,7 +167,6 @@ void rio_unregister_driver(struct rio_driver *rdrv) void rio_attach_device(struct rio_dev *rdev) { rdev->dev.bus = &rio_bus_type; - rdev->dev.parent = &rio_bus; } EXPORT_SYMBOL_GPL(rio_attach_device); @@ -216,9 +215,12 @@ static int rio_uevent(struct device *dev, struct kobj_uevent_env *env) return 0; } -struct device rio_bus = { - .init_name = "rapidio", +struct class rio_mport_class = { + .name = "rapidio_port", + .owner = THIS_MODULE, + .dev_groups = rio_mport_groups, }; +EXPORT_SYMBOL_GPL(rio_mport_class); struct bus_type rio_bus_type = { .name = "rapidio", @@ -233,14 +235,20 @@ struct bus_type rio_bus_type = { /** * rio_bus_init - Register the RapidIO bus with the device model * - * Registers the RIO bus device and RIO bus type with the Linux + * Registers the RIO mport device class and RIO bus type with the Linux * device model. */ static int __init rio_bus_init(void) { - if (device_register(&rio_bus) < 0) - printk("RIO: failed to register RIO bus device\n"); - return bus_register(&rio_bus_type); + int ret; + + ret = class_register(&rio_mport_class); + if (!ret) { + ret = bus_register(&rio_bus_type); + if (ret) + class_unregister(&rio_mport_class); + } + return ret; } postcore_initcall(rio_bus_init); diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index d3a6539a77cc..47a1b2ea76c4 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -461,6 +461,7 @@ static struct rio_dev *rio_setup_device(struct rio_net *net, rdev->comp_tag & RIO_CTAG_UDEVID); } + rdev->dev.parent = &port->dev; rio_attach_device(rdev); device_initialize(&rdev->dev); diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c index e0221c6d0cc2..cdb005c0094d 100644 --- a/drivers/rapidio/rio-sysfs.c +++ b/drivers/rapidio/rio-sysfs.c @@ -341,3 +341,43 @@ const struct attribute_group *rio_bus_groups[] = { &rio_bus_group, NULL, }; + +static ssize_t +port_destid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct rio_mport *mport = to_rio_mport(dev); + + if (mport) + return sprintf(buf, "0x%04x\n", mport->host_deviceid); + else + return -ENODEV; +} +static DEVICE_ATTR_RO(port_destid); + +static ssize_t sys_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct rio_mport *mport = to_rio_mport(dev); + + if (mport) + return sprintf(buf, "%u\n", mport->sys_size); + else + return -ENODEV; +} +static DEVICE_ATTR_RO(sys_size); + +static struct attribute *rio_mport_attrs[] = { + &dev_attr_port_destid.attr, + &dev_attr_sys_size.attr, + NULL, +}; + +static const struct attribute_group rio_mport_group = { + .attrs = rio_mport_attrs, +}; + +const struct attribute_group *rio_mport_groups[] = { + &rio_mport_group, + NULL, +}; diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index 2e8a20cac588..a54ba0494dd3 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -1884,6 +1884,7 @@ static int rio_get_hdid(int index) int rio_register_mport(struct rio_mport *port) { struct rio_scan_node *scan = NULL; + int res = 0; if (next_portid >= RIO_MAX_MPORTS) { pr_err("RIO: reached specified max number of mports\n"); @@ -1894,6 +1895,16 @@ int rio_register_mport(struct rio_mport *port) port->host_deviceid = rio_get_hdid(port->id); port->nscan = NULL; + dev_set_name(&port->dev, "rapidio%d", port->id); + port->dev.class = &rio_mport_class; + + res = device_register(&port->dev); + if (res) + dev_err(&port->dev, "RIO: mport%d registration failed ERR=%d\n", + port->id, res); + else + dev_dbg(&port->dev, "RIO: mport%d registered\n", port->id); + mutex_lock(&rio_mport_list_lock); list_add_tail(&port->node, &rio_mports); diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h index 5f99d22ad0b0..2d0550e08ea2 100644 --- a/drivers/rapidio/rio.h +++ b/drivers/rapidio/rio.h @@ -50,6 +50,7 @@ extern int rio_mport_scan(int mport_id); /* Structures internal to the RIO core code */ extern const struct attribute_group *rio_dev_groups[]; extern const struct attribute_group *rio_bus_groups[]; +extern const struct attribute_group *rio_mport_groups[]; #define RIO_GET_DID(size, x) (size ? (x & 0xffff) : ((x & 0x00ff0000) >> 16)) #define RIO_SET_DID(size, x) (size ? (x & 0xffff) : ((x & 0x000000ff) << 16)) diff --git a/include/linux/rio.h b/include/linux/rio.h index b71d5738e683..6bda06f21930 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -83,7 +83,7 @@ #define RIO_CTAG_UDEVID 0x0001ffff /* Unique device identifier */ extern struct bus_type rio_bus_type; -extern struct device rio_bus; +extern struct class rio_mport_class; struct rio_mport; struct rio_dev; @@ -201,6 +201,7 @@ struct rio_dev { #define rio_dev_f(n) list_entry(n, struct rio_dev, net_list) #define to_rio_dev(n) container_of(n, struct rio_dev, dev) #define sw_to_rio_dev(n) container_of(n, struct rio_dev, rswitch[0]) +#define to_rio_mport(n) container_of(n, struct rio_mport, dev) /** * struct rio_msg - RIO message event @@ -248,6 +249,7 @@ enum rio_phy_type { * @phy_type: RapidIO phy type * @phys_efptr: RIO port extended features pointer * @name: Port name string + * @dev: device structure associated with an mport * @priv: Master port private data * @dma: DMA device associated with mport * @nscan: RapidIO network enumeration/discovery operations @@ -272,6 +274,7 @@ struct rio_mport { enum rio_phy_type phy_type; /* RapidIO phy type */ u32 phys_efptr; unsigned char name[RIO_MAX_MPORT_NAME]; + struct device dev; void *priv; /* Master port private data */ #ifdef CONFIG_RAPIDIO_DMA_ENGINE struct dma_device dma; -- cgit From 80df28476505ed4e6701c3448c63c9229a50c655 Mon Sep 17 00:00:00 2001 From: Liu Hua Date: Mon, 7 Apr 2014 15:38:57 -0700 Subject: hung_task: check the value of "sysctl_hung_task_timeout_sec" As sysctl_hung_task_timeout_sec is unsigned long, when this value is larger then LONG_MAX/HZ, the function schedule_timeout_interruptible in watchdog will return immediately without sleep and with print : schedule_timeout: wrong timeout value ffffffffffffff83 and then the funtion watchdog will call schedule_timeout_interruptible again and again. The screen will be filled with "schedule_timeout: wrong timeout value ffffffffffffff83" This patch does some check and correction in sysctl, to let the function schedule_timeout_interruptible allways get the valid parameter. Signed-off-by: Liu Hua Tested-by: Satoru Takeuchi Cc: [3.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 1 + kernel/sysctl.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 271a09db6629..9886c3d57fc2 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -317,6 +317,7 @@ for more than this value report a warning. This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. 0: means infinite timeout - no checking done. +Possible values to set are in range {0..LONG_MAX/HZ}. ============================================================== diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5c14b547882e..74f5b580fe34 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -141,6 +141,11 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; +/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ +#ifdef CONFIG_DETECT_HUNG_TASK +static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); +#endif + #ifdef CONFIG_INOTIFY_USER #include #endif @@ -985,6 +990,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, }, { .procname = "hung_task_warnings", -- cgit From 894122db49135090043df90bc105a82e16a68373 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:38:58 -0700 Subject: fs/adfs/super.c: add __init to init_inodecache() init_inodecache is only called by __init init_adfs_fs. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/adfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 952aeb048349..9852bdf34d76 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -266,7 +266,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", sizeof(struct adfs_inode_info), -- cgit From adbd319e5ae90b346e6dbb692862318faa7a7808 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:38:59 -0700 Subject: affs: add __init to init_inodecache () init_inodecache is only called by __init init_affs_fs Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/affs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/affs/super.c b/fs/affs/super.c index 307453086c3f..4fad16adbe7b 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -128,7 +128,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { affs_inode_cachep = kmem_cache_create("affs_inode_cache", sizeof(struct affs_inode_info), -- cgit From d40c4d46eaa2295b5d1ee08f594b023245db87a4 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:39:00 -0700 Subject: fs/affs/dir.c: unlock/brelse dir on failure + code clean-up Commit 0edf977d2ae3 ("[readdir] convert affs") returns directly -EIO without unlocking dir inode and releasing dir bh when second affs_bread sequence fails. This patch restores initial behaviour. It also fixes pr_debug and affs_error to fit in 80 columns + removes reference to filldir (replaced by dir_emit in the commit above). Signed-off-by: Fabian Frederick Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/affs/dir.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/fs/affs/dir.c b/fs/affs/dir.c index f1eba8c3644e..cbbda476a805 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -52,8 +52,10 @@ affs_readdir(struct file *file, struct dir_context *ctx) int hash_pos; int chain_pos; u32 ino; + int error = 0; - pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos); + pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n", + inode->i_ino, (unsigned long)ctx->pos); if (ctx->pos < 2) { file->private_data = (void *)0; @@ -72,7 +74,7 @@ affs_readdir(struct file *file, struct dir_context *ctx) } dir_bh = affs_bread(sb, inode->i_ino); if (!dir_bh) - goto readdir_out; + goto out_unlock_dir; /* If the directory hasn't changed since the last call to readdir(), * we can jump directly to where we left off. @@ -88,7 +90,8 @@ affs_readdir(struct file *file, struct dir_context *ctx) fh_bh = affs_bread(sb, ino); if (!fh_bh) { affs_error(sb, "readdir","Cannot read block %d", i); - return -EIO; + error = -EIO; + goto out_brelse_dir; } ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); affs_brelse(fh_bh); @@ -107,29 +110,34 @@ inside: do { fh_bh = affs_bread(sb, ino); if (!fh_bh) { - affs_error(sb, "readdir","Cannot read block %d", ino); + affs_error(sb, "readdir", + "Cannot read block %d", ino); break; } namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); name = AFFS_TAIL(sb, fh_bh)->name + 1; - pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n", + pr_debug("AFFS: readdir(): dir_emit(\"%.*s\", " + "ino=%u), hash=%d, f_pos=%x\n", namelen, name, ino, hash_pos, (u32)ctx->pos); + if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN)) - goto readdir_done; + goto done; ctx->pos++; ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); affs_brelse(fh_bh); fh_bh = NULL; } while (ino); } -readdir_done: +done: file->f_version = inode->i_version; file->private_data = (void *)(long)ino; + affs_brelse(fh_bh); -readdir_out: +out_brelse_dir: affs_brelse(dir_bh); - affs_brelse(fh_bh); + +out_unlock_dir: affs_unlock_dir(inode); - return 0; + return error; } -- cgit From 8ca577223f75230a746a06f4566c53943f78d5d0 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:39:01 -0700 Subject: affs: add mount option to avoid filename truncates Normal behavior for filenames exceeding specific filesystem limits is to refuse operation. AFFS standard name length being only 30 characters against 255 for usual Linux filesystems, original implementation does filename truncate by default with a define value AFFS_NO_TRUNCATE which can be enabled but needs module compilation. This patch adds 'nofilenametruncate' mount option so that user can easily activate that feature and avoid a lot of problems (eg overwrite files ...) Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/affs.txt | 9 ++++++--- fs/affs/affs.h | 20 ++++++++------------ fs/affs/amigaffs.c | 23 +++++++++++++++-------- fs/affs/namei.c | 32 +++++++++++++++++++++++--------- fs/affs/super.c | 6 +++++- 5 files changed, 57 insertions(+), 33 deletions(-) diff --git a/Documentation/filesystems/affs.txt b/Documentation/filesystems/affs.txt index 81ac488e3758..71b63c2b9841 100644 --- a/Documentation/filesystems/affs.txt +++ b/Documentation/filesystems/affs.txt @@ -49,6 +49,10 @@ mode=mode Sets the mode flags to the given (octal) value, regardless This is useful since most of the plain AmigaOS files will map to 600. +nofilenametruncate + The file system will return an error when filename exceeds + standard maximum filename length (30 characters). + reserved=num Sets the number of reserved blocks at the start of the partition to num. You should never need this option. Default is 2. @@ -181,9 +185,8 @@ tested, though several hundred MB have been read and written using this fs. For a most up-to-date list of bugs please consult fs/affs/Changes. -Filenames are truncated to 30 characters without warning (this -can be changed by setting the compile-time option AFFS_NO_TRUNCATE -in include/linux/amigaffs.h). +By default, filenames are truncated to 30 characters without warning. +'nofilenametruncate' mount option can change that behavior. Case is ignored by the affs in filename matching, but Linux shells do care about the case. Example (with /wb being an affs mounted fs): diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 3952121f2f28..25b23b1e7f22 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -5,14 +5,6 @@ #include #include -/* AmigaOS allows file names with up to 30 characters length. - * Names longer than that will be silently truncated. If you - * want to disallow this, comment out the following #define. - * Creating filesystem objects with longer names will then - * result in an error (ENAMETOOLONG). - */ -/*#define AFFS_NO_TRUNCATE */ - /* Ugly macros make the code more pretty. */ #define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st)))) @@ -28,7 +20,6 @@ #define AFFS_CACHE_SIZE PAGE_SIZE -#define AFFS_MAX_PREALLOC 32 #define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2) #define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) #define AFFS_AC_MASK (AFFS_AC_SIZE-1) @@ -118,6 +109,7 @@ struct affs_sb_info { #define SF_OFS 0x0200 /* Old filesystem */ #define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ #define SF_VERBOSE 0x0800 /* Talk about fs when mounting */ +#define SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */ /* short cut to get to the affs specific sb data */ static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) @@ -137,9 +129,13 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); extern void secs_to_datestamp(time_t secs, struct affs_date *ds); extern umode_t prot_to_mode(u32 prot); extern void mode_to_prot(struct inode *inode); -extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); -extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); -extern int affs_check_name(const unsigned char *name, int len); +extern void affs_error(struct super_block *sb, const char *function, + const char *fmt, ...); +extern void affs_warning(struct super_block *sb, const char *function, + const char *fmt, ...); +extern bool affs_nofilenametruncate(const struct dentry *dentry); +extern int affs_check_name(const unsigned char *name, int len, + bool notruncate); extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry); /* bitmap. c */ diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index d9a43674cb94..533a322c41c0 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -471,20 +471,27 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) function,ErrorBuffer); } +bool +affs_nofilenametruncate(const struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE; + +} + /* Check if the name is valid for a affs object. */ int -affs_check_name(const unsigned char *name, int len) +affs_check_name(const unsigned char *name, int len, bool notruncate) { int i; - if (len > 30) -#ifdef AFFS_NO_TRUNCATE - return -ENAMETOOLONG; -#else - len = 30; -#endif - + if (len > 30) { + if (notruncate) + return -ENAMETOOLONG; + else + len = 30; + } for (i = 0; i < len; i++) { if (name[i] < ' ' || name[i] == ':' || (name[i] > 0x7e && name[i] < 0xa0)) diff --git a/fs/affs/namei.c b/fs/affs/namei.c index c36cbb4537a2..6dae1ccd176d 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -60,13 +60,13 @@ affs_get_toupper(struct super_block *sb) * Note: the dentry argument is the parent dentry. */ static inline int -__affs_hash_dentry(struct qstr *qstr, toupper_t toupper) +__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) { const u8 *name = qstr->name; unsigned long hash; int i; - i = affs_check_name(qstr->name, qstr->len); + i = affs_check_name(qstr->name, qstr->len, notruncate); if (i) return i; @@ -82,16 +82,22 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper) static int affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_toupper); + return __affs_hash_dentry(qstr, affs_toupper, + affs_nofilenametruncate(dentry)); + } + static int affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_intl_toupper); + return __affs_hash_dentry(qstr, affs_intl_toupper, + affs_nofilenametruncate(dentry)); + } static inline int __affs_compare_dentry(unsigned int len, - const char *str, const struct qstr *name, toupper_t toupper) + const char *str, const struct qstr *name, toupper_t toupper, + bool notruncate) { const u8 *aname = str; const u8 *bname = name->name; @@ -101,7 +107,7 @@ static inline int __affs_compare_dentry(unsigned int len, * must be valid. 'name' must be validated first. */ - if (affs_check_name(name->name, name->len)) + if (affs_check_name(name->name, name->len, notruncate)) return 1; /* @@ -126,13 +132,18 @@ static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(len, str, name, affs_toupper); + + return __affs_compare_dentry(len, str, name, affs_toupper, + affs_nofilenametruncate(parent)); } + static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(len, str, name, affs_intl_toupper); + return __affs_compare_dentry(len, str, name, affs_intl_toupper, + affs_nofilenametruncate(parent)); + } /* @@ -411,7 +422,10 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); - retval = affs_check_name(new_dentry->d_name.name,new_dentry->d_name.len); + retval = affs_check_name(new_dentry->d_name.name, + new_dentry->d_name.len, + affs_nofilenametruncate(old_dentry)); + if (retval) return retval; diff --git a/fs/affs/super.c b/fs/affs/super.c index 4fad16adbe7b..6d589f28bf9b 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -163,7 +163,7 @@ static const struct super_operations affs_sops = { }; enum { - Opt_bs, Opt_mode, Opt_mufs, Opt_prefix, Opt_protect, + Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect, Opt_reserved, Opt_root, Opt_setgid, Opt_setuid, Opt_verbose, Opt_volume, Opt_ignore, Opt_err, }; @@ -172,6 +172,7 @@ static const match_table_t tokens = { {Opt_bs, "bs=%u"}, {Opt_mode, "mode=%o"}, {Opt_mufs, "mufs"}, + {Opt_notruncate, "nofilenametruncate"}, {Opt_prefix, "prefix=%s"}, {Opt_protect, "protect"}, {Opt_reserved, "reserved=%u"}, @@ -233,6 +234,9 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, case Opt_mufs: *mount_opts |= SF_MUFS; break; + case Opt_notruncate: + *mount_opts |= SF_NO_TRUNCATE; + break; case Opt_prefix: *prefix = match_strdup(&args[0]); if (!*prefix) -- cgit From 758b4440753969420290eeff69eeedc5a8521149 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:39:02 -0700 Subject: fs/bfs/inode.c: add __init to init_inodecache() init_inodecache is only called by __init init_bfs_fs Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/bfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 29aa5cf6639b..7041ac35ace8 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -266,7 +266,7 @@ static void init_once(void *foo) inode_init_once(&bi->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", sizeof(struct bfs_inode_info), -- cgit From d7c0847fe3682a026ee6d147c5b6b8ab457fffc8 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:39:03 -0700 Subject: kernel/panic.c: display reason at end + pr_emerg Currently, booting without initrd specified on 80x25 screen gives a call trace followed by atkbd : Spurious ACK. Original message ("VFS: Unable to mount root fs") is not available. Of course this could happen in other situations... This patch displays panic reason after call trace which could help lot of people even if it's not the very last line on screen. Also, convert all panic.c printk(KERN_EMERG to pr_emerg( [akpm@linux-foundation.org: missed a couple of pr_ conversions] Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index 79fd820bb5e8..d02fa9fef46a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -100,7 +100,7 @@ void panic(const char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); + pr_emerg("Kernel panic - not syncing: %s\n", buf); #ifdef CONFIG_DEBUG_BUGVERBOSE /* * Avoid nested stack-dumping if a panic occurs during oops processing @@ -141,7 +141,7 @@ void panic(const char *fmt, ...) * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked. */ - printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); + pr_emerg("Rebooting in %d seconds..", panic_timeout); for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { touch_nmi_watchdog(); @@ -165,7 +165,7 @@ void panic(const char *fmt, ...) extern int stop_a_enabled; /* Make sure the user can actually press Stop-A (L1-A) */ stop_a_enabled = 1; - printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); + pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n"); } #endif #if defined(CONFIG_S390) @@ -176,6 +176,7 @@ void panic(const char *fmt, ...) disabled_wait(caller); } #endif + pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); @@ -276,8 +277,7 @@ unsigned long get_taint(void) void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) { if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off()) - printk(KERN_WARNING - "Disabling lock debugging due to kernel taint\n"); + pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); } @@ -382,8 +382,7 @@ late_initcall(init_oops_id); void print_oops_end_marker(void) { init_oops_id(); - printk(KERN_WARNING "---[ end trace %016llx ]---\n", - (unsigned long long)oops_id); + pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); } /* -- cgit From b6a83d928c652c092acd1ce79c19d44a1da9a6db Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 7 Apr 2014 15:39:03 -0700 Subject: drivers/misc/sgi-gru/grukdump.c: cleanup gru_dump_context() a little "ret" is zero here so we can remove the "!ret" part of the condition. "uhdr" is alread a __user pointer so we can remove the cast. Signed-off-by: Dan Carpenter Acked-by: Dimitri Sivanich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/sgi-gru/grukdump.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/misc/sgi-gru/grukdump.c b/drivers/misc/sgi-gru/grukdump.c index 2bef3f76032a..a3700a56b8ff 100644 --- a/drivers/misc/sgi-gru/grukdump.c +++ b/drivers/misc/sgi-gru/grukdump.c @@ -178,10 +178,10 @@ static int gru_dump_context(struct gru_state *gru, int ctxnum, hdr.cbrcnt = cbrcnt; hdr.dsrcnt = dsrcnt; hdr.cch_locked = cch_locked; - if (!ret && copy_to_user((void __user *)uhdr, &hdr, sizeof(hdr))) - ret = -EFAULT; + if (copy_to_user(uhdr, &hdr, sizeof(hdr))) + return -EFAULT; - return ret ? ret : bytes; + return bytes; } int gru_dump_chiplet_request(unsigned long arg) -- cgit From ae797bdf7f12a2e0216161eb82a568809f6e5c94 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Apr 2014 15:39:04 -0700 Subject: cris: make ETRAX_ARCH_V10 select TTY for use in debugport Fix breakage which will be exposed by the patch "kconfig: make allnoconfig disable options behind EMBEDDED and EXPERT". arch/cris/arch-v10/kernel/debugport.c, compiled in unconditionally with ETRAX_ARCH_V10, requires TTY, so select TTY to avoid a build failure. Signed-off-by: Josh Triplett Cc: Stephen Rothwell Cc: Mikael Starvik Cc: Jesper Nilsson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/cris/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index ed0fcdf7e990..7cb90a54b598 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -138,6 +138,7 @@ config ETRAX_ARCH_V10 bool default y if ETRAX100LX || ETRAX100LX_V2 default n if !(ETRAX100LX || ETRAX100LX_V2) + select TTY config ETRAX_ARCH_V32 bool -- cgit From c638b1074837c601808fee93f33b9c19224ea4cf Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 7 Apr 2014 15:39:06 -0700 Subject: cris: cpuinfo_op should depend on CONFIG_PROC_FS Fix breakage which will be exposed by the patch "kconfig: make allnoconfig disable options behind EMBEDDED and EXPERT". Now allnoconfig started disabling CONFIG_PROC_FS: arch/cris/kernel/built-in.o:(.rodata+0xc): undefined reference to `show_cpuinfo' make: *** [vmlinux] Error 1 Signed-off-by: Geert Uytterhoeven Cc: Stephen Rothwell Cc: Mikael Starvik Cc: Jesper Nilsson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/cris/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/cris/kernel/setup.c b/arch/cris/kernel/setup.c index 32c3d248868e..905b70ea9939 100644 --- a/arch/cris/kernel/setup.c +++ b/arch/cris/kernel/setup.c @@ -165,6 +165,7 @@ void __init setup_arch(char **cmdline_p) strcpy(init_utsname()->machine, cris_machine_name); } +#ifdef CONFIG_PROC_FS static void *c_start(struct seq_file *m, loff_t *pos) { return *pos < nr_cpu_ids ? (void *)(int)(*pos + 1) : NULL; @@ -188,6 +189,7 @@ const struct seq_operations cpuinfo_op = { .stop = c_stop, .show = show_cpuinfo, }; +#endif /* CONFIG_PROC_FS */ static int __init topology_init(void) { -- cgit From 6035d9db386bf30151ed16a94c9223bbe52562ee Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Apr 2014 15:39:07 -0700 Subject: ia64: select CONFIG_TTY for use of tty_write_message in unaligned Fix breakage which will be exposed by the patch "kconfig: make allnoconfig disable options behind EMBEDDED and EXPERT". arch/ia64/kernel/unaligned.c uses tty_write_message to print an unaligned access exception to the TTY of the current user process. Enable TTY to prevent a build error. Minimal fix, on the basis that few people on ia64 will care deeply about kernel size enough to turn off TTY. Ideally, I'd instead suggest dropping the tty_write_message entirely, and just leaving the printk. Bonus: no need to sprintf first. Signed-off-by: Josh Triplett Cc: Stephen Rothwell Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 0c8e553e0b9f..1325c3bc58e1 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -21,6 +21,7 @@ config IA64 select HAVE_FUNCTION_TRACER select HAVE_DMA_ATTRS select HAVE_KVM + select TTY select HAVE_ARCH_TRACEHOOK select HAVE_DMA_API_DEBUG select HAVE_MEMBLOCK -- cgit From 527518f1a9ecc1eb17615e99e7b1233cf9322a49 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Apr 2014 15:39:08 -0700 Subject: ppc: make PPC_BOOK3S_64 select IRQ_WORK Fix breakage which will be exposed by the patch "kconfig: make allnoconfig disable options behind EMBEDDED and EXPERT". arch/powerpc/kernel/mce.c, compiled in for PPC_BOOK3S_64, calls functions only built when IRQ_WORK, so select it. Fixes the following build error: arch/powerpc/kernel/built-in.o: In function `.machine_check_queue_event': (.text+0x11260): undefined reference to `.irq_work_queue' Signed-off-by: Josh Triplett Reported-by: Stephen Rothwell Acked-by: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/Kconfig.cputype | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 434fda39bf8b..d9e2b19b7c8d 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -73,6 +73,7 @@ config PPC_BOOK3S_64 select SYS_SUPPORTS_HUGETLBFS select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES select ARCH_SUPPORTS_NUMA_BALANCING + select IRQ_WORK config PPC_BOOK3E_64 bool "Embedded processors" -- cgit From 5d2acfc7b974bbd3858b4dd3f2cdc6362dd8843a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Apr 2014 15:39:09 -0700 Subject: kconfig: make allnoconfig disable options behind EMBEDDED and EXPERT "make allnoconfig" exists to ease testing of minimal configurations. Documentation/SubmitChecklist includes a note to test with allnoconfig. This helps catch missing dependencies on common-but-not-required functionality, which might otherwise go unnoticed. However, allnoconfig still leaves many symbols enabled, because they're hidden behind CONFIG_EMBEDDED or CONFIG_EXPERT. For instance, allnoconfig still has CONFIG_PRINTK and CONFIG_BLOCK enabled, so drivers don't typically get build-tested with those disabled. To address this, introduce a new Kconfig option "allnoconfig_y", used on symbols which only exist to hide other symbols. Set it on CONFIG_EMBEDDED (which then selects CONFIG_EXPERT). allnoconfig will then disable all the symbols hidden behind those. Signed-off-by: Josh Triplett Tested-by: Paul E. McKenney Cc: Michal Marek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kbuild/kconfig-language.txt | 4 ++++ init/Kconfig | 1 + scripts/kconfig/confdata.c | 5 ++++- scripts/kconfig/expr.h | 3 +++ scripts/kconfig/lkc.h | 1 + scripts/kconfig/menu.c | 3 +++ scripts/kconfig/zconf.gperf | 1 + scripts/kconfig/zconf.hash.c_shipped | 13 ++++++++----- 8 files changed, 25 insertions(+), 6 deletions(-) diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index c420676c6fe3..350f733bf2c7 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt @@ -157,6 +157,10 @@ applicable everywhere (see syntax). to the build environment (if this is desired, it can be done via another symbol). + - "allnoconfig_y" + This declares the symbol as one that should have the value y when + using "allnoconfig". Used for symbols that hide other symbols. + Menu dependencies ----------------- diff --git a/init/Kconfig b/init/Kconfig index 8851c6417880..427ba60d638f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1483,6 +1483,7 @@ config PCI_QUIRKS config EMBEDDED bool "Embedded system" + option allnoconfig_y select EXPERT help This option should be enabled if compiling the kernel for diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 87f723804079..f88d90f20228 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -1178,7 +1178,10 @@ bool conf_set_all_new_symbols(enum conf_def_mode mode) sym->def[S_DEF_USER].tri = mod; break; case def_no: - sym->def[S_DEF_USER].tri = no; + if (sym->flags & SYMBOL_ALLNOCONFIG_Y) + sym->def[S_DEF_USER].tri = yes; + else + sym->def[S_DEF_USER].tri = no; break; case def_random: sym->def[S_DEF_USER].tri = no; diff --git a/scripts/kconfig/expr.h b/scripts/kconfig/expr.h index ba663e1dc7e3..412ea8a2abb8 100644 --- a/scripts/kconfig/expr.h +++ b/scripts/kconfig/expr.h @@ -109,6 +109,9 @@ struct symbol { /* choice values need to be set before calculating this symbol value */ #define SYMBOL_NEED_SET_CHOICE_VALUES 0x100000 +/* Set symbol to y if allnoconfig; used for symbols that hide others */ +#define SYMBOL_ALLNOCONFIG_Y 0x200000 + #define SYMBOL_MAXLENGTH 256 #define SYMBOL_HASHSIZE 9973 diff --git a/scripts/kconfig/lkc.h b/scripts/kconfig/lkc.h index 09f4edfdc911..d5daa7af8b49 100644 --- a/scripts/kconfig/lkc.h +++ b/scripts/kconfig/lkc.h @@ -61,6 +61,7 @@ enum conf_def_mode { #define T_OPT_MODULES 1 #define T_OPT_DEFCONFIG_LIST 2 #define T_OPT_ENV 3 +#define T_OPT_ALLNOCONFIG_Y 4 struct kconf_id { int name; diff --git a/scripts/kconfig/menu.c b/scripts/kconfig/menu.c index db1512ae30cc..3ac2c9c6e280 100644 --- a/scripts/kconfig/menu.c +++ b/scripts/kconfig/menu.c @@ -217,6 +217,9 @@ void menu_add_option(int token, char *arg) case T_OPT_ENV: prop_add_env(arg); break; + case T_OPT_ALLNOCONFIG_Y: + current_entry->sym->flags |= SYMBOL_ALLNOCONFIG_Y; + break; } } diff --git a/scripts/kconfig/zconf.gperf b/scripts/kconfig/zconf.gperf index f14ab41154b6..b6ac02d604f1 100644 --- a/scripts/kconfig/zconf.gperf +++ b/scripts/kconfig/zconf.gperf @@ -44,4 +44,5 @@ on, T_ON, TF_PARAM modules, T_OPT_MODULES, TF_OPTION defconfig_list, T_OPT_DEFCONFIG_LIST,TF_OPTION env, T_OPT_ENV, TF_OPTION +allnoconfig_y, T_OPT_ALLNOCONFIG_Y,TF_OPTION %% diff --git a/scripts/kconfig/zconf.hash.c_shipped b/scripts/kconfig/zconf.hash.c_shipped index 40df0005daa9..c77a8eff1ef2 100644 --- a/scripts/kconfig/zconf.hash.c_shipped +++ b/scripts/kconfig/zconf.hash.c_shipped @@ -55,10 +55,10 @@ kconf_id_hash (register const char *str, register unsigned int len) 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, - 73, 73, 73, 73, 73, 73, 73, 73, 25, 25, + 73, 73, 73, 73, 73, 73, 73, 5, 25, 25, 0, 0, 0, 5, 0, 0, 73, 73, 5, 0, 10, 5, 45, 73, 20, 20, 0, 15, 15, 73, - 20, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 20, 5, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, @@ -106,6 +106,7 @@ struct kconf_id_strings_t char kconf_id_strings_str23[sizeof("mainmenu")]; char kconf_id_strings_str25[sizeof("menuconfig")]; char kconf_id_strings_str27[sizeof("modules")]; + char kconf_id_strings_str28[sizeof("allnoconfig_y")]; char kconf_id_strings_str29[sizeof("menu")]; char kconf_id_strings_str31[sizeof("select")]; char kconf_id_strings_str32[sizeof("comment")]; @@ -141,6 +142,7 @@ static const struct kconf_id_strings_t kconf_id_strings_contents = "mainmenu", "menuconfig", "modules", + "allnoconfig_y", "menu", "select", "comment", @@ -170,7 +172,7 @@ kconf_id_lookup (register const char *str, register unsigned int len) { enum { - TOTAL_KEYWORDS = 32, + TOTAL_KEYWORDS = 33, MIN_WORD_LENGTH = 2, MAX_WORD_LENGTH = 14, MIN_HASH_VALUE = 2, @@ -219,7 +221,8 @@ kconf_id_lookup (register const char *str, register unsigned int len) {-1}, #line 44 "scripts/kconfig/zconf.gperf" {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str27, T_OPT_MODULES, TF_OPTION}, - {-1}, +#line 47 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str28, T_OPT_ALLNOCONFIG_Y,TF_OPTION}, #line 16 "scripts/kconfig/zconf.gperf" {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str29, T_MENU, TF_COMMAND}, {-1}, @@ -282,5 +285,5 @@ kconf_id_lookup (register const char *str, register unsigned int len) } return 0; } -#line 47 "scripts/kconfig/zconf.gperf" +#line 48 "scripts/kconfig/zconf.gperf" -- cgit From b607e70ec6a982fb4e69c4949b5ec57341cf0d94 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Apr 2014 15:39:10 -0700 Subject: bug: when !CONFIG_BUG, simplify WARN_ON_ONCE and family When !CONFIG_BUG, WARN_ON and family become simple passthroughs of their condition argument; however, WARN_ON_ONCE and family still have conditions and a boolean to detect one-time invocation, even though the warning they'd emit doesn't exist. Make the existing definitions conditional on CONFIG_BUG, and add definitions for !CONFIG_BUG that map to the passthrough versions of WARN and WARN_ON. This saves 4.4k on a minimized configuration (smaller than allnoconfig), and 20.6k with defconfig plus CONFIG_BUG=n. Signed-off-by: Josh Triplett Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 57 +++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 7d10f962aa13..7ecd398aebf0 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -106,33 +106,6 @@ extern void warn_slowpath_null(const char *file, const int line); unlikely(__ret_warn_on); \ }) -#else /* !CONFIG_BUG */ -#ifndef HAVE_ARCH_BUG -#define BUG() do {} while(0) -#endif - -#ifndef HAVE_ARCH_BUG_ON -#define BUG_ON(condition) do { if (condition) ; } while(0) -#endif - -#ifndef HAVE_ARCH_WARN_ON -#define WARN_ON(condition) ({ \ - int __ret_warn_on = !!(condition); \ - unlikely(__ret_warn_on); \ -}) -#endif - -#ifndef WARN -#define WARN(condition, format...) ({ \ - int __ret_warn_on = !!(condition); \ - unlikely(__ret_warn_on); \ -}) -#endif - -#define WARN_TAINT(condition, taint, format...) WARN_ON(condition) - -#endif - #define WARN_ON_ONCE(condition) ({ \ static bool __section(.data.unlikely) __warned; \ int __ret_warn_once = !!(condition); \ @@ -163,6 +136,36 @@ extern void warn_slowpath_null(const char *file, const int line); unlikely(__ret_warn_once); \ }) +#else /* !CONFIG_BUG */ +#ifndef HAVE_ARCH_BUG +#define BUG() do {} while(0) +#endif + +#ifndef HAVE_ARCH_BUG_ON +#define BUG_ON(condition) do { if (condition) ; } while(0) +#endif + +#ifndef HAVE_ARCH_WARN_ON +#define WARN_ON(condition) ({ \ + int __ret_warn_on = !!(condition); \ + unlikely(__ret_warn_on); \ +}) +#endif + +#ifndef WARN +#define WARN(condition, format...) ({ \ + int __ret_warn_on = !!(condition); \ + unlikely(__ret_warn_on); \ +}) +#endif + +#define WARN_ON_ONCE(condition) WARN_ON(condition) +#define WARN_ONCE(condition, format...) WARN(condition, format) +#define WARN_TAINT(condition, taint, format...) WARN(condition, format) +#define WARN_TAINT_ONCE(condition, taint, format...) WARN(condition, format) + +#endif + /* * WARN_ON_SMP() is for cases that the warning is either * meaningless for !SMP or may even cause failures. -- cgit From a3f7607d09e28520ad6d86ea423051605f7d4c6f Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Apr 2014 15:39:11 -0700 Subject: include/asm-generic/bug.h: style fix: s/while(0)/while (0)/ Signed-off-by: Josh Triplett Reported-by: Randy Dunlap Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 7ecd398aebf0..2d54d8df7bd1 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -52,7 +52,7 @@ struct bug_entry { #endif #ifndef HAVE_ARCH_BUG_ON -#define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while(0) +#define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0) #endif /* @@ -138,11 +138,11 @@ extern void warn_slowpath_null(const char *file, const int line); #else /* !CONFIG_BUG */ #ifndef HAVE_ARCH_BUG -#define BUG() do {} while(0) +#define BUG() do {} while (0) #endif #ifndef HAVE_ARCH_BUG_ON -#define BUG_ON(condition) do { if (condition) ; } while(0) +#define BUG_ON(condition) do { if (condition) ; } while (0) #endif #ifndef HAVE_ARCH_WARN_ON -- cgit From 4e50ebde32bed67a9aec8c239bbd89e5d0b8727b Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Apr 2014 15:39:12 -0700 Subject: bug: when !CONFIG_BUG, make WARN call no_printk to check format and args The stub version of WARN for !CONFIG_BUG completely ignored its format string and subsequent arguments; make it check them instead, using no_printk. Signed-off-by: Josh Triplett Reported-by: Arnd Bergmann Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 2d54d8df7bd1..a97fa11aaf41 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -155,6 +155,7 @@ extern void warn_slowpath_null(const char *file, const int line); #ifndef WARN #define WARN(condition, format...) ({ \ int __ret_warn_on = !!(condition); \ + no_printk(format); \ unlikely(__ret_warn_on); \ }) #endif -- cgit From a4b5d580e07875f9be29f62a57c67fbbdbb40ba2 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Apr 2014 15:39:13 -0700 Subject: bug: Make BUG() always stop the machine When !CONFIG_BUG and !HAVE_ARCH_BUG, define the generic BUG() as an infinite loop rather than a no-op. This avoids undefined behavior if execution ever actually reaches BUG(), and avoids warnings about code after BUG() (such as on non-void functions calling BUG() and then not returning). bloat-o-meter results: add/remove: 0/0 grow/shrink: 43/10 up/down: 235/-98 (137) function old new delta umount_collect 119 138 +19 notify_change 306 324 +18 xstate_enable_boot_cpu 252 269 +17 kunmap 54 70 +16 balloon_page_dequeue 112 126 +14 mm_take_all_locks 223 233 +10 list_lru_walk_node 143 152 +9 vma_adjust 1059 1067 +8 pcpu_setup_first_chunk 1130 1138 +8 mm_drop_all_locks 143 151 +8 ns_capable 55 62 +7 anon_transport_class_unregister 8 15 +7 srcu_init_notifier_head 35 41 +6 shrink_dcache_for_umount 174 180 +6 kunmap_high 99 105 +6 end_page_writeback 43 49 +6 do_exit 1339 1345 +6 __kfifo_dma_out_prepare_r 86 92 +6 __kfifo_dma_in_prepare_r 90 96 +6 fixup_user_fault 120 125 +5 repair_env_string 73 77 +4 read_cache_pages_invalidate_page 56 60 +4 isolate_lru_pages.isra 142 146 +4 do_notify_parent_cldstop 255 259 +4 cpu_init 370 374 +4 utimes_common 270 272 +2 tasklet_hi_action 91 93 +2 tasklet_action 91 93 +2 set_pte_vaddr 46 48 +2 find_get_pages_tag 202 204 +2 early_iounmap 185 187 +2 __native_set_fixmap 36 38 +2 __get_user_pages 822 824 +2 __early_ioremap 299 301 +2 yield_task_stop 1 2 +1 tick_resume 37 38 +1 switched_to_stop 1 2 +1 switched_to_idle 1 2 +1 prio_changed_stop 1 2 +1 prio_changed_idle 1 2 +1 pm_qos_power_read 111 112 +1 arch_cpu_idle_dead 1 2 +1 __insert_vmap_area 140 141 +1 sys_renameat 614 612 -2 mm_fault_error 297 295 -2 SyS_renameat 614 612 -2 sys_linkat 416 413 -3 SyS_linkat 416 413 -3 chmod_common 129 122 -7 proc_cap_handler 240 225 -15 __schedule 849 831 -18 sys_madvise 1077 1054 -23 SyS_madvise 1077 1054 -23 Signed-off-by: Josh Triplett Reported-by: Arnd Bergmann Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index a97fa11aaf41..630dd2372238 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -138,7 +138,7 @@ extern void warn_slowpath_null(const char *file, const int line); #else /* !CONFIG_BUG */ #ifndef HAVE_ARCH_BUG -#define BUG() do {} while (0) +#define BUG() do {} while (1) #endif #ifndef HAVE_ARCH_BUG_ON -- cgit From b06dd879f5db33c1d7f5ab516ea671627f99c0c9 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Apr 2014 15:39:14 -0700 Subject: x86: always define BUG() and HAVE_ARCH_BUG, even with !CONFIG_BUG This ensures that BUG() always has a definition that causes a trap (via an undefined instruction), and that the compiler still recognizes the code following BUG() as unreachable, avoiding warnings that would otherwise appear (such as on non-void functions that don't return a value after BUG()). In addition to saving a few bytes over the generic infinite-loop implementation, this implementation traps rather than looping, which potentially allows for better error-recovery behavior (such as by rebooting). Signed-off-by: Josh Triplett Reported-by: Arnd Bergmann Acked-by: Arnd Bergmann Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/bug.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 2f03ff018d36..ba38ebbaced3 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_BUG_H #define _ASM_X86_BUG_H -#ifdef CONFIG_BUG #define HAVE_ARCH_BUG #ifdef CONFIG_DEBUG_BUGVERBOSE @@ -33,8 +32,6 @@ do { \ } while (0) #endif -#endif /* !CONFIG_BUG */ - #include #endif /* _ASM_X86_BUG_H */ -- cgit From 16caed319604609a5579df132fce362a456170d7 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 7 Apr 2014 15:39:15 -0700 Subject: fault-injection: set bounds on what /proc/self/make-it-fail accepts. /proc/self/make-it-fail is a boolean, but accepts any number, including negative ones. Change variable to unsigned, and cap upper bound at 1. [akpm@linux-foundation.org: don't make make_it_fail unsigned] Signed-off-by: Dave Jones Reviewed-by: Akinobu Mita Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index 8da60e768b42..6b7087e2e8fb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1236,6 +1236,9 @@ static ssize_t proc_fault_inject_write(struct file * file, make_it_fail = simple_strtol(strstrip(buffer), &end, 0); if (*end) return -EINVAL; + if (make_it_fail < 0 || make_it_fail > 1) + return -EINVAL; + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; -- cgit From 6aa7a29aa8378fa868fabef6a84b0a40d5d9d677 Mon Sep 17 00:00:00 2001 From: "Daniel M. Weeks" Date: Mon, 7 Apr 2014 15:39:16 -0700 Subject: initramfs: debug detected compression method This can greatly aid in narrowing down the real source of initramfs problems such as failures related to the compression of the in-kernel initramfs when an external initramfs is in use as well. Existing errors are ambiguous as to which initramfs is a problem and why. [akpm@linux-foundation.org: use pr_debug()] Signed-off-by: Daniel M. Weeks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 1 + lib/decompress.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/init/initramfs.c b/init/initramfs.c index 93b61396756b..a8497fab1c3d 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -455,6 +455,7 @@ static char * __init unpack_to_rootfs(char *buf, unsigned len) } this_header = 0; decompress = decompress_method(buf, len, &compress_name); + pr_debug("Detected %s compressed data\n", compress_name); if (decompress) { res = decompress(buf, len, NULL, flush_buffer, NULL, &my_inptr, error); diff --git a/lib/decompress.c b/lib/decompress.c index 4d1cd0397aab..86069d74c062 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifndef CONFIG_DECOMPRESS_GZIP # define gunzip NULL @@ -61,6 +62,8 @@ decompress_fn __init decompress_method(const unsigned char *inbuf, int len, if (len < 2) return NULL; /* Need at least this much... */ + pr_debug("Compressed data magic: %#.2x %#.2x\n", inbuf[0], inbuf[1]); + for (cf = compressed_formats; cf->name; cf++) { if (!memcmp(inbuf, cf->magic, 2)) break; -- cgit From 187841a800f328e93529086ca58145e7a44bff3f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 7 Apr 2014 15:39:17 -0700 Subject: ipc/compat.c: remove sc_semopm macro This macro appears to have been introduced back in the 2.5 era for semtimedop32 backward compatibility on ia32: https://lkml.org/lkml/2003/4/28/78 Nowadays, this syscall in compat just defaults back to the code found in sem.c, so it is no longer used and can thus be removed: long compat_sys_semtimedop(int semid, struct sembuf __user *tsems, unsigned nsops, const struct compat_timespec __user *timeout) { struct timespec __user *ts64; if (compat_convert_timespec(&ts64, timeout)) return -EFAULT; return sys_semtimedop(semid, tsems, nsops, ts64); } Furthermore, there are no users in compat.c. After this change, kernel builds just fine with both CONFIG_SYSVIPC_COMPAT and CONFIG_SYSVIPC. Signed-off-by: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/compat.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/ipc/compat.c b/ipc/compat.c index a4695ada3275..45d035d4cedc 100644 --- a/ipc/compat.c +++ b/ipc/compat.c @@ -113,9 +113,6 @@ struct compat_shm_info { compat_ulong_t swap_attempts, swap_successes; }; -extern int sem_ctls[]; -#define sc_semopm (sem_ctls[2]) - static inline int compat_ipc_parse_version(int *cmd) { #ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION -- cgit From 6d08a2567c0b9103c3ff946df17ad4be9a917e2f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 7 Apr 2014 15:39:18 -0700 Subject: ipc: use device_initcall ... since __initcall is now deprecated. Signed-off-by: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/ipc_sysctl.c | 2 +- ipc/mqueue.c | 2 +- ipc/util.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 17028648cfeb..998d31b230f1 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -281,4 +281,4 @@ static int __init ipc_sysctl_init(void) return 0; } -__initcall(ipc_sysctl_init); +device_initcall(ipc_sysctl_init); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index c3b31179122c..4fcf39af1776 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1459,4 +1459,4 @@ out_sysctl: return error; } -__initcall(init_mqueue_fs); +device_initcall(init_mqueue_fs); diff --git a/ipc/util.c b/ipc/util.c index e1b4c6db8aa0..2eb0d1eaa312 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -128,7 +128,7 @@ static int __init ipc_init(void) register_ipcns_notifier(&init_ipc_ns); return 0; } -__initcall(ipc_init); +device_initcall(ipc_init); /** * ipc_init_ids - initialise ipc identifiers -- cgit From ce816fa88cca083c47ab9000b2138a83043a78be Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Mon, 7 Apr 2014 15:39:19 -0700 Subject: Kconfig: rename HAS_IOPORT to HAS_IOPORT_MAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the renamed symbol is defined lib/iomap.c implements ioport_map and ioport_unmap and currently (nearly) all platforms define the port accessor functions outb/inb and friend unconditionally. So HAS_IOPORT_MAP is the better name for this. Consequently NO_IOPORT is renamed to NO_IOPORT_MAP. The motivation for this change is to reintroduce a symbol HAS_IOPORT that signals if outb/int et al are available. I will address that at least one merge window later though to keep surprises to a minimum and catch new introductions of (HAS|NO)_IOPORT. The changes in this commit were done using: $ git grep -l -E '(NO|HAS)_IOPORT' | xargs perl -p -i -e 's/\b((?:CONFIG_)?(?:NO|HAS)_IOPORT)\b/$1_MAP/' Signed-off-by: Uwe Kleine-König Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/Kconfig | 2 +- arch/arm/Kconfig | 12 ++++++------ arch/arm/mach-picoxcell/Kconfig | 2 +- arch/arm/mach-prima2/Kconfig | 2 +- arch/arm/mach-s3c24xx/Kconfig | 2 +- arch/arm/mach-shmobile/Kconfig | 2 +- arch/arm/mach-vexpress/Kconfig | 2 +- arch/arm/plat-samsung/Kconfig | 4 ++-- arch/arm64/Kconfig | 2 +- arch/cris/Kconfig | 2 +- arch/hexagon/Kconfig | 2 +- arch/m32r/Kconfig | 2 +- arch/m68k/Kconfig | 2 +- arch/metag/Kconfig | 2 +- arch/mips/Kconfig | 4 ++-- arch/openrisc/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/sh/Kconfig | 4 ++-- arch/sh/boards/Kconfig | 8 ++++---- arch/sh/include/asm/io.h | 4 ++-- arch/sh/include/asm/io_trapped.h | 2 +- arch/sh/include/asm/machvec.h | 2 +- arch/sh/kernel/Makefile | 2 +- arch/sh/kernel/io_trapped.c | 4 ++-- arch/tile/Kconfig | 2 +- arch/unicore32/Kconfig | 2 +- arch/xtensa/Kconfig | 4 ++-- arch/xtensa/configs/iss_defconfig | 2 +- arch/xtensa/configs/s6105_defconfig | 2 +- drivers/char/tpm/Kconfig | 2 +- drivers/i2c/busses/Kconfig | 2 +- drivers/net/can/sja1000/Kconfig | 2 +- drivers/net/ethernet/3com/Kconfig | 2 +- include/asm-generic/io.h | 4 ++-- include/asm-generic/iomap.h | 2 +- include/linux/io.h | 2 +- lib/Kconfig | 4 ++-- lib/devres.c | 4 ++-- lib/iomap.c | 4 ++-- sound/isa/Kconfig | 2 +- sound/pci/Kconfig | 2 +- 41 files changed, 59 insertions(+), 59 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 75de197a2fef..9596b0ab108d 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -57,7 +57,7 @@ config ARCH_FLATMEM_ENABLE config MMU def_bool y -config NO_IOPORT +config NO_IOPORT_MAP def_bool y config GENERIC_CALIBRATE_DELAY diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index d7a71e3ef55f..5db05f6a0412 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -126,7 +126,7 @@ config HAVE_TCM config HAVE_PROC_CPU bool -config NO_IOPORT +config NO_IOPORT_MAP bool config EISA @@ -410,7 +410,7 @@ config ARCH_EBSA110 select ISA select NEED_MACH_IO_H select NEED_MACH_MEMORY_H - select NO_IOPORT + select NO_IOPORT_MAP help This is an evaluation board for the StrongARM processor available from Digital. It has limited hardware on-board, including an @@ -428,7 +428,7 @@ config ARCH_EFM32 select CPU_V7M select GENERIC_CLOCKEVENTS select NO_DMA - select NO_IOPORT + select NO_IOPORT_MAP select SPARSE_IRQ select USE_OF help @@ -677,7 +677,7 @@ config ARCH_SHMOBILE_LEGACY select HAVE_SMP select MIGHT_HAVE_CACHE_L2X0 select MULTI_IRQ_HANDLER - select NO_IOPORT + select NO_IOPORT_MAP select PINCTRL select PM_GENERIC_DOMAINS if PM select SPARSE_IRQ @@ -699,7 +699,7 @@ config ARCH_RPC select ISA_DMA_API select NEED_MACH_IO_H select NEED_MACH_MEMORY_H - select NO_IOPORT + select NO_IOPORT_MAP select VIRT_TO_BUS help On the Acorn Risc-PC, Linux can support the internal IDE disk and @@ -760,7 +760,7 @@ config ARCH_S3C64XX select HAVE_S3C2410_I2C if I2C select HAVE_S3C2410_WATCHDOG if WATCHDOG select HAVE_TCM - select NO_IOPORT + select NO_IOPORT_MAP select PLAT_SAMSUNG select PM_GENERIC_DOMAINS if PM select S3C_DEV_NAND diff --git a/arch/arm/mach-picoxcell/Kconfig b/arch/arm/mach-picoxcell/Kconfig index eca9eb1c5931..62240f69b4ee 100644 --- a/arch/arm/mach-picoxcell/Kconfig +++ b/arch/arm/mach-picoxcell/Kconfig @@ -4,4 +4,4 @@ config ARCH_PICOXCELL select ARM_VIC select DW_APB_TIMER_OF select HAVE_TCM - select NO_IOPORT + select NO_IOPORT_MAP diff --git a/arch/arm/mach-prima2/Kconfig b/arch/arm/mach-prima2/Kconfig index 3e8189186a5b..e4e505f52ba0 100644 --- a/arch/arm/mach-prima2/Kconfig +++ b/arch/arm/mach-prima2/Kconfig @@ -3,7 +3,7 @@ config ARCH_SIRF select ARCH_HAS_RESET_CONTROLLER select ARCH_REQUIRE_GPIOLIB select GENERIC_IRQ_CHIP - select NO_IOPORT + select NO_IOPORT_MAP select PINCTRL select PINCTRL_SIRF help diff --git a/arch/arm/mach-s3c24xx/Kconfig b/arch/arm/mach-s3c24xx/Kconfig index ba1cc6246778..40cf50b9940c 100644 --- a/arch/arm/mach-s3c24xx/Kconfig +++ b/arch/arm/mach-s3c24xx/Kconfig @@ -12,7 +12,7 @@ if ARCH_S3C24XX config PLAT_S3C24XX def_bool y select ARCH_REQUIRE_GPIOLIB - select NO_IOPORT + select NO_IOPORT_MAP select S3C_DEV_NAND select IRQ_DOMAIN help diff --git a/arch/arm/mach-shmobile/Kconfig b/arch/arm/mach-shmobile/Kconfig index a182008e3aeb..0f92ba8e7884 100644 --- a/arch/arm/mach-shmobile/Kconfig +++ b/arch/arm/mach-shmobile/Kconfig @@ -10,7 +10,7 @@ config ARCH_SHMOBILE_MULTI select ARM_GIC select MIGHT_HAVE_PCI select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE - select NO_IOPORT + select NO_IOPORT_MAP select PINCTRL select ARCH_REQUIRE_GPIOLIB diff --git a/arch/arm/mach-vexpress/Kconfig b/arch/arm/mach-vexpress/Kconfig index 80b4be36f10a..657d52d0391f 100644 --- a/arch/arm/mach-vexpress/Kconfig +++ b/arch/arm/mach-vexpress/Kconfig @@ -10,7 +10,7 @@ config ARCH_VEXPRESS select HAVE_ARM_TWD if SMP select HAVE_PATA_PLATFORM select ICST - select NO_IOPORT + select NO_IOPORT_MAP select PLAT_VERSATILE select PLAT_VERSATILE_CLCD select POWER_RESET diff --git a/arch/arm/plat-samsung/Kconfig b/arch/arm/plat-samsung/Kconfig index b57e922f1614..243dfcb2ca0e 100644 --- a/arch/arm/plat-samsung/Kconfig +++ b/arch/arm/plat-samsung/Kconfig @@ -9,7 +9,7 @@ config PLAT_SAMSUNG depends on PLAT_S3C24XX || ARCH_S3C64XX || PLAT_S5P || ARCH_EXYNOS default y select GENERIC_IRQ_CHIP - select NO_IOPORT + select NO_IOPORT_MAP help Base platform code for all Samsung SoC based systems @@ -19,7 +19,7 @@ config PLAT_S5P default y select ARCH_REQUIRE_GPIOLIB select ARM_VIC - select NO_IOPORT + select NO_IOPORT_MAP select PLAT_SAMSUNG select S3C_GPIO_TRACK select S5P_GPIO_DRVSTR diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9711a5fd948d..8079a23e2701 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -66,7 +66,7 @@ config ARCH_PHYS_ADDR_T_64BIT config MMU def_bool y -config NO_IOPORT +config NO_IOPORT_MAP def_bool y config STACKTRACE_SUPPORT diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 7cb90a54b598..52731e221851 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -29,7 +29,7 @@ config GENERIC_CALIBRATE_DELAY bool default y -config NO_IOPORT +config NO_IOPORT_MAP def_bool y config FORCE_MAX_ZONEORDER diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index fbc5c78c9ac7..0fd6138f6203 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -19,7 +19,7 @@ config HEXAGON select GENERIC_IRQ_SHOW select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK - select NO_IOPORT + select NO_IOPORT_MAP select GENERIC_IOMAP select GENERIC_SMP_IDLE_THREAD select STACKTRACE_SUPPORT diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index ca4504424dae..9e44bbd8051e 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -28,7 +28,7 @@ config ZONE_DMA bool default y -config NO_IOPORT +config NO_IOPORT_MAP def_bool y config NO_DMA diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index b2e322939256..87b7c7581b1d 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -52,7 +52,7 @@ config TIME_LOW_RES bool default y -config NO_IOPORT +config NO_IOPORT_MAP def_bool y config NO_DMA diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig index b1d3c9c0eff8..499b7610eaaf 100644 --- a/arch/metag/Kconfig +++ b/arch/metag/Kconfig @@ -52,7 +52,7 @@ config GENERIC_HWEIGHT config GENERIC_CALIBRATE_DELAY def_bool y -config NO_IOPORT +config NO_IOPORT_MAP def_bool y source "init/Kconfig" diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 16d5ab1615b1..5cd695f905a1 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -175,7 +175,7 @@ config MACH_DECSTATION select CPU_R4000_WORKAROUNDS if 64BIT select CPU_R4400_WORKAROUNDS if 64BIT select DMA_NONCOHERENT - select NO_IOPORT + select NO_IOPORT_MAP select IRQ_CPU select SYS_HAS_CPU_R3000 select SYS_HAS_CPU_R4X00 @@ -947,7 +947,7 @@ config SYNC_R4K config MIPS_MACHINE def_bool n -config NO_IOPORT +config NO_IOPORT_MAP def_bool n config GENERIC_ISA_DMA diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 9488209a5253..e71d712afb79 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -41,7 +41,7 @@ config RWSEM_XCHGADD_ALGORITHM config GENERIC_HWEIGHT def_bool y -config NO_IOPORT +config NO_IOPORT_MAP def_bool y config TRACE_IRQFLAGS_SUPPORT diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 953f17c8d17c..346d21678ffd 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -52,7 +52,7 @@ config KEXEC config AUDIT_ARCH def_bool y -config NO_IOPORT +config NO_IOPORT_MAP def_bool y config PCI_QUIRKS diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 1399383315a3..ba55e939a820 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -3,7 +3,7 @@ config SUPERH select ARCH_MIGHT_HAVE_PC_PARPORT select EXPERT select CLKDEV_LOOKUP - select HAVE_IDE if HAS_IOPORT + select HAVE_IDE if HAS_IOPORT_MAP select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select ARCH_DISCARD_MEMBLOCK @@ -138,7 +138,7 @@ config ARCH_HAS_ILOG2_U32 config ARCH_HAS_ILOG2_U64 def_bool n -config NO_IOPORT +config NO_IOPORT_MAP def_bool !PCI depends on !SH_CAYMAN && !SH_SH4202_MICRODEV && !SH_SHMIN && \ !SH_HP6XX && !SH_SOLUTION_ENGINE diff --git a/arch/sh/boards/Kconfig b/arch/sh/boards/Kconfig index eb1cf84231a2..e331e5373b8e 100644 --- a/arch/sh/boards/Kconfig +++ b/arch/sh/boards/Kconfig @@ -158,7 +158,7 @@ config SH_SDK7786 bool "SDK7786" depends on CPU_SUBTYPE_SH7786 select SYS_SUPPORTS_PCI - select NO_IOPORT if !PCI + select NO_IOPORT_MAP if !PCI select ARCH_WANT_OPTIONAL_GPIOLIB select HAVE_SRAM_POOL select REGULATOR_FIXED_VOLTAGE if REGULATOR @@ -204,7 +204,7 @@ config SH_URQUELL depends on CPU_SUBTYPE_SH7786 select ARCH_REQUIRE_GPIOLIB select SYS_SUPPORTS_PCI - select NO_IOPORT if !PCI + select NO_IOPORT_MAP if !PCI config SH_MIGOR bool "Migo-R" @@ -306,7 +306,7 @@ config SH_LBOX_RE2 config SH_X3PROTO bool "SH-X3 Prototype board" depends on CPU_SUBTYPE_SHX3 - select NO_IOPORT if !PCI + select NO_IOPORT_MAP if !PCI select IRQ_DOMAIN config SH_MAGIC_PANEL_R2 @@ -333,7 +333,7 @@ config SH_POLARIS config SH_SH2007 bool "SH-2007 board" - select NO_IOPORT + select NO_IOPORT_MAP select REGULATOR_FIXED_VOLTAGE if REGULATOR depends on CPU_SUBTYPE_SH7780 help diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index 629db2ad7916..728c4c571f40 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -122,7 +122,7 @@ __BUILD_MEMORY_STRING(__raw_, l, u32) __BUILD_MEMORY_STRING(__raw_, q, u64) -#ifdef CONFIG_HAS_IOPORT +#ifdef CONFIG_HAS_IOPORT_MAP /* * Slowdown I/O port space accesses for antique hardware. @@ -218,7 +218,7 @@ __BUILD_IOPORT_STRING(w, u16) __BUILD_IOPORT_STRING(l, u32) __BUILD_IOPORT_STRING(q, u64) -#else /* !CONFIG_HAS_IOPORT */ +#else /* !CONFIG_HAS_IOPORT_MAP */ #include diff --git a/arch/sh/include/asm/io_trapped.h b/arch/sh/include/asm/io_trapped.h index f1251d4f0ba9..4ab94ef51071 100644 --- a/arch/sh/include/asm/io_trapped.h +++ b/arch/sh/include/asm/io_trapped.h @@ -36,7 +36,7 @@ __ioremap_trapped(unsigned long offset, unsigned long size) #define __ioremap_trapped(offset, size) NULL #endif -#ifdef CONFIG_HAS_IOPORT +#ifdef CONFIG_HAS_IOPORT_MAP extern struct list_head trapped_io; static inline void __iomem * diff --git a/arch/sh/include/asm/machvec.h b/arch/sh/include/asm/machvec.h index eb9c20d971dd..d3324e4f372e 100644 --- a/arch/sh/include/asm/machvec.h +++ b/arch/sh/include/asm/machvec.h @@ -21,7 +21,7 @@ struct sh_machine_vector { int (*mv_irq_demux)(int irq); void (*mv_init_irq)(void); -#ifdef CONFIG_HAS_IOPORT +#ifdef CONFIG_HAS_IOPORT_MAP void __iomem *(*mv_ioport_map)(unsigned long port, unsigned int size); void (*mv_ioport_unmap)(void __iomem *); #endif diff --git a/arch/sh/kernel/Makefile b/arch/sh/kernel/Makefile index 261c8bfd75ce..2ccf36c824c6 100644 --- a/arch/sh/kernel/Makefile +++ b/arch/sh/kernel/Makefile @@ -22,7 +22,7 @@ obj-y := debugtraps.o dma-nommu.o dumpstack.o \ ifndef CONFIG_GENERIC_IOMAP obj-y += iomap.o -obj-$(CONFIG_HAS_IOPORT) += ioport.o +obj-$(CONFIG_HAS_IOPORT_MAP) += ioport.o endif obj-$(CONFIG_SUPERH32) += sys_sh32.o diff --git a/arch/sh/kernel/io_trapped.c b/arch/sh/kernel/io_trapped.c index c0a9761f2f8a..f8ce36286cea 100644 --- a/arch/sh/kernel/io_trapped.c +++ b/arch/sh/kernel/io_trapped.c @@ -22,7 +22,7 @@ #define TRAPPED_PAGES_MAX 16 -#ifdef CONFIG_HAS_IOPORT +#ifdef CONFIG_HAS_IOPORT_MAP LIST_HEAD(trapped_io); EXPORT_SYMBOL_GPL(trapped_io); #endif @@ -90,7 +90,7 @@ int register_trapped_io(struct trapped_io *tiop) tiop->magic = IO_TRAPPED_MAGIC; INIT_LIST_HEAD(&tiop->list); spin_lock_irq(&trapped_lock); -#ifdef CONFIG_HAS_IOPORT +#ifdef CONFIG_HAS_IOPORT_MAP if (flags & IORESOURCE_IO) list_add(&tiop->list, &trapped_io); #endif diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 31c8c6223995..85258ca43ff5 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -411,7 +411,7 @@ config PCI_DOMAINS config NO_IOMEM def_bool !PCI -config NO_IOPORT +config NO_IOPORT_MAP def_bool !PCI config TILE_PCI_IO diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 25c0dba508cc..aafad6fa1667 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -27,7 +27,7 @@ config UNICORE32 config GENERIC_CSUM def_bool y -config NO_IOPORT +config NO_IOPORT_MAP bool config STACKTRACE_SUPPORT diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index c87ae7c6e5f9..02d6d29a63c1 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -41,7 +41,7 @@ config ARCH_HAS_ILOG2_U32 config ARCH_HAS_ILOG2_U64 def_bool n -config NO_IOPORT +config NO_IOPORT_MAP def_bool n config HZ @@ -239,7 +239,7 @@ config XTENSA_PLATFORM_XT2000 config XTENSA_PLATFORM_S6105 bool "S6105" select SERIAL_CONSOLE - select NO_IOPORT + select NO_IOPORT_MAP config XTENSA_PLATFORM_XTFPGA bool "XTFPGA" diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig index 4f233204faf9..d57d917ff240 100644 --- a/arch/xtensa/configs/iss_defconfig +++ b/arch/xtensa/configs/iss_defconfig @@ -11,7 +11,7 @@ CONFIG_GENERIC_FIND_NEXT_BIT=y CONFIG_GENERIC_HWEIGHT=y # CONFIG_ARCH_HAS_ILOG2_U32 is not set # CONFIG_ARCH_HAS_ILOG2_U64 is not set -CONFIG_NO_IOPORT=y +CONFIG_NO_IOPORT_MAP=y CONFIG_HZ=100 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" CONFIG_CONSTRUCTORS=y diff --git a/arch/xtensa/configs/s6105_defconfig b/arch/xtensa/configs/s6105_defconfig index d929f77a0360..583c2b0974ca 100644 --- a/arch/xtensa/configs/s6105_defconfig +++ b/arch/xtensa/configs/s6105_defconfig @@ -11,7 +11,7 @@ CONFIG_GENERIC_FIND_NEXT_BIT=y CONFIG_GENERIC_HWEIGHT=y # CONFIG_ARCH_HAS_ILOG2_U32 is not set # CONFIG_ARCH_HAS_ILOG2_U64 is not set -CONFIG_NO_IOPORT=y +CONFIG_NO_IOPORT_MAP=y CONFIG_HZ=100 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig index 1a65838888cd..c54cac3f8bc8 100644 --- a/drivers/char/tpm/Kconfig +++ b/drivers/char/tpm/Kconfig @@ -74,7 +74,7 @@ config TCG_NSC config TCG_ATMEL tristate "Atmel TPM Interface" - depends on PPC64 || HAS_IOPORT + depends on PPC64 || HAS_IOPORT_MAP ---help--- If you have a TPM security chip from Atmel say Yes and it will be accessible from within Linux. To compile this driver diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index de17c5593d97..014afab1d551 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -936,7 +936,7 @@ config I2C_ACORN config I2C_ELEKTOR tristate "Elektor ISA card" - depends on ISA && HAS_IOPORT && BROKEN_ON_SMP + depends on ISA && HAS_IOPORT_MAP && BROKEN_ON_SMP select I2C_ALGOPCF help This supports the PCF8584 ISA bus I2C adapter. Say Y if you own diff --git a/drivers/net/can/sja1000/Kconfig b/drivers/net/can/sja1000/Kconfig index 4b18b8765523..1e65cb6c2591 100644 --- a/drivers/net/can/sja1000/Kconfig +++ b/drivers/net/can/sja1000/Kconfig @@ -39,7 +39,7 @@ config CAN_EMS_PCI config CAN_PEAK_PCMCIA tristate "PEAK PCAN-PC Card" depends on PCMCIA - depends on HAS_IOPORT + depends on HAS_IOPORT_MAP ---help--- This driver is for the PCAN-PC Card PCMCIA adapter (1 or 2 channels) from PEAK-System (http://www.peak-system.com). To compile this diff --git a/drivers/net/ethernet/3com/Kconfig b/drivers/net/ethernet/3com/Kconfig index 65b735d4a6ad..afaab4b2333f 100644 --- a/drivers/net/ethernet/3com/Kconfig +++ b/drivers/net/ethernet/3com/Kconfig @@ -66,7 +66,7 @@ config PCMCIA_3C589 config VORTEX tristate "3c590/3c900 series (592/595/597) \"Vortex/Boomerang\" support" - depends on (PCI || EISA) && HAS_IOPORT + depends on (PCI || EISA) && HAS_IOPORT_MAP select MII ---help--- This option enables driver support for a large number of 10Mbps and diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index d5afe96adba6..975e1cc75edb 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -327,7 +327,7 @@ static inline void iounmap(void __iomem *addr) } #endif /* CONFIG_MMU */ -#ifdef CONFIG_HAS_IOPORT +#ifdef CONFIG_HAS_IOPORT_MAP #ifndef CONFIG_GENERIC_IOMAP static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { @@ -341,7 +341,7 @@ static inline void ioport_unmap(void __iomem *p) extern void __iomem *ioport_map(unsigned long port, unsigned int nr); extern void ioport_unmap(void __iomem *p); #endif /* CONFIG_GENERIC_IOMAP */ -#endif /* CONFIG_HAS_IOPORT */ +#endif /* CONFIG_HAS_IOPORT_MAP */ #ifndef xlate_dev_kmem_ptr #define xlate_dev_kmem_ptr(p) p diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h index 6afd7d6a9899..1b41011643a5 100644 --- a/include/asm-generic/iomap.h +++ b/include/asm-generic/iomap.h @@ -56,7 +56,7 @@ extern void iowrite8_rep(void __iomem *port, const void *buf, unsigned long coun extern void iowrite16_rep(void __iomem *port, const void *buf, unsigned long count); extern void iowrite32_rep(void __iomem *port, const void *buf, unsigned long count); -#ifdef CONFIG_HAS_IOPORT +#ifdef CONFIG_HAS_IOPORT_MAP /* Create a virtual mapping cookie for an IO port range */ extern void __iomem *ioport_map(unsigned long port, unsigned int nr); extern void ioport_unmap(void __iomem *); diff --git a/include/linux/io.h b/include/linux/io.h index 8a18e75600cc..b76e6e545806 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -41,7 +41,7 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end, /* * Managed iomap interface */ -#ifdef CONFIG_HAS_IOPORT +#ifdef CONFIG_HAS_IOPORT_MAP void __iomem * devm_ioport_map(struct device *dev, unsigned long port, unsigned int nr); void devm_ioport_unmap(struct device *dev, void __iomem *addr); diff --git a/lib/Kconfig b/lib/Kconfig index 991c98bc4a3f..5d4984c505f8 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -342,9 +342,9 @@ config HAS_IOMEM select GENERIC_IO default y -config HAS_IOPORT +config HAS_IOPORT_MAP boolean - depends on HAS_IOMEM && !NO_IOPORT + depends on HAS_IOMEM && !NO_IOPORT_MAP default y config HAS_DMA diff --git a/lib/devres.c b/lib/devres.c index 48cb3c7bd7de..2f16c133fd36 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -170,7 +170,7 @@ void __iomem *devm_request_and_ioremap(struct device *device, } EXPORT_SYMBOL(devm_request_and_ioremap); -#ifdef CONFIG_HAS_IOPORT +#ifdef CONFIG_HAS_IOPORT_MAP /* * Generic iomap devres */ @@ -229,7 +229,7 @@ void devm_ioport_unmap(struct device *dev, void __iomem *addr) devm_ioport_map_match, (__force void *)addr)); } EXPORT_SYMBOL(devm_ioport_unmap); -#endif /* CONFIG_HAS_IOPORT */ +#endif /* CONFIG_HAS_IOPORT_MAP */ #ifdef CONFIG_PCI /* diff --git a/lib/iomap.c b/lib/iomap.c index 2c08f36862eb..fc3dcb4b238e 100644 --- a/lib/iomap.c +++ b/lib/iomap.c @@ -224,7 +224,7 @@ EXPORT_SYMBOL(iowrite8_rep); EXPORT_SYMBOL(iowrite16_rep); EXPORT_SYMBOL(iowrite32_rep); -#ifdef CONFIG_HAS_IOPORT +#ifdef CONFIG_HAS_IOPORT_MAP /* Create a virtual mapping cookie for an IO port range */ void __iomem *ioport_map(unsigned long port, unsigned int nr) { @@ -239,7 +239,7 @@ void ioport_unmap(void __iomem *addr) } EXPORT_SYMBOL(ioport_map); EXPORT_SYMBOL(ioport_unmap); -#endif /* CONFIG_HAS_IOPORT */ +#endif /* CONFIG_HAS_IOPORT_MAP */ #ifdef CONFIG_PCI /* Hide the details if this is a MMIO or PIO address space and just do what diff --git a/sound/isa/Kconfig b/sound/isa/Kconfig index affa13480659..0216475fc759 100644 --- a/sound/isa/Kconfig +++ b/sound/isa/Kconfig @@ -191,7 +191,7 @@ config SND_ES18XX config SND_SC6000 tristate "Gallant SC-6000/6600/7000 and Audio Excel DSP 16" - depends on HAS_IOPORT + depends on HAS_IOPORT_MAP select SND_WSS_LIB select SND_OPL3_LIB select SND_MPU401_UART diff --git a/sound/pci/Kconfig b/sound/pci/Kconfig index 0b0c0cf13f74..3a3a3a71088b 100644 --- a/sound/pci/Kconfig +++ b/sound/pci/Kconfig @@ -688,7 +688,7 @@ config SND_LOLA config SND_LX6464ES tristate "Digigram LX6464ES" - depends on HAS_IOPORT + depends on HAS_IOPORT_MAP select SND_PCM help Say Y here to include support for Digigram LX6464ES boards. -- cgit From 52f5684c8e1ec7463192aba8e2916df49807511a Mon Sep 17 00:00:00 2001 From: Gideon Israel Dsouza Date: Mon, 7 Apr 2014 15:39:20 -0700 Subject: kernel: use macros from compiler.h instead of __attribute__((...)) To increase compiler portability there is which provides convenience macros for various gcc constructs. Eg: __weak for __attribute__((weak)). I've replaced all instances of gcc attributes with the right macro in the kernel subsystem. Signed-off-by: Gideon Israel Dsouza Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 ++- kernel/kallsyms.c | 11 ++++++----- kernel/kexec.c | 5 +++-- kernel/ksysfs.c | 5 +++-- kernel/power/power.h | 3 ++- kernel/power/snapshot.c | 3 ++- kernel/power/suspend.c | 5 +++-- kernel/power/swap.c | 2 +- kernel/sched/clock.c | 3 ++- kernel/sched/core.c | 3 ++- kernel/signal.c | 4 +++- kernel/time/timekeeping.c | 5 +++-- kernel/trace/trace.h | 3 ++- 13 files changed, 34 insertions(+), 21 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index e905e9c6b224..54a8d26f612f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include @@ -286,7 +287,7 @@ void __init fork_init(unsigned long mempages) init_task.signal->rlim[RLIMIT_NPROC]; } -int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, +int __weak arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3127ad52cdb2..cb0cf37dac3a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -36,8 +37,8 @@ * These will be re-linked against their real values * during the second link stage. */ -extern const unsigned long kallsyms_addresses[] __attribute__((weak)); -extern const u8 kallsyms_names[] __attribute__((weak)); +extern const unsigned long kallsyms_addresses[] __weak; +extern const u8 kallsyms_names[] __weak; /* * Tell the compiler that the count isn't in the small data section if the arch @@ -46,10 +47,10 @@ extern const u8 kallsyms_names[] __attribute__((weak)); extern const unsigned long kallsyms_num_syms __attribute__((weak, section(".rodata"))); -extern const u8 kallsyms_token_table[] __attribute__((weak)); -extern const u16 kallsyms_token_index[] __attribute__((weak)); +extern const u8 kallsyms_token_table[] __weak; +extern const u16 kallsyms_token_index[] __weak; -extern const unsigned long kallsyms_markers[] __attribute__((weak)); +extern const unsigned long kallsyms_markers[] __weak; static inline int is_kernel_inittext(unsigned long addr) { diff --git a/kernel/kexec.c b/kernel/kexec.c index c0d261c7db7b..c8380ad203bc 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -1551,10 +1552,10 @@ void vmcoreinfo_append_str(const char *fmt, ...) * provide an empty default implementation here -- architecture * code may override this */ -void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) +void __weak arch_crash_save_vmcoreinfo(void) {} -unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) +unsigned long __weak paddr_vmcoreinfo_note(void) { return __pa((unsigned long)(char *)&vmcoreinfo_note); } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e660964086e2..2495a9b14ac8 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -18,6 +18,7 @@ #include #include #include +#include #include /* rcu_expedited */ @@ -162,8 +163,8 @@ KERNEL_ATTR_RW(rcu_expedited); /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ -extern const void __start_notes __attribute__((weak)); -extern const void __stop_notes __attribute__((weak)); +extern const void __start_notes __weak; +extern const void __stop_notes __weak; #define notes_size (&__stop_notes - &__start_notes) static ssize_t notes_read(struct file *filp, struct kobject *kobj, diff --git a/kernel/power/power.h b/kernel/power/power.h index 1ca753106557..15f37ea08719 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -2,6 +2,7 @@ #include #include #include +#include struct swsusp_info { struct new_utsname uts; @@ -11,7 +12,7 @@ struct swsusp_info { unsigned long image_pages; unsigned long pages; unsigned long size; -} __attribute__((aligned(PAGE_SIZE))); +} __aligned(PAGE_SIZE); #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 149e745eaa52..18fb7a2fb14b 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -155,7 +156,7 @@ static inline void free_image_page(void *addr, int clear_nosave_free) struct linked_page { struct linked_page *next; char data[LINKED_PAGE_DATA_SIZE]; -} __attribute__((packed)); +} __packed; static inline void free_list_of_pages(struct linked_page *list, int clear_page_nosave) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 90b3d9366d1a..c3ad9cafe930 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "power.h" @@ -156,13 +157,13 @@ static int suspend_prepare(suspend_state_t state) } /* default implementation */ -void __attribute__ ((weak)) arch_suspend_disable_irqs(void) +void __weak arch_suspend_disable_irqs(void) { local_irq_disable(); } /* default implementation */ -void __attribute__ ((weak)) arch_suspend_enable_irqs(void) +void __weak arch_suspend_enable_irqs(void) { local_irq_enable(); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7c33ed200410..8c9a4819f798 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -101,7 +101,7 @@ struct swsusp_header { unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; char sig[10]; -} __attribute__((packed)); +} __packed; static struct swsusp_header *swsusp_header; diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index b30a2924ef14..3ef6451e972e 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -60,13 +60,14 @@ #include #include #include +#include /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __attribute__((weak)) sched_clock(void) +unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1d1b87b36778..80bd491b718c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include @@ -6498,7 +6499,7 @@ static cpumask_var_t fallback_doms; * cpu core maps. It is supposed to return 1 if the topology changed * or 0 if it stayed the same. */ -int __attribute__((weak)) arch_update_cpu_topology(void) +int __weak arch_update_cpu_topology(void) { return 0; } diff --git a/kernel/signal.c b/kernel/signal.c index 5d4b05a229a6..6ea13c09ae56 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -33,6 +33,8 @@ #include #include #include +#include + #define CREATE_TRACE_POINTS #include @@ -3618,7 +3620,7 @@ SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) } #endif -__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) +__weak const char *arch_vma_name(struct vm_area_struct *vma) { return NULL; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5b40279ecd71..f7df8ea21707 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "tick-internal.h" #include "ntp_internal.h" @@ -760,7 +761,7 @@ u64 timekeeping_max_deferment(void) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_persistent_clock(struct timespec *ts) +void __weak read_persistent_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; @@ -775,7 +776,7 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_boot_clock(struct timespec *ts) +void __weak read_boot_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ffc314b7e92b..2e29d7ba5a52 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_FTRACE_SYSCALLS #include /* For NR_SYSCALLS */ @@ -1279,7 +1280,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ - __attribute__((__aligned__(4))) event_##call; + __aligned(4) event_##call; #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ -- cgit From 956632857819747466e27037aa8a57e8165213c0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Apr 2014 15:39:21 -0700 Subject: asm/system.h: clean asm/system.h from docs Clean asm/system.h from docs as nothing should refer to that header anymore. Signed-off-by: David Howells Cc: Ingo Molnar Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/DocBook/kernel-hacking.tmpl | 2 +- Documentation/irqflags-tracing.txt | 7 ------- Documentation/scheduler/sched-arch.txt | 2 +- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl index d0758b241b23..b90959ba37e4 100644 --- a/Documentation/DocBook/kernel-hacking.tmpl +++ b/Documentation/DocBook/kernel-hacking.tmpl @@ -671,7 +671,7 @@ printk(KERN_INFO "my ip: %pI4\n", &ipaddress); <function>local_irq_save()</function>/<function>local_irq_restore()</function> - <filename class="headerfile">include/asm/system.h</filename> + <filename class="headerfile">include/linux/irqflags.h</filename> diff --git a/Documentation/irqflags-tracing.txt b/Documentation/irqflags-tracing.txt index 67aa71e73035..f6da05670e16 100644 --- a/Documentation/irqflags-tracing.txt +++ b/Documentation/irqflags-tracing.txt @@ -22,13 +22,6 @@ rather straightforward and risk-free manner. Architectures that want to support this need to do a couple of code-organizational changes first: -- move their irq-flags manipulation code from their asm/system.h header - to asm/irqflags.h - -- rename local_irq_disable()/etc to raw_local_irq_disable()/etc. so that - the linux/irqflags.h code can inject callbacks and can construct the - real local_irq_disable()/etc APIs. - - add and enable TRACE_IRQFLAGS_SUPPORT in their arch level Kconfig file and then a couple of functional changes are needed as well to implement diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt index 9290de703450..a2f27bbf2cba 100644 --- a/Documentation/scheduler/sched-arch.txt +++ b/Documentation/scheduler/sched-arch.txt @@ -8,7 +8,7 @@ Context switch By default, the switch_to arch function is called with the runqueue locked. This is usually not a problem unless switch_to may need to take the runqueue lock. This is usually due to a wake up operation in -the context switch. See arch/ia64/include/asm/system.h for an example. +the context switch. See arch/ia64/include/asm/switch_to.h for an example. To request the scheduler call switch_to with the runqueue unlocked, you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file -- cgit From cf7bc58f6dd4fdbab22e2ec5f27fe59674f425bf Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Apr 2014 15:39:22 -0700 Subject: asm/system.h: um: arch_align_stack() moved to asm/exec.h arch_align_stack() moved to asm/exec.h, so change the comment referring to asm/system.h which no longer exists. Signed-off-by: David Howells Cc: Jeff Dike Cc: Richard Weinberger Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index eecc4142764c..f17bca8ed2ce 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -359,7 +359,7 @@ int singlestepping(void * t) /* * Only x86 and x86_64 have an arch_align_stack(). * All other arches have "#define arch_align_stack(x) (x)" - * in their asm/system.h + * in their asm/exec.h * As this is included in UML from asm-um/system-generic.h, * we can use it to behave as the subarch does. */ -- cgit From a44cb9449182fd7b25bf5f1cc38b7f19e0b96f6d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:23 -0700 Subject: memcg, slab: never try to merge memcg caches When a kmem cache is created (kmem_cache_create_memcg()), we first try to find a compatible cache that already exists and can handle requests from the new cache, i.e. has the same object size, alignment, ctor, etc. If there is such a cache, we do not create any new caches, instead we simply increment the refcount of the cache found and return it. Currently we do this procedure not only when creating root caches, but also for memcg caches. However, there is no point in that, because, as every memcg cache has exactly the same parameters as its parent and cache merging cannot be turned off in runtime (only on boot by passing "slub_nomerge"), the root caches of any two potentially mergeable memcg caches should be merged already, i.e. it must be the same root cache, and therefore we couldn't even get to the memcg cache creation, because it already exists. The only exception is boot caches - they are explicitly forbidden to be merged by setting their refcount to -1. There are currently only two of them - kmem_cache and kmem_cache_node, which are used in slab internals (I do not count kmalloc caches as their refcount is set to 1 immediately after creation). Since they are prevented from merging preliminary I guess we should avoid to merge their children too. So let's remove the useless code responsible for merging memcg caches. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 21 ++++----------------- mm/slab_common.c | 8 +++++--- mm/slub.c | 19 +++++++++---------- 3 files changed, 18 insertions(+), 30 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 8184a7cde272..3045316b7c9d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -55,12 +55,12 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, struct mem_cgroup; #ifdef CONFIG_SLUB struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)); +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)); #else static inline struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { return NULL; } #endif @@ -119,13 +119,6 @@ static inline bool is_root_cache(struct kmem_cache *s) return !s->memcg_params || s->memcg_params->is_root_cache; } -static inline bool cache_match_memcg(struct kmem_cache *cachep, - struct mem_cgroup *memcg) -{ - return (is_root_cache(cachep) && !memcg) || - (cachep->memcg_params->memcg == memcg); -} - static inline void memcg_bind_pages(struct kmem_cache *s, int order) { if (!is_root_cache(s)) @@ -204,12 +197,6 @@ static inline bool is_root_cache(struct kmem_cache *s) return true; } -static inline bool cache_match_memcg(struct kmem_cache *cachep, - struct mem_cgroup *memcg) -{ - return true; -} - static inline void memcg_bind_pages(struct kmem_cache *s, int order) { } diff --git a/mm/slab_common.c b/mm/slab_common.c index 1ec3c619ba04..e77b51eb7347 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -200,9 +200,11 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, */ flags &= CACHE_CREATE_MASK; - s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); - if (s) - goto out_unlock; + if (!memcg) { + s = __kmem_cache_alias(name, size, align, flags, ctor); + if (s) + goto out_unlock; + } err = -ENOMEM; s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); diff --git a/mm/slub.c b/mm/slub.c index 5b05e4fe9a1a..7d81afb27048 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3685,6 +3685,9 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; + if (!is_root_cache(s)) + return 1; + if (s->ctor) return 1; @@ -3697,9 +3700,8 @@ static int slab_unmergeable(struct kmem_cache *s) return 0; } -static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, - size_t align, unsigned long flags, const char *name, - void (*ctor)(void *)) +static struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)) { struct kmem_cache *s; @@ -3722,7 +3724,7 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, continue; if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) - continue; + continue; /* * Check if alignment is compatible. * Courtesy of Adrian Drzewiecki @@ -3733,21 +3735,18 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, if (s->size - size >= sizeof(void *)) continue; - if (!cache_match_memcg(s, memcg)) - continue; - return s; } return NULL; } struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - s = find_mergeable(memcg, size, align, flags, name, ctor); + s = find_mergeable(size, align, flags, name, ctor); if (s) { s->refcount++; /* -- cgit From 5722d094ad2b56fa2c1cb3adaf40071a55bbf242 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:24 -0700 Subject: memcg, slab: cleanup memcg cache creation This patch cleans up the memcg cache creation path as follows: - Move memcg cache name creation to a separate function to be called from kmem_cache_create_memcg(). This allows us to get rid of the mutex protecting the temporary buffer used for the name formatting, because the whole cache creation path is protected by the slab_mutex. - Get rid of memcg_create_kmem_cache(). This function serves as a proxy to kmem_cache_create_memcg(). After separating the cache name creation path, it would be reduced to a function call, so let's inline it. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 9 +++++ mm/memcontrol.c | 89 ++++++++++++++++++++-------------------------- mm/slab_common.c | 5 ++- 3 files changed, 52 insertions(+), 51 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 96f3fc87ab96..ab7f02884983 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -491,6 +491,9 @@ void __memcg_kmem_commit_charge(struct page *page, void __memcg_kmem_uncharge_pages(struct page *page, int order); int memcg_cache_id(struct mem_cgroup *memcg); + +char *memcg_create_cache_name(struct mem_cgroup *memcg, + struct kmem_cache *root_cache); int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache); void memcg_free_cache_params(struct kmem_cache *s); @@ -635,6 +638,12 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return -1; } +static inline char *memcg_create_cache_name(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) +{ + return NULL; +} + static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e33b1d09eb1f..32c7342df4bf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3094,6 +3094,29 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) return 0; } +char *memcg_create_cache_name(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) +{ + static char *buf = NULL; + + /* + * We need a mutex here to protect the shared buffer. Since this is + * expected to be called only on cache creation, we can employ the + * slab_mutex for that purpose. + */ + lockdep_assert_held(&slab_mutex); + + if (!buf) { + buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!buf) + return NULL; + } + + cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1); + return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, + memcg_cache_id(memcg), buf); +} + int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { @@ -3298,46 +3321,6 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) schedule_work(&cachep->memcg_params->destroy); } -static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, - struct kmem_cache *s) -{ - struct kmem_cache *new = NULL; - static char *tmp_path = NULL, *tmp_name = NULL; - static DEFINE_MUTEX(mutex); /* protects tmp_name */ - - BUG_ON(!memcg_can_account_kmem(memcg)); - - mutex_lock(&mutex); - /* - * kmem_cache_create_memcg duplicates the given name and - * cgroup_name for this name requires RCU context. - * This static temporary buffer is used to prevent from - * pointless shortliving allocation. - */ - if (!tmp_path || !tmp_name) { - if (!tmp_path) - tmp_path = kmalloc(PATH_MAX, GFP_KERNEL); - if (!tmp_name) - tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!tmp_path || !tmp_name) - goto out; - } - - cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1); - snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name, - memcg_cache_id(memcg), tmp_name); - - new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align, - (s->flags & ~SLAB_PANIC), s->ctor, s); - if (new) - new->allocflags |= __GFP_KMEMCG; - else - new = s; -out: - mutex_unlock(&mutex); - return new; -} - void kmem_cache_destroy_memcg_children(struct kmem_cache *s) { struct kmem_cache *c; @@ -3384,12 +3367,6 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) mutex_unlock(&activate_kmem_mutex); } -struct create_work { - struct mem_cgroup *memcg; - struct kmem_cache *cachep; - struct work_struct work; -}; - static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) { struct kmem_cache *cachep; @@ -3407,13 +3384,25 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) mutex_unlock(&memcg->slab_caches_mutex); } +struct create_work { + struct mem_cgroup *memcg; + struct kmem_cache *cachep; + struct work_struct work; +}; + static void memcg_create_cache_work_func(struct work_struct *w) { - struct create_work *cw; + struct create_work *cw = container_of(w, struct create_work, work); + struct mem_cgroup *memcg = cw->memcg; + struct kmem_cache *cachep = cw->cachep; + struct kmem_cache *new; - cw = container_of(w, struct create_work, work); - memcg_create_kmem_cache(cw->memcg, cw->cachep); - css_put(&cw->memcg->css); + new = kmem_cache_create_memcg(memcg, cachep->name, + cachep->object_size, cachep->align, + cachep->flags & ~SLAB_PANIC, cachep->ctor, cachep); + if (new) + new->allocflags |= __GFP_KMEMCG; + css_put(&memcg->css); kfree(cw); } diff --git a/mm/slab_common.c b/mm/slab_common.c index e77b51eb7347..11857abf7057 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -215,7 +215,10 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, s->align = calculate_alignment(flags, align, size); s->ctor = ctor; - s->name = kstrdup(name, GFP_KERNEL); + if (memcg) + s->name = memcg_create_cache_name(memcg, parent_cache); + else + s->name = kstrdup(name, GFP_KERNEL); if (!s->name) goto out_free_cache; -- cgit From 794b1248be4e7e157f5535c3ee49168aa4643349 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:26 -0700 Subject: memcg, slab: separate memcg vs root cache creation paths Memcg-awareness turned kmem_cache_create() into a dirty interweaving of memcg-only and except-for-memcg calls. To clean this up, let's move the code responsible for memcg cache creation to a separate function. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 -- include/linux/slab.h | 6 +- mm/memcontrol.c | 7 +- mm/slab_common.c | 187 ++++++++++++++++++++++++++------------------- 4 files changed, 111 insertions(+), 95 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ab7f02884983..02d3072841e9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -638,12 +638,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return -1; } -static inline char *memcg_create_cache_name(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) -{ - return NULL; -} - static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { diff --git a/include/linux/slab.h b/include/linux/slab.h index b5b2df60299e..3dd389aa91c7 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -115,9 +115,9 @@ int slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); -struct kmem_cache * -kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t, - unsigned long, void (*)(void *), struct kmem_cache *); +#ifdef CONFIG_MEMCG_KMEM +void kmem_cache_create_memcg(struct mem_cgroup *, struct kmem_cache *); +#endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); void kmem_cache_free(struct kmem_cache *, void *); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 32c7342df4bf..451523c3bd4e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3395,13 +3395,8 @@ static void memcg_create_cache_work_func(struct work_struct *w) struct create_work *cw = container_of(w, struct create_work, work); struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; - struct kmem_cache *new; - new = kmem_cache_create_memcg(memcg, cachep->name, - cachep->object_size, cachep->align, - cachep->flags & ~SLAB_PANIC, cachep->ctor, cachep); - if (new) - new->allocflags |= __GFP_KMEMCG; + kmem_cache_create_memcg(memcg, cachep); css_put(&memcg->css); kfree(cw); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 11857abf7057..ccc012f00126 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -29,8 +29,7 @@ DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; #ifdef CONFIG_DEBUG_VM -static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, - size_t size) +static int kmem_cache_sanity_check(const char *name, size_t size) { struct kmem_cache *s = NULL; @@ -57,13 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, } #if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) - /* - * For simplicity, we won't check this in the list of memcg - * caches. We have control over memcg naming, and if there - * aren't duplicates in the global list, there won't be any - * duplicates in the memcg lists as well. - */ - if (!memcg && !strcmp(s->name, name)) { + if (!strcmp(s->name, name)) { pr_err("%s (%s): Cache name already exists.\n", __func__, name); dump_stack(); @@ -77,8 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, return 0; } #else -static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, - const char *name, size_t size) +static inline int kmem_cache_sanity_check(const char *name, size_t size) { return 0; } @@ -139,6 +131,46 @@ unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } +static struct kmem_cache * +do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *), + struct mem_cgroup *memcg, struct kmem_cache *root_cache) +{ + struct kmem_cache *s; + int err; + + err = -ENOMEM; + s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); + if (!s) + goto out; + + s->name = name; + s->object_size = object_size; + s->size = size; + s->align = align; + s->ctor = ctor; + + err = memcg_alloc_cache_params(memcg, s, root_cache); + if (err) + goto out_free_cache; + + err = __kmem_cache_create(s, flags); + if (err) + goto out_free_cache; + + s->refcount = 1; + list_add(&s->list, &slab_caches); + memcg_register_cache(s); +out: + if (err) + return ERR_PTR(err); + return s; + +out_free_cache: + memcg_free_cache_params(s); + kfree(s); + goto out; +} /* * kmem_cache_create - Create a cache. @@ -164,34 +196,21 @@ unsigned long calculate_alignment(unsigned long flags, * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ - struct kmem_cache * -kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *), - struct kmem_cache *parent_cache) +kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *s = NULL; + struct kmem_cache *s; + char *cache_name; int err; get_online_cpus(); mutex_lock(&slab_mutex); - err = kmem_cache_sanity_check(memcg, name, size); + err = kmem_cache_sanity_check(name, size); if (err) goto out_unlock; - if (memcg) { - /* - * Since per-memcg caches are created asynchronously on first - * allocation (see memcg_kmem_get_cache()), several threads can - * try to create the same cache, but only one of them may - * succeed. Therefore if we get here and see the cache has - * already been created, we silently return NULL. - */ - if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg))) - goto out_unlock; - } - /* * Some allocators will constraint the set of valid flags to a subset * of all flags. We expect them to define CACHE_CREATE_MASK in this @@ -200,55 +219,29 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, */ flags &= CACHE_CREATE_MASK; - if (!memcg) { - s = __kmem_cache_alias(name, size, align, flags, ctor); - if (s) - goto out_unlock; - } - - err = -ENOMEM; - s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); - if (!s) + s = __kmem_cache_alias(name, size, align, flags, ctor); + if (s) goto out_unlock; - s->object_size = s->size = size; - s->align = calculate_alignment(flags, align, size); - s->ctor = ctor; - - if (memcg) - s->name = memcg_create_cache_name(memcg, parent_cache); - else - s->name = kstrdup(name, GFP_KERNEL); - if (!s->name) - goto out_free_cache; - - err = memcg_alloc_cache_params(memcg, s, parent_cache); - if (err) - goto out_free_cache; - - err = __kmem_cache_create(s, flags); - if (err) - goto out_free_cache; + cache_name = kstrdup(name, GFP_KERNEL); + if (!cache_name) { + err = -ENOMEM; + goto out_unlock; + } - s->refcount = 1; - list_add(&s->list, &slab_caches); - memcg_register_cache(s); + s = do_kmem_cache_create(cache_name, size, size, + calculate_alignment(flags, align, size), + flags, ctor, NULL, NULL); + if (IS_ERR(s)) { + err = PTR_ERR(s); + kfree(cache_name); + } out_unlock: mutex_unlock(&slab_mutex); put_online_cpus(); if (err) { - /* - * There is no point in flooding logs with warnings or - * especially crashing the system if we fail to create a cache - * for a memcg. In this case we will be accounting the memcg - * allocation to the root cgroup until we succeed to create its - * own cache, but it isn't that critical. - */ - if (!memcg) - return NULL; - if (flags & SLAB_PANIC) panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", name, err); @@ -260,21 +253,55 @@ out_unlock: return NULL; } return s; - -out_free_cache: - memcg_free_cache_params(s); - kfree(s->name); - kmem_cache_free(kmem_cache, s); - goto out_unlock; } +EXPORT_SYMBOL(kmem_cache_create); -struct kmem_cache * -kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +#ifdef CONFIG_MEMCG_KMEM +/* + * kmem_cache_create_memcg - Create a cache for a memory cgroup. + * @memcg: The memory cgroup the new cache is for. + * @root_cache: The parent of the new cache. + * + * This function attempts to create a kmem cache that will serve allocation + * requests going from @memcg to @root_cache. The new cache inherits properties + * from its parent. + */ +void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { - return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); + struct kmem_cache *s; + char *cache_name; + + get_online_cpus(); + mutex_lock(&slab_mutex); + + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. + */ + if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg))) + goto out_unlock; + + cache_name = memcg_create_cache_name(memcg, root_cache); + if (!cache_name) + goto out_unlock; + + s = do_kmem_cache_create(cache_name, root_cache->object_size, + root_cache->size, root_cache->align, + root_cache->flags, root_cache->ctor, + memcg, root_cache); + if (IS_ERR(s)) { + kfree(cache_name); + goto out_unlock; + } + + s->allocflags |= __GFP_KMEMCG; + +out_unlock: + mutex_unlock(&slab_mutex); + put_online_cpus(); } -EXPORT_SYMBOL(kmem_cache_create); +#endif /* CONFIG_MEMCG_KMEM */ void kmem_cache_destroy(struct kmem_cache *s) { -- cgit From 051dd46050f2a9bdfff8cc067f8987069eae1743 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:27 -0700 Subject: memcg, slab: unregister cache from memcg before starting to destroy it Currently, memcg_unregister_cache(), which deletes the cache being destroyed from the memcg_slab_caches list, is called after __kmem_cache_shutdown() (see kmem_cache_destroy()), which starts to destroy the cache. As a result, one can access a partially destroyed cache while traversing a memcg_slab_caches list, which can have deadly consequences (for instance, cache_show() called for each cache on a memcg_slab_caches list from mem_cgroup_slabinfo_read() will dereference pointers to already freed data). To fix this, let's move memcg_unregister_cache() before the cache destruction process beginning, issuing memcg_register_cache() on failure. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 12 ++++++------ mm/slab_common.c | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 451523c3bd4e..c22d8bf42d9a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3140,6 +3140,7 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, s->memcg_params->root_cache = root_cache; INIT_WORK(&s->memcg_params->destroy, kmem_cache_destroy_work_func); + css_get(&memcg->css); } else s->memcg_params->is_root_cache = true; @@ -3148,6 +3149,10 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, void memcg_free_cache_params(struct kmem_cache *s) { + if (!s->memcg_params) + return; + if (!s->memcg_params->is_root_cache) + css_put(&s->memcg_params->memcg->css); kfree(s->memcg_params); } @@ -3170,9 +3175,6 @@ void memcg_register_cache(struct kmem_cache *s) memcg = s->memcg_params->memcg; id = memcg_cache_id(memcg); - css_get(&memcg->css); - - /* * Since readers won't lock (see cache_from_memcg_idx()), we need a * barrier here to ensure nobody will see the kmem_cache partially @@ -3221,10 +3223,8 @@ void memcg_unregister_cache(struct kmem_cache *s) * after removing it from the memcg_slab_caches list, otherwise we can * fail to convert memcg_params_to_cache() while traversing the list. */ - VM_BUG_ON(!root->memcg_params->memcg_caches[id]); + VM_BUG_ON(root->memcg_params->memcg_caches[id] != s); root->memcg_params->memcg_caches[id] = NULL; - - css_put(&memcg->css); } /* diff --git a/mm/slab_common.c b/mm/slab_common.c index ccc012f00126..0c2879ff414c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -313,9 +313,9 @@ void kmem_cache_destroy(struct kmem_cache *s) s->refcount--; if (!s->refcount) { list_del(&s->list); + memcg_unregister_cache(s); if (!__kmem_cache_shutdown(s)) { - memcg_unregister_cache(s); mutex_unlock(&slab_mutex); if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); @@ -325,6 +325,7 @@ void kmem_cache_destroy(struct kmem_cache *s) kmem_cache_free(kmem_cache, s); } else { list_add(&s->list, &slab_caches); + memcg_register_cache(s); mutex_unlock(&slab_mutex); printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", s->name); -- cgit From b8529907ba35d625fa4b85d3e4dc8021be97c1f3 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:28 -0700 Subject: memcg, slab: do not destroy children caches if parent has aliases Currently we destroy children caches at the very beginning of kmem_cache_destroy(). This is wrong, because the root cache will not necessarily be destroyed in the end - if it has aliases (refcount > 0), kmem_cache_destroy() will simply decrement its refcount and return. In this case, at best we will get a bunch of warnings in dmesg, like this one: kmem_cache_destroy kmalloc-32:0: Slab cache still has objects CPU: 1 PID: 7139 Comm: modprobe Tainted: G B W 3.13.0+ #117 Call Trace: dump_stack+0x49/0x5b kmem_cache_destroy+0xdf/0xf0 kmem_cache_destroy_memcg_children+0x97/0xc0 kmem_cache_destroy+0xf/0xf0 xfs_mru_cache_uninit+0x21/0x30 [xfs] exit_xfs_fs+0x2e/0xc44 [xfs] SyS_delete_module+0x198/0x1f0 system_call_fastpath+0x16/0x1b At worst - if kmem_cache_destroy() will race with an allocation from a memcg cache - the kernel will panic. This patch fixes this by moving children caches destruction after the check if the cache has aliases. Plus, it forbids destroying a root cache if it still has children caches, because each children cache keeps a reference to its parent. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 +--- mm/memcontrol.c | 13 ++++---- mm/slab_common.c | 75 ++++++++++++++++++++++++++++++---------------- 3 files changed, 57 insertions(+), 37 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 02d3072841e9..b569b8be5c5a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -507,7 +507,7 @@ struct kmem_cache * __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); void mem_cgroup_destroy_cache(struct kmem_cache *cachep); -void kmem_cache_destroy_memcg_children(struct kmem_cache *s); +int __kmem_cache_destroy_memcg_children(struct kmem_cache *s); /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. @@ -661,10 +661,6 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { return cachep; } - -static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s) -{ -} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c22d8bf42d9a..29501f040568 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3321,15 +3321,10 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) schedule_work(&cachep->memcg_params->destroy); } -void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) { struct kmem_cache *c; - int i; - - if (!s->memcg_params) - return; - if (!s->memcg_params->is_root_cache) - return; + int i, failed = 0; /* * If the cache is being destroyed, we trust that there is no one else @@ -3363,8 +3358,12 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) c->memcg_params->dead = false; cancel_work_sync(&c->memcg_params->destroy); kmem_cache_destroy(c); + + if (cache_from_memcg_idx(s, i)) + failed++; } mutex_unlock(&activate_kmem_mutex); + return failed; } static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) diff --git a/mm/slab_common.c b/mm/slab_common.c index 0c2879ff414c..f3cfccf76dda 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -301,39 +301,64 @@ out_unlock: mutex_unlock(&slab_mutex); put_online_cpus(); } + +static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ + int rc; + + if (!s->memcg_params || + !s->memcg_params->is_root_cache) + return 0; + + mutex_unlock(&slab_mutex); + rc = __kmem_cache_destroy_memcg_children(s); + mutex_lock(&slab_mutex); + + return rc; +} +#else +static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ + return 0; +} #endif /* CONFIG_MEMCG_KMEM */ void kmem_cache_destroy(struct kmem_cache *s) { - /* Destroy all the children caches if we aren't a memcg cache */ - kmem_cache_destroy_memcg_children(s); - get_online_cpus(); mutex_lock(&slab_mutex); + s->refcount--; - if (!s->refcount) { - list_del(&s->list); - memcg_unregister_cache(s); - - if (!__kmem_cache_shutdown(s)) { - mutex_unlock(&slab_mutex); - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - - memcg_free_cache_params(s); - kfree(s->name); - kmem_cache_free(kmem_cache, s); - } else { - list_add(&s->list, &slab_caches); - memcg_register_cache(s); - mutex_unlock(&slab_mutex); - printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", - s->name); - dump_stack(); - } - } else { - mutex_unlock(&slab_mutex); + if (s->refcount) + goto out_unlock; + + if (kmem_cache_destroy_memcg_children(s) != 0) + goto out_unlock; + + list_del(&s->list); + memcg_unregister_cache(s); + + if (__kmem_cache_shutdown(s) != 0) { + list_add(&s->list, &slab_caches); + memcg_register_cache(s); + printk(KERN_ERR "kmem_cache_destroy %s: " + "Slab cache still has objects\n", s->name); + dump_stack(); + goto out_unlock; } + + mutex_unlock(&slab_mutex); + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); + + memcg_free_cache_params(s); + kfree(s->name); + kmem_cache_free(kmem_cache, s); + goto out_put_cpus; + +out_unlock: + mutex_unlock(&slab_mutex); +out_put_cpus: put_online_cpus(); } EXPORT_SYMBOL(kmem_cache_destroy); -- cgit From 84d0ddd6b0e3187d85e609c2d10c36089cf0be04 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:29 -0700 Subject: slub: adjust memcg caches when creating cache alias Otherwise, kzalloc() called from a memcg won't clear the whole object. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 7d81afb27048..33939e72bc37 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3748,7 +3748,11 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, s = find_mergeable(size, align, flags, name, ctor); if (s) { + int i; + struct kmem_cache *c; + s->refcount++; + /* * Adjust the object sizes so that we clear * the complete object on kzalloc. @@ -3756,6 +3760,15 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, s->object_size = max(s->object_size, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + for_each_memcg_cache_index(i) { + c = cache_from_memcg_idx(s, i); + if (!c) + continue; + c->object_size = s->object_size; + c->inuse = max_t(int, c->inuse, + ALIGN(size, sizeof(void *))); + } + if (sysfs_slab_alias(s, name)) { s->refcount--; s = NULL; -- cgit From 9a41707bd3a0811919000daf094e9d50ea65f7da Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:31 -0700 Subject: slub: rework sysfs layout for memcg caches Currently, we try to arrange sysfs entries for memcg caches in the same manner as for global caches. Apart from turning /sys/kernel/slab into a mess when there are a lot of kmem-active memcgs created, it actually does not work properly - we won't create more than one link to a memcg cache in case its parent is merged with another cache. For instance, if A is a root cache merged with another root cache B, we will have the following sysfs setup: X A -> X B -> X where X is some unique id (see create_unique_id()). Now if memcgs M and N start to allocate from cache A (or B, which is the same), we will get: X X:M X:N A -> X B -> X A:M -> X:M A:N -> X:N Since B is an alias for A, we won't get entries B:M and B:N, which is confusing. It is more logical to have entries for memcg caches under the corresponding root cache's sysfs directory. This would allow us to keep sysfs layout clean, and avoid such inconsistencies like one described above. This patch does the trick. It creates a "cgroup" kset in each root cache kobject to keep its children caches there. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 3 +++ mm/slub.c | 26 +++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index f56bfa9e4526..f2f7398848cf 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -87,6 +87,9 @@ struct kmem_cache { #ifdef CONFIG_MEMCG_KMEM struct memcg_cache_params *memcg_params; int max_attr_size; /* for propagation, maximum size of a stored attr */ +#ifdef CONFIG_SYSFS + struct kset *memcg_kset; +#endif #endif #ifdef CONFIG_NUMA diff --git a/mm/slub.c b/mm/slub.c index 33939e72bc37..3508edec19f9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5138,6 +5138,15 @@ static const struct kset_uevent_ops slab_uevent_ops = { static struct kset *slab_kset; +static inline struct kset *cache_kset(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + return s->memcg_params->root_cache->memcg_kset; +#endif + return slab_kset; +} + #define ID_STR_LENGTH 64 /* Create a unique string id for a slab cache: @@ -5203,7 +5212,7 @@ static int sysfs_slab_add(struct kmem_cache *s) name = create_unique_id(s); } - s->kobj.kset = slab_kset; + s->kobj.kset = cache_kset(s); err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); if (err) { kobject_put(&s->kobj); @@ -5216,6 +5225,18 @@ static int sysfs_slab_add(struct kmem_cache *s) kobject_put(&s->kobj); return err; } + +#ifdef CONFIG_MEMCG_KMEM + if (is_root_cache(s)) { + s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); + if (!s->memcg_kset) { + kobject_del(&s->kobj); + kobject_put(&s->kobj); + return -ENOMEM; + } + } +#endif + kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ @@ -5234,6 +5255,9 @@ static void sysfs_slab_remove(struct kmem_cache *s) */ return; +#ifdef CONFIG_MEMCG_KMEM + kset_unregister(s->memcg_kset); +#endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); kobject_put(&s->kobj); -- cgit From 54b6a731025f9528d44945a72b1f4e5946bb2d80 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 7 Apr 2014 15:39:32 -0700 Subject: slub: fix leak of 'name' in sysfs_slab_add The failure paths of sysfs_slab_add don't release the allocation of 'name' made by create_unique_id() a few lines above the context of the diff below. Create a common exit path to make it more obvious what needs freeing. [vdavydov@parallels.com: free the name only if !unmergeable] Signed-off-by: Dave Jones Signed-off-by: Vladimir Davydov Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 3508edec19f9..e7451861f95d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5214,25 +5214,19 @@ static int sysfs_slab_add(struct kmem_cache *s) s->kobj.kset = cache_kset(s); err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); - if (err) { - kobject_put(&s->kobj); - return err; - } + if (err) + goto out_put_kobj; err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) { - kobject_del(&s->kobj); - kobject_put(&s->kobj); - return err; - } + if (err) + goto out_del_kobj; #ifdef CONFIG_MEMCG_KMEM if (is_root_cache(s)) { s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); if (!s->memcg_kset) { - kobject_del(&s->kobj); - kobject_put(&s->kobj); - return -ENOMEM; + err = -ENOMEM; + goto out_del_kobj; } } #endif @@ -5241,9 +5235,16 @@ static int sysfs_slab_add(struct kmem_cache *s) if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); - kfree(name); } - return 0; +out: + if (!unmergeable) + kfree(name); + return err; +out_del_kobj: + kobject_del(&s->kobj); +out_put_kobj: + kobject_put(&s->kobj); + goto out; } static void sysfs_slab_remove(struct kmem_cache *s) -- cgit From b3ca1c10d7b32fdfdfaf5484eda486323f52d9be Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 7 Apr 2014 15:39:34 -0700 Subject: percpu: add raw_cpu_ops The kernel has never been audited to ensure that this_cpu operations are consistently used throughout the kernel. The code generated in many places can be improved through the use of this_cpu operations (which uses a segment register for relocation of per cpu offsets instead of performing address calculations). The patch set also addresses various consistency issues in general with the per cpu macros. A. The semantics of __this_cpu_ptr() differs from this_cpu_ptr only because checks are skipped. This is typically shown through a raw_ prefix. So this patch set changes the places where __this_cpu_ptr() is used to raw_cpu_ptr(). B. There has been the long term wish by some that __this_cpu operations would check for preemption. However, there are cases where preemption checks need to be skipped. This patch set adds raw_cpu operations that do not check for preemption and then adds preemption checks to the __this_cpu operations. C. The use of __get_cpu_var is always a reference to a percpu variable that can also be handled via a this_cpu operation. This patch set replaces all uses of __get_cpu_var with this_cpu operations. D. We can then use this_cpu RMW operations in various places replacing sequences of instructions by a single one. E. The use of this_cpu operations throughout will allow other arches than x86 to implement optimized references and RMV operations to work with per cpu local data. F. The use of this_cpu operations opens up the possibility to further optimize code that relies on synchronization through per cpu data. The patch set works in a couple of stages: I. Patch 1 adds the additional raw_cpu operations and raw_cpu_ptr(). Also converts the existing __this_cpu_xx_# primitive in the x86 code to raw_cpu_xx_#. II. Patch 2-4 use the raw_cpu operations in places that would give us false positives once they are enabled. III. Patch 5 adds preemption checks to __this_cpu operations to allow checking if preemption is properly disabled when these functions are used. IV. Patches 6-20 are patches that simply replace uses of __get_cpu_var with this_cpu_ptr. They do not depend on any changes to the percpu code. No preemption tests are skipped if they are applied. V. Patches 21-46 are conversion patches that use this_cpu operations in various kernel subsystems/drivers or arch code. VI. Patches 47/48 (not included in this series) remove no longer used functions (__this_cpu_ptr and __get_cpu_var). These should only be applied after all the conversion patches have made it and after we have done additional passes through the kernel to ensure that none of the uses of these functions remain. This patch (of 46): The patches following this one will add preemption checks to __this_cpu ops so we need to have an alternative way to use this_cpu operations without preemption checks. raw_cpu_ops will be the basis for all other ops since these will be the operations that do not implement any checks. Primitive operations are renamed by this patch from __this_cpu_xxx to raw_cpu_xxxx. Also change the uses of the x86 percpu primitives in preempt.h. These depend directly on asm/percpu.h (header #include nesting issue). Signed-off-by: Peter Zijlstra Signed-off-by: Christoph Lameter Acked-by: Ingo Molnar Cc: Tejun Heo Cc: "James E.J. Bottomley" Cc: "Paul E. McKenney" Cc: Alex Shi Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bryan Wu Cc: Catalin Marinas Cc: Chris Metcalf Cc: Daniel Lezcano Cc: David Daney Cc: David Miller Cc: David S. Miller Cc: Dimitri Sivanich Cc: Dipankar Sarma Cc: Eric Dumazet Cc: Fenghua Yu Cc: Frederic Weisbecker Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Hedi Berriche Cc: Heiko Carstens Cc: Helge Deller Cc: Ivan Kokshaysky Cc: James Hogan Cc: Jens Axboe Cc: John Stultz Cc: Martin Schwidefsky Cc: Masami Hiramatsu Cc: Matt Turner Cc: Mike Frysinger Cc: Mike Travis Cc: Neil Brown Cc: Nicolas Pitre Cc: Paul Mackerras Cc: Paul Mundt Cc: Rafael J. Wysocki Cc: Ralf Baechle Cc: Richard Henderson Cc: Robert Richter Cc: Russell King Cc: Russell King Cc: Rusty Russell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Cc: Wim Van Sebroeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/percpu.h | 98 ++++++------ arch/x86/include/asm/preempt.h | 16 +- include/asm-generic/percpu.h | 13 +- include/linux/percpu.h | 331 ++++++++++++++++++++++++----------------- 4 files changed, 260 insertions(+), 198 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 94220d14d5cc..851bcdc5db04 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -52,7 +52,7 @@ * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. */ -#define __this_cpu_ptr(ptr) \ +#define raw_cpu_ptr(ptr) \ ({ \ unsigned long tcp_ptr__; \ __verify_pcpu_ptr(ptr); \ @@ -362,25 +362,25 @@ do { \ */ #define this_cpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) -#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define __this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) - -#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_add_1(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_add_2(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_add_4(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val) -#define __this_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) -#define __this_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) +#define raw_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) + +#define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_add_1(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_add_2(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_add_4(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) #define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) @@ -401,16 +401,16 @@ do { \ #define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) -#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) #define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) @@ -427,7 +427,7 @@ do { \ __ret; \ }) -#define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double +#define raw_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double #define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double #endif /* CONFIG_X86_CMPXCHG64 */ @@ -436,22 +436,22 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define __this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) -#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) - -#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define raw_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) +#define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val) +#define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define raw_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) +#define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) +#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) + +#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) +#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) +#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) +#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) +#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) +#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) /* @@ -474,7 +474,7 @@ do { \ __ret; \ }) -#define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double +#define raw_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double #define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double #endif @@ -495,9 +495,9 @@ static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; #ifdef CONFIG_X86_64 - return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_8(*a)) != 0; + return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0; #else - return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_4(*a)) != 0; + return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_4(*a)) != 0; #endif } diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index c8b051933b1b..7024c12f7bfe 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -19,12 +19,12 @@ DECLARE_PER_CPU(int, __preempt_count); */ static __always_inline int preempt_count(void) { - return __this_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; + return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) { - __this_cpu_write_4(__preempt_count, pc); + raw_cpu_write_4(__preempt_count, pc); } /* @@ -53,17 +53,17 @@ static __always_inline void preempt_count_set(int pc) static __always_inline void set_preempt_need_resched(void) { - __this_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); + raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { - __this_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); + raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { - return !(__this_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); + return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); } /* @@ -72,12 +72,12 @@ static __always_inline bool test_preempt_need_resched(void) static __always_inline void __preempt_count_add(int val) { - __this_cpu_add_4(__preempt_count, val); + raw_cpu_add_4(__preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { - __this_cpu_add_4(__preempt_count, -val); + raw_cpu_add_4(__preempt_count, -val); } /* @@ -95,7 +95,7 @@ static __always_inline bool __preempt_count_dec_and_test(void) */ static __always_inline bool should_resched(void) { - return unlikely(!__this_cpu_read_4(__preempt_count)); + return unlikely(!raw_cpu_read_4(__preempt_count)); } #ifdef CONFIG_PREEMPT diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index d17784ea37ff..0703aa75b5e8 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -56,17 +56,17 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; #define per_cpu(var, cpu) \ (*SHIFT_PERCPU_PTR(&(var), per_cpu_offset(cpu))) -#ifndef __this_cpu_ptr -#define __this_cpu_ptr(ptr) SHIFT_PERCPU_PTR(ptr, __my_cpu_offset) +#ifndef raw_cpu_ptr +#define raw_cpu_ptr(ptr) SHIFT_PERCPU_PTR(ptr, __my_cpu_offset) #endif #ifdef CONFIG_DEBUG_PREEMPT #define this_cpu_ptr(ptr) SHIFT_PERCPU_PTR(ptr, my_cpu_offset) #else -#define this_cpu_ptr(ptr) __this_cpu_ptr(ptr) +#define this_cpu_ptr(ptr) raw_cpu_ptr(ptr) #endif #define __get_cpu_var(var) (*this_cpu_ptr(&(var))) -#define __raw_get_cpu_var(var) (*__this_cpu_ptr(&(var))) +#define __raw_get_cpu_var(var) (*raw_cpu_ptr(&(var))) #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA extern void setup_per_cpu_areas(void); @@ -83,7 +83,7 @@ extern void setup_per_cpu_areas(void); #define __get_cpu_var(var) (*VERIFY_PERCPU_PTR(&(var))) #define __raw_get_cpu_var(var) (*VERIFY_PERCPU_PTR(&(var))) #define this_cpu_ptr(ptr) per_cpu_ptr(ptr, 0) -#define __this_cpu_ptr(ptr) this_cpu_ptr(ptr) +#define raw_cpu_ptr(ptr) this_cpu_ptr(ptr) #endif /* SMP */ @@ -122,4 +122,7 @@ extern void setup_per_cpu_areas(void); #define PER_CPU_DEF_ATTRIBUTES #endif +/* Keep until we have removed all uses of __this_cpu_ptr */ +#define __this_cpu_ptr raw_cpu_ptr + #endif /* _ASM_GENERIC_PERCPU_H_ */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e3817d2441b6..4e4d2afcc0c7 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -243,6 +243,8 @@ do { \ } while (0) /* + * this_cpu operations (C) 2008-2013 Christoph Lameter + * * Optimized manipulation for memory allocated through the per cpu * allocator or for addresses of per cpu variables. * @@ -296,7 +298,7 @@ do { \ do { \ unsigned long flags; \ raw_local_irq_save(flags); \ - *__this_cpu_ptr(&(pcp)) op val; \ + *raw_cpu_ptr(&(pcp)) op val; \ raw_local_irq_restore(flags); \ } while (0) @@ -381,8 +383,8 @@ do { \ typeof(pcp) ret__; \ unsigned long flags; \ raw_local_irq_save(flags); \ - __this_cpu_add(pcp, val); \ - ret__ = __this_cpu_read(pcp); \ + raw_cpu_add(pcp, val); \ + ret__ = raw_cpu_read(pcp); \ raw_local_irq_restore(flags); \ ret__; \ }) @@ -411,8 +413,8 @@ do { \ ({ typeof(pcp) ret__; \ unsigned long flags; \ raw_local_irq_save(flags); \ - ret__ = __this_cpu_read(pcp); \ - __this_cpu_write(pcp, nval); \ + ret__ = raw_cpu_read(pcp); \ + raw_cpu_write(pcp, nval); \ raw_local_irq_restore(flags); \ ret__; \ }) @@ -439,9 +441,9 @@ do { \ typeof(pcp) ret__; \ unsigned long flags; \ raw_local_irq_save(flags); \ - ret__ = __this_cpu_read(pcp); \ + ret__ = raw_cpu_read(pcp); \ if (ret__ == (oval)) \ - __this_cpu_write(pcp, nval); \ + raw_cpu_write(pcp, nval); \ raw_local_irq_restore(flags); \ ret__; \ }) @@ -476,7 +478,7 @@ do { \ int ret__; \ unsigned long flags; \ raw_local_irq_save(flags); \ - ret__ = __this_cpu_generic_cmpxchg_double(pcp1, pcp2, \ + ret__ = raw_cpu_generic_cmpxchg_double(pcp1, pcp2, \ oval1, oval2, nval1, nval2); \ raw_local_irq_restore(flags); \ ret__; \ @@ -504,12 +506,8 @@ do { \ #endif /* - * Generic percpu operations for context that are safe from preemption/interrupts. - * Either we do not care about races or the caller has the - * responsibility of handling preemption/interrupt issues. Arch code can still - * override these instructions since the arch per cpu code may be more - * efficient and may actually get race freeness for free (that is the - * case for x86 for example). + * Generic percpu operations for contexts where we do not want to do + * any checks for preemptiosn. * * If there is no other protection through preempt disable and/or * disabling interupts then one of these RMW operations can show unexpected @@ -517,211 +515,272 @@ do { \ * or an interrupt occurred and the same percpu variable was modified from * the interrupt context. */ -#ifndef __this_cpu_read -# ifndef __this_cpu_read_1 -# define __this_cpu_read_1(pcp) (*__this_cpu_ptr(&(pcp))) +#ifndef raw_cpu_read +# ifndef raw_cpu_read_1 +# define raw_cpu_read_1(pcp) (*raw_cpu_ptr(&(pcp))) # endif -# ifndef __this_cpu_read_2 -# define __this_cpu_read_2(pcp) (*__this_cpu_ptr(&(pcp))) +# ifndef raw_cpu_read_2 +# define raw_cpu_read_2(pcp) (*raw_cpu_ptr(&(pcp))) # endif -# ifndef __this_cpu_read_4 -# define __this_cpu_read_4(pcp) (*__this_cpu_ptr(&(pcp))) +# ifndef raw_cpu_read_4 +# define raw_cpu_read_4(pcp) (*raw_cpu_ptr(&(pcp))) # endif -# ifndef __this_cpu_read_8 -# define __this_cpu_read_8(pcp) (*__this_cpu_ptr(&(pcp))) +# ifndef raw_cpu_read_8 +# define raw_cpu_read_8(pcp) (*raw_cpu_ptr(&(pcp))) # endif -# define __this_cpu_read(pcp) __pcpu_size_call_return(__this_cpu_read_, (pcp)) +# define raw_cpu_read(pcp) __pcpu_size_call_return(raw_cpu_read_, (pcp)) #endif -#define __this_cpu_generic_to_op(pcp, val, op) \ +#define raw_cpu_generic_to_op(pcp, val, op) \ do { \ - *__this_cpu_ptr(&(pcp)) op val; \ + *raw_cpu_ptr(&(pcp)) op val; \ } while (0) -#ifndef __this_cpu_write -# ifndef __this_cpu_write_1 -# define __this_cpu_write_1(pcp, val) __this_cpu_generic_to_op((pcp), (val), =) + +#ifndef raw_cpu_write +# ifndef raw_cpu_write_1 +# define raw_cpu_write_1(pcp, val) raw_cpu_generic_to_op((pcp), (val), =) # endif -# ifndef __this_cpu_write_2 -# define __this_cpu_write_2(pcp, val) __this_cpu_generic_to_op((pcp), (val), =) +# ifndef raw_cpu_write_2 +# define raw_cpu_write_2(pcp, val) raw_cpu_generic_to_op((pcp), (val), =) # endif -# ifndef __this_cpu_write_4 -# define __this_cpu_write_4(pcp, val) __this_cpu_generic_to_op((pcp), (val), =) +# ifndef raw_cpu_write_4 +# define raw_cpu_write_4(pcp, val) raw_cpu_generic_to_op((pcp), (val), =) # endif -# ifndef __this_cpu_write_8 -# define __this_cpu_write_8(pcp, val) __this_cpu_generic_to_op((pcp), (val), =) +# ifndef raw_cpu_write_8 +# define raw_cpu_write_8(pcp, val) raw_cpu_generic_to_op((pcp), (val), =) # endif -# define __this_cpu_write(pcp, val) __pcpu_size_call(__this_cpu_write_, (pcp), (val)) +# define raw_cpu_write(pcp, val) __pcpu_size_call(raw_cpu_write_, (pcp), (val)) #endif -#ifndef __this_cpu_add -# ifndef __this_cpu_add_1 -# define __this_cpu_add_1(pcp, val) __this_cpu_generic_to_op((pcp), (val), +=) +#ifndef raw_cpu_add +# ifndef raw_cpu_add_1 +# define raw_cpu_add_1(pcp, val) raw_cpu_generic_to_op((pcp), (val), +=) # endif -# ifndef __this_cpu_add_2 -# define __this_cpu_add_2(pcp, val) __this_cpu_generic_to_op((pcp), (val), +=) +# ifndef raw_cpu_add_2 +# define raw_cpu_add_2(pcp, val) raw_cpu_generic_to_op((pcp), (val), +=) # endif -# ifndef __this_cpu_add_4 -# define __this_cpu_add_4(pcp, val) __this_cpu_generic_to_op((pcp), (val), +=) +# ifndef raw_cpu_add_4 +# define raw_cpu_add_4(pcp, val) raw_cpu_generic_to_op((pcp), (val), +=) # endif -# ifndef __this_cpu_add_8 -# define __this_cpu_add_8(pcp, val) __this_cpu_generic_to_op((pcp), (val), +=) +# ifndef raw_cpu_add_8 +# define raw_cpu_add_8(pcp, val) raw_cpu_generic_to_op((pcp), (val), +=) # endif -# define __this_cpu_add(pcp, val) __pcpu_size_call(__this_cpu_add_, (pcp), (val)) +# define raw_cpu_add(pcp, val) __pcpu_size_call(raw_cpu_add_, (pcp), (val)) #endif -#ifndef __this_cpu_sub -# define __this_cpu_sub(pcp, val) __this_cpu_add((pcp), -(typeof(pcp))(val)) +#ifndef raw_cpu_sub +# define raw_cpu_sub(pcp, val) raw_cpu_add((pcp), -(val)) #endif -#ifndef __this_cpu_inc -# define __this_cpu_inc(pcp) __this_cpu_add((pcp), 1) +#ifndef raw_cpu_inc +# define raw_cpu_inc(pcp) raw_cpu_add((pcp), 1) #endif -#ifndef __this_cpu_dec -# define __this_cpu_dec(pcp) __this_cpu_sub((pcp), 1) +#ifndef raw_cpu_dec +# define raw_cpu_dec(pcp) raw_cpu_sub((pcp), 1) #endif -#ifndef __this_cpu_and -# ifndef __this_cpu_and_1 -# define __this_cpu_and_1(pcp, val) __this_cpu_generic_to_op((pcp), (val), &=) +#ifndef raw_cpu_and +# ifndef raw_cpu_and_1 +# define raw_cpu_and_1(pcp, val) raw_cpu_generic_to_op((pcp), (val), &=) # endif -# ifndef __this_cpu_and_2 -# define __this_cpu_and_2(pcp, val) __this_cpu_generic_to_op((pcp), (val), &=) +# ifndef raw_cpu_and_2 +# define raw_cpu_and_2(pcp, val) raw_cpu_generic_to_op((pcp), (val), &=) # endif -# ifndef __this_cpu_and_4 -# define __this_cpu_and_4(pcp, val) __this_cpu_generic_to_op((pcp), (val), &=) +# ifndef raw_cpu_and_4 +# define raw_cpu_and_4(pcp, val) raw_cpu_generic_to_op((pcp), (val), &=) # endif -# ifndef __this_cpu_and_8 -# define __this_cpu_and_8(pcp, val) __this_cpu_generic_to_op((pcp), (val), &=) +# ifndef raw_cpu_and_8 +# define raw_cpu_and_8(pcp, val) raw_cpu_generic_to_op((pcp), (val), &=) # endif -# define __this_cpu_and(pcp, val) __pcpu_size_call(__this_cpu_and_, (pcp), (val)) +# define raw_cpu_and(pcp, val) __pcpu_size_call(raw_cpu_and_, (pcp), (val)) #endif -#ifndef __this_cpu_or -# ifndef __this_cpu_or_1 -# define __this_cpu_or_1(pcp, val) __this_cpu_generic_to_op((pcp), (val), |=) +#ifndef raw_cpu_or +# ifndef raw_cpu_or_1 +# define raw_cpu_or_1(pcp, val) raw_cpu_generic_to_op((pcp), (val), |=) # endif -# ifndef __this_cpu_or_2 -# define __this_cpu_or_2(pcp, val) __this_cpu_generic_to_op((pcp), (val), |=) +# ifndef raw_cpu_or_2 +# define raw_cpu_or_2(pcp, val) raw_cpu_generic_to_op((pcp), (val), |=) # endif -# ifndef __this_cpu_or_4 -# define __this_cpu_or_4(pcp, val) __this_cpu_generic_to_op((pcp), (val), |=) +# ifndef raw_cpu_or_4 +# define raw_cpu_or_4(pcp, val) raw_cpu_generic_to_op((pcp), (val), |=) # endif -# ifndef __this_cpu_or_8 -# define __this_cpu_or_8(pcp, val) __this_cpu_generic_to_op((pcp), (val), |=) +# ifndef raw_cpu_or_8 +# define raw_cpu_or_8(pcp, val) raw_cpu_generic_to_op((pcp), (val), |=) # endif -# define __this_cpu_or(pcp, val) __pcpu_size_call(__this_cpu_or_, (pcp), (val)) +# define raw_cpu_or(pcp, val) __pcpu_size_call(raw_cpu_or_, (pcp), (val)) #endif -#define __this_cpu_generic_add_return(pcp, val) \ +#define raw_cpu_generic_add_return(pcp, val) \ ({ \ - __this_cpu_add(pcp, val); \ - __this_cpu_read(pcp); \ + raw_cpu_add(pcp, val); \ + raw_cpu_read(pcp); \ }) -#ifndef __this_cpu_add_return -# ifndef __this_cpu_add_return_1 -# define __this_cpu_add_return_1(pcp, val) __this_cpu_generic_add_return(pcp, val) +#ifndef raw_cpu_add_return +# ifndef raw_cpu_add_return_1 +# define raw_cpu_add_return_1(pcp, val) raw_cpu_generic_add_return(pcp, val) # endif -# ifndef __this_cpu_add_return_2 -# define __this_cpu_add_return_2(pcp, val) __this_cpu_generic_add_return(pcp, val) +# ifndef raw_cpu_add_return_2 +# define raw_cpu_add_return_2(pcp, val) raw_cpu_generic_add_return(pcp, val) # endif -# ifndef __this_cpu_add_return_4 -# define __this_cpu_add_return_4(pcp, val) __this_cpu_generic_add_return(pcp, val) +# ifndef raw_cpu_add_return_4 +# define raw_cpu_add_return_4(pcp, val) raw_cpu_generic_add_return(pcp, val) # endif -# ifndef __this_cpu_add_return_8 -# define __this_cpu_add_return_8(pcp, val) __this_cpu_generic_add_return(pcp, val) +# ifndef raw_cpu_add_return_8 +# define raw_cpu_add_return_8(pcp, val) raw_cpu_generic_add_return(pcp, val) # endif -# define __this_cpu_add_return(pcp, val) \ - __pcpu_size_call_return2(__this_cpu_add_return_, pcp, val) +# define raw_cpu_add_return(pcp, val) \ + __pcpu_size_call_return2(raw_add_return_, pcp, val) #endif -#define __this_cpu_sub_return(pcp, val) __this_cpu_add_return(pcp, -(typeof(pcp))(val)) -#define __this_cpu_inc_return(pcp) __this_cpu_add_return(pcp, 1) -#define __this_cpu_dec_return(pcp) __this_cpu_add_return(pcp, -1) +#define raw_cpu_sub_return(pcp, val) raw_cpu_add_return(pcp, -(typeof(pcp))(val)) +#define raw_cpu_inc_return(pcp) raw_cpu_add_return(pcp, 1) +#define raw_cpu_dec_return(pcp) raw_cpu_add_return(pcp, -1) -#define __this_cpu_generic_xchg(pcp, nval) \ +#define raw_cpu_generic_xchg(pcp, nval) \ ({ typeof(pcp) ret__; \ - ret__ = __this_cpu_read(pcp); \ - __this_cpu_write(pcp, nval); \ + ret__ = raw_cpu_read(pcp); \ + raw_cpu_write(pcp, nval); \ ret__; \ }) -#ifndef __this_cpu_xchg -# ifndef __this_cpu_xchg_1 -# define __this_cpu_xchg_1(pcp, nval) __this_cpu_generic_xchg(pcp, nval) +#ifndef raw_cpu_xchg +# ifndef raw_cpu_xchg_1 +# define raw_cpu_xchg_1(pcp, nval) raw_cpu_generic_xchg(pcp, nval) # endif -# ifndef __this_cpu_xchg_2 -# define __this_cpu_xchg_2(pcp, nval) __this_cpu_generic_xchg(pcp, nval) +# ifndef raw_cpu_xchg_2 +# define raw_cpu_xchg_2(pcp, nval) raw_cpu_generic_xchg(pcp, nval) # endif -# ifndef __this_cpu_xchg_4 -# define __this_cpu_xchg_4(pcp, nval) __this_cpu_generic_xchg(pcp, nval) +# ifndef raw_cpu_xchg_4 +# define raw_cpu_xchg_4(pcp, nval) raw_cpu_generic_xchg(pcp, nval) # endif -# ifndef __this_cpu_xchg_8 -# define __this_cpu_xchg_8(pcp, nval) __this_cpu_generic_xchg(pcp, nval) +# ifndef raw_cpu_xchg_8 +# define raw_cpu_xchg_8(pcp, nval) raw_cpu_generic_xchg(pcp, nval) # endif -# define __this_cpu_xchg(pcp, nval) \ - __pcpu_size_call_return2(__this_cpu_xchg_, (pcp), nval) +# define raw_cpu_xchg(pcp, nval) \ + __pcpu_size_call_return2(raw_cpu_xchg_, (pcp), nval) #endif -#define __this_cpu_generic_cmpxchg(pcp, oval, nval) \ +#define raw_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ typeof(pcp) ret__; \ - ret__ = __this_cpu_read(pcp); \ + ret__ = raw_cpu_read(pcp); \ if (ret__ == (oval)) \ - __this_cpu_write(pcp, nval); \ + raw_cpu_write(pcp, nval); \ ret__; \ }) -#ifndef __this_cpu_cmpxchg -# ifndef __this_cpu_cmpxchg_1 -# define __this_cpu_cmpxchg_1(pcp, oval, nval) __this_cpu_generic_cmpxchg(pcp, oval, nval) +#ifndef raw_cpu_cmpxchg +# ifndef raw_cpu_cmpxchg_1 +# define raw_cpu_cmpxchg_1(pcp, oval, nval) raw_cpu_generic_cmpxchg(pcp, oval, nval) # endif -# ifndef __this_cpu_cmpxchg_2 -# define __this_cpu_cmpxchg_2(pcp, oval, nval) __this_cpu_generic_cmpxchg(pcp, oval, nval) +# ifndef raw_cpu_cmpxchg_2 +# define raw_cpu_cmpxchg_2(pcp, oval, nval) raw_cpu_generic_cmpxchg(pcp, oval, nval) # endif -# ifndef __this_cpu_cmpxchg_4 -# define __this_cpu_cmpxchg_4(pcp, oval, nval) __this_cpu_generic_cmpxchg(pcp, oval, nval) +# ifndef raw_cpu_cmpxchg_4 +# define raw_cpu_cmpxchg_4(pcp, oval, nval) raw_cpu_generic_cmpxchg(pcp, oval, nval) # endif -# ifndef __this_cpu_cmpxchg_8 -# define __this_cpu_cmpxchg_8(pcp, oval, nval) __this_cpu_generic_cmpxchg(pcp, oval, nval) +# ifndef raw_cpu_cmpxchg_8 +# define raw_cpu_cmpxchg_8(pcp, oval, nval) raw_cpu_generic_cmpxchg(pcp, oval, nval) # endif -# define __this_cpu_cmpxchg(pcp, oval, nval) \ - __pcpu_size_call_return2(__this_cpu_cmpxchg_, pcp, oval, nval) +# define raw_cpu_cmpxchg(pcp, oval, nval) \ + __pcpu_size_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval) #endif -#define __this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ +#define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ ({ \ int __ret = 0; \ - if (__this_cpu_read(pcp1) == (oval1) && \ - __this_cpu_read(pcp2) == (oval2)) { \ - __this_cpu_write(pcp1, (nval1)); \ - __this_cpu_write(pcp2, (nval2)); \ + if (raw_cpu_read(pcp1) == (oval1) && \ + raw_cpu_read(pcp2) == (oval2)) { \ + raw_cpu_write(pcp1, (nval1)); \ + raw_cpu_write(pcp2, (nval2)); \ __ret = 1; \ } \ (__ret); \ }) -#ifndef __this_cpu_cmpxchg_double -# ifndef __this_cpu_cmpxchg_double_1 -# define __this_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2) \ - __this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +#ifndef raw_cpu_cmpxchg_double +# ifndef raw_cpu_cmpxchg_double_1 +# define raw_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) # endif -# ifndef __this_cpu_cmpxchg_double_2 -# define __this_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2) \ - __this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# ifndef raw_cpu_cmpxchg_double_2 +# define raw_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) # endif -# ifndef __this_cpu_cmpxchg_double_4 -# define __this_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2) \ - __this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# ifndef raw_cpu_cmpxchg_double_4 +# define raw_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) # endif -# ifndef __this_cpu_cmpxchg_double_8 -# define __this_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2) \ - __this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# ifndef raw_cpu_cmpxchg_double_8 +# define raw_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) # endif +# define raw_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + __pcpu_double_call_return_bool(raw_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2)) +#endif + +/* + * Generic percpu operations for context that are safe from preemption/interrupts. + * Checks will be added here soon. + */ +#ifndef __this_cpu_read +# define __this_cpu_read(pcp) __pcpu_size_call_return(raw_cpu_read_, (pcp)) +#endif + +#ifndef __this_cpu_write +# define __this_cpu_write(pcp, val) __pcpu_size_call(raw_cpu_write_, (pcp), (val)) +#endif + +#ifndef __this_cpu_add +# define __this_cpu_add(pcp, val) __pcpu_size_call(raw_cpu_add_, (pcp), (val)) +#endif + +#ifndef __this_cpu_sub +# define __this_cpu_sub(pcp, val) __this_cpu_add((pcp), -(typeof(pcp))(val)) +#endif + +#ifndef __this_cpu_inc +# define __this_cpu_inc(pcp) __this_cpu_add((pcp), 1) +#endif + +#ifndef __this_cpu_dec +# define __this_cpu_dec(pcp) __this_cpu_sub((pcp), 1) +#endif + +#ifndef __this_cpu_and +# define __this_cpu_and(pcp, val) __pcpu_size_call(raw_cpu_and_, (pcp), (val)) +#endif + +#ifndef __this_cpu_or +# define __this_cpu_or(pcp, val) __pcpu_size_call(raw_cpu_or_, (pcp), (val)) +#endif + +#ifndef __this_cpu_add_return +# define __this_cpu_add_return(pcp, val) \ + __pcpu_size_call_return2(raw_cpu_add_return_, pcp, val) +#endif + +#define __this_cpu_sub_return(pcp, val) __this_cpu_add_return(pcp, -(typeof(pcp))(val)) +#define __this_cpu_inc_return(pcp) __this_cpu_add_return(pcp, 1) +#define __this_cpu_dec_return(pcp) __this_cpu_add_return(pcp, -1) + +#ifndef __this_cpu_xchg +# define __this_cpu_xchg(pcp, nval) \ + __pcpu_size_call_return2(raw_cpu_xchg_, (pcp), nval) +#endif + +#ifndef __this_cpu_cmpxchg +# define __this_cpu_cmpxchg(pcp, oval, nval) \ + __pcpu_size_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval) +#endif + +#ifndef __this_cpu_cmpxchg_double # define __this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ - __pcpu_double_call_return_bool(__this_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2)) + __pcpu_double_call_return_bool(raw_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2)) #endif #endif /* __LINUX_PERCPU_H */ -- cgit From dc322a99d31fff5d3f8acfa061ad033953efdebe Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 7 Apr 2014 15:39:38 -0700 Subject: mm: use raw_cpu ops for determining current NUMA node With the preempt checking logic for __this_cpu_ops we will get false positives from locations in the code that use numa_node_id. Before the __this_cpu ops where introduced there were no checks for preemption present either. smp_raw_processor_id() was used. See http://www.spinics.net/lists/linux-numa/msg00641.html Therefore we need to use raw_cpu_read here to avoid false postives. Note that this issue has been discussed in prior years. If the process changes nodes after retrieving the current numa node then that is acceptable since most uses of numa_node etc are for optimization and not for correctness. There were suggestions to implement a raw_numa_node_id in order to do preempt checks for numa_node_id as well. But I think we better defer that to another patch since that would mean investigating how numa_node_id() is used throughout the kernel which would increase the scope of this patchset significantly. After all preemption was never checked before when numa_node_id() was used. Some sample traces: __this_cpu_read operation in preemptible [00000000] code: login/1456 caller is __this_cpu_preempt_check+0x2b/0x2d CPU: 0 PID: 1456 Comm: login Not tainted 3.12.0-rc4-cl-00062-g2fe80d3-dirty #185 Call Trace: dump_stack+0x4e/0x82 check_preemption_disabled+0xc5/0xe0 __this_cpu_preempt_check+0x2b/0x2d get_task_policy+0x1d/0x49 get_vma_policy+0x14/0x76 alloc_pages_vma+0x35/0xff handle_mm_fault+0x290/0x73b __do_page_fault+0x3fe/0x44d do_page_fault+0x9/0xc page_fault+0x22/0x30 generic_file_aio_read+0x38e/0x624 do_sync_read+0x54/0x73 vfs_read+0x9d/0x12a SyS_read+0x47/0x7e cstar_dispatch+0x7/0x23 caller is __this_cpu_preempt_check+0x2b/0x2d CPU: 0 PID: 1456 Comm: login Not tainted 3.12.0-rc4-cl-00062-g2fe80d3-dirty #185 Call Trace: dump_stack+0x4e/0x82 check_preemption_disabled+0xc5/0xe0 __this_cpu_preempt_check+0x2b/0x2d alloc_pages_current+0x8f/0xbc __page_cache_alloc+0xb/0xd __do_page_cache_readahead+0xf4/0x219 ra_submit+0x1c/0x20 ondemand_readahead+0x28c/0x2b4 page_cache_sync_readahead+0x38/0x3a generic_file_aio_read+0x261/0x624 do_sync_read+0x54/0x73 vfs_read+0x9d/0x12a SyS_read+0x47/0x7e cstar_dispatch+0x7/0x23 Signed-off-by: Christoph Lameter Acked-by: Ingo Molnar Cc: Alex Shi Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/topology.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/topology.h b/include/linux/topology.h index 12ae6ce997d6..7062330a1329 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -188,7 +188,7 @@ DECLARE_PER_CPU(int, numa_node); /* Returns the number of the current Node. */ static inline int numa_node_id(void) { - return __this_cpu_read(numa_node); + return raw_cpu_read(numa_node); } #endif @@ -245,7 +245,7 @@ static inline void set_numa_mem(int node) /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { - return __this_cpu_read(_numa_mem_); + return raw_cpu_read(_numa_mem_); } #endif -- cgit From 08f141d3dbddacb70aba1541bc5f950e466591e9 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 7 Apr 2014 15:39:39 -0700 Subject: modules: use raw_cpu_write for initialization of per cpu refcount. The initialization of a structure is not subject to synchronization. The use of __this_cpu would trigger a false positive with the additional preemption checks for __this_cpu ops. So simply disable the check through the use of raw_cpu ops. Trace: __this_cpu_write operation in preemptible [00000000] code: modprobe/286 caller is __this_cpu_preempt_check+0x38/0x60 CPU: 3 PID: 286 Comm: modprobe Tainted: GF 3.12.0-rc4+ #187 Call Trace: dump_stack+0x4e/0x82 check_preemption_disabled+0xec/0x110 __this_cpu_preempt_check+0x38/0x60 load_module+0xcfd/0x2650 SyS_init_module+0xa6/0xd0 tracesys+0xe1/0xe6 Signed-off-by: Christoph Lameter Acked-by: Ingo Molnar Acked-by: Rusty Russell Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/module.c b/kernel/module.c index 29f7790eaa14..11869408f79b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -640,7 +640,7 @@ static int module_unload_init(struct module *mod) INIT_LIST_HEAD(&mod->target_list); /* Hold reference count during initialization. */ - __this_cpu_write(mod->refptr->incs, 1); + raw_cpu_write(mod->refptr->incs, 1); return 0; } -- cgit From 3ed66e910c91eb914b5c1f2d434538fe68bb8a56 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 7 Apr 2014 15:39:40 -0700 Subject: net: replace __this_cpu_inc in route.c with raw_cpu_inc The RT_CACHE_STAT_INC macro triggers the new preemption checks for __this_cpu ops. I do not see any other synchronization that would allow the use of a __this_cpu operation here however in commit dbd2915ce87e ("[IPV4]: RT_CACHE_STAT_INC() warning fix") Andrew justifies the use of raw_smp_processor_id() here because "we do not care" about races. In the past we agreed that the price of disabling interrupts here to get consistent counters would be too high. These counters may be inaccurate due to race conditions. The use of __this_cpu op improves the situation already from what commit dbd2915ce87e did since the single instruction emitted on x86 does not allow the race to occur anymore. However, non x86 platforms could still experience a race here. Trace: __this_cpu_add operation in preemptible [00000000] code: avahi-daemon/1193 caller is __this_cpu_preempt_check+0x38/0x60 CPU: 1 PID: 1193 Comm: avahi-daemon Tainted: GF 3.12.0-rc4+ #187 Call Trace: check_preemption_disabled+0xec/0x110 __this_cpu_preempt_check+0x38/0x60 __ip_route_output_key+0x575/0x8c0 ip_route_output_flow+0x27/0x70 udp_sendmsg+0x825/0xa20 inet_sendmsg+0x85/0xc0 sock_sendmsg+0x9c/0xd0 ___sys_sendmsg+0x37c/0x390 __sys_sendmsg+0x49/0x90 SyS_sendmsg+0x12/0x20 tracesys+0xe1/0xe6 Signed-off-by: Christoph Lameter Acked-by: David S. Miller Acked-by: Ingo Molnar Cc: Eric Dumazet Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1be9e990514d..34d094cadb11 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -188,7 +188,7 @@ const __u8 ip_tos2prio[16] = { EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); -#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) +#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) -- cgit From 88da03a67674bcd6e9ecf18a0a182cf1303056ba Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 7 Apr 2014 15:39:42 -0700 Subject: slub: use raw_cpu_inc for incrementing statistics Statistics are not critical to the operation of the allocation but should also not cause too much overhead. When __this_cpu_inc is altered to check if preemption is disabled this triggers. Use raw_cpu_inc to avoid the checks. Using this_cpu_ops may cause interrupt disable/enable sequences on various arches which may significantly impact allocator performance. [akpm@linux-foundation.org: add comment] Signed-off-by: Christoph Lameter Cc: Fengguang Wu Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index e7451861f95d..f620bbf4054a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -224,7 +224,11 @@ static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS - __this_cpu_inc(s->cpu_slab->stat[si]); + /* + * The rmw is racy on a preemptible kernel but this is acceptable, so + * avoid this_cpu_add()'s irq-disable overhead. + */ + raw_cpu_inc(s->cpu_slab->stat[si]); #endif } -- cgit From 293b6a4c875c3b49853bff7de99954f49f59aa75 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 7 Apr 2014 15:39:43 -0700 Subject: vmstat: use raw_cpu_ops to avoid false positives on preemption checks vm counters are allowed to be racy. Use raw_cpu_ops to avoid the local_irq_disable overhead and to avoid preemption checks which will be added to the __this_cpu operations. [akpm@linux-foundation.org: Add comment. Again.] Signed-off-by: Christoph Lameter Reported-by: Sergey Senozhatsky Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ea4476157e00..45c9cd1daf7a 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -27,9 +27,13 @@ struct vm_event_state { DECLARE_PER_CPU(struct vm_event_state, vm_event_states); +/* + * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the + * local_irq_disable overhead. + */ static inline void __count_vm_event(enum vm_event_item item) { - __this_cpu_inc(vm_event_states.event[item]); + raw_cpu_inc(vm_event_states.event[item]); } static inline void count_vm_event(enum vm_event_item item) @@ -39,7 +43,7 @@ static inline void count_vm_event(enum vm_event_item item) static inline void __count_vm_events(enum vm_event_item item, long delta) { - __this_cpu_add(vm_event_states.event[item], delta); + raw_cpu_add(vm_event_states.event[item], delta); } static inline void count_vm_events(enum vm_event_item item, long delta) -- cgit From 188a81409ff7de1c5aae947a96356ddd8ff4aaa3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 7 Apr 2014 15:39:44 -0700 Subject: percpu: add preemption checks to __this_cpu ops We define a check function in order to avoid trouble with the include files. Then the higher level __this_cpu macros are modified to invoke the preemption check. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Christoph Lameter Acked-by: Ingo Molnar Cc: Tejun Heo Tested-by: Grygorii Strashko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/percpu.h | 39 +++++++++++++++++++++++++++++---------- lib/smp_processor_id.c | 18 ++++++++++++++---- 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 4e4d2afcc0c7..e7a0b95ed527 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -173,6 +173,12 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); extern void __bad_size_call_parameter(void); +#ifdef CONFIG_DEBUG_PREEMPT +extern void __this_cpu_preempt_check(const char *op); +#else +static inline void __this_cpu_preempt_check(const char *op) { } +#endif + #define __pcpu_size_call_return(stem, variable) \ ({ typeof(variable) pscr_ret__; \ __verify_pcpu_ptr(&(variable)); \ @@ -725,18 +731,24 @@ do { \ /* * Generic percpu operations for context that are safe from preemption/interrupts. - * Checks will be added here soon. */ #ifndef __this_cpu_read -# define __this_cpu_read(pcp) __pcpu_size_call_return(raw_cpu_read_, (pcp)) +# define __this_cpu_read(pcp) \ + (__this_cpu_preempt_check("read"),__pcpu_size_call_return(raw_cpu_read_, (pcp))) #endif #ifndef __this_cpu_write -# define __this_cpu_write(pcp, val) __pcpu_size_call(raw_cpu_write_, (pcp), (val)) +# define __this_cpu_write(pcp, val) \ +do { __this_cpu_preempt_check("write"); \ + __pcpu_size_call(raw_cpu_write_, (pcp), (val)); \ +} while (0) #endif #ifndef __this_cpu_add -# define __this_cpu_add(pcp, val) __pcpu_size_call(raw_cpu_add_, (pcp), (val)) +# define __this_cpu_add(pcp, val) \ +do { __this_cpu_preempt_check("add"); \ + __pcpu_size_call(raw_cpu_add_, (pcp), (val)); \ +} while (0) #endif #ifndef __this_cpu_sub @@ -752,16 +764,23 @@ do { \ #endif #ifndef __this_cpu_and -# define __this_cpu_and(pcp, val) __pcpu_size_call(raw_cpu_and_, (pcp), (val)) +# define __this_cpu_and(pcp, val) \ +do { __this_cpu_preempt_check("and"); \ + __pcpu_size_call(raw_cpu_and_, (pcp), (val)); \ +} while (0) + #endif #ifndef __this_cpu_or -# define __this_cpu_or(pcp, val) __pcpu_size_call(raw_cpu_or_, (pcp), (val)) +# define __this_cpu_or(pcp, val) \ +do { __this_cpu_preempt_check("or"); \ + __pcpu_size_call(raw_cpu_or_, (pcp), (val)); \ +} while (0) #endif #ifndef __this_cpu_add_return # define __this_cpu_add_return(pcp, val) \ - __pcpu_size_call_return2(raw_cpu_add_return_, pcp, val) + (__this_cpu_preempt_check("add_return"),__pcpu_size_call_return2(raw_cpu_add_return_, pcp, val)) #endif #define __this_cpu_sub_return(pcp, val) __this_cpu_add_return(pcp, -(typeof(pcp))(val)) @@ -770,17 +789,17 @@ do { \ #ifndef __this_cpu_xchg # define __this_cpu_xchg(pcp, nval) \ - __pcpu_size_call_return2(raw_cpu_xchg_, (pcp), nval) + (__this_cpu_preempt_check("xchg"),__pcpu_size_call_return2(raw_cpu_xchg_, (pcp), nval)) #endif #ifndef __this_cpu_cmpxchg # define __this_cpu_cmpxchg(pcp, oval, nval) \ - __pcpu_size_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval) + (__this_cpu_preempt_check("cmpxchg"),__pcpu_size_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval)) #endif #ifndef __this_cpu_cmpxchg_double # define __this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ - __pcpu_double_call_return_bool(raw_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2)) + (__this_cpu_preempt_check("cmpxchg_double"),__pcpu_double_call_return_bool(raw_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2))) #endif #endif /* __LINUX_PERCPU_H */ diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 04abe53f12a1..1afec32de6f2 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -7,7 +7,8 @@ #include #include -notrace unsigned int debug_smp_processor_id(void) +notrace static unsigned int check_preemption_disabled(const char *what1, + const char *what2) { int this_cpu = raw_smp_processor_id(); @@ -38,9 +39,9 @@ notrace unsigned int debug_smp_processor_id(void) if (!printk_ratelimit()) goto out_enable; - printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] " - "code: %s/%d\n", - preempt_count() - 1, current->comm, current->pid); + printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n", + what1, what2, preempt_count() - 1, current->comm, current->pid); + print_symbol("caller is %s\n", (long)__builtin_return_address(0)); dump_stack(); @@ -50,5 +51,14 @@ out: return this_cpu; } +notrace unsigned int debug_smp_processor_id(void) +{ + return check_preemption_disabled("smp_processor_id", ""); +} EXPORT_SYMBOL(debug_smp_processor_id); +notrace void __this_cpu_preempt_check(const char *op) +{ + check_preemption_disabled("__this_cpu_", op); +} +EXPORT_SYMBOL(__this_cpu_preempt_check); -- cgit From 64b47e8fdb40a0d46e8cf458dd3e24f8afa073f6 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Apr 2014 15:39:45 -0700 Subject: lglock: map to spinlock when !CONFIG_SMP When the system has only one CPU, lglock is effectively a spinlock; map it directly to spinlock to eliminate the indirection and duplicate code. In addition to removing overhead, this drops 1.6k of code with a defconfig modified to have !CONFIG_SMP, and 1.1k with a minimal config. Signed-off-by: Josh Triplett Cc: Rusty Russell Cc: Michal Marek Cc: Thomas Gleixner Cc: David Howells Cc: "H. Peter Anvin" Cc: Nick Piggin Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lglock.h | 16 ++++++++++++++++ kernel/locking/Makefile | 3 ++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/linux/lglock.h b/include/linux/lglock.h index 96549abe8842..0081f000e34b 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -25,6 +25,8 @@ #include #include +#ifdef CONFIG_SMP + #ifdef CONFIG_DEBUG_LOCK_ALLOC #define LOCKDEP_INIT_MAP lockdep_init_map #else @@ -57,4 +59,18 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu); void lg_global_lock(struct lglock *lg); void lg_global_unlock(struct lglock *lg); +#else +/* When !CONFIG_SMP, map lglock to spinlock */ +#define lglock spinlock +#define DEFINE_LGLOCK(name) DEFINE_SPINLOCK(name) +#define DEFINE_STATIC_LGLOCK(name) static DEFINE_SPINLOCK(name) +#define lg_lock_init(lg, name) spin_lock_init(lg) +#define lg_local_lock spin_lock +#define lg_local_unlock spin_unlock +#define lg_local_lock_cpu(lg, cpu) spin_lock(lg) +#define lg_local_unlock_cpu(lg, cpu) spin_unlock(lg) +#define lg_global_lock spin_lock +#define lg_global_unlock spin_unlock +#endif + #endif diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 306a76b51e0f..b8bdcd4785b7 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o +obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg @@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -- cgit From 6b550f6f2004017f1b4633d2c9e39b610bfe84f0 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Mon, 7 Apr 2014 15:39:46 -0700 Subject: x86/mm: sparse warning fix for early_memremap This patch series takes the common bits from the x86 early ioremap implementation and creates a generic implementation which may be used by other architectures. The early ioremap interfaces are intended for situations where boot code needs to make temporary virtual mappings before the normal ioremap interfaces are available. Typically, this means before paging_init() has run. This patch (of 6): There's a lot of sparse warnings for code like below: void *a = early_memremap(phys_addr, size); early_memremap intend to map kernel memory with ioremap facility, the return pointer should be a kernel ram pointer instead of iomem one. For making the function clearer and supressing sparse warnings this patch do below two things: 1. cast to (__force void *) for the return value of early_memremap 2. add early_memunmap function and pass (__force void __iomem *) to iounmap From Boris: "Ingo told me yesterday, it makes sense too. I'd guess we can try it. FWIW, all callers of early_memremap use the memory they get remapped as normal memory so we should be safe" Signed-off-by: Dave Young Signed-off-by: Mark Salter Acked-by: H. Peter Anvin Cc: Borislav Petkov Cc: Catalin Marinas Cc: Will Deacon Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/io.h | 3 ++- arch/x86/mm/ioremap.c | 10 +++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 91d9c69a629e..7cec9ef3a73a 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -325,9 +325,10 @@ extern void early_ioremap_init(void); extern void early_ioremap_reset(void); extern void __iomem *early_ioremap(resource_size_t phys_addr, unsigned long size); -extern void __iomem *early_memremap(resource_size_t phys_addr, +extern void *early_memremap(resource_size_t phys_addr, unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size); +extern void early_memunmap(void *addr, unsigned long size); extern void fixup_early_ioremap(void); extern bool is_early_ioremap_ptep(pte_t *ptep); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 799580cabc78..bbb450412810 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -562,10 +562,9 @@ early_ioremap(resource_size_t phys_addr, unsigned long size) } /* Remap memory */ -void __init __iomem * -early_memremap(resource_size_t phys_addr, unsigned long size) +void __init *early_memremap(resource_size_t phys_addr, unsigned long size) { - return __early_ioremap(phys_addr, size, PAGE_KERNEL); + return (__force void *)__early_ioremap(phys_addr, size, PAGE_KERNEL); } void __init early_iounmap(void __iomem *addr, unsigned long size) @@ -620,3 +619,8 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) } prev_map[slot] = NULL; } + +void __init early_memunmap(void *addr, unsigned long size) +{ + early_iounmap((__force void __iomem *)addr, size); +} -- cgit From 9e5c33d7aeeef62e5fa7e74f94432685bd03026b Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Mon, 7 Apr 2014 15:39:48 -0700 Subject: mm: create generic early_ioremap() support This patch creates a generic implementation of early_ioremap() support based on the existing x86 implementation. early_ioremp() is useful for early boot code which needs to temporarily map I/O or memory regions before normal mapping functions such as ioremap() are available. Some architectures have optional MMU. In the no-MMU case, the remap functions simply return the passed in physical address and the unmap functions do nothing. Signed-off-by: Mark Salter Acked-by: Catalin Marinas Acked-by: H. Peter Anvin Cc: Borislav Petkov Cc: Dave Young Cc: Will Deacon Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/early_ioremap.h | 42 +++++++ mm/Kconfig | 3 + mm/Makefile | 1 + mm/early_ioremap.c | 245 ++++++++++++++++++++++++++++++++++++ 4 files changed, 291 insertions(+) create mode 100644 include/asm-generic/early_ioremap.h create mode 100644 mm/early_ioremap.c diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h new file mode 100644 index 000000000000..a5de55c04fb2 --- /dev/null +++ b/include/asm-generic/early_ioremap.h @@ -0,0 +1,42 @@ +#ifndef _ASM_EARLY_IOREMAP_H_ +#define _ASM_EARLY_IOREMAP_H_ + +#include + +/* + * early_ioremap() and early_iounmap() are for temporary early boot-time + * mappings, before the real ioremap() is functional. + */ +extern void __iomem *early_ioremap(resource_size_t phys_addr, + unsigned long size); +extern void *early_memremap(resource_size_t phys_addr, + unsigned long size); +extern void early_iounmap(void __iomem *addr, unsigned long size); +extern void early_memunmap(void *addr, unsigned long size); + +/* + * Weak function called by early_ioremap_reset(). It does nothing, but + * architectures may provide their own version to do any needed cleanups. + */ +extern void early_ioremap_shutdown(void); + +#if defined(CONFIG_GENERIC_EARLY_IOREMAP) && defined(CONFIG_MMU) +/* Arch-specific initialization */ +extern void early_ioremap_init(void); + +/* Generic initialization called by architecture code */ +extern void early_ioremap_setup(void); + +/* + * Called as last step in paging_init() so library can act + * accordingly for subsequent map/unmap requests. + */ +extern void early_ioremap_reset(void); + +#else +static inline void early_ioremap_init(void) { } +static inline void early_ioremap_setup(void) { } +static inline void early_ioremap_reset(void) { } +#endif + +#endif /* _ASM_EARLY_IOREMAP_H_ */ diff --git a/mm/Kconfig b/mm/Kconfig index 37fbe1ef5239..ebe5880c29d6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -578,3 +578,6 @@ config PGTABLE_MAPPING You can check speed with zsmalloc benchmark: https://github.com/spartacus06/zsmapbench + +config GENERIC_EARLY_IOREMAP + bool diff --git a/mm/Makefile b/mm/Makefile index 23a6f7e23019..9e5aaf92197d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -61,3 +61,4 @@ obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o +obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c new file mode 100644 index 000000000000..e10ccd299d66 --- /dev/null +++ b/mm/early_ioremap.c @@ -0,0 +1,245 @@ +/* + * Provide common bits of early_ioremap() support for architectures needing + * temporary mappings during boot before ioremap() is available. + * + * This is mostly a direct copy of the x86 early_ioremap implementation. + * + * (C) Copyright 1995 1996, 2014 Linus Torvalds + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_MMU +static int early_ioremap_debug __initdata; + +static int __init early_ioremap_debug_setup(char *str) +{ + early_ioremap_debug = 1; + + return 0; +} +early_param("early_ioremap_debug", early_ioremap_debug_setup); + +static int after_paging_init __initdata; + +void __init __weak early_ioremap_shutdown(void) +{ +} + +void __init early_ioremap_reset(void) +{ + early_ioremap_shutdown(); + after_paging_init = 1; +} + +/* + * Generally, ioremap() is available after paging_init() has been called. + * Architectures wanting to allow early_ioremap after paging_init() can + * define __late_set_fixmap and __late_clear_fixmap to do the right thing. + */ +#ifndef __late_set_fixmap +static inline void __init __late_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t prot) +{ + BUG(); +} +#endif + +#ifndef __late_clear_fixmap +static inline void __init __late_clear_fixmap(enum fixed_addresses idx) +{ + BUG(); +} +#endif + +static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + +void __init early_ioremap_setup(void) +{ + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (WARN_ON(prev_map[i])) + break; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); +} + +static int __init check_early_ioremap_leak(void) +{ + int count = 0; + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (prev_map[i]) + count++; + + if (WARN(count, KERN_WARNING + "Debug warning: early ioremap leak of %d areas detected.\n" + "please boot with early_ioremap_debug and report the dmesg.\n", + count)) + return 1; + return 0; +} +late_initcall(check_early_ioremap_leak); + +static void __init __iomem * +__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) +{ + unsigned long offset; + resource_size_t last_addr; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + WARN_ON(system_state != SYSTEM_BOOTING); + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (!prev_map[i]) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n", + __func__, (u64)phys_addr, size)) + return NULL; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (WARN_ON(!size || last_addr < phys_addr)) + return NULL; + + prev_size[slot] = size; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr + 1) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (WARN_ON(nrpages > NR_FIX_BTMAPS)) + return NULL; + + /* + * Ok, go for it.. + */ + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_set_fixmap(idx, phys_addr, prot); + else + __early_set_fixmap(idx, phys_addr, prot); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n", + __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]); + + prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); + return prev_map[slot]; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i] == addr) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n", + addr, size)) + return; + + if (WARN(prev_size[slot] != size, + "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", + addr, size, slot, prev_size[slot])) + return; + + WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n", + addr, size, slot); + + virt_addr = (unsigned long)addr; + if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) + return; + + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_clear_fixmap(idx); + else + __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR); + --idx; + --nrpages; + } + prev_map[slot] = NULL; +} + +/* Remap an IO device */ +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO); +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return (__force void *)__early_ioremap(phys_addr, size, + FIXMAP_PAGE_NORMAL); +} +#else /* CONFIG_MMU */ + +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return (__force void __iomem *)phys_addr; +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return (void *)phys_addr; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ +} + +#endif /* CONFIG_MMU */ + + +void __init early_memunmap(void *addr, unsigned long size) +{ + early_iounmap((__force void __iomem *)addr, size); +} -- cgit From 5b7c73e00968c7fdf908c3ced31e1cc83c01ba14 Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Mon, 7 Apr 2014 15:39:49 -0700 Subject: x86: use generic early_ioremap Move x86 over to the generic early ioremap implementation. Signed-off-by: Mark Salter Acked-by: H. Peter Anvin Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Young Cc: Will Deacon Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/Kbuild | 1 + arch/x86/include/asm/fixmap.h | 6 ++ arch/x86/include/asm/io.h | 15 +-- arch/x86/mm/ioremap.c | 228 +----------------------------------------- arch/x86/mm/pgtable_32.c | 2 +- 6 files changed, 13 insertions(+), 240 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f73071742975..5b8ec0f53b57 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -43,6 +43,7 @@ config X86 select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS if !SWIOTLB select HAVE_KRETPROBES + select GENERIC_EARLY_IOREMAP select HAVE_OPTPROBES select HAVE_KPROBES_ON_FTRACE select HAVE_FTRACE_MCOUNT_RECORD diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4acddc43ee0c..3ca9762e1649 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -5,5 +5,6 @@ genhdr-y += unistd_64.h genhdr-y += unistd_x32.h generic-y += clkdev.h +generic-y += early_ioremap.h generic-y += cputime.h generic-y += mcs_spinlock.h diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 8dcd35c4c787..43f482a0db37 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -163,5 +163,11 @@ static inline void __set_fixmap(enum fixed_addresses idx, #include +#define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) +#define __late_clear_fixmap(idx) __set_fixmap(idx, 0, __pgprot(0)) + +void __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 7cec9ef3a73a..b8237d8a1e0c 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -39,6 +39,7 @@ #include #include #include +#include #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -316,20 +317,6 @@ extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, unsigned long prot_val); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); -/* - * early_ioremap() and early_iounmap() are for temporary early boot-time - * mappings, before the real ioremap() is functional. - * A boot-time mapping is currently limited to at most 16 pages. - */ -extern void early_ioremap_init(void); -extern void early_ioremap_reset(void); -extern void __iomem *early_ioremap(resource_size_t phys_addr, - unsigned long size); -extern void *early_memremap(resource_size_t phys_addr, - unsigned long size); -extern void early_iounmap(void __iomem *addr, unsigned long size); -extern void early_memunmap(void *addr, unsigned long size); -extern void fixup_early_ioremap(void); extern bool is_early_ioremap_ptep(pte_t *ptep); #ifdef CONFIG_XEN diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index bbb450412810..597ac155c91c 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -328,17 +328,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr) return; } -static int __initdata early_ioremap_debug; - -static int __init early_ioremap_debug_setup(char *str) -{ - early_ioremap_debug = 1; - - return 0; -} -early_param("early_ioremap_debug", early_ioremap_debug_setup); - -static __initdata int after_paging_init; static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) @@ -362,18 +351,11 @@ bool __init is_early_ioremap_ptep(pte_t *ptep) return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)]; } -static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; - void __init early_ioremap_init(void) { pmd_t *pmd; - int i; - if (early_ioremap_debug) - printk(KERN_INFO "early_ioremap_init()\n"); - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) - slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + early_ioremap_setup(); pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); @@ -402,13 +384,8 @@ void __init early_ioremap_init(void) } } -void __init early_ioremap_reset(void) -{ - after_paging_init = 1; -} - -static void __init __early_set_fixmap(enum fixed_addresses idx, - phys_addr_t phys, pgprot_t flags) +void __init __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) { unsigned long addr = __fix_to_virt(idx); pte_t *pte; @@ -425,202 +402,3 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, pte_clear(&init_mm, addr, pte); __flush_tlb_one(addr); } - -static inline void __init early_set_fixmap(enum fixed_addresses idx, - phys_addr_t phys, pgprot_t prot) -{ - if (after_paging_init) - __set_fixmap(idx, phys, prot); - else - __early_set_fixmap(idx, phys, prot); -} - -static inline void __init early_clear_fixmap(enum fixed_addresses idx) -{ - if (after_paging_init) - clear_fixmap(idx); - else - __early_set_fixmap(idx, 0, __pgprot(0)); -} - -static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; -static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; - -void __init fixup_early_ioremap(void) -{ - int i; - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (prev_map[i]) { - WARN_ON(1); - break; - } - } - - early_ioremap_init(); -} - -static int __init check_early_ioremap_leak(void) -{ - int count = 0; - int i; - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) - if (prev_map[i]) - count++; - - if (!count) - return 0; - WARN(1, KERN_WARNING - "Debug warning: early ioremap leak of %d areas detected.\n", - count); - printk(KERN_WARNING - "please boot with early_ioremap_debug and report the dmesg.\n"); - - return 1; -} -late_initcall(check_early_ioremap_leak); - -static void __init __iomem * -__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) -{ - unsigned long offset; - resource_size_t last_addr; - unsigned int nrpages; - enum fixed_addresses idx; - int i, slot; - - WARN_ON(system_state != SYSTEM_BOOTING); - - slot = -1; - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (!prev_map[i]) { - slot = i; - break; - } - } - - if (slot < 0) { - printk(KERN_INFO "%s(%08llx, %08lx) not found slot\n", - __func__, (u64)phys_addr, size); - WARN_ON(1); - return NULL; - } - - if (early_ioremap_debug) { - printk(KERN_INFO "%s(%08llx, %08lx) [%d] => ", - __func__, (u64)phys_addr, size, slot); - dump_stack(); - } - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) { - WARN_ON(1); - return NULL; - } - - prev_size[slot] = size; - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr + 1) - phys_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) { - WARN_ON(1); - return NULL; - } - - /* - * Ok, go for it.. - */ - idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - while (nrpages > 0) { - early_set_fixmap(idx, phys_addr, prot); - phys_addr += PAGE_SIZE; - --idx; - --nrpages; - } - if (early_ioremap_debug) - printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); - - prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); - return prev_map[slot]; -} - -/* Remap an IO device */ -void __init __iomem * -early_ioremap(resource_size_t phys_addr, unsigned long size) -{ - return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); -} - -/* Remap memory */ -void __init *early_memremap(resource_size_t phys_addr, unsigned long size) -{ - return (__force void *)__early_ioremap(phys_addr, size, PAGE_KERNEL); -} - -void __init early_iounmap(void __iomem *addr, unsigned long size) -{ - unsigned long virt_addr; - unsigned long offset; - unsigned int nrpages; - enum fixed_addresses idx; - int i, slot; - - slot = -1; - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (prev_map[i] == addr) { - slot = i; - break; - } - } - - if (slot < 0) { - printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", - addr, size); - WARN_ON(1); - return; - } - - if (prev_size[slot] != size) { - printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", - addr, size, slot, prev_size[slot]); - WARN_ON(1); - return; - } - - if (early_ioremap_debug) { - printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, - size, slot); - dump_stack(); - } - - virt_addr = (unsigned long)addr; - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { - WARN_ON(1); - return; - } - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; - - idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - while (nrpages > 0) { - early_clear_fixmap(idx); - --idx; - --nrpages; - } - prev_map[slot] = NULL; -} - -void __init early_memunmap(void *addr, unsigned long size) -{ - early_iounmap((__force void __iomem *)addr, size); -} diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index a69bcb8c7621..4dd8cf652579 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -127,7 +127,7 @@ static int __init parse_reservetop(char *arg) address = memparse(arg, &arg); reserve_top_address(address); - fixup_early_ioremap(); + early_ioremap_init(); return 0; } early_param("reservetop", parse_reservetop); -- cgit From 0bf757c73d6612d3d279de3f61b35062aa9c8b1d Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Mon, 7 Apr 2014 15:39:51 -0700 Subject: arm64: initialize pgprot info earlier in boot Presently, paging_init() calls init_mem_pgprot() to initialize pgprot values used by macros such as PAGE_KERNEL, PAGE_KERNEL_EXEC, etc. The new fixmap and early_ioremap support also needs to use these macros before paging_init() is called. This patch moves the init_mem_pgprot() call out of paging_init() and into setup_arch() so that pgprot_default gets initialized in time for fixmap and early_ioremap. Signed-off-by: Mark Salter Acked-by: Catalin Marinas Cc: Will Deacon Cc: Borislav Petkov Cc: Dave Young Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/mmu.h | 1 + arch/arm64/kernel/setup.c | 2 ++ arch/arm64/mm/mmu.c | 3 +-- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 2494fc01896a..f600d400c07d 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -27,5 +27,6 @@ typedef struct { extern void paging_init(void); extern void setup_mm_for_reboot(void); extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); +extern void init_mem_pgprot(void); #endif diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 67da30741a1b..20830d1afbb6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -360,6 +360,8 @@ void __init setup_arch(char **cmdline_p) *cmdline_p = boot_command_line; + init_mem_pgprot(); + parse_early_param(); arm64_memblock_init(); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index f8dc7e8fce6f..ba259a0e385e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -125,7 +125,7 @@ early_param("cachepolicy", early_cachepolicy); /* * Adjust the PMD section entries according to the CPU in use. */ -static void __init init_mem_pgprot(void) +void __init init_mem_pgprot(void) { pteval_t default_pgprot; int i; @@ -357,7 +357,6 @@ void __init paging_init(void) { void *zero_page; - init_mem_pgprot(); map_mem(); /* -- cgit From bf4b558eba920a38f91beb5ee62a8ce2628c92f7 Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Mon, 7 Apr 2014 15:39:52 -0700 Subject: arm64: add early_ioremap support Add support for early IO or memory mappings which are needed before the normal ioremap() is usable. This also adds fixmap support for permanent fixed mappings such as that used by the earlyprintk device register region. Signed-off-by: Mark Salter Acked-by: Catalin Marinas Cc: Borislav Petkov Cc: Dave Young Cc: H. Peter Anvin Cc: Will Deacon Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/arm64/memory.txt | 4 +- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/Kbuild | 1 + arch/arm64/include/asm/fixmap.h | 67 +++++++++++++++++++++++++++++++ arch/arm64/include/asm/io.h | 1 + arch/arm64/include/asm/memory.h | 2 +- arch/arm64/kernel/early_printk.c | 8 +++- arch/arm64/kernel/head.S | 9 ++--- arch/arm64/kernel/setup.c | 2 + arch/arm64/mm/ioremap.c | 85 ++++++++++++++++++++++++++++++++++++++++ arch/arm64/mm/mmu.c | 41 ------------------- 11 files changed, 169 insertions(+), 52 deletions(-) create mode 100644 arch/arm64/include/asm/fixmap.h diff --git a/Documentation/arm64/memory.txt b/Documentation/arm64/memory.txt index 85e24c4f215c..d50fa618371b 100644 --- a/Documentation/arm64/memory.txt +++ b/Documentation/arm64/memory.txt @@ -39,7 +39,7 @@ ffffffbffa000000 ffffffbffaffffff 16MB PCI I/O space ffffffbffb000000 ffffffbffbbfffff 12MB [guard] -ffffffbffbc00000 ffffffbffbdfffff 2MB earlyprintk device +ffffffbffbc00000 ffffffbffbdfffff 2MB fixed mappings ffffffbffbe00000 ffffffbffbffffff 2MB [guard] @@ -66,7 +66,7 @@ fffffdfffa000000 fffffdfffaffffff 16MB PCI I/O space fffffdfffb000000 fffffdfffbbfffff 12MB [guard] -fffffdfffbc00000 fffffdfffbdfffff 2MB earlyprintk device +fffffdfffbc00000 fffffdfffbdfffff 2MB fixed mappings fffffdfffbe00000 fffffdfffbffffff 2MB [guard] diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 8079a23e2701..e6e4d3749a6e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -17,6 +17,7 @@ config ARM64 select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_CPU_AUTOPROBE + select GENERIC_EARLY_IOREMAP select GENERIC_IOMAP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 4bca4923fc0b..83f71b3004a8 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -10,6 +10,7 @@ generic-y += delay.h generic-y += div64.h generic-y += dma.h generic-y += emergency-restart.h +generic-y += early_ioremap.h generic-y += errno.h generic-y += ftrace.h generic-y += hash.h diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h new file mode 100644 index 000000000000..5f7bfe6df723 --- /dev/null +++ b/arch/arm64/include/asm/fixmap.h @@ -0,0 +1,67 @@ +/* + * fixmap.h: compile-time virtual memory allocation + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998 Ingo Molnar + * Copyright (C) 2013 Mark Salter + * + * Adapted from arch/x86_64 version. + * + */ + +#ifndef _ASM_ARM64_FIXMAP_H +#define _ASM_ARM64_FIXMAP_H + +#ifndef __ASSEMBLY__ +#include +#include + +/* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at + * compile time, but to set the physical address only + * in the boot process. + * + * These 'compile-time allocated' memory buffers are + * page-sized. Use set_fixmap(idx,phys) to associate + * physical memory with fixmap indices. + * + */ +enum fixed_addresses { + FIX_EARLYCON_MEM_BASE, + __end_of_permanent_fixed_addresses, + + /* + * Temporary boot-time mappings, used by early_ioremap(), + * before ioremap() is functional. + */ +#ifdef CONFIG_ARM64_64K_PAGES +#define NR_FIX_BTMAPS 4 +#else +#define NR_FIX_BTMAPS 64 +#endif +#define FIX_BTMAPS_SLOTS 7 +#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS) + + FIX_BTMAP_END = __end_of_permanent_fixed_addresses, + FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, + __end_of_fixed_addresses +}; + +#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) + +#define FIXMAP_PAGE_IO __pgprot(PROT_DEVICE_nGnRE) + +extern void __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags); + +#define __set_fixmap __early_set_fixmap + +#include + +#endif /* !__ASSEMBLY__ */ +#endif /* _ASM_ARM64_FIXMAP_H */ diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 7846a6bb0833..a1bef78f0303 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -27,6 +27,7 @@ #include #include #include +#include #include diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 9dc5dc39fded..e94f9458aa6f 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -49,7 +49,7 @@ #define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1)) #define MODULES_END (PAGE_OFFSET) #define MODULES_VADDR (MODULES_END - SZ_64M) -#define EARLYCON_IOBASE (MODULES_VADDR - SZ_4M) +#define FIXADDR_TOP (MODULES_VADDR - SZ_2M - PAGE_SIZE) #define TASK_SIZE_64 (UL(1) << VA_BITS) #ifdef CONFIG_COMPAT diff --git a/arch/arm64/kernel/early_printk.c b/arch/arm64/kernel/early_printk.c index fbb6e1843659..ffbbdde7aba1 100644 --- a/arch/arm64/kernel/early_printk.c +++ b/arch/arm64/kernel/early_printk.c @@ -26,6 +26,8 @@ #include #include +#include + static void __iomem *early_base; static void (*printch)(char ch); @@ -141,8 +143,10 @@ static int __init setup_early_printk(char *buf) } /* no options parsing yet */ - if (paddr) - early_base = early_io_map(paddr, EARLYCON_IOBASE); + if (paddr) { + set_fixmap_io(FIX_EARLYCON_MEM_BASE, paddr); + early_base = (void __iomem *)fix_to_virt(FIX_EARLYCON_MEM_BASE); + } printch = match->printch; early_console = &early_console_dev; diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 61035d6814cb..1fe5d8d2bdfd 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -404,7 +404,7 @@ ENDPROC(__calc_phys_offset) * - identity mapping to enable the MMU (low address, TTBR0) * - first few MB of the kernel linear mapping to jump to once the MMU has * been enabled, including the FDT blob (TTBR1) - * - UART mapping if CONFIG_EARLY_PRINTK is enabled (TTBR1) + * - pgd entry for fixed mappings (TTBR1) */ __create_page_tables: pgtbl x25, x26, x24 // idmap_pg_dir and swapper_pg_dir addresses @@ -461,15 +461,12 @@ __create_page_tables: sub x6, x6, #1 // inclusive range create_block_map x0, x7, x3, x5, x6 1: -#ifdef CONFIG_EARLY_PRINTK /* - * Create the pgd entry for the UART mapping. The full mapping is done - * later based earlyprintk kernel parameter. + * Create the pgd entry for the fixed mappings. */ - ldr x5, =EARLYCON_IOBASE // UART virtual address + ldr x5, =FIXADDR_TOP // Fixed mapping virtual address add x0, x26, #2 * PAGE_SIZE // section table address create_pgd_entry x26, x0, x5, x6, x7 -#endif ret ENDPROC(__create_page_tables) .ltorg diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 20830d1afbb6..720853f70b6b 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -42,6 +42,7 @@ #include #include +#include #include #include #include @@ -361,6 +362,7 @@ void __init setup_arch(char **cmdline_p) *cmdline_p = boot_command_line; init_mem_pgprot(); + early_ioremap_init(); parse_early_param(); diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index 2bb1d586664c..7ec328392ae0 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -25,6 +25,10 @@ #include #include +#include +#include +#include + static void __iomem *__ioremap_caller(phys_addr_t phys_addr, size_t size, pgprot_t prot, void *caller) { @@ -98,3 +102,84 @@ void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size) __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_cache); + +#ifndef CONFIG_ARM64_64K_PAGES +static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; +#endif + +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(addr); + BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd)); + + pud = pud_offset(pgd, addr); + BUG_ON(pud_none(*pud) || pud_bad(*pud)); + + return pmd_offset(pud, addr); +} + +static inline pte_t * __init early_ioremap_pte(unsigned long addr) +{ + pmd_t *pmd = early_ioremap_pmd(addr); + + BUG_ON(pmd_none(*pmd) || pmd_bad(*pmd)); + + return pte_offset_kernel(pmd, addr); +} + +void __init early_ioremap_init(void) +{ + pmd_t *pmd; + + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); +#ifndef CONFIG_ARM64_64K_PAGES + /* need to populate pmd for 4k pagesize only */ + pmd_populate_kernel(&init_mm, pmd, bm_pte); +#endif + /* + * The boot-ioremap range spans multiple pmds, for which + * we are not prepared: + */ + BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); + + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { + WARN_ON(1); + pr_warn("pmd %p != %p\n", + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))); + pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", + fix_to_virt(FIX_BTMAP_BEGIN)); + pr_warn("fix_to_virt(FIX_BTMAP_END): %08lx\n", + fix_to_virt(FIX_BTMAP_END)); + + pr_warn("FIX_BTMAP_END: %d\n", FIX_BTMAP_END); + pr_warn("FIX_BTMAP_BEGIN: %d\n", + FIX_BTMAP_BEGIN); + } + + early_ioremap_setup(); +} + +void __init __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) +{ + unsigned long addr = __fix_to_virt(idx); + pte_t *pte; + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + + pte = early_ioremap_pte(addr); + + if (pgprot_val(flags)) + set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); + else { + pte_clear(&init_mm, addr, pte); + flush_tlb_kernel_range(addr, addr+PAGE_SIZE); + } +} diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ba259a0e385e..6b7e89569a3a 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -260,47 +260,6 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, } while (pgd++, addr = next, addr != end); } -#ifdef CONFIG_EARLY_PRINTK -/* - * Create an early I/O mapping using the pgd/pmd entries already populated - * in head.S as this function is called too early to allocated any memory. The - * mapping size is 2MB with 4KB pages or 64KB or 64KB pages. - */ -void __iomem * __init early_io_map(phys_addr_t phys, unsigned long virt) -{ - unsigned long size, mask; - bool page64k = IS_ENABLED(CONFIG_ARM64_64K_PAGES); - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* - * No early pte entries with !ARM64_64K_PAGES configuration, so using - * sections (pmd). - */ - size = page64k ? PAGE_SIZE : SECTION_SIZE; - mask = ~(size - 1); - - pgd = pgd_offset_k(virt); - pud = pud_offset(pgd, virt); - if (pud_none(*pud)) - return NULL; - pmd = pmd_offset(pud, virt); - - if (page64k) { - if (pmd_none(*pmd)) - return NULL; - pte = pte_offset_kernel(pmd, virt); - set_pte(pte, __pte((phys & mask) | PROT_DEVICE_nGnRE)); - } else { - set_pmd(pmd, __pmd((phys & mask) | PROT_SECT_DEVICE_nGnRE)); - } - - return (void __iomem *)((virt & mask) + (phys & ~mask)); -} -#endif - static void __init map_mem(void) { struct memblock_region *reg; -- cgit From 56aeeba8c1a21211aebe724a623acafe760912a2 Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Mon, 7 Apr 2014 15:39:53 -0700 Subject: doc/kernel-parameters.txt: add early_ioremap_debug Add description of early_ioremap_debug kernel parameter. Signed-off-by: Mark Salter Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Young Cc: H. Peter Anvin Cc: Will Deacon Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index bc3478581f67..b6c67d592be5 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -884,6 +884,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Enable debug messages at boot time. See Documentation/dynamic-debug-howto.txt for details. + early_ioremap_debug [KNL] + Enable debug messages in early_ioremap support. This + is useful for tracking down temporary early mappings + which are not unmapped. + earlycon= [KNL] Output early console device and options. uart[8250],io,[,options] uart[8250],mmio,[,options] -- cgit From 76ee4735781713eed1b9754837715a0b3b231dcd Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:39:55 -0700 Subject: fs/ufs/super.c: add __init to init_inodecache() init_inodecache is only called by __init init_ufs_fs. Signed-off-by: Fabian Frederick Cc: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index b8c6791f046f..3aad63394abc 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1454,7 +1454,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", sizeof(struct ufs_inode_info), -- cgit From 6e0bd34c33b193849b5ada01dd0b02d3bef85bf3 Mon Sep 17 00:00:00 2001 From: Christian Engelmayer Date: Mon, 7 Apr 2014 15:39:56 -0700 Subject: fs/ufs: remove unused ufs_super_block_first pointer Remove occurences of unused pointers to struct ufs_super_block_first that were acquired via ubh_get_usb_first(). Detected by Coverity: CID 139929 - CID 139936, CID 139940. Signed-off-by: Christian Engelmayer Cc: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/balloc.c | 12 ------------ fs/ufs/ialloc.c | 4 ---- fs/ufs/super.c | 2 -- 3 files changed, 18 deletions(-) diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index a7ea492ae660..0ab1de4b39a5 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -38,7 +38,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, bit, end_bit, bbase, blkmap, i; @@ -46,7 +45,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); UFSD("ENTER, fragment %llu, count %u\n", (unsigned long long)fragment, count); @@ -135,7 +133,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned overflow, cgno, bit, end_bit, i; @@ -143,7 +140,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); UFSD("ENTER, fragment %llu, count %u\n", (unsigned long long)fragment, count); @@ -499,7 +495,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, fragno, fragoff, count, fragsize, i; @@ -509,7 +504,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first (uspi); count = newcount - oldcount; cgno = ufs_dtog(uspi, fragment); @@ -577,7 +571,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned oldcg, i, j, k, allocsize; @@ -588,7 +581,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); oldcg = cgno; /* @@ -690,7 +682,6 @@ static u64 ufs_alloccg_block(struct inode *inode, { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cylinder_group * ucg; u64 result, blkno; @@ -698,7 +689,6 @@ static u64 ufs_alloccg_block(struct inode *inode, sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (goal == 0) { @@ -794,7 +784,6 @@ static u64 ufs_bitmap_search(struct super_block *sb, 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe }; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct ufs_super_block_first *usb1; struct ufs_cylinder_group *ucg; unsigned start, length, loc; unsigned pos, want, blockmap, mask, end; @@ -803,7 +792,6 @@ static u64 ufs_bitmap_search(struct super_block *sb, UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx, (unsigned long long)goal, count); - usb1 = ubh_get_usb_first (uspi); ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (goal) diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index d0426d74817b..98f7211599ff 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -57,7 +57,6 @@ void ufs_free_inode (struct inode * inode) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; int is_directory; @@ -67,7 +66,6 @@ void ufs_free_inode (struct inode * inode) sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); ino = inode->i_ino; @@ -175,7 +173,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) struct super_block * sb; struct ufs_sb_info * sbi; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; struct inode * inode; @@ -195,7 +192,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) ufsi = UFS_I(inode); sbi = UFS_SB(sb); uspi = sbi->s_uspi; - usb1 = ubh_get_usb_first(uspi); mutex_lock(&sbi->s_lock); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 3aad63394abc..0232f526a5cf 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1390,14 +1390,12 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi; unsigned flags = UFS_SB(sb)->s_flags; - struct ufs_super_block_first *usb1; struct ufs_super_block_second *usb2; struct ufs_super_block_third *usb3; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); lock_ufs(sb); - usb1 = ubh_get_usb_first(uspi); usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); -- cgit From 48968a112cdf992d52c949a26c0020943c2268b1 Mon Sep 17 00:00:00 2001 From: Christian Engelmayer Date: Mon, 7 Apr 2014 15:39:57 -0700 Subject: fs/ufs: remove unused ufs_super_block_second pointer Pointer 'usb2' to struct ufs_super_block_second acquired via ubh_get_usb_second() is never used in function ufs_statfs(). Thus remove it. Detected by Coverity: CID 139940. Signed-off-by: Christian Engelmayer Cc: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 0232f526a5cf..53122c959cc1 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1390,13 +1390,11 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi; unsigned flags = UFS_SB(sb)->s_flags; - struct ufs_super_block_second *usb2; struct ufs_super_block_third *usb3; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); lock_ufs(sb); - usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { -- cgit From fe4487d18fe88d7616628d2512ef0799e1bb697e Mon Sep 17 00:00:00 2001 From: Christian Engelmayer Date: Mon, 7 Apr 2014 15:39:57 -0700 Subject: fs/ufs: remove unused ufs_super_block_third pointer Pointer 'usb3' to struct ufs_super_block_third acquired via ubh_get_usb_third() is never used in function ufs_read_cylinder_structures(). Thus remove it. Detected by Coverity: CID 139939. Signed-off-by: Christian Engelmayer Cc: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 53122c959cc1..c1183f9f69dc 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -524,11 +524,9 @@ static int ufs_read_cylinder_structures(struct super_block *sb) struct ufs_buffer_head * ubh; unsigned char * base, * space; unsigned size, blks, i; - struct ufs_super_block_third *usb3; UFSD("ENTER\n"); - usb3 = ubh_get_usb_third(uspi); /* * Read cs structures from (usually) first data block * on the device. -- cgit From fdc5813fbbd484a54c88477f91a78934cda8bb32 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Mon, 7 Apr 2014 15:39:58 -0700 Subject: MAINTAINERS: update Intel C600 SAS driver maintainers Signed-off-by: Lukasz Dorau Signed-off-by: Dave Jiang Signed-off-by: Maciej Patelczyk Cc: Artur Paszkiewicz Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index bfff89690c3a..d0b8afe26123 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4542,8 +4542,7 @@ K: \b(ABS|SYN)_MT_ INTEL C600 SERIES SAS CONTROLLER DRIVER M: Intel SCU Linux support -M: Lukasz Dorau -M: Maciej Patelczyk +M: Artur Paszkiewicz M: Dave Jiang L: linux-scsi@vger.kernel.org T: git git://git.code.sf.net/p/intel-sas/isci -- cgit