summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig13
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c20
-rw-r--r--mm/compaction.c106
-rw-r--r--mm/filemap.c33
-rw-r--r--mm/huge_memory.c30
-rw-r--r--mm/hugetlb_cgroup.c35
-rw-r--r--mm/internal.h1
-rw-r--r--mm/kasan/Makefile1
-rw-r--r--mm/kasan/kasan.c133
-rw-r--r--mm/kasan/kasan.h21
-rw-r--r--mm/kasan/quarantine.c291
-rw-r--r--mm/kasan/report.c1
-rw-r--r--mm/memblock.c30
-rw-r--r--mm/memcontrol.c3
-rw-r--r--mm/memory-failure.c72
-rw-r--r--mm/memory.c36
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c1
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/oom_kill.c34
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c386
-rw-r--r--mm/page_poison.c8
-rw-r--r--mm/slab.c14
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c5
-rw-r--r--mm/swap.c5
-rw-r--r--mm/vmalloc.c39
-rw-r--r--mm/vmscan.c137
-rw-r--r--mm/vmstat.c51
-rw-r--r--mm/z3fold.c792
-rw-r--r--mm/zsmalloc.c175
-rw-r--r--mm/zswap.c12
35 files changed, 1914 insertions, 594 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b0432b71137d..2664c118b5d2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -404,6 +404,7 @@ config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
select COMPACTION
+ select RADIX_TREE_MULTIORDER
help
Transparent Hugepages allows the kernel to use huge pages and
huge tlb transparently to the applications whenever possible.
@@ -567,7 +568,7 @@ config ZPOOL
zsmalloc.
config ZBUD
- tristate "Low density storage for compressed pages"
+ tristate "Low (Up to 2x) density storage for compressed pages"
default n
help
A special purpose allocator for storing compressed pages.
@@ -576,6 +577,16 @@ config ZBUD
deterministic reclaim properties that make it preferable to a higher
density approach when reclaim will be used.
+config Z3FOLD
+ tristate "Up to 3x density storage for compressed pages"
+ depends on ZPOOL
+ default n
+ help
+ A special purpose allocator for storing compressed pages.
+ It is designed to store up to three compressed pages per physical
+ page. It is a ZBUD derivative so the simplicity and determinism are
+ still there.
+
config ZSMALLOC
tristate "Memory allocator for compressed pages"
depends on MMU
diff --git a/mm/Makefile b/mm/Makefile
index deb467edca2d..78c6f7dedb83 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o
obj-$(CONFIG_ZBUD) += zbud.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
+obj-$(CONFIG_Z3FOLD) += z3fold.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0c6317b7db38..ed173b8ae8f2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -957,9 +957,8 @@ EXPORT_SYMBOL(congestion_wait);
* jiffies for either a BDI to exit congestion of the given @sync queue
* or a write to complete.
*
- * In the absence of zone congestion, a short sleep or a cond_resched is
- * performed to yield the processor and to allow other subsystems to make
- * a forward progress.
+ * In the absence of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
*
* The return value is 0 if the sleep is for the full timeout. Otherwise,
* it is the number of jiffies that were still remaining when the function
@@ -979,20 +978,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
*/
if (atomic_read(&nr_wb_congested[sync]) == 0 ||
!test_bit(ZONE_CONGESTED, &zone->flags)) {
-
- /*
- * Memory allocation/reclaim might be called from a WQ
- * context and the current implementation of the WQ
- * concurrency control doesn't recognize that a particular
- * WQ is congested if the worker thread is looping without
- * ever sleeping. Therefore we have to do a short sleep
- * here rather than calling cond_resched().
- */
- if (current->flags & PF_WQ_WORKER)
- schedule_timeout_uninterruptible(1);
- else
- cond_resched();
-
+ cond_resched();
/* In case we scheduled, work out time remaining */
ret = timeout - (jiffies - start);
if (ret < 0)
diff --git a/mm/compaction.c b/mm/compaction.c
index eda3c2244f30..1427366ad673 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1229,7 +1229,7 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
-static int __compact_finished(struct zone *zone, struct compact_control *cc,
+static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
const int migratetype)
{
unsigned int order;
@@ -1252,7 +1252,10 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
if (cc->direct_compaction)
zone->compact_blockskip_flush = true;
- return COMPACT_COMPLETE;
+ if (cc->whole_zone)
+ return COMPACT_COMPLETE;
+ else
+ return COMPACT_PARTIAL_SKIPPED;
}
if (is_via_compact_memory(cc->order))
@@ -1292,8 +1295,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
return COMPACT_NO_SUITABLE_PAGE;
}
-static int compact_finished(struct zone *zone, struct compact_control *cc,
- const int migratetype)
+static enum compact_result compact_finished(struct zone *zone,
+ struct compact_control *cc,
+ const int migratetype)
{
int ret;
@@ -1312,9 +1316,10 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
* COMPACT_PARTIAL - If the allocation would succeed without compaction
* COMPACT_CONTINUE - If compaction should run now
*/
-static unsigned long __compaction_suitable(struct zone *zone, int order,
+static enum compact_result __compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
- int classzone_idx)
+ int classzone_idx,
+ unsigned long wmark_target)
{
int fragindex;
unsigned long watermark;
@@ -1337,7 +1342,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
* allocated and for a short time, the footprint is higher
*/
watermark += (2UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
+ if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
+ alloc_flags, wmark_target))
return COMPACT_SKIPPED;
/*
@@ -1358,13 +1364,14 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
return COMPACT_CONTINUE;
}
-unsigned long compaction_suitable(struct zone *zone, int order,
+enum compact_result compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
int classzone_idx)
{
- unsigned long ret;
+ enum compact_result ret;
- ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
+ ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
+ zone_page_state(zone, NR_FREE_PAGES));
trace_mm_compaction_suitable(zone, order, ret);
if (ret == COMPACT_NOT_SUITABLE_ZONE)
ret = COMPACT_SKIPPED;
@@ -1372,9 +1379,42 @@ unsigned long compaction_suitable(struct zone *zone, int order,
return ret;
}
-static int compact_zone(struct zone *zone, struct compact_control *cc)
+bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
+ int alloc_flags)
{
- int ret;
+ struct zone *zone;
+ struct zoneref *z;
+
+ /*
+ * Make sure at least one zone would pass __compaction_suitable if we continue
+ * retrying the reclaim.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+ ac->nodemask) {
+ unsigned long available;
+ enum compact_result compact_result;
+
+ /*
+ * Do not consider all the reclaimable memory because we do not
+ * want to trash just for a single high order allocation which
+ * is even not guaranteed to appear even if __compaction_suitable
+ * is happy about the watermark check.
+ */
+ available = zone_reclaimable_pages(zone) / order;
+ available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+ compact_result = __compaction_suitable(zone, order, alloc_flags,
+ ac_classzone_idx(ac), available);
+ if (compact_result != COMPACT_SKIPPED &&
+ compact_result != COMPACT_NOT_SUITABLE_ZONE)
+ return true;
+ }
+
+ return false;
+}
+
+static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
+{
+ enum compact_result ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
@@ -1382,15 +1422,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
- switch (ret) {
- case COMPACT_PARTIAL:
- case COMPACT_SKIPPED:
- /* Compaction is likely to fail */
+ /* Compaction is likely to fail */
+ if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED)
return ret;
- case COMPACT_CONTINUE:
- /* Fall through to compaction */
- ;
- }
+
+ /* huh, compaction_suitable is returning something unexpected */
+ VM_BUG_ON(ret != COMPACT_CONTINUE);
/*
* Clear pageblock skip if there were failures recently and compaction
@@ -1415,6 +1452,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
+
+ if (cc->migrate_pfn == start_pfn)
+ cc->whole_zone = true;
+
cc->last_migrated_pfn = 0;
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
@@ -1530,11 +1571,11 @@ out:
return ret;
}
-static unsigned long compact_zone_order(struct zone *zone, int order,
+static enum compact_result compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum migrate_mode mode, int *contended,
unsigned int alloc_flags, int classzone_idx)
{
- unsigned long ret;
+ enum compact_result ret;
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
@@ -1572,7 +1613,7 @@ int sysctl_extfrag_threshold = 500;
*
* This is the main entry point for direct page compaction.
*/
-unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum migrate_mode mode, int *contended)
{
@@ -1580,7 +1621,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
- int rc = COMPACT_DEFERRED;
+ enum compact_result rc = COMPACT_SKIPPED;
int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
*contended = COMPACT_CONTENDED_NONE;
@@ -1594,11 +1635,13 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
- int status;
+ enum compact_result status;
int zone_contended;
- if (compaction_deferred(zone, order))
+ if (compaction_deferred(zone, order)) {
+ rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
continue;
+ }
status = compact_zone_order(zone, order, gfp_mask, mode,
&zone_contended, alloc_flags,
@@ -1634,7 +1677,8 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
goto break_loop;
}
- if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
+ if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE ||
+ status == COMPACT_PARTIAL_SKIPPED)) {
/*
* We think that allocation won't succeed in this zone
* so we defer compaction there. If it ends up
@@ -1669,7 +1713,7 @@ break_loop:
* If at least one zone wasn't deferred or skipped, we report if all
* zones that were tried were lock contended.
*/
- if (rc > COMPACT_SKIPPED && all_zones_contended)
+ if (rc > COMPACT_INACTIVE && all_zones_contended)
*contended = COMPACT_CONTENDED_LOCK;
return rc;
@@ -1818,7 +1862,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
struct zone *zone;
enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
- for (zoneid = 0; zoneid < classzone_idx; zoneid++) {
+ for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
@@ -1853,7 +1897,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
cc.classzone_idx);
count_vm_event(KCOMPACTD_WAKE);
- for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) {
+ for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
int status;
zone = &pgdat->node_zones[zoneid];
@@ -1881,7 +1925,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
cc.classzone_idx, 0)) {
success = true;
compaction_defer_reset(zone, cc.order, false);
- } else if (status == COMPACT_COMPLETE) {
+ } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
/*
* We use sync migration mode here, so we defer like
* sync direct compaction does.
diff --git a/mm/filemap.c b/mm/filemap.c
index 01690338e3d2..9665b1d4f318 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -114,14 +114,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
struct radix_tree_node *node;
- unsigned long index;
- unsigned int offset;
- unsigned int tag;
- void **slot;
VM_BUG_ON(!PageLocked(page));
- __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+ node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index,
+ shadow);
if (shadow) {
mapping->nrexceptional++;
@@ -135,23 +132,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
}
mapping->nrpages--;
- if (!node) {
- /* Clear direct pointer tags in root node */
- mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
- radix_tree_replace_slot(slot, shadow);
+ if (!node)
return;
- }
-
- /* Clear tree tags for the removed page */
- index = page->index;
- offset = index & RADIX_TREE_MAP_MASK;
- for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
- if (test_bit(offset, node->tags[tag]))
- radix_tree_tag_clear(&mapping->page_tree, index, tag);
- }
- /* Delete page, swap shadow entry */
- radix_tree_replace_slot(slot, shadow);
workingset_node_pages_dec(node);
if (shadow)
workingset_node_shadows_inc(node);
@@ -713,8 +696,12 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
* The page might have been evicted from cache only
* recently, in which case it should be activated like
* any other repeatedly accessed page.
+ * The exception is pages getting rewritten; evicting other
+ * data from the working set, only to cache data that will
+ * get overwritten with something else, is a waste of memory.
*/
- if (shadow && workingset_refault(shadow)) {
+ if (!(gfp_mask & __GFP_WRITE) &&
+ shadow && workingset_refault(shadow)) {
SetPageActive(page);
workingset_activation(page);
} else
@@ -2187,7 +2174,7 @@ repeat:
if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
- do_set_pte(vma, addr, page, pte, false, false);
+ do_set_pte(vma, addr, page, pte, false, false, true);
unlock_page(page);
goto next;
unlock:
@@ -2574,7 +2561,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags)
{
struct page *page;
- int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
+ int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
if (flags & AOP_FLAG_NOFS)
fgp_flags |= FGP_NOFS;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 66675eed67be..41ef7547e822 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -89,6 +89,7 @@ static unsigned int khugepaged_full_scans;
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static unsigned long khugepaged_sleep_expire;
static struct task_struct *khugepaged_thread __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);
static DEFINE_SPINLOCK(khugepaged_mm_lock);
@@ -467,6 +468,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
return -EINVAL;
khugepaged_scan_sleep_millisecs = msecs;
+ khugepaged_sleep_expire = 0;
wake_up_interruptible(&khugepaged_wait);
return count;
@@ -494,6 +496,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
return -EINVAL;
khugepaged_alloc_sleep_millisecs = msecs;
+ khugepaged_sleep_expire = 0;
wake_up_interruptible(&khugepaged_wait);
return count;
@@ -764,10 +767,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
{
- pmd_t entry;
- entry = mk_pmd(page, prot);
- entry = pmd_mkhuge(entry);
- return entry;
+ return pmd_mkhuge(mk_pmd(page, prot));
}
static inline struct list_head *page_deferred_list(struct page *page)
@@ -2794,15 +2794,25 @@ static void khugepaged_do_scan(void)
put_page(hpage);
}
+static bool khugepaged_should_wakeup(void)
+{
+ return kthread_should_stop() ||
+ time_after_eq(jiffies, khugepaged_sleep_expire);
+}
+
static void khugepaged_wait_work(void)
{
if (khugepaged_has_work()) {
- if (!khugepaged_scan_sleep_millisecs)
+ const unsigned long scan_sleep_jiffies =
+ msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
+
+ if (!scan_sleep_jiffies)
return;
+ khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
wait_event_freezable_timeout(khugepaged_wait,
- kthread_should_stop(),
- msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
+ khugepaged_should_wakeup(),
+ scan_sleep_jiffies);
return;
}
@@ -3026,8 +3036,10 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
return;
/*
- * Caller holds the mmap_sem write mode, so a huge pmd cannot
- * materialize from under us.
+ * Caller holds the mmap_sem write mode or the anon_vma lock,
+ * so a huge pmd cannot materialize from under us (khugepaged
+ * holds both the mmap_sem write mode and the anon_vma lock
+ * write mode).
*/
__split_huge_pmd(vma, pmd, address, freeze);
}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index d8fb10de0f14..eec1150125b9 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -67,26 +67,42 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
return false;
}
+static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
+ struct hugetlb_cgroup *parent_h_cgroup)
+{
+ int idx;
+
+ for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
+ struct page_counter *counter = &h_cgroup->hugepage[idx];
+ struct page_counter *parent = NULL;
+ unsigned long limit;
+ int ret;
+
+ if (parent_h_cgroup)
+ parent = &parent_h_cgroup->hugepage[idx];
+ page_counter_init(counter, parent);
+
+ limit = round_down(PAGE_COUNTER_MAX,
+ 1 << huge_page_order(&hstates[idx]));
+ ret = page_counter_limit(counter, limit);
+ VM_BUG_ON(ret);
+ }
+}
+
static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
struct hugetlb_cgroup *h_cgroup;
- int idx;
h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
if (!h_cgroup)
return ERR_PTR(-ENOMEM);
- if (parent_h_cgroup) {
- for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
- page_counter_init(&h_cgroup->hugepage[idx],
- &parent_h_cgroup->hugepage[idx]);
- } else {
+ if (!parent_h_cgroup)
root_h_cgroup = h_cgroup;
- for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
- page_counter_init(&h_cgroup->hugepage[idx], NULL);
- }
+
+ hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
return &h_cgroup->css;
}
@@ -285,6 +301,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
return ret;
idx = MEMFILE_IDX(of_cft(of)->private);
+ nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx]));
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_LIMIT:
diff --git a/mm/internal.h b/mm/internal.h
index 3ac544f1963f..f6f3353b0868 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -174,6 +174,7 @@ struct compact_control {
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool direct_compaction; /* False from kcompactd or /proc/... */
+ bool whole_zone; /* Whole zone has been scanned */
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
const unsigned int alloc_flags; /* alloc flags of a direct compactor */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 131daadf40e4..1548749a3d45 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -8,3 +8,4 @@ CFLAGS_REMOVE_kasan.o = -pg
CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
obj-y := kasan.o report.o kasan_init.o
+obj-$(CONFIG_SLAB) += quarantine.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 38f1dd79acdb..18b6a2b8d183 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -273,32 +273,48 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
return memory_is_poisoned_n(addr, size);
}
-
-static __always_inline void check_memory_region(unsigned long addr,
- size_t size, bool write)
+static __always_inline void check_memory_region_inline(unsigned long addr,
+ size_t size, bool write,
+ unsigned long ret_ip)
{
if (unlikely(size == 0))
return;
if (unlikely((void *)addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
- kasan_report(addr, size, write, _RET_IP_);
+ kasan_report(addr, size, write, ret_ip);
return;
}
if (likely(!memory_is_poisoned(addr, size)))
return;
- kasan_report(addr, size, write, _RET_IP_);
+ kasan_report(addr, size, write, ret_ip);
+}
+
+static void check_memory_region(unsigned long addr,
+ size_t size, bool write,
+ unsigned long ret_ip)
+{
+ check_memory_region_inline(addr, size, write, ret_ip);
+}
+
+void kasan_check_read(const void *p, unsigned int size)
+{
+ check_memory_region((unsigned long)p, size, false, _RET_IP_);
}
+EXPORT_SYMBOL(kasan_check_read);
-void __asan_loadN(unsigned long addr, size_t size);
-void __asan_storeN(unsigned long addr, size_t size);
+void kasan_check_write(const void *p, unsigned int size)
+{
+ check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_write);
#undef memset
void *memset(void *addr, int c, size_t len)
{
- __asan_storeN((unsigned long)addr, len);
+ check_memory_region((unsigned long)addr, len, true, _RET_IP_);
return __memset(addr, c, len);
}
@@ -306,8 +322,8 @@ void *memset(void *addr, int c, size_t len)
#undef memmove
void *memmove(void *dest, const void *src, size_t len)
{
- __asan_loadN((unsigned long)src, len);
- __asan_storeN((unsigned long)dest, len);
+ check_memory_region((unsigned long)src, len, false, _RET_IP_);
+ check_memory_region((unsigned long)dest, len, true, _RET_IP_);
return __memmove(dest, src, len);
}
@@ -315,8 +331,8 @@ void *memmove(void *dest, const void *src, size_t len)
#undef memcpy
void *memcpy(void *dest, const void *src, size_t len)
{
- __asan_loadN((unsigned long)src, len);
- __asan_storeN((unsigned long)dest, len);
+ check_memory_region((unsigned long)src, len, false, _RET_IP_);
+ check_memory_region((unsigned long)dest, len, true, _RET_IP_);
return __memcpy(dest, src, len);
}
@@ -388,6 +404,16 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
}
#endif
+void kasan_cache_shrink(struct kmem_cache *cache)
+{
+ quarantine_remove_cache(cache);
+}
+
+void kasan_cache_destroy(struct kmem_cache *cache)
+{
+ quarantine_remove_cache(cache);
+}
+
void kasan_poison_slab(struct page *page)
{
kasan_poison_shadow(page_address(page),
@@ -482,7 +508,7 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
kasan_kmalloc(cache, object, cache->object_size, flags);
}
-void kasan_slab_free(struct kmem_cache *cache, void *object)
+void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
{
unsigned long size = cache->object_size;
unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
@@ -491,18 +517,43 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
return;
+ kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+
+bool kasan_slab_free(struct kmem_cache *cache, void *object)
+{
#ifdef CONFIG_SLAB
- if (cache->flags & SLAB_KASAN) {
- struct kasan_free_meta *free_info =
- get_free_info(cache, object);
+ /* RCU slabs could be legally used after free within the RCU period */
+ if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+ return false;
+
+ if (likely(cache->flags & SLAB_KASAN)) {
struct kasan_alloc_meta *alloc_info =
get_alloc_info(cache, object);
- alloc_info->state = KASAN_STATE_FREE;
- set_track(&free_info->track, GFP_NOWAIT);
+ struct kasan_free_meta *free_info =
+ get_free_info(cache, object);
+
+ switch (alloc_info->state) {
+ case KASAN_STATE_ALLOC:
+ alloc_info->state = KASAN_STATE_QUARANTINE;
+ quarantine_put(free_info, cache);
+ set_track(&free_info->track, GFP_NOWAIT);
+ kasan_poison_slab_free(cache, object);
+ return true;
+ case KASAN_STATE_QUARANTINE:
+ case KASAN_STATE_FREE:
+ pr_err("Double free");
+ dump_stack();
+ break;
+ default:
+ break;
+ }
}
+ return false;
+#else
+ kasan_poison_slab_free(cache, object);
+ return false;
#endif
-
- kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
}
void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
@@ -511,6 +562,9 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
unsigned long redzone_start;
unsigned long redzone_end;
+ if (flags & __GFP_RECLAIM)
+ quarantine_reduce();
+
if (unlikely(object == NULL))
return;
@@ -541,6 +595,9 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
unsigned long redzone_start;
unsigned long redzone_end;
+ if (flags & __GFP_RECLAIM)
+ quarantine_reduce();
+
if (unlikely(ptr == NULL))
return;
@@ -649,22 +706,22 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size)
}
EXPORT_SYMBOL(__asan_unregister_globals);
-#define DEFINE_ASAN_LOAD_STORE(size) \
- void __asan_load##size(unsigned long addr) \
- { \
- check_memory_region(addr, size, false); \
- } \
- EXPORT_SYMBOL(__asan_load##size); \
- __alias(__asan_load##size) \
- void __asan_load##size##_noabort(unsigned long); \
- EXPORT_SYMBOL(__asan_load##size##_noabort); \
- void __asan_store##size(unsigned long addr) \
- { \
- check_memory_region(addr, size, true); \
- } \
- EXPORT_SYMBOL(__asan_store##size); \
- __alias(__asan_store##size) \
- void __asan_store##size##_noabort(unsigned long); \
+#define DEFINE_ASAN_LOAD_STORE(size) \
+ void __asan_load##size(unsigned long addr) \
+ { \
+ check_memory_region_inline(addr, size, false, _RET_IP_);\
+ } \
+ EXPORT_SYMBOL(__asan_load##size); \
+ __alias(__asan_load##size) \
+ void __asan_load##size##_noabort(unsigned long); \
+ EXPORT_SYMBOL(__asan_load##size##_noabort); \
+ void __asan_store##size(unsigned long addr) \
+ { \
+ check_memory_region_inline(addr, size, true, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__asan_store##size); \
+ __alias(__asan_store##size) \
+ void __asan_store##size##_noabort(unsigned long); \
EXPORT_SYMBOL(__asan_store##size##_noabort)
DEFINE_ASAN_LOAD_STORE(1);
@@ -675,7 +732,7 @@ DEFINE_ASAN_LOAD_STORE(16);
void __asan_loadN(unsigned long addr, size_t size)
{
- check_memory_region(addr, size, false);
+ check_memory_region(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__asan_loadN);
@@ -685,7 +742,7 @@ EXPORT_SYMBOL(__asan_loadN_noabort);
void __asan_storeN(unsigned long addr, size_t size)
{
- check_memory_region(addr, size, true);
+ check_memory_region(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__asan_storeN);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 30a2f0ba0e09..7f7ac51d7faf 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -62,6 +62,7 @@ struct kasan_global {
enum kasan_state {
KASAN_STATE_INIT,
KASAN_STATE_ALLOC,
+ KASAN_STATE_QUARANTINE,
KASAN_STATE_FREE
};
@@ -79,9 +80,14 @@ struct kasan_alloc_meta {
u32 reserved;
};
+struct qlist_node {
+ struct qlist_node *next;
+};
struct kasan_free_meta {
- /* Allocator freelist pointer, unused by KASAN. */
- void **freelist;
+ /* This field is used while the object is in the quarantine.
+ * Otherwise it might be used for the allocator freelist.
+ */
+ struct qlist_node quarantine_link;
struct kasan_track track;
};
@@ -105,4 +111,15 @@ static inline bool kasan_report_enabled(void)
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
+#ifdef CONFIG_SLAB
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
+void quarantine_reduce(void);
+void quarantine_remove_cache(struct kmem_cache *cache);
+#else
+static inline void quarantine_put(struct kasan_free_meta *info,
+ struct kmem_cache *cache) { }
+static inline void quarantine_reduce(void) { }
+static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
+#endif
+
#endif
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
new file mode 100644
index 000000000000..4973505a9bdd
--- /dev/null
+++ b/mm/kasan/quarantine.c
@@ -0,0 +1,291 @@
+/*
+ * KASAN quarantine.
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/hash.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/shrinker.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "../slab.h"
+#include "kasan.h"
+
+/* Data structure and operations for quarantine queues. */
+
+/*
+ * Each queue is a signle-linked list, which also stores the total size of
+ * objects inside of it.
+ */
+struct qlist_head {
+ struct qlist_node *head;
+ struct qlist_node *tail;
+ size_t bytes;
+};
+
+#define QLIST_INIT { NULL, NULL, 0 }
+
+static bool qlist_empty(struct qlist_head *q)
+{
+ return !q->head;
+}
+
+static void qlist_init(struct qlist_head *q)
+{
+ q->head = q->tail = NULL;
+ q->bytes = 0;
+}
+
+static void qlist_put(struct qlist_head *q, struct qlist_node *qlink,
+ size_t size)
+{
+ if (unlikely(qlist_empty(q)))
+ q->head = qlink;
+ else
+ q->tail->next = qlink;
+ q->tail = qlink;
+ qlink->next = NULL;
+ q->bytes += size;
+}
+
+static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
+{
+ if (unlikely(qlist_empty(from)))
+ return;
+
+ if (qlist_empty(to)) {
+ *to = *from;
+ qlist_init(from);
+ return;
+ }
+
+ to->tail->next = from->head;
+ to->tail = from->tail;
+ to->bytes += from->bytes;
+
+ qlist_init(from);
+}
+
+static void qlist_move(struct qlist_head *from, struct qlist_node *last,
+ struct qlist_head *to, size_t size)
+{
+ if (unlikely(last == from->tail)) {
+ qlist_move_all(from, to);
+ return;
+ }
+ if (qlist_empty(to))
+ to->head = from->head;
+ else
+ to->tail->next = from->head;
+ to->tail = last;
+ from->head = last->next;
+ last->next = NULL;
+ from->bytes -= size;
+ to->bytes += size;
+}
+
+
+/*
+ * The object quarantine consists of per-cpu queues and a global queue,
+ * guarded by quarantine_lock.
+ */
+static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
+
+static struct qlist_head global_quarantine;
+static DEFINE_SPINLOCK(quarantine_lock);
+
+/* Maximum size of the global queue. */
+static unsigned long quarantine_size;
+
+/*
+ * The fraction of physical memory the quarantine is allowed to occupy.
+ * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep
+ * the ratio low to avoid OOM.
+ */
+#define QUARANTINE_FRACTION 32
+
+#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+
+static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
+{
+ return virt_to_head_page(qlink)->slab_cache;
+}
+
+static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+ struct kasan_free_meta *free_info =
+ container_of(qlink, struct kasan_free_meta,
+ quarantine_link);
+
+ return ((void *)free_info) - cache->kasan_info.free_meta_offset;
+}
+
+static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+ void *object = qlink_to_object(qlink, cache);
+ struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+ unsigned long flags;
+
+ local_irq_save(flags);
+ alloc_info->state = KASAN_STATE_FREE;
+ ___cache_free(cache, object, _THIS_IP_);
+ local_irq_restore(flags);
+}
+
+static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
+{
+ struct qlist_node *qlink;
+
+ if (unlikely(qlist_empty(q)))
+ return;
+
+ qlink = q->head;
+ while (qlink) {
+ struct kmem_cache *obj_cache =
+ cache ? cache : qlink_to_cache(qlink);
+ struct qlist_node *next = qlink->next;
+
+ qlink_free(qlink, obj_cache);
+ qlink = next;
+ }
+ qlist_init(q);
+}
+
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
+{
+ unsigned long flags;
+ struct qlist_head *q;
+ struct qlist_head temp = QLIST_INIT;
+
+ local_irq_save(flags);
+
+ q = this_cpu_ptr(&cpu_quarantine);
+ qlist_put(q, &info->quarantine_link, cache->size);
+ if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE))
+ qlist_move_all(q, &temp);
+
+ local_irq_restore(flags);
+
+ if (unlikely(!qlist_empty(&temp))) {
+ spin_lock_irqsave(&quarantine_lock, flags);
+ qlist_move_all(&temp, &global_quarantine);
+ spin_unlock_irqrestore(&quarantine_lock, flags);
+ }
+}
+
+void quarantine_reduce(void)
+{
+ size_t new_quarantine_size;
+ unsigned long flags;
+ struct qlist_head to_free = QLIST_INIT;
+ size_t size_to_free = 0;
+ struct qlist_node *last;
+
+ if (likely(READ_ONCE(global_quarantine.bytes) <=
+ READ_ONCE(quarantine_size)))
+ return;
+
+ spin_lock_irqsave(&quarantine_lock, flags);
+
+ /*
+ * Update quarantine size in case of hotplug. Allocate a fraction of
+ * the installed memory to quarantine minus per-cpu queue limits.
+ */
+ new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+ QUARANTINE_FRACTION;
+ new_quarantine_size -= QUARANTINE_PERCPU_SIZE * num_online_cpus();
+ WRITE_ONCE(quarantine_size, new_quarantine_size);
+
+ last = global_quarantine.head;
+ while (last) {
+ struct kmem_cache *cache = qlink_to_cache(last);
+
+ size_to_free += cache->size;
+ if (!last->next || size_to_free >
+ global_quarantine.bytes - QUARANTINE_LOW_SIZE)
+ break;
+ last = last->next;
+ }
+ qlist_move(&global_quarantine, last, &to_free, size_to_free);
+
+ spin_unlock_irqrestore(&quarantine_lock, flags);
+
+ qlist_free_all(&to_free, NULL);
+}
+
+static void qlist_move_cache(struct qlist_head *from,
+ struct qlist_head *to,
+ struct kmem_cache *cache)
+{
+ struct qlist_node *prev = NULL, *curr;
+
+ if (unlikely(qlist_empty(from)))
+ return;
+
+ curr = from->head;
+ while (curr) {
+ struct qlist_node *qlink = curr;
+ struct kmem_cache *obj_cache = qlink_to_cache(qlink);
+
+ if (obj_cache == cache) {
+ if (unlikely(from->head == qlink)) {
+ from->head = curr->next;
+ prev = curr;
+ } else
+ prev->next = curr->next;
+ if (unlikely(from->tail == qlink))
+ from->tail = curr->next;
+ from->bytes -= cache->size;
+ qlist_put(to, qlink, cache->size);
+ } else {
+ prev = curr;
+ }
+ curr = curr->next;
+ }
+}
+
+static void per_cpu_remove_cache(void *arg)
+{
+ struct kmem_cache *cache = arg;
+ struct qlist_head to_free = QLIST_INIT;
+ struct qlist_head *q;
+
+ q = this_cpu_ptr(&cpu_quarantine);
+ qlist_move_cache(q, &to_free, cache);
+ qlist_free_all(&to_free, cache);
+}
+
+void quarantine_remove_cache(struct kmem_cache *cache)
+{
+ unsigned long flags;
+ struct qlist_head to_free = QLIST_INIT;
+
+ on_each_cpu(per_cpu_remove_cache, cache, 1);
+
+ spin_lock_irqsave(&quarantine_lock, flags);
+ qlist_move_cache(&global_quarantine, &to_free, cache);
+ spin_unlock_irqrestore(&quarantine_lock, flags);
+
+ qlist_free_all(&to_free, cache);
+}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 60869a5a0124..b3c122ddd454 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -151,6 +151,7 @@ static void object_err(struct kmem_cache *cache, struct page *page,
print_track(&alloc_info->track);
break;
case KASAN_STATE_FREE:
+ case KASAN_STATE_QUARANTINE:
pr_err("Object freed, allocated with size %u bytes\n",
alloc_info->alloc_size);
free_info = get_free_info(cache, object);
diff --git a/mm/memblock.c b/mm/memblock.c
index b570dddb4cb9..ac1248933b31 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -606,22 +606,14 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
return memblock_add_range(&memblock.memory, base, size, nid, 0);
}
-static int __init_memblock memblock_add_region(phys_addr_t base,
- phys_addr_t size,
- int nid,
- unsigned long flags)
+int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
- flags, (void *)_RET_IP_);
-
- return memblock_add_range(&memblock.memory, base, size, nid, flags);
-}
+ 0UL, (void *)_RET_IP_);
-int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
-{
- return memblock_add_region(base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
}
/**
@@ -732,22 +724,14 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
return memblock_remove_range(&memblock.reserved, base, size);
}
-static int __init_memblock memblock_reserve_region(phys_addr_t base,
- phys_addr_t size,
- int nid,
- unsigned long flags)
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
- flags, (void *)_RET_IP_);
-
- return memblock_add_range(&memblock.reserved, base, size, nid, flags);
-}
+ 0UL, (void *)_RET_IP_);
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
-{
- return memblock_reserve_region(base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
}
/**
@@ -840,7 +824,7 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
{
struct memblock_type *type = &memblock.reserved;
- if (*idx >= 0 && *idx < type->cnt) {
+ if (*idx < type->cnt) {
struct memblock_region *r = &type->regions[*idx];
phys_addr_t base = r->base;
phys_addr_t size = r->size;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d71d387868e6..b3f16ab4b431 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2652,8 +2652,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
}
/*
- * Reclaims as many pages from the given memcg as possible and moves
- * the rest to the parent.
+ * Reclaims as many pages from the given memcg as possible.
*
* Caller is responsible for holding css reference for memcg.
*/
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ca5acee53b7a..2fcca6b0e005 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -184,8 +184,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
struct siginfo si;
int ret;
- pr_err("MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
- pfn, t->comm, t->pid);
+ pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
+ pfn, t->comm, t->pid);
si.si_signo = SIGBUS;
si.si_errno = 0;
si.si_addr = (void *)addr;
@@ -208,7 +208,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
}
if (ret < 0)
- pr_info("MCE: Error sending signal to %s:%d: %d\n",
+ pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
t->comm, t->pid, ret);
return ret;
}
@@ -289,7 +289,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
} else {
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
if (!tk) {
- pr_err("MCE: Out of memory while machine check handling\n");
+ pr_err("Memory failure: Out of memory while machine check handling\n");
return;
}
}
@@ -303,7 +303,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* a SIGKILL because the error is not contained anymore.
*/
if (tk->addr == -EFAULT) {
- pr_info("MCE: Unable to find user space address %lx in %s\n",
+ pr_info("Memory failure: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
tk->addr_valid = 0;
}
@@ -334,7 +334,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
* signal and then access the memory. Just kill it.
*/
if (fail || tk->addr_valid == 0) {
- pr_err("MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+ pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
pfn, tk->tsk->comm, tk->tsk->pid);
force_sig(SIGKILL, tk->tsk);
}
@@ -347,7 +347,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
*/
else if (kill_proc(tk->tsk, tk->addr, trapno,
pfn, page, flags) < 0)
- pr_err("MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
+ pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
}
put_task_struct(tk->tsk);
@@ -559,7 +559,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
*/
static int me_unknown(struct page *p, unsigned long pfn)
{
- pr_err("MCE %#lx: Unknown page state\n", pfn);
+ pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
return MF_FAILED;
}
@@ -604,11 +604,12 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
if (mapping->a_ops->error_remove_page) {
err = mapping->a_ops->error_remove_page(mapping, p);
if (err != 0) {
- pr_info("MCE %#lx: Failed to punch page: %d\n",
+ pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
- pr_info("MCE %#lx: failed to release buffers\n", pfn);
+ pr_info("Memory failure: %#lx: failed to release buffers\n",
+ pfn);
} else {
ret = MF_RECOVERED;
}
@@ -620,7 +621,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
if (invalidate_inode_page(p))
ret = MF_RECOVERED;
else
- pr_info("MCE %#lx: Failed to invalidate\n", pfn);
+ pr_info("Memory failure: %#lx: Failed to invalidate\n",
+ pfn);
}
return ret;
}
@@ -833,7 +835,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type,
{
trace_memory_failure_event(pfn, type, result);
- pr_err("MCE %#lx: recovery action for %s: %s\n",
+ pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
}
@@ -849,7 +851,7 @@ static int page_action(struct page_state *ps, struct page *p,
if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
count--;
if (count != 0) {
- pr_err("MCE %#lx: %s still referenced by %d users\n",
+ pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
pfn, action_page_types[ps->type], count);
result = MF_FAILED;
}
@@ -882,7 +884,7 @@ int get_hwpoison_page(struct page *page)
* tries to touch the "partially handled" page.
*/
if (!PageAnon(head)) {
- pr_err("MCE: %#lx: non anonymous thp\n",
+ pr_err("Memory failure: %#lx: non anonymous thp\n",
page_to_pfn(page));
return 0;
}
@@ -892,7 +894,8 @@ int get_hwpoison_page(struct page *page)
if (head == compound_head(page))
return 1;
- pr_info("MCE: %#lx cannot catch tail\n", page_to_pfn(page));
+ pr_info("Memory failure: %#lx cannot catch tail\n",
+ page_to_pfn(page));
put_page(head);
}
@@ -931,12 +934,13 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
return SWAP_SUCCESS;
if (PageKsm(p)) {
- pr_err("MCE %#lx: can't handle KSM pages.\n", pfn);
+ pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
return SWAP_FAIL;
}
if (PageSwapCache(p)) {
- pr_err("MCE %#lx: keeping poisoned page in swap cache\n", pfn);
+ pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
+ pfn);
ttu |= TTU_IGNORE_HWPOISON;
}
@@ -954,7 +958,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
- pr_info("MCE %#lx: corrupted page was clean: dropped without side effects\n",
+ pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
}
@@ -972,7 +976,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
- pr_err("MCE %#lx: failed to unmap page (mapcount=%d)\n",
+ pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
/*
@@ -1040,14 +1044,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
if (!pfn_valid(pfn)) {
- pr_err("MCE %#lx: memory outside kernel control\n", pfn);
+ pr_err("Memory failure: %#lx: memory outside kernel control\n",
+ pfn);
return -ENXIO;
}
p = pfn_to_page(pfn);
orig_head = hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
- pr_err("MCE %#lx: already hardware poisoned\n", pfn);
+ pr_err("Memory failure: %#lx: already hardware poisoned\n",
+ pfn);
return 0;
}
@@ -1112,9 +1118,11 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
unlock_page(hpage);
if (!PageAnon(hpage))
- pr_err("MCE: %#lx: non anonymous thp\n", pfn);
+ pr_err("Memory failure: %#lx: non anonymous thp\n",
+ pfn);
else
- pr_err("MCE: %#lx: thp split failed\n", pfn);
+ pr_err("Memory failure: %#lx: thp split failed\n",
+ pfn);
if (TestClearPageHWPoison(p))
num_poisoned_pages_sub(nr_pages);
put_hwpoison_page(p);
@@ -1178,7 +1186,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* unpoison always clear PG_hwpoison inside page lock
*/
if (!PageHWPoison(p)) {
- pr_err("MCE %#lx: just unpoisoned\n", pfn);
+ pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
put_hwpoison_page(hpage);
@@ -1395,25 +1403,25 @@ int unpoison_memory(unsigned long pfn)
page = compound_head(p);
if (!PageHWPoison(p)) {
- unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n",
+ unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
pfn, &unpoison_rs);
return 0;
}
if (page_count(page) > 1) {
- unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n",
+ unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
pfn, &unpoison_rs);
return 0;
}
if (page_mapped(page)) {
- unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n",
+ unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
pfn, &unpoison_rs);
return 0;
}
if (page_mapping(page)) {
- unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
+ unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
pfn, &unpoison_rs);
return 0;
}
@@ -1424,7 +1432,7 @@ int unpoison_memory(unsigned long pfn)
* In such case, we yield to memory_failure() and make unpoison fail.
*/
if (!PageHuge(page) && PageTransHuge(page)) {
- unpoison_pr_info("MCE: Memory failure is now running on %#lx\n",
+ unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
pfn, &unpoison_rs);
return 0;
}
@@ -1439,13 +1447,13 @@ int unpoison_memory(unsigned long pfn)
* to the end.
*/
if (PageHuge(page)) {
- unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n",
+ unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n",
pfn, &unpoison_rs);
return 0;
}
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
- unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n",
+ unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
pfn, &unpoison_rs);
return 0;
}
@@ -1458,7 +1466,7 @@ int unpoison_memory(unsigned long pfn)
* the free buddy page pool.
*/
if (TestClearPageHWPoison(page)) {
- unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n",
+ unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
pfn, &unpoison_rs);
num_poisoned_pages_sub(nr_pages);
freeit = 1;
diff --git a/mm/memory.c b/mm/memory.c
index 07493e34ab7e..a1b93d9e4449 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1744,6 +1744,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long next;
unsigned long end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
+ unsigned long remap_pfn = pfn;
int err;
/*
@@ -1770,7 +1771,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
vma->vm_pgoff = pfn;
}
- err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
+ err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
if (err)
return -EINVAL;
@@ -1789,7 +1790,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
} while (pgd++, addr = next, addr != end);
if (err)
- untrack_pfn(vma, pfn, PAGE_ALIGN(size));
+ untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
return err;
}
@@ -2875,7 +2876,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
* vm_ops->map_pages.
*/
void do_set_pte(struct vm_area_struct *vma, unsigned long address,
- struct page *page, pte_t *pte, bool write, bool anon)
+ struct page *page, pte_t *pte, bool write, bool anon, bool old)
{
pte_t entry;
@@ -2883,6 +2884,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
entry = mk_pte(page, vma->vm_page_prot);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (old)
+ entry = pte_mkold(entry);
if (anon) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address, false);
@@ -2896,8 +2899,16 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
update_mmu_cache(vma, address, pte);
}
+/*
+ * If architecture emulates "accessed" or "young" bit without HW support,
+ * there is no much gain with fault_around.
+ */
static unsigned long fault_around_bytes __read_mostly =
+#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+ PAGE_SIZE;
+#else
rounddown_pow_of_two(65536);
+#endif
#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
@@ -3020,9 +3031,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- do_fault_around(vma, address, pte, pgoff, flags);
if (!pte_same(*pte, orig_pte))
goto unlock_out;
+ do_fault_around(vma, address, pte, pgoff, flags);
+ /* Check if the fault is handled by faultaround */
+ if (!pte_same(*pte, orig_pte)) {
+ /*
+ * Faultaround produce old pte, but the pte we've
+ * handler fault for should be young.
+ */
+ pte_t entry = pte_mkyoung(*pte);
+ if (ptep_set_access_flags(vma, address, pte, entry, 0))
+ update_mmu_cache(vma, address, pte);
+ goto unlock_out;
+ }
pte_unmap_unlock(pte, ptl);
}
@@ -3037,7 +3059,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
put_page(fault_page);
return ret;
}
- do_set_pte(vma, address, fault_page, pte, false, false);
+ do_set_pte(vma, address, fault_page, pte, false, false, false);
unlock_page(fault_page);
unlock_out:
pte_unmap_unlock(pte, ptl);
@@ -3089,7 +3111,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
goto uncharge_out;
}
- do_set_pte(vma, address, new_page, pte, true, true);
+ do_set_pte(vma, address, new_page, pte, true, true, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
@@ -3146,7 +3168,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
put_page(fault_page);
return ret;
}
- do_set_pte(vma, address, fault_page, pte, true, false);
+ do_set_pte(vma, address, fault_page, pte, true, false, false);
pte_unmap_unlock(pte, ptl);
if (set_page_dirty(fault_page))
diff --git a/mm/mempool.c b/mm/mempool.c
index 9b7a14a791cc..9e075f829d0d 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -105,7 +105,7 @@ static inline void poison_element(mempool_t *pool, void *element)
static void kasan_poison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab)
- kasan_slab_free(pool->pool_data, element);
+ kasan_poison_slab_free(pool->pool_data, element);
if (pool->alloc == mempool_kmalloc)
kasan_kfree(element);
if (pool->alloc == mempool_alloc_pages)
diff --git a/mm/migrate.c b/mm/migrate.c
index 53ab6398e7a2..9baf41c877ff 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1171,6 +1171,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
switch(rc) {
case -ENOMEM:
+ nr_failed++;
goto out;
case -EAGAIN:
retry++;
diff --git a/mm/mmap.c b/mm/mmap.c
index fba246b8f1a5..b9274a0c82c9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -66,7 +66,7 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif
-static bool ignore_rlimit_data = true;
+static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
static void unmap_region(struct mm_struct *mm,
@@ -2886,13 +2886,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
if (is_data_mapping(flags) &&
mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
- if (ignore_rlimit_data)
- pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Will be forbidden soon.\n",
+ /* Workaround for Valgrind */
+ if (rlimit(RLIMIT_DATA) == 0 &&
+ mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
+ return true;
+ if (!ignore_rlimit_data) {
+ pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits or use boot option ignore_rlimit_data.\n",
current->comm, current->pid,
(mm->data_vm + npages) << PAGE_SHIFT,
rlimit(RLIMIT_DATA));
- else
return false;
+ }
}
return true;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 415f7eb913fa..5bb2f7698ad7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -174,8 +174,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
if (!p)
return 0;
+ /*
+ * Do not even consider tasks which are explicitly marked oom
+ * unkillable or have been already oom reaped.
+ */
adj = (long)p->signal->oom_score_adj;
- if (adj == OOM_SCORE_ADJ_MIN) {
+ if (adj == OOM_SCORE_ADJ_MIN ||
+ test_bit(MMF_OOM_REAPED, &p->mm->flags)) {
task_unlock(p);
return 0;
}
@@ -278,12 +283,8 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
* This task already has access to memory reserves and is being killed.
* Don't allow any other task to have access to the reserves.
*/
- if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (!is_sysrq_oom(oc))
- return OOM_SCAN_ABORT;
- }
- if (!task->mm)
- return OOM_SCAN_CONTINUE;
+ if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims))
+ return OOM_SCAN_ABORT;
/*
* If task is allocating a lot of memory and has been marked to be
@@ -302,12 +303,12 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
static struct task_struct *select_bad_process(struct oom_control *oc,
unsigned int *ppoints, unsigned long totalpages)
{
- struct task_struct *g, *p;
+ struct task_struct *p;
struct task_struct *chosen = NULL;
unsigned long chosen_points = 0;
rcu_read_lock();
- for_each_process_thread(g, p) {
+ for_each_process(p) {
unsigned int points;
switch (oom_scan_process_thread(oc, p, totalpages)) {
@@ -326,9 +327,6 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
points = oom_badness(p, NULL, oc->nodemask, totalpages);
if (!points || points < chosen_points)
continue;
- /* Prefer thread group leaders for display purposes */
- if (points == chosen_points && thread_group_leader(chosen))
- continue;
chosen = p;
chosen_points = points;
@@ -441,7 +439,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
-
static bool __oom_reap_task(struct task_struct *tsk)
{
struct mmu_gather tlb;
@@ -513,9 +510,14 @@ static bool __oom_reap_task(struct task_struct *tsk)
* This task can be safely ignored because we cannot do much more
* to release its memory.
*/
- tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
+ set_bit(MMF_OOM_REAPED, &mm->flags);
out:
- mmput(mm);
+ /*
+ * Drop our reference but make sure the mmput slow path is called from a
+ * different context because we shouldn't risk we get stuck there and
+ * put the oom_reaper out of the way.
+ */
+ mmput_async(mm);
return ret;
}
@@ -664,6 +666,7 @@ void mark_oom_victim(struct task_struct *tsk)
/* OOM killer might race with memcg OOM */
if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
return;
+ atomic_inc(&tsk->signal->oom_victims);
/*
* Make sure that the task is woken up from uninterruptible sleep
* if it is frozen because OOM killer wouldn't be able to free
@@ -681,6 +684,7 @@ void exit_oom_victim(struct task_struct *tsk)
{
if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
return;
+ atomic_dec(&tsk->signal->oom_victims);
if (!atomic_dec_return(&oom_victims))
wake_up_all(&oom_victims_wait);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3b88795ab46e..b9956fdee8f5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -411,8 +411,8 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
bg_thresh = thresh / 2;
tsk = current;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
- bg_thresh += bg_thresh / 4;
- thresh += thresh / 4;
+ bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
+ thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
}
dtc->thresh = thresh;
dtc->bg_thresh = bg_thresh;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c469c1dfb8b..f8f3bfc435ee 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -522,12 +522,6 @@ static void bad_page(struct page *page, const char *reason,
static unsigned long nr_shown;
static unsigned long nr_unshown;
- /* Don't complain about poisoned pages */
- if (PageHWPoison(page)) {
- page_mapcount_reset(page); /* remove PageBuddy */
- return;
- }
-
/*
* Allow a burst of 60 reports, then keep quiet for that minute;
* or allow a steady drip of one report per second.
@@ -613,14 +607,7 @@ static int __init early_debug_pagealloc(char *buf)
{
if (!buf)
return -EINVAL;
-
- if (strcmp(buf, "on") == 0)
- _debug_pagealloc_enabled = true;
-
- if (strcmp(buf, "off") == 0)
- _debug_pagealloc_enabled = false;
-
- return 0;
+ return kstrtobool(buf, &_debug_pagealloc_enabled);
}
early_param("debug_pagealloc", early_debug_pagealloc);
@@ -1000,7 +987,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
trace_mm_page_free(page, order);
kmemcheck_free_shadow(page, order);
- kasan_free_pages(page, order);
/*
* Check tail pages before head page information is cleared to
@@ -1042,6 +1028,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
arch_free_page(page, order);
kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0);
+ kasan_free_pages(page, order);
return true;
}
@@ -1212,7 +1199,7 @@ static inline void init_reserved_page(unsigned long pfn)
* marks the pages PageReserved. The remaining valid pages are later
* sent to the buddy page allocator.
*/
-void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
+void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
@@ -1661,6 +1648,9 @@ static void check_new_page_bad(struct page *page)
if (unlikely(page->flags & __PG_HWPOISON)) {
bad_reason = "HWPoisoned (hardware-corrupted)";
bad_flags = __PG_HWPOISON;
+ /* Don't complain about hwpoisoned pages */
+ page_mapcount_reset(page); /* remove PageBuddy */
+ return;
}
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
@@ -2750,10 +2740,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
* one free page of a suitable size. Checking now avoids taking the zone lock
* to check in the allocation paths if no pages are free.
*/
-static bool __zone_watermark_ok(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx,
- unsigned int alloc_flags,
- long free_pages)
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+ int classzone_idx, unsigned int alloc_flags,
+ long free_pages)
{
long min = mark;
int o;
@@ -3180,34 +3169,33 @@ out:
return page;
}
+
+/*
+ * Maximum number of compaction retries wit a progress before OOM
+ * killer is consider as the only way to move forward.
+ */
+#define MAX_COMPACT_RETRIES 16
+
#ifdef CONFIG_COMPACTION
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended_compaction,
- bool *deferred_compaction)
+ enum migrate_mode mode, enum compact_result *compact_result)
{
- unsigned long compact_result;
struct page *page;
+ int contended_compaction;
if (!order)
return NULL;
current->flags |= PF_MEMALLOC;
- compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- mode, contended_compaction);
+ *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
+ mode, &contended_compaction);
current->flags &= ~PF_MEMALLOC;
- switch (compact_result) {
- case COMPACT_DEFERRED:
- *deferred_compaction = true;
- /* fall-through */
- case COMPACT_SKIPPED:
+ if (*compact_result <= COMPACT_INACTIVE)
return NULL;
- default:
- break;
- }
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
@@ -3233,19 +3221,112 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTFAIL);
+ /*
+ * In all zones where compaction was attempted (and not
+ * deferred or skipped), lock contention has been detected.
+ * For THP allocation we do not want to disrupt the others
+ * so we fallback to base pages instead.
+ */
+ if (contended_compaction == COMPACT_CONTENDED_LOCK)
+ *compact_result = COMPACT_CONTENDED;
+
+ /*
+ * If compaction was aborted due to need_resched(), we do not
+ * want to further increase allocation latency, unless it is
+ * khugepaged trying to collapse.
+ */
+ if (contended_compaction == COMPACT_CONTENDED_SCHED
+ && !(current->flags & PF_KTHREAD))
+ *compact_result = COMPACT_CONTENDED;
+
cond_resched();
return NULL;
}
+
+static inline bool
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+ enum compact_result compact_result, enum migrate_mode *migrate_mode,
+ int compaction_retries)
+{
+ int max_retries = MAX_COMPACT_RETRIES;
+
+ if (!order)
+ return false;
+
+ /*
+ * compaction considers all the zone as desperately out of memory
+ * so it doesn't really make much sense to retry except when the
+ * failure could be caused by weak migration mode.
+ */
+ if (compaction_failed(compact_result)) {
+ if (*migrate_mode == MIGRATE_ASYNC) {
+ *migrate_mode = MIGRATE_SYNC_LIGHT;
+ return true;
+ }
+ return false;
+ }
+
+ /*
+ * make sure the compaction wasn't deferred or didn't bail out early
+ * due to locks contention before we declare that we should give up.
+ * But do not retry if the given zonelist is not suitable for
+ * compaction.
+ */
+ if (compaction_withdrawn(compact_result))
+ return compaction_zonelist_suitable(ac, order, alloc_flags);
+
+ /*
+ * !costly requests are much more important than __GFP_REPEAT
+ * costly ones because they are de facto nofail and invoke OOM
+ * killer to move on while costly can fail and users are ready
+ * to cope with that. 1/4 retries is rather arbitrary but we
+ * would need much more detailed feedback from compaction to
+ * make a better decision.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ max_retries /= 4;
+ if (compaction_retries <= max_retries)
+ return true;
+
+ return false;
+}
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended_compaction,
- bool *deferred_compaction)
+ enum migrate_mode mode, enum compact_result *compact_result)
{
+ *compact_result = COMPACT_SKIPPED;
return NULL;
}
+
+static inline bool
+should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+ enum compact_result compact_result,
+ enum migrate_mode *migrate_mode,
+ int compaction_retries)
+{
+ struct zone *zone;
+ struct zoneref *z;
+
+ if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
+ return false;
+
+ /*
+ * There are setups with compaction disabled which would prefer to loop
+ * inside the allocator rather than hit the oom killer prematurely.
+ * Let's give them a good hope and keep retrying while the order-0
+ * watermarks are OK.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+ ac->nodemask) {
+ if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
+ ac_classzone_idx(ac), alloc_flags))
+ return true;
+ }
+ return false;
+}
#endif /* CONFIG_COMPACTION */
/* Perform direct synchronous page reclaim */
@@ -3377,6 +3458,101 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
}
+/*
+ * Maximum number of reclaim retries without any progress before OOM killer
+ * is consider as the only way to move forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
+/*
+ * Checks whether it makes sense to retry the reclaim to make a forward progress
+ * for the given allocation request.
+ * The reclaim feedback represented by did_some_progress (any progress during
+ * the last reclaim round) and no_progress_loops (number of reclaim rounds without
+ * any progress in a row) is considered as well as the reclaimable pages on the
+ * applicable zone list (with a backoff mechanism which is a function of
+ * no_progress_loops).
+ *
+ * Returns true if a retry is viable or false to enter the oom path.
+ */
+static inline bool
+should_reclaim_retry(gfp_t gfp_mask, unsigned order,
+ struct alloc_context *ac, int alloc_flags,
+ bool did_some_progress, int no_progress_loops)
+{
+ struct zone *zone;
+ struct zoneref *z;
+
+ /*
+ * Make sure we converge to OOM if we cannot make any progress
+ * several times in the row.
+ */
+ if (no_progress_loops > MAX_RECLAIM_RETRIES)
+ return false;
+
+ /*
+ * Keep reclaiming pages while there is a chance this will lead somewhere.
+ * If none of the target zones can satisfy our allocation request even
+ * if all reclaimable pages are considered then we are screwed and have
+ * to go OOM.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+ ac->nodemask) {
+ unsigned long available;
+ unsigned long reclaimable;
+
+ available = reclaimable = zone_reclaimable_pages(zone);
+ available -= DIV_ROUND_UP(no_progress_loops * available,
+ MAX_RECLAIM_RETRIES);
+ available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+ /*
+ * Would the allocation succeed if we reclaimed the whole
+ * available?
+ */
+ if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
+ ac_classzone_idx(ac), alloc_flags, available)) {
+ /*
+ * If we didn't make any progress and have a lot of
+ * dirty + writeback pages then we should wait for
+ * an IO to complete to slow down the reclaim and
+ * prevent from pre mature OOM
+ */
+ if (!did_some_progress) {
+ unsigned long writeback;
+ unsigned long dirty;
+
+ writeback = zone_page_state_snapshot(zone,
+ NR_WRITEBACK);
+ dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
+
+ if (2*(writeback + dirty) > reclaimable) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ return true;
+ }
+ }
+
+ /*
+ * Memory allocation/reclaim might be called from a WQ
+ * context and the current implementation of the WQ
+ * concurrency control doesn't recognize that
+ * a particular WQ is congested if the worker thread is
+ * looping without ever sleeping. Therefore we have to
+ * do a short sleep here rather than calling
+ * cond_resched().
+ */
+ if (current->flags & PF_WQ_WORKER)
+ schedule_timeout_uninterruptible(1);
+ else
+ cond_resched();
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
@@ -3384,11 +3560,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
struct page *page = NULL;
unsigned int alloc_flags;
- unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
enum migrate_mode migration_mode = MIGRATE_ASYNC;
- bool deferred_compaction = false;
- int contended_compaction = COMPACT_CONTENDED_NONE;
+ enum compact_result compact_result;
+ int compaction_retries = 0;
+ int no_progress_loops = 0;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3475,8 +3651,7 @@ retry:
*/
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
migration_mode,
- &contended_compaction,
- &deferred_compaction);
+ &compact_result);
if (page)
goto got_pg;
@@ -3489,35 +3664,19 @@ retry:
* to heavily disrupt the system, so we fail the allocation
* instead of entering direct reclaim.
*/
- if (deferred_compaction)
- goto nopage;
-
- /*
- * In all zones where compaction was attempted (and not
- * deferred or skipped), lock contention has been detected.
- * For THP allocation we do not want to disrupt the others
- * so we fallback to base pages instead.
- */
- if (contended_compaction == COMPACT_CONTENDED_LOCK)
+ if (compact_result == COMPACT_DEFERRED)
goto nopage;
/*
- * If compaction was aborted due to need_resched(), we do not
- * want to further increase allocation latency, unless it is
- * khugepaged trying to collapse.
+ * Compaction is contended so rather back off than cause
+ * excessive stalls.
*/
- if (contended_compaction == COMPACT_CONTENDED_SCHED
- && !(current->flags & PF_KTHREAD))
+ if(compact_result == COMPACT_CONTENDED)
goto nopage;
}
- /*
- * It can become very expensive to allocate transparent hugepages at
- * fault, so use asynchronous memory compaction for THP unless it is
- * khugepaged trying to collapse.
- */
- if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
- migration_mode = MIGRATE_SYNC_LIGHT;
+ if (order && compaction_made_progress(compact_result))
+ compaction_retries++;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
@@ -3529,14 +3688,38 @@ retry:
if (gfp_mask & __GFP_NORETRY)
goto noretry;
- /* Keep reclaiming pages as long as there is reasonable progress */
- pages_reclaimed += did_some_progress;
- if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
- ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
- /* Wait for some write requests to complete then retry */
- wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50);
+ /*
+ * Do not retry costly high order allocations unless they are
+ * __GFP_REPEAT
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+ goto noretry;
+
+ /*
+ * Costly allocations might have made a progress but this doesn't mean
+ * their order will become available due to high fragmentation so
+ * always increment the no progress counter for them
+ */
+ if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
+ no_progress_loops = 0;
+ else
+ no_progress_loops++;
+
+ if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
+ did_some_progress > 0, no_progress_loops))
+ goto retry;
+
+ /*
+ * It doesn't make any sense to retry for the compaction if the order-0
+ * reclaim is not able to make any progress because the current
+ * implementation of the compaction depends on the sufficient amount
+ * of free memory (see __compaction_suitable)
+ */
+ if (did_some_progress > 0 &&
+ should_compact_retry(ac, order, alloc_flags,
+ compact_result, &migration_mode,
+ compaction_retries))
goto retry;
- }
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -3544,19 +3727,28 @@ retry:
goto got_pg;
/* Retry as long as the OOM killer is making progress */
- if (did_some_progress)
+ if (did_some_progress) {
+ no_progress_loops = 0;
goto retry;
+ }
noretry:
/*
- * High-order allocations do not necessarily loop after
- * direct reclaim and reclaim/compaction depends on compaction
- * being called after reclaim so call directly if necessary
+ * High-order allocations do not necessarily loop after direct reclaim
+ * and reclaim/compaction depends on compaction being called after
+ * reclaim so call directly if necessary.
+ * It can become very expensive to allocate transparent hugepages at
+ * fault, so use asynchronous memory compaction for THP unless it is
+ * khugepaged trying to collapse. All other requests should tolerate
+ * at least light sync migration.
*/
+ if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD))
+ migration_mode = MIGRATE_ASYNC;
+ else
+ migration_mode = MIGRATE_SYNC_LIGHT;
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
ac, migration_mode,
- &contended_compaction,
- &deferred_compaction);
+ &compact_result);
if (page)
goto got_pg;
nopage:
@@ -6671,49 +6863,6 @@ void setup_per_zone_wmarks(void)
}
/*
- * The inactive anon list should be small enough that the VM never has to
- * do too much work, but large enough that each inactive page has a chance
- * to be referenced again before it is swapped out.
- *
- * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
- * INACTIVE_ANON pages on this zone's LRU, maintained by the
- * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
- * the anonymous pages are kept on the inactive list.
- *
- * total target max
- * memory ratio inactive anon
- * -------------------------------------
- * 10MB 1 5MB
- * 100MB 1 50MB
- * 1GB 3 250MB
- * 10GB 10 0.9GB
- * 100GB 31 3GB
- * 1TB 101 10GB
- * 10TB 320 32GB
- */
-static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
-{
- unsigned int gb, ratio;
-
- /* Zone size in gigabytes */
- gb = zone->managed_pages >> (30 - PAGE_SHIFT);
- if (gb)
- ratio = int_sqrt(10 * gb);
- else
- ratio = 1;
-
- zone->inactive_ratio = ratio;
-}
-
-static void __meminit setup_per_zone_inactive_ratio(void)
-{
- struct zone *zone;
-
- for_each_zone(zone)
- calculate_zone_inactive_ratio(zone);
-}
-
-/*
* Initialise min_free_kbytes.
*
* For small machines we want it small (128k min). For large machines
@@ -6758,7 +6907,6 @@ int __meminit init_per_zone_wmark_min(void)
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
- setup_per_zone_inactive_ratio();
return 0;
}
core_initcall(init_per_zone_wmark_min)
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 479e7ea2bea6..1eae5fad2446 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -13,13 +13,7 @@ static int early_page_poison_param(char *buf)
{
if (!buf)
return -EINVAL;
-
- if (strcmp(buf, "on") == 0)
- want_page_poisoning = true;
- else if (strcmp(buf, "off") == 0)
- want_page_poisoning = false;
-
- return 0;
+ return strtobool(buf, &want_page_poisoning);
}
early_param("page_poison", early_page_poison_param);
diff --git a/mm/slab.c b/mm/slab.c
index c11bf5007952..cc8bbc1e6bc9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3547,9 +3547,17 @@ free_done:
static inline void __cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
- struct array_cache *ac = cpu_cache_get(cachep);
+ /* Put the object into the quarantine, don't touch it for now. */
+ if (kasan_slab_free(cachep, objp))
+ return;
+
+ ___cache_free(cachep, objp, caller);
+}
- kasan_slab_free(cachep, objp);
+void ___cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
+{
+ struct array_cache *ac = cpu_cache_get(cachep);
check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
@@ -4493,7 +4501,7 @@ size_t ksize(const void *objp)
/* We assume that ksize callers could use the whole allocated area,
* so we need to unpoison this area.
*/
- kasan_krealloc(objp, size, GFP_NOWAIT);
+ kasan_unpoison_shadow(objp, size);
return size;
}
diff --git a/mm/slab.h b/mm/slab.h
index 5969769fbee6..dedb1a920fb8 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -462,4 +462,6 @@ void *slab_next(struct seq_file *m, void *p, loff_t *pos);
void slab_stop(struct seq_file *m, void *p);
int memcg_slab_show(struct seq_file *m, void *p);
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
+
#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3239bfd758e6..a65dad7fdcd1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -715,6 +715,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
get_online_cpus();
get_online_mems();
+ kasan_cache_destroy(s);
mutex_lock(&slab_mutex);
s->refcount--;
@@ -753,6 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
get_online_cpus();
get_online_mems();
+ kasan_cache_shrink(cachep);
ret = __kmem_cache_shrink(cachep, false);
put_online_mems();
put_online_cpus();
diff --git a/mm/slub.c b/mm/slub.c
index cf1faa4d3992..825ff4505336 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3635,8 +3635,9 @@ size_t ksize(const void *object)
{
size_t size = __ksize(object);
/* We assume that ksize callers could use whole allocated area,
- so we need unpoison this area. */
- kasan_krealloc(object, size, GFP_NOWAIT);
+ * so we need to unpoison this area.
+ */
+ kasan_unpoison_shadow(object, size);
return size;
}
EXPORT_SYMBOL(ksize);
diff --git a/mm/swap.c b/mm/swap.c
index 03aacbcb013f..95916142fc46 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,6 +47,9 @@ static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+#endif
/*
* This path almost never happens for VM activity - pages are normally
@@ -274,8 +277,6 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
}
#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
-
static void activate_page_drain(int cpu)
{
struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ae7d20b447ff..6e3291882739 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -274,13 +274,12 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
/*** Global kva allocator ***/
-#define VM_LAZY_FREE 0x01
-#define VM_LAZY_FREEING 0x02
#define VM_VM_AREA 0x04
static DEFINE_SPINLOCK(vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
+static LLIST_HEAD(vmap_purge_list);
static struct rb_root vmap_area_root = RB_ROOT;
/* The vmap cache globals are protected by vmap_area_lock */
@@ -601,7 +600,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
int sync, int force_flush)
{
static DEFINE_SPINLOCK(purge_lock);
- LIST_HEAD(valist);
+ struct llist_node *valist;
struct vmap_area *va;
struct vmap_area *n_va;
int nr = 0;
@@ -620,20 +619,14 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
if (sync)
purge_fragmented_blocks_allcpus();
- rcu_read_lock();
- list_for_each_entry_rcu(va, &vmap_area_list, list) {
- if (va->flags & VM_LAZY_FREE) {
- if (va->va_start < *start)
- *start = va->va_start;
- if (va->va_end > *end)
- *end = va->va_end;
- nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
- list_add_tail(&va->purge_list, &valist);
- va->flags |= VM_LAZY_FREEING;
- va->flags &= ~VM_LAZY_FREE;
- }
+ valist = llist_del_all(&vmap_purge_list);
+ llist_for_each_entry(va, valist, purge_list) {
+ if (va->va_start < *start)
+ *start = va->va_start;
+ if (va->va_end > *end)
+ *end = va->va_end;
+ nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
}
- rcu_read_unlock();
if (nr)
atomic_sub(nr, &vmap_lazy_nr);
@@ -643,7 +636,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
if (nr) {
spin_lock(&vmap_area_lock);
- list_for_each_entry_safe(va, n_va, &valist, purge_list)
+ llist_for_each_entry_safe(va, n_va, valist, purge_list)
__free_vmap_area(va);
spin_unlock(&vmap_area_lock);
}
@@ -678,9 +671,15 @@ static void purge_vmap_area_lazy(void)
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
- va->flags |= VM_LAZY_FREE;
- atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
- if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
+ int nr_lazy;
+
+ nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
+ &vmap_lazy_nr);
+
+ /* After this point, we may free va at any time */
+ llist_add(&va->purge_list, &vmap_purge_list);
+
+ if (unlikely(nr_lazy > lazy_max_pages()))
try_purge_vmap_area_lazy();
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcfdfc1a0942..c4a2f4512fca 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -191,7 +191,7 @@ static bool sane_reclaim(struct scan_control *sc)
}
#endif
-static unsigned long zone_reclaimable_pages(struct zone *zone)
+unsigned long zone_reclaimable_pages(struct zone *zone)
{
unsigned long nr;
@@ -1862,83 +1862,63 @@ static void shrink_active_list(unsigned long nr_to_scan,
free_hot_cold_page_list(&l_hold, true);
}
-#ifdef CONFIG_SWAP
-static bool inactive_anon_is_low_global(struct zone *zone)
-{
- unsigned long active, inactive;
-
- active = zone_page_state(zone, NR_ACTIVE_ANON);
- inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-
- return inactive * zone->inactive_ratio < active;
-}
-
-/**
- * inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @lruvec: LRU vector to check
+/*
+ * The inactive anon list should be small enough that the VM never has
+ * to do too much work.
*
- * Returns true if the zone does not have enough inactive anon pages,
- * meaning some active anon pages need to be deactivated.
- */
-static bool inactive_anon_is_low(struct lruvec *lruvec)
-{
- /*
- * If we don't have swap space, anonymous page deactivation
- * is pointless.
- */
- if (!total_swap_pages)
- return false;
-
- if (!mem_cgroup_disabled())
- return mem_cgroup_inactive_anon_is_low(lruvec);
-
- return inactive_anon_is_low_global(lruvec_zone(lruvec));
-}
-#else
-static inline bool inactive_anon_is_low(struct lruvec *lruvec)
-{
- return false;
-}
-#endif
-
-/**
- * inactive_file_is_low - check if file pages need to be deactivated
- * @lruvec: LRU vector to check
+ * The inactive file list should be small enough to leave most memory
+ * to the established workingset on the scan-resistant active list,
+ * but large enough to avoid thrashing the aggregate readahead window.
*
- * When the system is doing streaming IO, memory pressure here
- * ensures that active file pages get deactivated, until more
- * than half of the file pages are on the inactive list.
+ * Both inactive lists should also be large enough that each inactive
+ * page has a chance to be referenced again before it is reclaimed.
*
- * Once we get to that situation, protect the system's working
- * set from being evicted by disabling active file page aging.
+ * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
+ * on this LRU, maintained by the pageout code. A zone->inactive_ratio
+ * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
*
- * This uses a different ratio than the anonymous pages, because
- * the page cache uses a use-once replacement algorithm.
+ * total target max
+ * memory ratio inactive
+ * -------------------------------------
+ * 10MB 1 5MB
+ * 100MB 1 50MB
+ * 1GB 3 250MB
+ * 10GB 10 0.9GB
+ * 100GB 31 3GB
+ * 1TB 101 10GB
+ * 10TB 320 32GB
*/
-static bool inactive_file_is_low(struct lruvec *lruvec)
+static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
{
+ unsigned long inactive_ratio;
unsigned long inactive;
unsigned long active;
+ unsigned long gb;
- inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
- active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+ /*
+ * If we don't have swap space, anonymous page deactivation
+ * is pointless.
+ */
+ if (!file && !total_swap_pages)
+ return false;
- return active > inactive;
-}
+ inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
+ active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
-static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
-{
- if (is_file_lru(lru))
- return inactive_file_is_low(lruvec);
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
else
- return inactive_anon_is_low(lruvec);
+ inactive_ratio = 1;
+
+ return inactive * inactive_ratio < active;
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, lru))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru)))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -2059,7 +2039,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_file_is_low(lruvec) &&
+ if (!inactive_list_is_low(lruvec, true) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
@@ -2301,7 +2281,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_anon_is_low(lruvec))
+ if (inactive_list_is_low(lruvec, false))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -2479,7 +2459,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
* Returns true if compaction should go ahead for a high-order request, or
* the high-order allocation would succeed without compaction.
*/
-static inline bool compaction_ready(struct zone *zone, int order)
+static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx)
{
unsigned long balance_gap, watermark;
bool watermark_ok;
@@ -2493,7 +2473,7 @@ static inline bool compaction_ready(struct zone *zone, int order)
balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
- watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0);
+ watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx);
/*
* If compaction is deferred, reclaim up to a point where
@@ -2506,7 +2486,7 @@ static inline bool compaction_ready(struct zone *zone, int order)
* If compaction is not ready to start and allocation is not likely
* to succeed without it, then keep reclaiming.
*/
- if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED)
+ if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED)
return false;
return watermark_ok;
@@ -2527,10 +2507,8 @@ static inline bool compaction_ready(struct zone *zone, int order)
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
- *
- * Returns true if a zone was reclaimable.
*/
-static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
@@ -2538,7 +2516,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
unsigned long nr_soft_scanned;
gfp_t orig_mask;
enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
- bool reclaimable = false;
/*
* If the number of buffer_heads in the machine exceeds the maximum
@@ -2586,7 +2563,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (IS_ENABLED(CONFIG_COMPACTION) &&
sc->order > PAGE_ALLOC_COSTLY_ORDER &&
zonelist_zone_idx(z) <= requested_highidx &&
- compaction_ready(zone, sc->order)) {
+ compaction_ready(zone, sc->order, requested_highidx)) {
sc->compaction_ready = true;
continue;
}
@@ -2603,17 +2580,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
&nr_soft_scanned);
sc->nr_reclaimed += nr_soft_reclaimed;
sc->nr_scanned += nr_soft_scanned;
- if (nr_soft_reclaimed)
- reclaimable = true;
/* need some check for avoid more shrink_zone() */
}
- if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
- reclaimable = true;
-
- if (global_reclaim(sc) &&
- !reclaimable && zone_reclaimable(zone))
- reclaimable = true;
+ shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
}
/*
@@ -2621,8 +2591,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* promoted it to __GFP_HIGHMEM.
*/
sc->gfp_mask = orig_mask;
-
- return reclaimable;
}
/*
@@ -2647,7 +2615,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
int initial_priority = sc->priority;
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
- bool zones_reclaimable;
retry:
delayacct_freepages_start();
@@ -2658,7 +2625,7 @@ retry:
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
sc->nr_scanned = 0;
- zones_reclaimable = shrink_zones(zonelist, sc);
+ shrink_zones(zonelist, sc);
total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
@@ -2705,10 +2672,6 @@ retry:
goto retry;
}
- /* Any of the zones still reclaimable? Don't OOM. */
- if (zones_reclaimable)
- return 1;
-
return 0;
}
@@ -2962,7 +2925,7 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
do {
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- if (inactive_anon_is_low(lruvec))
+ if (inactive_list_is_low(lruvec, false))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5b72a8ad2813..77e42ef388c2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1352,7 +1352,6 @@ static const struct file_operations proc_vmstat_file_operations = {
static struct workqueue_struct *vmstat_wq;
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
-static cpumask_var_t cpu_stat_off;
#ifdef CONFIG_PROC_FS
static void refresh_vm_stats(struct work_struct *work)
@@ -1421,24 +1420,10 @@ static void vmstat_update(struct work_struct *w)
* Counters were updated so we expect more updates
* to occur in the future. Keep on running the
* update worker thread.
- * If we were marked on cpu_stat_off clear the flag
- * so that vmstat_shepherd doesn't schedule us again.
*/
- if (!cpumask_test_and_clear_cpu(smp_processor_id(),
- cpu_stat_off)) {
- queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+ queue_delayed_work_on(smp_processor_id(), vmstat_wq,
this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
- }
- } else {
- /*
- * We did not update any counters so the app may be in
- * a mode where it does not cause counter updates.
- * We may be uselessly running vmstat_update.
- * Defer the checking for differentials to the
- * shepherd thread on a different processor.
- */
- cpumask_set_cpu(smp_processor_id(), cpu_stat_off);
}
}
@@ -1470,16 +1455,17 @@ static bool need_update(int cpu)
return false;
}
+/*
+ * Switch off vmstat processing and then fold all the remaining differentials
+ * until the diffs stay at zero. The function is used by NOHZ and can only be
+ * invoked when tick processing is not active.
+ */
void quiet_vmstat(void)
{
if (system_state != SYSTEM_RUNNING)
return;
- /*
- * If we are already in hands of the shepherd then there
- * is nothing for us to do here.
- */
- if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+ if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
return;
if (!need_update(smp_processor_id()))
@@ -1494,7 +1480,6 @@ void quiet_vmstat(void)
refresh_cpu_vm_stats(false);
}
-
/*
* Shepherd worker thread that checks the
* differentials of processors that have their worker
@@ -1511,20 +1496,11 @@ static void vmstat_shepherd(struct work_struct *w)
get_online_cpus();
/* Check processors whose vmstat worker threads have been disabled */
- for_each_cpu(cpu, cpu_stat_off) {
+ for_each_online_cpu(cpu) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
- if (need_update(cpu)) {
- if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
- queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
- } else {
- /*
- * Cancel the work if quiet_vmstat has put this
- * cpu on cpu_stat_off because the work item might
- * be still scheduled
- */
- cancel_delayed_work(dw);
- }
+ if (!delayed_work_pending(dw) && need_update(cpu))
+ queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
}
put_online_cpus();
@@ -1540,10 +1516,6 @@ static void __init start_shepherd_timer(void)
INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
- if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
- BUG();
- cpumask_copy(cpu_stat_off, cpu_online_mask);
-
vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
@@ -1578,16 +1550,13 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
case CPU_ONLINE_FROZEN:
refresh_zone_stat_thresholds();
node_set_state(cpu_to_node(cpu), N_CPU);
- cpumask_set_cpu(cpu, cpu_stat_off);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
- cpumask_clear_cpu(cpu, cpu_stat_off);
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- cpumask_set_cpu(cpu, cpu_stat_off);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
diff --git a/mm/z3fold.c b/mm/z3fold.c
new file mode 100644
index 000000000000..34917d55d311
--- /dev/null
+++ b/mm/z3fold.c
@@ -0,0 +1,792 @@
+/*
+ * z3fold.c
+ *
+ * Author: Vitaly Wool <vitaly.wool@konsulko.com>
+ * Copyright (C) 2016, Sony Mobile Communications Inc.
+ *
+ * This implementation is based on zbud written by Seth Jennings.
+ *
+ * z3fold is an special purpose allocator for storing compressed pages. It
+ * can store up to three compressed pages per page which improves the
+ * compression ratio of zbud while retaining its main concepts (e. g. always
+ * storing an integral number of objects per page) and simplicity.
+ * It still has simple and deterministic reclaim properties that make it
+ * preferable to a higher density approach (with no requirement on integral
+ * number of object per page) when reclaim is used.
+ *
+ * As in zbud, pages are divided into "chunks". The size of the chunks is
+ * fixed at compile time and is determined by NCHUNKS_ORDER below.
+ *
+ * z3fold doesn't export any API and is meant to be used via zpool API.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/zpool.h>
+
+/*****************
+ * Structures
+*****************/
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation. It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
+ * in allocated page is occupied by z3fold header, NCHUNKS will be calculated
+ * to 63 which shows the max number of free chunks in z3fold page, also there
+ * will be 63 freelists per pool.
+ */
+#define NCHUNKS_ORDER 6
+
+#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE (1 << CHUNK_SHIFT)
+#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+
+#define BUDDY_MASK ((1 << NCHUNKS_ORDER) - 1)
+
+struct z3fold_pool;
+struct z3fold_ops {
+ int (*evict)(struct z3fold_pool *pool, unsigned long handle);
+};
+
+/**
+ * struct z3fold_pool - stores metadata for each z3fold pool
+ * @lock: protects all pool fields and first|last_chunk fields of any
+ * z3fold page in the pool
+ * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies;
+ * the lists each z3fold page is added to depends on the size of
+ * its free region.
+ * @buddied: list tracking the z3fold pages that contain 3 buddies;
+ * these z3fold pages are full
+ * @lru: list tracking the z3fold pages in LRU order by most recently
+ * added buddy.
+ * @pages_nr: number of z3fold pages in the pool.
+ * @ops: pointer to a structure of user defined operations specified at
+ * pool creation time.
+ *
+ * This structure is allocated at pool creation time and maintains metadata
+ * pertaining to a particular z3fold pool.
+ */
+struct z3fold_pool {
+ spinlock_t lock;
+ struct list_head unbuddied[NCHUNKS];
+ struct list_head buddied;
+ struct list_head lru;
+ u64 pages_nr;
+ const struct z3fold_ops *ops;
+ struct zpool *zpool;
+ const struct zpool_ops *zpool_ops;
+};
+
+enum buddy {
+ HEADLESS = 0,
+ FIRST,
+ MIDDLE,
+ LAST,
+ BUDDIES_MAX
+};
+
+/*
+ * struct z3fold_header - z3fold page metadata occupying the first chunk of each
+ * z3fold page, except for HEADLESS pages
+ * @buddy: links the z3fold page into the relevant list in the pool
+ * @first_chunks: the size of the first buddy in chunks, 0 if free
+ * @middle_chunks: the size of the middle buddy in chunks, 0 if free
+ * @last_chunks: the size of the last buddy in chunks, 0 if free
+ * @first_num: the starting number (for the first handle)
+ */
+struct z3fold_header {
+ struct list_head buddy;
+ unsigned short first_chunks;
+ unsigned short middle_chunks;
+ unsigned short last_chunks;
+ unsigned short start_middle;
+ unsigned short first_num:NCHUNKS_ORDER;
+};
+
+/*
+ * Internal z3fold page flags
+ */
+enum z3fold_page_flags {
+ UNDER_RECLAIM = 0,
+ PAGE_HEADLESS,
+ MIDDLE_CHUNK_MAPPED,
+};
+
+/*****************
+ * Helpers
+*****************/
+
+/* Converts an allocation size in bytes to size in z3fold chunks */
+static int size_to_chunks(size_t size)
+{
+ return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
+}
+
+#define for_each_unbuddied_list(_iter, _begin) \
+ for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
+
+/* Initializes the z3fold header of a newly allocated z3fold page */
+static struct z3fold_header *init_z3fold_page(struct page *page)
+{
+ struct z3fold_header *zhdr = page_address(page);
+
+ INIT_LIST_HEAD(&page->lru);
+ clear_bit(UNDER_RECLAIM, &page->private);
+ clear_bit(PAGE_HEADLESS, &page->private);
+ clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+
+ zhdr->first_chunks = 0;
+ zhdr->middle_chunks = 0;
+ zhdr->last_chunks = 0;
+ zhdr->first_num = 0;
+ zhdr->start_middle = 0;
+ INIT_LIST_HEAD(&zhdr->buddy);
+ return zhdr;
+}
+
+/* Resets the struct page fields and frees the page */
+static void free_z3fold_page(struct z3fold_header *zhdr)
+{
+ __free_page(virt_to_page(zhdr));
+}
+
+/*
+ * Encodes the handle of a particular buddy within a z3fold page
+ * Pool lock should be held as this function accesses first_num
+ */
+static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
+{
+ unsigned long handle;
+
+ handle = (unsigned long)zhdr;
+ if (bud != HEADLESS)
+ handle += (bud + zhdr->first_num) & BUDDY_MASK;
+ return handle;
+}
+
+/* Returns the z3fold page where a given handle is stored */
+static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
+{
+ return (struct z3fold_header *)(handle & PAGE_MASK);
+}
+
+/* Returns buddy number */
+static enum buddy handle_to_buddy(unsigned long handle)
+{
+ struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
+ return (handle - zhdr->first_num) & BUDDY_MASK;
+}
+
+/*
+ * Returns the number of free chunks in a z3fold page.
+ * NB: can't be used with HEADLESS pages.
+ */
+static int num_free_chunks(struct z3fold_header *zhdr)
+{
+ int nfree;
+ /*
+ * If there is a middle object, pick up the bigger free space
+ * either before or after it. Otherwise just subtract the number
+ * of chunks occupied by the first and the last objects.
+ */
+ if (zhdr->middle_chunks != 0) {
+ int nfree_before = zhdr->first_chunks ?
+ 0 : zhdr->start_middle - 1;
+ int nfree_after = zhdr->last_chunks ?
+ 0 : NCHUNKS - zhdr->start_middle - zhdr->middle_chunks;
+ nfree = max(nfree_before, nfree_after);
+ } else
+ nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
+ return nfree;
+}
+
+/*****************
+ * API Functions
+*****************/
+/**
+ * z3fold_create_pool() - create a new z3fold pool
+ * @gfp: gfp flags when allocating the z3fold pool structure
+ * @ops: user-defined operations for the z3fold pool
+ *
+ * Return: pointer to the new z3fold pool or NULL if the metadata allocation
+ * failed.
+ */
+static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
+ const struct z3fold_ops *ops)
+{
+ struct z3fold_pool *pool;
+ int i;
+
+ pool = kzalloc(sizeof(struct z3fold_pool), gfp);
+ if (!pool)
+ return NULL;
+ spin_lock_init(&pool->lock);
+ for_each_unbuddied_list(i, 0)
+ INIT_LIST_HEAD(&pool->unbuddied[i]);
+ INIT_LIST_HEAD(&pool->buddied);
+ INIT_LIST_HEAD(&pool->lru);
+ pool->pages_nr = 0;
+ pool->ops = ops;
+ return pool;
+}
+
+/**
+ * z3fold_destroy_pool() - destroys an existing z3fold pool
+ * @pool: the z3fold pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+static void z3fold_destroy_pool(struct z3fold_pool *pool)
+{
+ kfree(pool);
+}
+
+/* Has to be called with lock held */
+static int z3fold_compact_page(struct z3fold_header *zhdr)
+{
+ struct page *page = virt_to_page(zhdr);
+ void *beg = zhdr;
+
+
+ if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) &&
+ zhdr->middle_chunks != 0 &&
+ zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+ memmove(beg + ZHDR_SIZE_ALIGNED,
+ beg + (zhdr->start_middle << CHUNK_SHIFT),
+ zhdr->middle_chunks << CHUNK_SHIFT);
+ zhdr->first_chunks = zhdr->middle_chunks;
+ zhdr->middle_chunks = 0;
+ zhdr->start_middle = 0;
+ zhdr->first_num++;
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * z3fold_alloc() - allocates a region of a given size
+ * @pool: z3fold pool from which to allocate
+ * @size: size in bytes of the desired allocation
+ * @gfp: gfp flags used if the pool needs to grow
+ * @handle: handle of the new allocation
+ *
+ * This function will attempt to find a free region in the pool large enough to
+ * satisfy the allocation request. A search of the unbuddied lists is
+ * performed first. If no suitable free region is found, then a new page is
+ * allocated and added to the pool to satisfy the request.
+ *
+ * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
+ * as z3fold pool pages.
+ *
+ * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
+ * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
+ * a new page.
+ */
+static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
+ unsigned long *handle)
+{
+ int chunks = 0, i, freechunks;
+ struct z3fold_header *zhdr = NULL;
+ enum buddy bud;
+ struct page *page;
+
+ if (!size || (gfp & __GFP_HIGHMEM))
+ return -EINVAL;
+
+ if (size > PAGE_SIZE)
+ return -ENOSPC;
+
+ if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
+ bud = HEADLESS;
+ else {
+ chunks = size_to_chunks(size);
+ spin_lock(&pool->lock);
+
+ /* First, try to find an unbuddied z3fold page. */
+ zhdr = NULL;
+ for_each_unbuddied_list(i, chunks) {
+ if (!list_empty(&pool->unbuddied[i])) {
+ zhdr = list_first_entry(&pool->unbuddied[i],
+ struct z3fold_header, buddy);
+ page = virt_to_page(zhdr);
+ if (zhdr->first_chunks == 0) {
+ if (zhdr->middle_chunks != 0 &&
+ chunks >= zhdr->start_middle)
+ bud = LAST;
+ else
+ bud = FIRST;
+ } else if (zhdr->last_chunks == 0)
+ bud = LAST;
+ else if (zhdr->middle_chunks == 0)
+ bud = MIDDLE;
+ else {
+ pr_err("No free chunks in unbuddied\n");
+ WARN_ON(1);
+ continue;
+ }
+ list_del(&zhdr->buddy);
+ goto found;
+ }
+ }
+ bud = FIRST;
+ spin_unlock(&pool->lock);
+ }
+
+ /* Couldn't find unbuddied z3fold page, create new one */
+ page = alloc_page(gfp);
+ if (!page)
+ return -ENOMEM;
+ spin_lock(&pool->lock);
+ pool->pages_nr++;
+ zhdr = init_z3fold_page(page);
+
+ if (bud == HEADLESS) {
+ set_bit(PAGE_HEADLESS, &page->private);
+ goto headless;
+ }
+
+found:
+ if (bud == FIRST)
+ zhdr->first_chunks = chunks;
+ else if (bud == LAST)
+ zhdr->last_chunks = chunks;
+ else {
+ zhdr->middle_chunks = chunks;
+ zhdr->start_middle = zhdr->first_chunks + 1;
+ }
+
+ if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
+ zhdr->middle_chunks == 0) {
+ /* Add to unbuddied list */
+ freechunks = num_free_chunks(zhdr);
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ } else {
+ /* Add to buddied list */
+ list_add(&zhdr->buddy, &pool->buddied);
+ }
+
+headless:
+ /* Add/move z3fold page to beginning of LRU */
+ if (!list_empty(&page->lru))
+ list_del(&page->lru);
+
+ list_add(&page->lru, &pool->lru);
+
+ *handle = encode_handle(zhdr, bud);
+ spin_unlock(&pool->lock);
+
+ return 0;
+}
+
+/**
+ * z3fold_free() - frees the allocation associated with the given handle
+ * @pool: pool in which the allocation resided
+ * @handle: handle associated with the allocation returned by z3fold_alloc()
+ *
+ * In the case that the z3fold page in which the allocation resides is under
+ * reclaim, as indicated by the PG_reclaim flag being set, this function
+ * only sets the first|last_chunks to 0. The page is actually freed
+ * once both buddies are evicted (see z3fold_reclaim_page() below).
+ */
+static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
+{
+ struct z3fold_header *zhdr;
+ int freechunks;
+ struct page *page;
+ enum buddy bud;
+
+ spin_lock(&pool->lock);
+ zhdr = handle_to_z3fold_header(handle);
+ page = virt_to_page(zhdr);
+
+ if (test_bit(PAGE_HEADLESS, &page->private)) {
+ /* HEADLESS page stored */
+ bud = HEADLESS;
+ } else {
+ bud = (handle - zhdr->first_num) & BUDDY_MASK;
+
+ switch (bud) {
+ case FIRST:
+ zhdr->first_chunks = 0;
+ break;
+ case MIDDLE:
+ zhdr->middle_chunks = 0;
+ zhdr->start_middle = 0;
+ break;
+ case LAST:
+ zhdr->last_chunks = 0;
+ break;
+ default:
+ pr_err("%s: unknown bud %d\n", __func__, bud);
+ WARN_ON(1);
+ spin_unlock(&pool->lock);
+ return;
+ }
+ }
+
+ if (test_bit(UNDER_RECLAIM, &page->private)) {
+ /* z3fold page is under reclaim, reclaim will free */
+ spin_unlock(&pool->lock);
+ return;
+ }
+
+ if (bud != HEADLESS) {
+ /* Remove from existing buddy list */
+ list_del(&zhdr->buddy);
+ }
+
+ if (bud == HEADLESS ||
+ (zhdr->first_chunks == 0 && zhdr->middle_chunks == 0 &&
+ zhdr->last_chunks == 0)) {
+ /* z3fold page is empty, free */
+ list_del(&page->lru);
+ clear_bit(PAGE_HEADLESS, &page->private);
+ free_z3fold_page(zhdr);
+ pool->pages_nr--;
+ } else {
+ z3fold_compact_page(zhdr);
+ /* Add to the unbuddied list */
+ freechunks = num_free_chunks(zhdr);
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ }
+
+ spin_unlock(&pool->lock);
+}
+
+/**
+ * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
+ * @pool: pool from which a page will attempt to be evicted
+ * @retires: number of pages on the LRU list for which eviction will
+ * be attempted before failing
+ *
+ * z3fold reclaim is different from normal system reclaim in that it is done
+ * from the bottom, up. This is because only the bottom layer, z3fold, has
+ * information on how the allocations are organized within each z3fold page.
+ * This has the potential to create interesting locking situations between
+ * z3fold and the user, however.
+ *
+ * To avoid these, this is how z3fold_reclaim_page() should be called:
+
+ * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
+ * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
+ * call the user-defined eviction handler with the pool and handle as
+ * arguments.
+ *
+ * If the handle can not be evicted, the eviction handler should return
+ * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
+ * appropriate list and try the next z3fold page on the LRU up to
+ * a user defined number of retries.
+ *
+ * If the handle is successfully evicted, the eviction handler should
+ * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
+ * contains logic to delay freeing the page if the page is under reclaim,
+ * as indicated by the setting of the PG_reclaim flag on the underlying page.
+ *
+ * If all buddies in the z3fold page are successfully evicted, then the
+ * z3fold page can be freed.
+ *
+ * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
+ * no pages to evict or an eviction handler is not registered, -EAGAIN if
+ * the retry limit was hit.
+ */
+static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
+{
+ int i, ret = 0, freechunks;
+ struct z3fold_header *zhdr;
+ struct page *page;
+ unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
+
+ spin_lock(&pool->lock);
+ if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
+ retries == 0) {
+ spin_unlock(&pool->lock);
+ return -EINVAL;
+ }
+ for (i = 0; i < retries; i++) {
+ page = list_last_entry(&pool->lru, struct page, lru);
+ list_del(&page->lru);
+
+ /* Protect z3fold page against free */
+ set_bit(UNDER_RECLAIM, &page->private);
+ zhdr = page_address(page);
+ if (!test_bit(PAGE_HEADLESS, &page->private)) {
+ list_del(&zhdr->buddy);
+ /*
+ * We need encode the handles before unlocking, since
+ * we can race with free that will set
+ * (first|last)_chunks to 0
+ */
+ first_handle = 0;
+ last_handle = 0;
+ middle_handle = 0;
+ if (zhdr->first_chunks)
+ first_handle = encode_handle(zhdr, FIRST);
+ if (zhdr->middle_chunks)
+ middle_handle = encode_handle(zhdr, MIDDLE);
+ if (zhdr->last_chunks)
+ last_handle = encode_handle(zhdr, LAST);
+ } else {
+ first_handle = encode_handle(zhdr, HEADLESS);
+ last_handle = middle_handle = 0;
+ }
+
+ spin_unlock(&pool->lock);
+
+ /* Issue the eviction callback(s) */
+ if (middle_handle) {
+ ret = pool->ops->evict(pool, middle_handle);
+ if (ret)
+ goto next;
+ }
+ if (first_handle) {
+ ret = pool->ops->evict(pool, first_handle);
+ if (ret)
+ goto next;
+ }
+ if (last_handle) {
+ ret = pool->ops->evict(pool, last_handle);
+ if (ret)
+ goto next;
+ }
+next:
+ spin_lock(&pool->lock);
+ clear_bit(UNDER_RECLAIM, &page->private);
+ if ((test_bit(PAGE_HEADLESS, &page->private) && ret == 0) ||
+ (zhdr->first_chunks == 0 && zhdr->last_chunks == 0 &&
+ zhdr->middle_chunks == 0)) {
+ /*
+ * All buddies are now free, free the z3fold page and
+ * return success.
+ */
+ clear_bit(PAGE_HEADLESS, &page->private);
+ free_z3fold_page(zhdr);
+ pool->pages_nr--;
+ spin_unlock(&pool->lock);
+ return 0;
+ } else if (zhdr->first_chunks != 0 &&
+ zhdr->last_chunks != 0 && zhdr->middle_chunks != 0) {
+ /* Full, add to buddied list */
+ list_add(&zhdr->buddy, &pool->buddied);
+ } else if (!test_bit(PAGE_HEADLESS, &page->private)) {
+ z3fold_compact_page(zhdr);
+ /* add to unbuddied list */
+ freechunks = num_free_chunks(zhdr);
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ }
+
+ /* add to beginning of LRU */
+ list_add(&page->lru, &pool->lru);
+ }
+ spin_unlock(&pool->lock);
+ return -EAGAIN;
+}
+
+/**
+ * z3fold_map() - maps the allocation associated with the given handle
+ * @pool: pool in which the allocation resides
+ * @handle: handle associated with the allocation to be mapped
+ *
+ * Extracts the buddy number from handle and constructs the pointer to the
+ * correct starting chunk within the page.
+ *
+ * Returns: a pointer to the mapped allocation
+ */
+static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
+{
+ struct z3fold_header *zhdr;
+ struct page *page;
+ void *addr;
+ enum buddy buddy;
+
+ spin_lock(&pool->lock);
+ zhdr = handle_to_z3fold_header(handle);
+ addr = zhdr;
+ page = virt_to_page(zhdr);
+
+ if (test_bit(PAGE_HEADLESS, &page->private))
+ goto out;
+
+ buddy = handle_to_buddy(handle);
+ switch (buddy) {
+ case FIRST:
+ addr += ZHDR_SIZE_ALIGNED;
+ break;
+ case MIDDLE:
+ addr += zhdr->start_middle << CHUNK_SHIFT;
+ set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ break;
+ case LAST:
+ addr += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
+ break;
+ default:
+ pr_err("unknown buddy id %d\n", buddy);
+ WARN_ON(1);
+ addr = NULL;
+ break;
+ }
+out:
+ spin_unlock(&pool->lock);
+ return addr;
+}
+
+/**
+ * z3fold_unmap() - unmaps the allocation associated with the given handle
+ * @pool: pool in which the allocation resides
+ * @handle: handle associated with the allocation to be unmapped
+ */
+static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
+{
+ struct z3fold_header *zhdr;
+ struct page *page;
+ enum buddy buddy;
+
+ spin_lock(&pool->lock);
+ zhdr = handle_to_z3fold_header(handle);
+ page = virt_to_page(zhdr);
+
+ if (test_bit(PAGE_HEADLESS, &page->private)) {
+ spin_unlock(&pool->lock);
+ return;
+ }
+
+ buddy = handle_to_buddy(handle);
+ if (buddy == MIDDLE)
+ clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ spin_unlock(&pool->lock);
+}
+
+/**
+ * z3fold_get_pool_size() - gets the z3fold pool size in pages
+ * @pool: pool whose size is being queried
+ *
+ * Returns: size in pages of the given pool. The pool lock need not be
+ * taken to access pages_nr.
+ */
+static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
+{
+ return pool->pages_nr;
+}
+
+/*****************
+ * zpool
+ ****************/
+
+static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
+{
+ if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
+ return pool->zpool_ops->evict(pool->zpool, handle);
+ else
+ return -ENOENT;
+}
+
+static const struct z3fold_ops z3fold_zpool_ops = {
+ .evict = z3fold_zpool_evict
+};
+
+static void *z3fold_zpool_create(const char *name, gfp_t gfp,
+ const struct zpool_ops *zpool_ops,
+ struct zpool *zpool)
+{
+ struct z3fold_pool *pool;
+
+ pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL);
+ if (pool) {
+ pool->zpool = zpool;
+ pool->zpool_ops = zpool_ops;
+ }
+ return pool;
+}
+
+static void z3fold_zpool_destroy(void *pool)
+{
+ z3fold_destroy_pool(pool);
+}
+
+static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+ unsigned long *handle)
+{
+ return z3fold_alloc(pool, size, gfp, handle);
+}
+static void z3fold_zpool_free(void *pool, unsigned long handle)
+{
+ z3fold_free(pool, handle);
+}
+
+static int z3fold_zpool_shrink(void *pool, unsigned int pages,
+ unsigned int *reclaimed)
+{
+ unsigned int total = 0;
+ int ret = -EINVAL;
+
+ while (total < pages) {
+ ret = z3fold_reclaim_page(pool, 8);
+ if (ret < 0)
+ break;
+ total++;
+ }
+
+ if (reclaimed)
+ *reclaimed = total;
+
+ return ret;
+}
+
+static void *z3fold_zpool_map(void *pool, unsigned long handle,
+ enum zpool_mapmode mm)
+{
+ return z3fold_map(pool, handle);
+}
+static void z3fold_zpool_unmap(void *pool, unsigned long handle)
+{
+ z3fold_unmap(pool, handle);
+}
+
+static u64 z3fold_zpool_total_size(void *pool)
+{
+ return z3fold_get_pool_size(pool) * PAGE_SIZE;
+}
+
+static struct zpool_driver z3fold_zpool_driver = {
+ .type = "z3fold",
+ .owner = THIS_MODULE,
+ .create = z3fold_zpool_create,
+ .destroy = z3fold_zpool_destroy,
+ .malloc = z3fold_zpool_malloc,
+ .free = z3fold_zpool_free,
+ .shrink = z3fold_zpool_shrink,
+ .map = z3fold_zpool_map,
+ .unmap = z3fold_zpool_unmap,
+ .total_size = z3fold_zpool_total_size,
+};
+
+MODULE_ALIAS("zpool-z3fold");
+
+static int __init init_z3fold(void)
+{
+ /* Make sure the z3fold header will fit in one chunk */
+ BUILD_BUG_ON(sizeof(struct z3fold_header) > ZHDR_SIZE_ALIGNED);
+ zpool_register_driver(&z3fold_zpool_driver);
+
+ return 0;
+}
+
+static void __exit exit_z3fold(void)
+{
+ zpool_unregister_driver(&z3fold_zpool_driver);
+}
+
+module_init(init_z3fold);
+module_exit(exit_z3fold);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
+MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index fe47fbba995a..72698db958e7 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -247,7 +247,6 @@ struct zs_pool {
struct size_class **size_class;
struct kmem_cache *handle_cachep;
- gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
struct zs_pool_stats stats;
@@ -295,10 +294,10 @@ static void destroy_handle_cache(struct zs_pool *pool)
kmem_cache_destroy(pool->handle_cachep);
}
-static unsigned long alloc_handle(struct zs_pool *pool)
+static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp)
{
return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
- pool->flags & ~__GFP_HIGHMEM);
+ gfp & ~__GFP_HIGHMEM);
}
static void free_handle(struct zs_pool *pool, unsigned long handle)
@@ -324,7 +323,12 @@ static void *zs_zpool_create(const char *name, gfp_t gfp,
const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
- return zs_create_pool(name, gfp);
+ /*
+ * Ignore global gfp flags: zs_malloc() may be invoked from
+ * different contexts and its caller must provide a valid
+ * gfp mask.
+ */
+ return zs_create_pool(name);
}
static void zs_zpool_destroy(void *pool)
@@ -335,7 +339,7 @@ static void zs_zpool_destroy(void *pool)
static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
unsigned long *handle)
{
- *handle = zs_malloc(pool, size);
+ *handle = zs_malloc(pool, size, gfp);
return *handle ? 0 : -1;
}
static void zs_zpool_free(void *pool, unsigned long handle)
@@ -413,26 +417,28 @@ static int is_last_page(struct page *page)
return PagePrivate2(page);
}
-static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
+static void get_zspage_mapping(struct page *first_page,
+ unsigned int *class_idx,
enum fullness_group *fullness)
{
unsigned long m;
- BUG_ON(!is_first_page(page));
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
- m = (unsigned long)page->mapping;
+ m = (unsigned long)first_page->mapping;
*fullness = m & FULLNESS_MASK;
*class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
}
-static void set_zspage_mapping(struct page *page, unsigned int class_idx,
+static void set_zspage_mapping(struct page *first_page,
+ unsigned int class_idx,
enum fullness_group fullness)
{
unsigned long m;
- BUG_ON(!is_first_page(page));
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
(fullness & FULLNESS_MASK);
- page->mapping = (struct address_space *)m;
+ first_page->mapping = (struct address_space *)m;
}
/*
@@ -567,17 +573,17 @@ static const struct file_operations zs_stat_size_ops = {
.release = single_release,
};
-static int zs_pool_stat_create(const char *name, struct zs_pool *pool)
+static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
{
struct dentry *entry;
if (!zs_stat_root)
- return -ENODEV;
+ return;
entry = debugfs_create_dir(name, zs_stat_root);
if (!entry) {
pr_warn("debugfs dir <%s> creation failed\n", name);
- return -ENOMEM;
+ return;
}
pool->stat_dentry = entry;
@@ -586,10 +592,8 @@ static int zs_pool_stat_create(const char *name, struct zs_pool *pool)
if (!entry) {
pr_warn("%s: debugfs file entry <%s> creation failed\n",
name, "classes");
- return -ENOMEM;
+ return;
}
-
- return 0;
}
static void zs_pool_stat_destroy(struct zs_pool *pool)
@@ -607,9 +611,8 @@ static void __exit zs_stat_exit(void)
{
}
-static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool)
+static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name)
{
- return 0;
}
static inline void zs_pool_stat_destroy(struct zs_pool *pool)
@@ -617,7 +620,6 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
}
#endif
-
/*
* For each size class, zspages are divided into different groups
* depending on how "full" they are. This was done so that we could
@@ -625,14 +627,15 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
* the pool (not yet implemented). This function returns fullness
* status of the given page.
*/
-static enum fullness_group get_fullness_group(struct page *page)
+static enum fullness_group get_fullness_group(struct page *first_page)
{
int inuse, max_objects;
enum fullness_group fg;
- BUG_ON(!is_first_page(page));
- inuse = page->inuse;
- max_objects = page->objects;
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+
+ inuse = first_page->inuse;
+ max_objects = first_page->objects;
if (inuse == 0)
fg = ZS_EMPTY;
@@ -652,12 +655,13 @@ static enum fullness_group get_fullness_group(struct page *page)
* have. This functions inserts the given zspage into the freelist
* identified by <class, fullness_group>.
*/
-static void insert_zspage(struct page *page, struct size_class *class,
- enum fullness_group fullness)
+static void insert_zspage(struct size_class *class,
+ enum fullness_group fullness,
+ struct page *first_page)
{
struct page **head;
- BUG_ON(!is_first_page(page));
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
if (fullness >= _ZS_NR_FULLNESS_GROUPS)
return;
@@ -667,7 +671,7 @@ static void insert_zspage(struct page *page, struct size_class *class,
head = &class->fullness_list[fullness];
if (!*head) {
- *head = page;
+ *head = first_page;
return;
}
@@ -675,34 +679,35 @@ static void insert_zspage(struct page *page, struct size_class *class,
* We want to see more ZS_FULL pages and less almost
* empty/full. Put pages with higher ->inuse first.
*/
- list_add_tail(&page->lru, &(*head)->lru);
- if (page->inuse >= (*head)->inuse)
- *head = page;
+ list_add_tail(&first_page->lru, &(*head)->lru);
+ if (first_page->inuse >= (*head)->inuse)
+ *head = first_page;
}
/*
* This function removes the given zspage from the freelist identified
* by <class, fullness_group>.
*/
-static void remove_zspage(struct page *page, struct size_class *class,
- enum fullness_group fullness)
+static void remove_zspage(struct size_class *class,
+ enum fullness_group fullness,
+ struct page *first_page)
{
struct page **head;
- BUG_ON(!is_first_page(page));
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
if (fullness >= _ZS_NR_FULLNESS_GROUPS)
return;
head = &class->fullness_list[fullness];
- BUG_ON(!*head);
+ VM_BUG_ON_PAGE(!*head, first_page);
if (list_empty(&(*head)->lru))
*head = NULL;
- else if (*head == page)
+ else if (*head == first_page)
*head = (struct page *)list_entry((*head)->lru.next,
struct page, lru);
- list_del_init(&page->lru);
+ list_del_init(&first_page->lru);
zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
}
@@ -717,21 +722,19 @@ static void remove_zspage(struct page *page, struct size_class *class,
* fullness group.
*/
static enum fullness_group fix_fullness_group(struct size_class *class,
- struct page *page)
+ struct page *first_page)
{
int class_idx;
enum fullness_group currfg, newfg;
- BUG_ON(!is_first_page(page));
-
- get_zspage_mapping(page, &class_idx, &currfg);
- newfg = get_fullness_group(page);
+ get_zspage_mapping(first_page, &class_idx, &currfg);
+ newfg = get_fullness_group(first_page);
if (newfg == currfg)
goto out;
- remove_zspage(page, class, currfg);
- insert_zspage(page, class, newfg);
- set_zspage_mapping(page, class_idx, newfg);
+ remove_zspage(class, currfg, first_page);
+ insert_zspage(class, newfg, first_page);
+ set_zspage_mapping(first_page, class_idx, newfg);
out:
return newfg;
@@ -809,7 +812,7 @@ static void *location_to_obj(struct page *page, unsigned long obj_idx)
unsigned long obj;
if (!page) {
- BUG_ON(obj_idx);
+ VM_BUG_ON(obj_idx);
return NULL;
}
@@ -842,7 +845,7 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page,
void *obj)
{
if (class->huge) {
- VM_BUG_ON(!is_first_page(page));
+ VM_BUG_ON_PAGE(!is_first_page(page), page);
return page_private(page);
} else
return *(unsigned long *)obj;
@@ -892,8 +895,8 @@ static void free_zspage(struct page *first_page)
{
struct page *nextp, *tmp, *head_extra;
- BUG_ON(!is_first_page(first_page));
- BUG_ON(first_page->inuse);
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+ VM_BUG_ON_PAGE(first_page->inuse, first_page);
head_extra = (struct page *)page_private(first_page);
@@ -914,12 +917,13 @@ static void free_zspage(struct page *first_page)
}
/* Initialize a newly allocated zspage */
-static void init_zspage(struct page *first_page, struct size_class *class)
+static void init_zspage(struct size_class *class, struct page *first_page)
{
unsigned long off = 0;
struct page *page = first_page;
- BUG_ON(!is_first_page(first_page));
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+
while (page) {
struct page *next_page;
struct link_free *link;
@@ -1001,7 +1005,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
prev_page = page;
}
- init_zspage(first_page, class);
+ init_zspage(class, first_page);
first_page->freelist = location_to_obj(first_page, 0);
/* Maximum number of objects we can store in this zspage */
@@ -1234,11 +1238,11 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
return true;
}
-static bool zspage_full(struct page *page)
+static bool zspage_full(struct page *first_page)
{
- BUG_ON(!is_first_page(page));
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
- return page->inuse == page->objects;
+ return first_page->inuse == first_page->objects;
}
unsigned long zs_get_total_pages(struct zs_pool *pool)
@@ -1274,14 +1278,12 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
struct page *pages[2];
void *ret;
- BUG_ON(!handle);
-
/*
* Because we use per-cpu mapping areas shared among the
* pools/users, we can't allow mapping in interrupt context
* because it can corrupt another users mappings.
*/
- BUG_ON(in_interrupt());
+ WARN_ON_ONCE(in_interrupt());
/* From now on, migration cannot move the object */
pin_tag(handle);
@@ -1325,8 +1327,6 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
struct size_class *class;
struct mapping_area *area;
- BUG_ON(!handle);
-
obj = handle_to_obj(handle);
obj_to_location(obj, &page, &obj_idx);
get_zspage_mapping(get_first_page(page), &class_idx, &fg);
@@ -1350,8 +1350,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
-static unsigned long obj_malloc(struct page *first_page,
- struct size_class *class, unsigned long handle)
+static unsigned long obj_malloc(struct size_class *class,
+ struct page *first_page, unsigned long handle)
{
unsigned long obj;
struct link_free *link;
@@ -1391,7 +1391,7 @@ static unsigned long obj_malloc(struct page *first_page,
* otherwise 0.
* Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
*/
-unsigned long zs_malloc(struct zs_pool *pool, size_t size)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
{
unsigned long handle, obj;
struct size_class *class;
@@ -1400,7 +1400,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
return 0;
- handle = alloc_handle(pool);
+ handle = alloc_handle(pool, gfp);
if (!handle)
return 0;
@@ -1413,7 +1413,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
if (!first_page) {
spin_unlock(&class->lock);
- first_page = alloc_zspage(class, pool->flags);
+ first_page = alloc_zspage(class, gfp);
if (unlikely(!first_page)) {
free_handle(pool, handle);
return 0;
@@ -1428,7 +1428,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
class->size, class->pages_per_zspage));
}
- obj = obj_malloc(first_page, class, handle);
+ obj = obj_malloc(class, first_page, handle);
/* Now move the zspage to another fullness group, if required */
fix_fullness_group(class, first_page);
record_obj(handle, obj);
@@ -1438,16 +1438,13 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
}
EXPORT_SYMBOL_GPL(zs_malloc);
-static void obj_free(struct zs_pool *pool, struct size_class *class,
- unsigned long obj)
+static void obj_free(struct size_class *class, unsigned long obj)
{
struct link_free *link;
struct page *first_page, *f_page;
unsigned long f_objidx, f_offset;
void *vaddr;
- BUG_ON(!obj);
-
obj &= ~OBJ_ALLOCATED_TAG;
obj_to_location(obj, &f_page, &f_objidx);
first_page = get_first_page(f_page);
@@ -1487,7 +1484,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
class = pool->size_class[class_idx];
spin_lock(&class->lock);
- obj_free(pool, class, obj);
+ obj_free(class, obj);
fullness = fix_fullness_group(class, first_page);
if (fullness == ZS_EMPTY) {
zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
@@ -1503,8 +1500,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
}
EXPORT_SYMBOL_GPL(zs_free);
-static void zs_object_copy(unsigned long dst, unsigned long src,
- struct size_class *class)
+static void zs_object_copy(struct size_class *class, unsigned long dst,
+ unsigned long src)
{
struct page *s_page, *d_page;
unsigned long s_objidx, d_objidx;
@@ -1547,7 +1544,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src,
kunmap_atomic(d_addr);
kunmap_atomic(s_addr);
s_page = get_next_page(s_page);
- BUG_ON(!s_page);
s_addr = kmap_atomic(s_page);
d_addr = kmap_atomic(d_page);
s_size = class->size - written;
@@ -1557,7 +1553,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src,
if (d_off >= PAGE_SIZE) {
kunmap_atomic(d_addr);
d_page = get_next_page(d_page);
- BUG_ON(!d_page);
d_addr = kmap_atomic(d_page);
d_size = class->size - written;
d_off = 0;
@@ -1572,8 +1567,8 @@ static void zs_object_copy(unsigned long dst, unsigned long src,
* Find alloced object in zspage from index object and
* return handle.
*/
-static unsigned long find_alloced_obj(struct page *page, int index,
- struct size_class *class)
+static unsigned long find_alloced_obj(struct size_class *class,
+ struct page *page, int index)
{
unsigned long head;
int offset = 0;
@@ -1623,7 +1618,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
int ret = 0;
while (1) {
- handle = find_alloced_obj(s_page, index, class);
+ handle = find_alloced_obj(class, s_page, index);
if (!handle) {
s_page = get_next_page(s_page);
if (!s_page)
@@ -1640,8 +1635,8 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
}
used_obj = handle_to_obj(handle);
- free_obj = obj_malloc(d_page, class, handle);
- zs_object_copy(free_obj, used_obj, class);
+ free_obj = obj_malloc(class, d_page, handle);
+ zs_object_copy(class, free_obj, used_obj);
index++;
/*
* record_obj updates handle's value to free_obj and it will
@@ -1652,7 +1647,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
free_obj |= BIT(HANDLE_PIN_BIT);
record_obj(handle, free_obj);
unpin_tag(handle);
- obj_free(pool, class, used_obj);
+ obj_free(class, used_obj);
}
/* Remember last position in this iteration */
@@ -1670,7 +1665,7 @@ static struct page *isolate_target_page(struct size_class *class)
for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
page = class->fullness_list[i];
if (page) {
- remove_zspage(page, class, i);
+ remove_zspage(class, i, page);
break;
}
}
@@ -1692,10 +1687,8 @@ static enum fullness_group putback_zspage(struct zs_pool *pool,
{
enum fullness_group fullness;
- BUG_ON(!is_first_page(first_page));
-
fullness = get_fullness_group(first_page);
- insert_zspage(first_page, class, fullness);
+ insert_zspage(class, fullness, first_page);
set_zspage_mapping(first_page, class->index, fullness);
if (fullness == ZS_EMPTY) {
@@ -1720,7 +1713,7 @@ static struct page *isolate_source_page(struct size_class *class)
if (!page)
continue;
- remove_zspage(page, class, i);
+ remove_zspage(class, i, page);
break;
}
@@ -1757,8 +1750,6 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
spin_lock(&class->lock);
while ((src_page = isolate_source_page(class))) {
- BUG_ON(!is_first_page(src_page));
-
if (!zs_can_compact(class))
break;
@@ -1887,7 +1878,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
* On success, a pointer to the newly created pool is returned,
* otherwise NULL.
*/
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
+struct zs_pool *zs_create_pool(const char *name)
{
int i;
struct zs_pool *pool;
@@ -1957,10 +1948,8 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
prev_class = class;
}
- pool->flags = flags;
-
- if (zs_pool_stat_create(name, pool))
- goto err;
+ /* debug only, don't abort if it fails */
+ zs_pool_stat_create(pool, name);
/*
* Not critical, we still can use the pool
diff --git a/mm/zswap.c b/mm/zswap.c
index de0f119b1780..275b22cc8df4 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -117,7 +117,7 @@ struct zswap_pool {
struct crypto_comp * __percpu *tfm;
struct kref kref;
struct list_head list;
- struct rcu_head rcu_head;
+ struct work_struct work;
struct notifier_block notifier;
char tfm_name[CRYPTO_MAX_ALG_NAME];
};
@@ -658,9 +658,11 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool)
return kref_get_unless_zero(&pool->kref);
}
-static void __zswap_pool_release(struct rcu_head *head)
+static void __zswap_pool_release(struct work_struct *work)
{
- struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head);
+ struct zswap_pool *pool = container_of(work, typeof(*pool), work);
+
+ synchronize_rcu();
/* nobody should have been able to get a kref... */
WARN_ON(kref_get_unless_zero(&pool->kref));
@@ -680,7 +682,9 @@ static void __zswap_pool_empty(struct kref *kref)
WARN_ON(pool == zswap_pool_current());
list_del_rcu(&pool->list);
- call_rcu(&pool->rcu_head, __zswap_pool_release);
+
+ INIT_WORK(&pool->work, __zswap_pool_release);
+ schedule_work(&pool->work);
spin_unlock(&zswap_pools_lock);
}