summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/tty/sysrq.c5
-rw-r--r--include/linux/oom.h14
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/power/process.c50
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/oom_kill.c132
-rw-r--r--mm/page_alloc.c17
7 files changed, 132 insertions, 91 deletions
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 0071469ecbf1..259a4d5a4e8f 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -355,8 +355,9 @@ static struct sysrq_key_op sysrq_term_op = {
static void moom_callback(struct work_struct *ignored)
{
- out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL,
- 0, NULL, true);
+ if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
+ GFP_KERNEL, 0, NULL, true))
+ pr_info("OOM request ignored because killer is disabled\n");
}
static DECLARE_WORK(moom_work, moom_callback);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index b42b80f88c3a..d5771bed59c9 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -72,22 +72,14 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
unsigned long totalpages, const nodemask_t *nodemask,
bool force_kill);
-extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *mask, bool force_kill);
extern int register_oom_notifier(struct notifier_block *nb);
extern int unregister_oom_notifier(struct notifier_block *nb);
extern bool oom_killer_disabled;
-
-static inline void oom_killer_disable(void)
-{
- oom_killer_disabled = true;
-}
-
-static inline void oom_killer_enable(void)
-{
- oom_killer_disabled = false;
-}
+extern bool oom_killer_disable(void);
+extern void oom_killer_enable(void);
extern struct task_struct *find_lock_task_mm(struct task_struct *p);
diff --git a/kernel/exit.c b/kernel/exit.c
index 02b3d1ab2ec0..feff10bbb307 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk)
task_unlock(tsk);
mm_update_next_owner(mm);
mmput(mm);
- unmark_oom_victim();
+ if (test_thread_flag(TIF_MEMDIE))
+ unmark_oom_victim();
}
static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 3ac45f192e9f..564f786df470 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -108,30 +108,6 @@ static int try_to_freeze_tasks(bool user_only)
return todo ? -EBUSY : 0;
}
-static bool __check_frozen_processes(void)
-{
- struct task_struct *g, *p;
-
- for_each_process_thread(g, p)
- if (p != current && !freezer_should_skip(p) && !frozen(p))
- return false;
-
- return true;
-}
-
-/*
- * Returns true if all freezable tasks (except for current) are frozen already
- */
-static bool check_frozen_processes(void)
-{
- bool ret;
-
- read_lock(&tasklist_lock);
- ret = __check_frozen_processes();
- read_unlock(&tasklist_lock);
- return ret;
-}
-
/**
* freeze_processes - Signal user space processes to enter the refrigerator.
* The current thread will not be frozen. The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
int freeze_processes(void)
{
int error;
- int oom_kills_saved;
error = __usermodehelper_disable(UMH_FREEZING);
if (error)
@@ -157,29 +132,22 @@ int freeze_processes(void)
pm_wakeup_clear();
pr_info("Freezing user space processes ... ");
pm_freezing = true;
- oom_kills_saved = oom_kills_count();
error = try_to_freeze_tasks(true);
if (!error) {
__usermodehelper_set_disable_depth(UMH_DISABLED);
- oom_killer_disable();
-
- /*
- * There might have been an OOM kill while we were
- * freezing tasks and the killed task might be still
- * on the way out so we have to double check for race.
- */
- if (oom_kills_count() != oom_kills_saved &&
- !check_frozen_processes()) {
- __usermodehelper_set_disable_depth(UMH_ENABLED);
- pr_cont("OOM in progress.");
- error = -EBUSY;
- } else {
- pr_cont("done.");
- }
+ pr_cont("done.");
}
pr_cont("\n");
BUG_ON(in_atomic());
+ /*
+ * Now that the whole userspace is frozen we need to disbale
+ * the OOM killer to disallow any further interference with
+ * killable tasks.
+ */
+ if (!error && !oom_killer_disable())
+ error = -EBUSY;
+
if (error)
thaw_processes();
return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fe4d258ef32b..fbf64e6f64e4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1930,7 +1930,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
if (!memcg)
return false;
- if (!handle)
+ if (!handle || oom_killer_disabled)
goto cleanup;
owait.memcg = memcg;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3cbd76b8c13b..b8df76ee2be3 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -398,30 +398,27 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
}
/*
- * Number of OOM killer invocations (including memcg OOM killer).
- * Primarily used by PM freezer to check for potential races with
- * OOM killed frozen task.
+ * Number of OOM victims in flight
*/
-static atomic_t oom_kills = ATOMIC_INIT(0);
+static atomic_t oom_victims = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
-int oom_kills_count(void)
-{
- return atomic_read(&oom_kills);
-}
-
-void note_oom_kill(void)
-{
- atomic_inc(&oom_kills);
-}
+bool oom_killer_disabled __read_mostly;
+static DECLARE_RWSEM(oom_sem);
/**
* mark_tsk_oom_victim - marks the given taks as OOM victim.
* @tsk: task to mark
+ *
+ * Has to be called with oom_sem taken for read and never after
+ * oom has been disabled already.
*/
void mark_tsk_oom_victim(struct task_struct *tsk)
{
- set_tsk_thread_flag(tsk, TIF_MEMDIE);
-
+ WARN_ON(oom_killer_disabled);
+ /* OOM killer might race with memcg OOM */
+ if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
+ return;
/*
* Make sure that the task is woken up from uninterruptible sleep
* if it is frozen because OOM killer wouldn't be able to free
@@ -429,14 +426,70 @@ void mark_tsk_oom_victim(struct task_struct *tsk)
* that TIF_MEMDIE tasks should be ignored.
*/
__thaw_task(tsk);
+ atomic_inc(&oom_victims);
}
/**
* unmark_oom_victim - unmarks the current task as OOM victim.
+ *
+ * Wakes up all waiters in oom_killer_disable()
*/
void unmark_oom_victim(void)
{
- clear_thread_flag(TIF_MEMDIE);
+ if (!test_and_clear_thread_flag(TIF_MEMDIE))
+ return;
+
+ down_read(&oom_sem);
+ /*
+ * There is no need to signal the lasst oom_victim if there
+ * is nobody who cares.
+ */
+ if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
+ wake_up_all(&oom_victims_wait);
+ up_read(&oom_sem);
+}
+
+/**
+ * oom_killer_disable - disable OOM killer
+ *
+ * Forces all page allocations to fail rather than trigger OOM killer.
+ * Will block and wait until all OOM victims are killed.
+ *
+ * The function cannot be called when there are runnable user tasks because
+ * the userspace would see unexpected allocation failures as a result. Any
+ * new usage of this function should be consulted with MM people.
+ *
+ * Returns true if successful and false if the OOM killer cannot be
+ * disabled.
+ */
+bool oom_killer_disable(void)
+{
+ /*
+ * Make sure to not race with an ongoing OOM killer
+ * and that the current is not the victim.
+ */
+ down_write(&oom_sem);
+ if (test_thread_flag(TIF_MEMDIE)) {
+ up_write(&oom_sem);
+ return false;
+ }
+
+ oom_killer_disabled = true;
+ up_write(&oom_sem);
+
+ wait_event(oom_victims_wait, !atomic_read(&oom_victims));
+
+ return true;
+}
+
+/**
+ * oom_killer_enable - enable OOM killer
+ */
+void oom_killer_enable(void)
+{
+ down_write(&oom_sem);
+ oom_killer_disabled = false;
+ up_write(&oom_sem);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -637,7 +690,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
}
/**
- * out_of_memory - kill the "best" process when we run out of memory
+ * __out_of_memory - kill the "best" process when we run out of memory
* @zonelist: zonelist pointer
* @gfp_mask: memory allocation flags
* @order: amount of memory being requested as a power of 2
@@ -649,7 +702,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask, bool force_kill)
{
const nodemask_t *mpol_mask;
@@ -718,6 +771,32 @@ out:
schedule_timeout_killable(1);
}
+/**
+ * out_of_memory - tries to invoke OOM killer.
+ * @zonelist: zonelist pointer
+ * @gfp_mask: memory allocation flags
+ * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
+ *
+ * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
+ * when it returns false. Otherwise returns true.
+ */
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+ int order, nodemask_t *nodemask, bool force_kill)
+{
+ bool ret = false;
+
+ down_read(&oom_sem);
+ if (!oom_killer_disabled) {
+ __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
+ ret = true;
+ }
+ up_read(&oom_sem);
+
+ return ret;
+}
+
/*
* The pagefault handler calls here because it is out of memory, so kill a
* memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
@@ -727,12 +806,25 @@ void pagefault_out_of_memory(void)
{
struct zonelist *zonelist;
+ down_read(&oom_sem);
if (mem_cgroup_oom_synchronize(true))
- return;
+ goto unlock;
zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
- out_of_memory(NULL, 0, 0, NULL, false);
+ if (!oom_killer_disabled)
+ __out_of_memory(NULL, 0, 0, NULL, false);
+ else
+ /*
+ * There shouldn't be any user tasks runable while the
+ * OOM killer is disabled so the current task has to
+ * be a racing OOM victim for which oom_killer_disable()
+ * is waiting for.
+ */
+ WARN_ON(test_thread_flag(TIF_MEMDIE));
+
oom_zonelist_unlock(zonelist, GFP_KERNEL);
}
+unlock:
+ up_read(&oom_sem);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 641d5a9a8617..134e25525044 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
PB_migrate, PB_migrate_end);
}
-bool oom_killer_disabled __read_mostly;
-
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
@@ -2317,9 +2315,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
*did_some_progress = 0;
- if (oom_killer_disabled)
- return NULL;
-
/*
* Acquire the per-zone oom lock for each zone. If that
* fails, somebody else is making progress for us.
@@ -2331,14 +2326,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
}
/*
- * PM-freezer should be notified that there might be an OOM killer on
- * its way to kill and wake somebody up. This is too early and we might
- * end up not killing anything but false positives are acceptable.
- * See freeze_processes.
- */
- note_oom_kill();
-
- /*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
* we're still under heavy pressure.
@@ -2372,8 +2359,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
- out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false);
- *did_some_progress = 1;
+ if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
+ *did_some_progress = 1;
out:
oom_zonelist_unlock(ac->zonelist, gfp_mask);
return page;