summaryrefslogtreecommitdiff
path: root/mm/swapfile.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c177
1 files changed, 86 insertions, 91 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 996afa8131c8..e898c879a434 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -39,6 +39,7 @@
#include <linux/export.h>
#include <linux/swap_slots.h>
#include <linux/sort.h>
+#include <linux/completion.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
@@ -99,11 +100,10 @@ atomic_t nr_rotate_swap = ATOMIC_INIT(0);
static struct swap_info_struct *swap_type_to_swap_info(int type)
{
- if (type >= READ_ONCE(nr_swapfiles))
+ if (type >= MAX_SWAPFILES)
return NULL;
- smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
- return READ_ONCE(swap_info[type]);
+ return READ_ONCE(swap_info[type]); /* rcu_dereference() */
}
static inline unsigned char swap_count(unsigned char ent)
@@ -452,10 +452,10 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
unsigned int idx)
{
/*
- * If scan_swap_map() can't find a free cluster, it will check
+ * If scan_swap_map_slots() can't find a free cluster, it will check
* si->swap_map directly. To make sure the discarding cluster isn't
- * taken by scan_swap_map(), mark the swap entries bad (occupied). It
- * will be cleared after discard
+ * taken by scan_swap_map_slots(), mark the swap entries bad (occupied).
+ * It will be cleared after discard
*/
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
SWAP_MAP_BAD, SWAPFILE_CLUSTER);
@@ -511,6 +511,14 @@ static void swap_discard_work(struct work_struct *work)
spin_unlock(&si->lock);
}
+static void swap_users_ref_free(struct percpu_ref *ref)
+{
+ struct swap_info_struct *si;
+
+ si = container_of(ref, struct swap_info_struct, users);
+ complete(&si->comp);
+}
+
static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
{
struct swap_cluster_info *ci = si->cluster_info;
@@ -580,7 +588,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
}
/*
- * It's possible scan_swap_map() uses a free cluster in the middle of free
+ * It's possible scan_swap_map_slots() uses a free cluster in the middle of free
* cluster list. Avoiding such abuse to avoid list corruption.
*/
static bool
@@ -1028,21 +1036,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
swap_range_free(si, offset, SWAPFILE_CLUSTER);
}
-static unsigned long scan_swap_map(struct swap_info_struct *si,
- unsigned char usage)
-{
- swp_entry_t entry;
- int n_ret;
-
- n_ret = scan_swap_map_slots(si, usage, 1, &entry);
-
- if (n_ret)
- return swp_offset(entry);
- else
- return 0;
-
-}
-
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
{
unsigned long size = swap_entry_size(entry_size);
@@ -1105,14 +1098,14 @@ start_over:
nextsi:
/*
* if we got here, it's likely that si was almost full before,
- * and since scan_swap_map() can drop the si->lock, multiple
- * callers probably all tried to get a page from the same si
- * and it filled up before we could get one; or, the si filled
- * up between us dropping swap_avail_lock and taking si->lock.
- * Since we dropped the swap_avail_lock, the swap_avail_head
- * list may have been modified; so if next is still in the
- * swap_avail_head list then try it, otherwise start over
- * if we have not gotten any slots.
+ * and since scan_swap_map_slots() can drop the si->lock,
+ * multiple callers probably all tried to get a page from the
+ * same si and it filled up before we could get one; or, the si
+ * filled up between us dropping swap_avail_lock and taking
+ * si->lock. Since we dropped the swap_avail_lock, the
+ * swap_avail_head list may have been modified; so if next is
+ * still in the swap_avail_head list then try it, otherwise
+ * start over if we have not gotten any slots.
*/
if (plist_node_empty(&next->avail_lists[node]))
goto start_over;
@@ -1128,30 +1121,6 @@ noswap:
return n_ret;
}
-/* The only caller of this function is now suspend routine */
-swp_entry_t get_swap_page_of_type(int type)
-{
- struct swap_info_struct *si = swap_type_to_swap_info(type);
- pgoff_t offset;
-
- if (!si)
- goto fail;
-
- spin_lock(&si->lock);
- if (si->flags & SWP_WRITEOK) {
- /* This is called for allocating swap entry, not cache */
- offset = scan_swap_map(si, 1);
- if (offset) {
- atomic_long_dec(&nr_swap_pages);
- spin_unlock(&si->lock);
- return swp_entry(type, offset);
- }
- }
- spin_unlock(&si->lock);
-fail:
- return (swp_entry_t) {0};
-}
-
static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
@@ -1270,18 +1239,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
* via preventing the swap device from being swapoff, until
* put_swap_device() is called. Otherwise return NULL.
*
- * The entirety of the RCU read critical section must come before the
- * return from or after the call to synchronize_rcu() in
- * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
- * true, the si->map, si->cluster_info, etc. must be valid in the
- * critical section.
- *
* Notice that swapoff or swapoff+swapon can still happen before the
- * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
- * in put_swap_device() if there isn't any other way to prevent
- * swapoff, such as page lock, page table lock, etc. The caller must
- * be prepared for that. For example, the following situation is
- * possible.
+ * percpu_ref_tryget_live() in get_swap_device() or after the
+ * percpu_ref_put() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc. The
+ * caller must be prepared for that. For example, the following
+ * situation is possible.
*
* CPU1 CPU2
* do_swap_page()
@@ -1309,21 +1272,27 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
si = swp_swap_info(entry);
if (!si)
goto bad_nofile;
-
- rcu_read_lock();
- if (data_race(!(si->flags & SWP_VALID)))
- goto unlock_out;
+ if (!percpu_ref_tryget_live(&si->users))
+ goto out;
+ /*
+ * Guarantee the si->users are checked before accessing other
+ * fields of swap_info_struct.
+ *
+ * Paired with the spin_unlock() after setup_swap_info() in
+ * enable_swap_info().
+ */
+ smp_rmb();
offset = swp_offset(entry);
if (offset >= si->max)
- goto unlock_out;
+ goto put_out;
return si;
bad_nofile:
pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
return NULL;
-unlock_out:
- rcu_read_unlock();
+put_out:
+ percpu_ref_put(&si->users);
return NULL;
}
@@ -1803,6 +1772,24 @@ int free_swap_and_cache(swp_entry_t entry)
}
#ifdef CONFIG_HIBERNATION
+
+swp_entry_t get_swap_page_of_type(int type)
+{
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
+ swp_entry_t entry = {0};
+
+ if (!si)
+ goto fail;
+
+ /* This is called for allocating swap entry, not cache */
+ spin_lock(&si->lock);
+ if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry))
+ atomic_long_dec(&nr_swap_pages);
+ spin_unlock(&si->lock);
+fail:
+ return entry;
+}
+
/*
* Find the swap type that corresponds to given device (if any).
*
@@ -2466,7 +2453,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
static void _enable_swap_info(struct swap_info_struct *p)
{
- p->flags |= SWP_WRITEOK | SWP_VALID;
+ p->flags |= SWP_WRITEOK;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
@@ -2497,10 +2484,9 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
/*
- * Guarantee swap_map, cluster_info, etc. fields are valid
- * between get/put_swap_device() if SWP_VALID bit is set
+ * Finished initializing swap device, now it's safe to reference it.
*/
- synchronize_rcu();
+ percpu_ref_resurrect(&p->users);
spin_lock(&swap_lock);
spin_lock(&p->lock);
_enable_swap_info(p);
@@ -2616,16 +2602,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
reenable_swap_slots_cache_unlock();
- spin_lock(&swap_lock);
- spin_lock(&p->lock);
- p->flags &= ~SWP_VALID; /* mark swap device as invalid */
- spin_unlock(&p->lock);
- spin_unlock(&swap_lock);
/*
- * wait for swap operations protected by get/put_swap_device()
- * to complete
+ * Wait for swap operations protected by get/put_swap_device()
+ * to complete.
+ *
+ * We need synchronize_rcu() here to protect the accessing to
+ * the swap cache data structure.
*/
+ percpu_ref_kill(&p->users);
synchronize_rcu();
+ wait_for_completion(&p->comp);
flush_work(&p->discard_work);
@@ -2641,7 +2627,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_lock(&p->lock);
drain_mmlist();
- /* wait for anyone still in scan_swap_map */
+ /* wait for anyone still in scan_swap_map_slots */
p->highest_bit = 0; /* cuts scans short */
while (p->flags >= SWP_SCANNING) {
spin_unlock(&p->lock);
@@ -2857,6 +2843,12 @@ static struct swap_info_struct *alloc_swap_info(void)
if (!p)
return ERR_PTR(-ENOMEM);
+ if (percpu_ref_init(&p->users, swap_users_ref_free,
+ PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
+ kvfree(p);
+ return ERR_PTR(-ENOMEM);
+ }
+
spin_lock(&swap_lock);
for (type = 0; type < nr_swapfiles; type++) {
if (!(swap_info[type]->flags & SWP_USED))
@@ -2864,19 +2856,18 @@ static struct swap_info_struct *alloc_swap_info(void)
}
if (type >= MAX_SWAPFILES) {
spin_unlock(&swap_lock);
+ percpu_ref_exit(&p->users);
kvfree(p);
return ERR_PTR(-EPERM);
}
if (type >= nr_swapfiles) {
p->type = type;
- WRITE_ONCE(swap_info[type], p);
/*
- * Write swap_info[type] before nr_swapfiles, in case a
- * racing procfs swap_start() or swap_next() is reading them.
- * (We never shrink nr_swapfiles, we never free this entry.)
+ * Publish the swap_info_struct after initializing it.
+ * Note that kvzalloc() above zeroes all its fields.
*/
- smp_wmb();
- WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
+ smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */
+ nr_swapfiles++;
} else {
defer = p;
p = swap_info[type];
@@ -2891,9 +2882,13 @@ static struct swap_info_struct *alloc_swap_info(void)
plist_node_init(&p->avail_lists[i], 0);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
- kvfree(defer);
+ if (defer) {
+ percpu_ref_exit(&defer->users);
+ kvfree(defer);
+ }
spin_lock_init(&p->lock);
spin_lock_init(&p->cont_lock);
+ init_completion(&p->comp);
return p;
}