summaryrefslogtreecommitdiff
path: root/drivers/md/dm-writecache.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/dm-writecache.c')
-rw-r--r--drivers/md/dm-writecache.c913
1 files changed, 579 insertions, 334 deletions
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 74f3c506f084..d8de4a3076a1 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2018 Red Hat. All rights reserved.
*
@@ -13,14 +13,15 @@
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
#include <linux/dax.h>
-#include <linux/pfn_t.h>
#include <linux/libnvdimm.h>
+#include <linux/delay.h>
+#include "dm-io-tracker.h"
#define DM_MSG_PREFIX "writecache"
#define HIGH_WATERMARK 50
#define LOW_WATERMARK 45
-#define MAX_WRITEBACK_JOBS 0
+#define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16)
#define ENDIO_LATENCY 16
#define WRITEBACK_LATENCY 64
#define AUTOCOMMIT_BLOCKS_SSD 65536
@@ -28,6 +29,7 @@
#define AUTOCOMMIT_MSEC 1000
#define MAX_AGE_DIV 16
#define MAX_AGE_UNSPECIFIED -1UL
+#define PAUSE_WRITEBACK (HZ * 3)
#define BITMAP_GRANULARITY 65536
#if BITMAP_GRANULARITY < PAGE_SIZE
@@ -35,7 +37,7 @@
#define BITMAP_GRANULARITY PAGE_SIZE
#endif
-#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
+#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX)
#define DM_WRITECACHE_HAS_PMEM
#endif
@@ -49,7 +51,7 @@ do { \
#define pmem_assign(dest, src) ((dest) = (src))
#endif
-#if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
+#if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
#define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
#endif
@@ -73,23 +75,20 @@ struct wc_memory_superblock {
};
__le64 padding[8];
};
- struct wc_memory_entry entries[0];
+ struct wc_memory_entry entries[];
};
struct wc_entry {
struct rb_node rb_node;
struct list_head lru;
unsigned short wc_list_contiguous;
- bool write_in_progress
#if BITS_PER_LONG == 64
- :1
-#endif
- ;
- unsigned long index
-#if BITS_PER_LONG == 64
- :47
+ bool write_in_progress : 1;
+ unsigned long index : 47;
+#else
+ bool write_in_progress;
+ unsigned long index;
#endif
- ;
unsigned long age;
#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
uint64_t original_sector;
@@ -123,10 +122,11 @@ struct dm_writecache {
size_t freelist_high_watermark;
size_t freelist_low_watermark;
unsigned long max_age;
+ unsigned long pause;
- unsigned uncommitted_blocks;
- unsigned autocommit_blocks;
- unsigned max_writeback_jobs;
+ unsigned int uncommitted_blocks;
+ unsigned int autocommit_blocks;
+ unsigned int max_writeback_jobs;
int error;
@@ -148,9 +148,10 @@ struct dm_writecache {
size_t metadata_sectors;
size_t n_blocks;
uint64_t seq_count;
+ sector_t data_device_sectors;
void *block_start;
struct wc_entry *entries;
- unsigned block_size;
+ unsigned int block_size;
unsigned char block_size_bits;
bool pmem_mode:1;
@@ -159,20 +160,33 @@ struct dm_writecache {
bool overwrote_committed:1;
bool memory_vmapped:1;
+ bool start_sector_set:1;
bool high_wm_percent_set:1;
bool low_wm_percent_set:1;
bool max_writeback_jobs_set:1;
bool autocommit_blocks_set:1;
bool autocommit_time_set:1;
+ bool max_age_set:1;
bool writeback_fua_set:1;
bool flush_on_suspend:1;
bool cleaner:1;
+ bool cleaner_set:1;
+ bool metadata_only:1;
+ bool pause_set:1;
+
+ unsigned int high_wm_percent_value;
+ unsigned int low_wm_percent_value;
+ unsigned int autocommit_time_value;
+ unsigned int max_age_value;
+ unsigned int pause_value;
- unsigned writeback_all;
+ unsigned int writeback_all;
struct workqueue_struct *writeback_wq;
struct work_struct writeback_work;
struct work_struct flush_work;
+ struct dm_io_tracker iot;
+
struct dm_io_client *dm_io;
raw_spinlock_t endio_list_lock;
@@ -184,10 +198,23 @@ struct dm_writecache {
struct dm_kcopyd_client *dm_kcopyd;
unsigned long *dirty_bitmap;
- unsigned dirty_bitmap_size;
+ unsigned int dirty_bitmap_size;
struct bio_set bio_set;
mempool_t copy_pool;
+
+ struct {
+ unsigned long long reads;
+ unsigned long long read_hits;
+ unsigned long long writes;
+ unsigned long long write_hits_uncommitted;
+ unsigned long long write_hits_committed;
+ unsigned long long writes_around;
+ unsigned long long writes_allocate;
+ unsigned long long writes_blocked_on_freelist;
+ unsigned long long flushes;
+ unsigned long long discards;
+ } stats;
};
#define WB_LIST_INLINE 16
@@ -196,7 +223,7 @@ struct writeback_struct {
struct list_head endio_entry;
struct dm_writecache *wc;
struct wc_entry **wc_list;
- unsigned wc_list_n;
+ unsigned int wc_list_n;
struct wc_entry *wc_list_inline[WB_LIST_INLINE];
struct bio bio;
};
@@ -205,7 +232,7 @@ struct copy_struct {
struct list_head endio_entry;
struct dm_writecache *wc;
struct wc_entry *e;
- unsigned n_entries;
+ unsigned int n_entries;
int error;
};
@@ -228,9 +255,10 @@ static int persistent_memory_claim(struct dm_writecache *wc)
int r;
loff_t s;
long p, da;
- pfn_t pfn;
+ unsigned long pfn;
int id;
struct page **pages;
+ sector_t offset;
wc->memory_vmapped = false;
@@ -245,23 +273,32 @@ static int persistent_memory_claim(struct dm_writecache *wc)
goto err1;
}
+ offset = get_start_sect(wc->ssd_dev->bdev);
+ if (offset & (PAGE_SIZE / 512 - 1)) {
+ r = -EINVAL;
+ goto err1;
+ }
+ offset >>= PAGE_SHIFT - 9;
+
id = dax_read_lock();
- da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn);
+ da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS,
+ &wc->memory_map, &pfn);
if (da < 0) {
wc->memory_map = NULL;
r = da;
goto err2;
}
- if (!pfn_t_has_page(pfn)) {
+ if (!pfn_valid(pfn)) {
wc->memory_map = NULL;
r = -EOPNOTSUPP;
goto err2;
}
if (da != p) {
long i;
+
wc->memory_map = NULL;
- pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
+ pages = vmalloc_array(p, sizeof(struct page *));
if (!pages) {
r = -ENOMEM;
goto err2;
@@ -269,19 +306,22 @@ static int persistent_memory_claim(struct dm_writecache *wc)
i = 0;
do {
long daa;
- daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i,
- NULL, &pfn);
+
+ daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i,
+ p - i, DAX_ACCESS, NULL, &pfn);
if (daa <= 0) {
r = daa ? daa : -EINVAL;
goto err3;
}
- if (!pfn_t_has_page(pfn)) {
+ if (!pfn_valid(pfn)) {
r = -EOPNOTSUPP;
goto err3;
}
while (daa-- && i < p) {
- pages[i++] = pfn_t_to_page(pfn);
- pfn.val++;
+ pages[i++] = pfn_to_page(pfn);
+ pfn++;
+ if (!(i & 15))
+ cond_resched();
}
} while (i < p);
wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
@@ -289,7 +329,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
r = -ENOMEM;
goto err3;
}
- kvfree(pages);
+ vfree(pages);
wc->memory_vmapped = true;
}
@@ -300,7 +340,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
return 0;
err3:
- kvfree(pages);
+ vfree(pages);
err2:
dax_read_unlock(id);
err1:
@@ -309,7 +349,7 @@ err1:
#else
static int persistent_memory_claim(struct dm_writecache *wc)
{
- BUG();
+ return -EOPNOTSUPP;
}
#endif
@@ -327,7 +367,7 @@ static struct page *persistent_memory_page(void *addr)
return virt_to_page(addr);
}
-static unsigned persistent_memory_page_offset(void *addr)
+static unsigned int persistent_memory_page_offset(void *addr)
{
return (unsigned long)addr & (PAGE_SIZE - 1);
}
@@ -460,11 +500,12 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
COMPLETION_INITIALIZER_ONSTACK(endio.c),
ATOMIC_INIT(1),
};
- unsigned bitmap_bits = wc->dirty_bitmap_size * 8;
- unsigned i = 0;
+ unsigned int bitmap_bits = wc->dirty_bitmap_size * 8;
+ unsigned int i = 0;
while (1) {
- unsigned j;
+ unsigned int j;
+
i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
if (unlikely(i == bitmap_bits))
break;
@@ -481,8 +522,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
region.sector += wc->start_sector;
atomic_inc(&endio.count);
- req.bi_op = REQ_OP_WRITE;
- req.bi_op_flags = REQ_SYNC;
+ req.bi_opf = REQ_OP_WRITE | REQ_SYNC;
req.mem.type = DM_IO_VMA;
req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
req.client = wc->dm_io;
@@ -490,7 +530,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
req.notify.context = &endio;
/* writing via async dm-io (implied by notify.fn above) won't return an error */
- (void) dm_io(&req, 1, &region, NULL);
+ (void) dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
i = j;
}
@@ -513,22 +553,21 @@ static void ssd_commit_superblock(struct dm_writecache *wc)
region.bdev = wc->ssd_dev->bdev;
region.sector = 0;
- region.count = PAGE_SIZE;
+ region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
if (unlikely(region.sector + region.count > wc->metadata_sectors))
region.count = wc->metadata_sectors - region.sector;
region.sector += wc->start_sector;
- req.bi_op = REQ_OP_WRITE;
- req.bi_op_flags = REQ_SYNC | REQ_FUA;
+ req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA;
req.mem.type = DM_IO_VMA;
req.mem.ptr.vma = (char *)wc->memory_map;
req.client = wc->dm_io;
req.notify.fn = NULL;
req.notify.context = NULL;
- r = dm_io(&req, 1, &region, NULL);
+ r = dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
if (unlikely(r))
writecache_error(wc, r, "error writing superblock");
}
@@ -536,7 +575,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc)
static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{
if (WC_MODE_PMEM(wc))
- wmb();
+ pmem_wmb();
else
ssd_commit_flushed(wc, wait_for_ios);
}
@@ -550,14 +589,13 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
region.bdev = dev->bdev;
region.sector = 0;
region.count = 0;
- req.bi_op = REQ_OP_WRITE;
- req.bi_op_flags = REQ_PREFLUSH;
+ req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
req.mem.type = DM_IO_KMEM;
req.mem.ptr.addr = NULL;
req.client = wc->dm_io;
req.notify.fn = NULL;
- r = dm_io(&req, 1, &region, NULL);
+ r = dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
if (unlikely(r))
writecache_error(wc, r, "error flushing metadata: %d", r);
}
@@ -584,20 +622,21 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
if (unlikely(!node)) {
if (!(flags & WFE_RETURN_FOLLOWING))
return NULL;
- if (read_original_sector(wc, e) >= block) {
- return e;
- } else {
- node = rb_next(&e->rb_node);
- if (unlikely(!node))
- return NULL;
- e = container_of(node, struct wc_entry, rb_node);
+ if (read_original_sector(wc, e) >= block)
return e;
- }
+
+ node = rb_next(&e->rb_node);
+ if (unlikely(!node))
+ return NULL;
+
+ e = container_of(node, struct wc_entry, rb_node);
+ return e;
}
}
while (1) {
struct wc_entry *e2;
+
if (flags & WFE_LOWEST_SEQ)
node = rb_prev(&e->rb_node);
else
@@ -640,6 +679,7 @@ static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry
{
if (WC_MODE_SORT_FREELIST(wc)) {
struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
+
if (unlikely(!*node))
wc->current_free = e;
while (*node) {
@@ -665,7 +705,7 @@ static inline void writecache_verify_watermark(struct dm_writecache *wc)
static void writecache_max_age_timer(struct timer_list *t)
{
- struct dm_writecache *wc = from_timer(wc, t, max_age_timer);
+ struct dm_writecache *wc = timer_container_of(wc, t, max_age_timer);
if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
queue_work(wc->writeback_wq, &wc->writeback_work);
@@ -679,6 +719,7 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, s
if (WC_MODE_SORT_FREELIST(wc)) {
struct rb_node *next;
+
if (unlikely(!wc->current_free))
return NULL;
e = wc->current_free;
@@ -730,7 +771,7 @@ static void writecache_poison_lists(struct dm_writecache *wc)
/*
* Catch incorrect access to these values while the device is suspended.
*/
- memset(&wc->tree, -1, sizeof wc->tree);
+ memset(&wc->tree, -1, sizeof(wc->tree));
wc->lru.next = LIST_POISON1;
wc->lru.prev = LIST_POISON2;
wc->freelist.next = LIST_POISON1;
@@ -755,7 +796,7 @@ static void writecache_flush(struct dm_writecache *wc)
bool need_flush_after_free;
wc->uncommitted_blocks = 0;
- del_timer(&wc->autocommit_timer);
+ timer_delete(&wc->autocommit_timer);
if (list_empty(&wc->lru))
return;
@@ -824,7 +865,8 @@ static void writecache_flush_work(struct work_struct *work)
static void writecache_autocommit_timer(struct timer_list *t)
{
- struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
+ struct dm_writecache *wc = timer_container_of(wc, t, autocommit_timer);
+
if (!writecache_has_error(wc))
queue_work(wc->writeback_wq, &wc->flush_work);
}
@@ -849,10 +891,14 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_
if (likely(!e->write_in_progress)) {
if (!discarded_something) {
- writecache_wait_for_ios(wc, READ);
- writecache_wait_for_ios(wc, WRITE);
+ if (!WC_MODE_PMEM(wc)) {
+ writecache_wait_for_ios(wc, READ);
+ writecache_wait_for_ios(wc, WRITE);
+ }
discarded_something = true;
}
+ if (!writecache_entry_is_committed(wc, e))
+ wc->uncommitted_blocks--;
writecache_free_entry(wc, e);
}
@@ -880,8 +926,8 @@ static void writecache_suspend(struct dm_target *ti)
struct dm_writecache *wc = ti->private;
bool flush_on_suspend;
- del_timer_sync(&wc->autocommit_timer);
- del_timer_sync(&wc->max_age_timer);
+ timer_delete_sync(&wc->autocommit_timer);
+ timer_delete_sync(&wc->max_age_timer);
wc_lock(wc);
writecache_flush(wc);
@@ -898,7 +944,8 @@ static void writecache_suspend(struct dm_target *ti)
wc_lock(wc);
if (flush_on_suspend)
wc->writeback_all--;
- while (writecache_wait_for_writeback(wc));
+ while (writecache_wait_for_writeback(wc))
+ ;
if (WC_MODE_PMEM(wc))
persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
@@ -914,11 +961,12 @@ static int writecache_alloc_entries(struct dm_writecache *wc)
if (wc->entries)
return 0;
- wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
+ wc->entries = vmalloc_array(wc->n_blocks, sizeof(struct wc_entry));
if (!wc->entries)
return -ENOMEM;
for (b = 0; b < wc->n_blocks; b++) {
struct wc_entry *e = &wc->entries[b];
+
e->index = b;
e->write_in_progress = false;
cond_resched();
@@ -935,14 +983,13 @@ static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors
region.bdev = wc->ssd_dev->bdev;
region.sector = wc->start_sector;
region.count = n_sectors;
- req.bi_op = REQ_OP_READ;
- req.bi_op_flags = REQ_SYNC;
+ req.bi_opf = REQ_OP_READ | REQ_SYNC;
req.mem.type = DM_IO_VMA;
req.mem.ptr.vma = (char *)wc->memory_map;
req.client = wc->dm_io;
req.notify.fn = NULL;
- return dm_io(&req, 1, &region, NULL);
+ return dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
}
static void writecache_resume(struct dm_target *ti)
@@ -955,12 +1002,15 @@ static void writecache_resume(struct dm_target *ti)
wc_lock(wc);
+ wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev);
+
if (WC_MODE_PMEM(wc)) {
persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
} else {
r = writecache_read_metadata(wc, wc->metadata_sectors);
if (r) {
size_t sb_entries_offset;
+
writecache_error(wc, r, "unable to read metadata: %d", r);
sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
memset((char *)wc->memory_map + sb_entries_offset, -1,
@@ -978,7 +1028,8 @@ static void writecache_resume(struct dm_target *ti)
}
wc->freelist_size = 0;
- r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
+ r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count,
+ sizeof(uint64_t));
if (r) {
writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
sb_seq_count = cpu_to_le64(0);
@@ -989,12 +1040,14 @@ static void writecache_resume(struct dm_target *ti)
for (b = 0; b < wc->n_blocks; b++) {
struct wc_entry *e = &wc->entries[b];
struct wc_memory_entry wme;
+
if (writecache_has_error(wc)) {
e->original_sector = -1;
e->seq_count = -1;
continue;
}
- r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
+ r = copy_mc_to_kernel(&wme, memory_entry(wc, e),
+ sizeof(struct wc_memory_entry));
if (r) {
writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
(unsigned long)b, r);
@@ -1009,6 +1062,7 @@ static void writecache_resume(struct dm_target *ti)
#endif
for (b = 0; b < wc->n_blocks; b++) {
struct wc_entry *e = &wc->entries[b];
+
if (!writecache_entry_is_committed(wc, e)) {
if (read_seq_count(wc, e) != -1) {
erase_this:
@@ -1054,7 +1108,7 @@ erase_this:
wc_unlock(wc);
}
-static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
+static int process_flush_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
{
if (argc != 1)
return -EINVAL;
@@ -1087,7 +1141,7 @@ static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *
return 0;
}
-static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
+static int process_flush_on_suspend_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
{
if (argc != 1)
return -EINVAL;
@@ -1107,7 +1161,7 @@ static void activate_cleaner(struct dm_writecache *wc)
wc->freelist_low_watermark = wc->n_blocks;
}
-static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
+static int process_cleaner_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
{
if (argc != 1)
return -EINVAL;
@@ -1121,8 +1175,20 @@ static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache
return 0;
}
-static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
- char *result, unsigned maxlen)
+static int process_clear_stats_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
+{
+ if (argc != 1)
+ return -EINVAL;
+
+ wc_lock(wc);
+ memset(&wc->stats, 0, sizeof(wc->stats));
+ wc_unlock(wc);
+
+ return 0;
+}
+
+static int writecache_message(struct dm_target *ti, unsigned int argc, char **argv,
+ char *result, unsigned int maxlen)
{
int r = -EINVAL;
struct dm_writecache *wc = ti->private;
@@ -1133,6 +1199,8 @@ static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
r = process_flush_on_suspend_mesg(argc, argv, wc);
else if (!strcasecmp(argv[0], "cleaner"))
r = process_cleaner_mesg(argc, argv, wc);
+ else if (!strcasecmp(argv[0], "clear_stats"))
+ r = process_clear_stats_mesg(argc, argv, wc);
else
DMERR("unrecognised message received: %s", argv[0]);
@@ -1178,21 +1246,22 @@ static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
{
void *buf;
- unsigned long flags;
- unsigned size;
+ unsigned int size;
int rw = bio_data_dir(bio);
- unsigned remaining_size = wc->block_size;
+ unsigned int remaining_size = wc->block_size;
do {
struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
- buf = bvec_kmap_irq(&bv, &flags);
+
+ buf = bvec_kmap_local(&bv);
size = bv.bv_len;
if (unlikely(size > remaining_size))
size = remaining_size;
if (rw == READ) {
int r;
- r = memcpy_mcsafe(buf, data, size);
+
+ r = copy_mc_to_kernel(buf, data, size);
flush_dcache_page(bio_page(bio));
if (unlikely(r)) {
writecache_error(wc, r, "hardware memory error when reading data: %d", r);
@@ -1203,7 +1272,7 @@ static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data
memcpy_flushcache_optimized(data, buf, size);
}
- bvec_kunmap_irq(buf, &flags);
+ kunmap_local(buf);
data = (char *)data + size;
remaining_size -= size;
@@ -1238,7 +1307,7 @@ static int writecache_flush_thread(void *data)
bio_end_sector(bio));
wc_unlock(wc);
bio_set_dev(bio, wc->dev->bdev);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
} else {
writecache_flush(wc);
wc_unlock(wc);
@@ -1258,194 +1327,303 @@ static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
bio_list_add(&wc->flush_list, bio);
}
-static int writecache_map(struct dm_target *ti, struct bio *bio)
-{
- struct wc_entry *e;
- struct dm_writecache *wc = ti->private;
+enum wc_map_op {
+ WC_MAP_SUBMIT,
+ WC_MAP_REMAP,
+ WC_MAP_REMAP_ORIGIN,
+ WC_MAP_RETURN,
+ WC_MAP_ERROR,
+};
- bio->bi_private = NULL;
+static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio,
+ struct wc_entry *e)
+{
+ if (e) {
+ sector_t next_boundary =
+ read_original_sector(wc, e) - bio->bi_iter.bi_sector;
+ if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT)
+ dm_accept_partial_bio(bio, next_boundary);
+ }
+}
- wc_lock(wc);
+static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio)
+{
+ enum wc_map_op map_op;
+ struct wc_entry *e;
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
- if (writecache_has_error(wc))
- goto unlock_error;
+read_next_block:
+ wc->stats.reads++;
+ e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
+ if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
+ wc->stats.read_hits++;
if (WC_MODE_PMEM(wc)) {
- writecache_flush(wc);
- if (writecache_has_error(wc))
- goto unlock_error;
- goto unlock_submit;
+ bio_copy_block(wc, bio, memory_data(wc, e));
+ if (bio->bi_iter.bi_size)
+ goto read_next_block;
+ map_op = WC_MAP_SUBMIT;
} else {
- writecache_offload_bio(wc, bio);
- goto unlock_return;
+ dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
+ bio_set_dev(bio, wc->ssd_dev->bdev);
+ bio->bi_iter.bi_sector = cache_sector(wc, e);
+ if (!writecache_entry_is_committed(wc, e))
+ writecache_wait_for_ios(wc, WRITE);
+ map_op = WC_MAP_REMAP;
}
+ } else {
+ writecache_map_remap_origin(wc, bio, e);
+ wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
+ map_op = WC_MAP_REMAP_ORIGIN;
}
- bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
+ return map_op;
+}
- if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
- (wc->block_size / 512 - 1)) != 0)) {
- DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
- (unsigned long long)bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size, wc->block_size);
- goto unlock_error;
- }
+static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio,
+ struct wc_entry *e, bool search_used)
+{
+ unsigned int bio_size = wc->block_size;
+ sector_t start_cache_sec = cache_sector(wc, e);
+ sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
- if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
- if (writecache_has_error(wc))
- goto unlock_error;
- if (WC_MODE_PMEM(wc)) {
- writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
- goto unlock_remap_origin;
+ while (bio_size < bio->bi_iter.bi_size) {
+ if (!search_used) {
+ struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
+
+ if (!f)
+ break;
+ write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
+ (bio_size >> SECTOR_SHIFT), wc->seq_count);
+ writecache_insert_entry(wc, f);
+ wc->uncommitted_blocks++;
} else {
- writecache_offload_bio(wc, bio);
- goto unlock_return;
+ struct wc_entry *f;
+ struct rb_node *next = rb_next(&e->rb_node);
+
+ if (!next)
+ break;
+ f = container_of(next, struct wc_entry, rb_node);
+ if (f != e + 1)
+ break;
+ if (read_original_sector(wc, f) !=
+ read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
+ break;
+ if (unlikely(f->write_in_progress))
+ break;
+ if (writecache_entry_is_committed(wc, f))
+ wc->overwrote_committed = true;
+ e = f;
}
+ bio_size += wc->block_size;
+ current_cache_sec += wc->block_size >> SECTOR_SHIFT;
}
- if (bio_data_dir(bio) == READ) {
-read_next_block:
- e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
- if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
- if (WC_MODE_PMEM(wc)) {
- bio_copy_block(wc, bio, memory_data(wc, e));
- if (bio->bi_iter.bi_size)
- goto read_next_block;
- goto unlock_submit;
- } else {
- dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
- bio_set_dev(bio, wc->ssd_dev->bdev);
- bio->bi_iter.bi_sector = cache_sector(wc, e);
- if (!writecache_entry_is_committed(wc, e))
- writecache_wait_for_ios(wc, WRITE);
- goto unlock_remap;
+ bio_set_dev(bio, wc->ssd_dev->bdev);
+ bio->bi_iter.bi_sector = start_cache_sec;
+ dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
+
+ wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
+ wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
+
+ if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
+ wc->uncommitted_blocks = 0;
+ queue_work(wc->writeback_wq, &wc->flush_work);
+ } else {
+ writecache_schedule_autocommit(wc);
+ }
+}
+
+static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio)
+{
+ struct wc_entry *e;
+
+ do {
+ bool found_entry = false;
+ bool search_used = false;
+
+ if (writecache_has_error(wc)) {
+ wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
+ return WC_MAP_ERROR;
+ }
+ e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
+ if (e) {
+ if (!writecache_entry_is_committed(wc, e)) {
+ wc->stats.write_hits_uncommitted++;
+ search_used = true;
+ goto bio_copy;
}
- } else {
- if (e) {
- sector_t next_boundary =
- read_original_sector(wc, e) - bio->bi_iter.bi_sector;
- if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
- dm_accept_partial_bio(bio, next_boundary);
- }
+ wc->stats.write_hits_committed++;
+ if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
+ wc->overwrote_committed = true;
+ search_used = true;
+ goto bio_copy;
}
- goto unlock_remap_origin;
+ found_entry = true;
+ } else {
+ if (unlikely(wc->cleaner) ||
+ (wc->metadata_only && !(bio->bi_opf & REQ_META)))
+ goto direct_write;
}
- } else {
- do {
- bool found_entry = false;
- if (writecache_has_error(wc))
- goto unlock_error;
- e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
- if (e) {
- if (!writecache_entry_is_committed(wc, e))
- goto bio_copy;
- if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
- wc->overwrote_committed = true;
- goto bio_copy;
- }
- found_entry = true;
- } else {
- if (unlikely(wc->cleaner))
- goto direct_write;
- }
- e = writecache_pop_from_freelist(wc, (sector_t)-1);
- if (unlikely(!e)) {
- if (!found_entry) {
+ e = writecache_pop_from_freelist(wc, (sector_t)-1);
+ if (unlikely(!e)) {
+ if (!WC_MODE_PMEM(wc) && !found_entry) {
direct_write:
- e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
- if (e) {
- sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector;
- BUG_ON(!next_boundary);
- if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
- dm_accept_partial_bio(bio, next_boundary);
- }
- }
- goto unlock_remap_origin;
- }
- writecache_wait_on_freelist(wc);
- continue;
+ e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
+ writecache_map_remap_origin(wc, bio, e);
+ wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits;
+ wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
+ return WC_MAP_REMAP_ORIGIN;
}
- write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
- writecache_insert_entry(wc, e);
- wc->uncommitted_blocks++;
+ wc->stats.writes_blocked_on_freelist++;
+ writecache_wait_on_freelist(wc);
+ continue;
+ }
+ write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
+ writecache_insert_entry(wc, e);
+ wc->uncommitted_blocks++;
+ wc->stats.writes_allocate++;
bio_copy:
- if (WC_MODE_PMEM(wc)) {
- bio_copy_block(wc, bio, memory_data(wc, e));
- } else {
- unsigned bio_size = wc->block_size;
- sector_t start_cache_sec = cache_sector(wc, e);
- sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
-
- while (bio_size < bio->bi_iter.bi_size) {
- struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
- if (!f)
- break;
- write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
- (bio_size >> SECTOR_SHIFT), wc->seq_count);
- writecache_insert_entry(wc, f);
- wc->uncommitted_blocks++;
- bio_size += wc->block_size;
- current_cache_sec += wc->block_size >> SECTOR_SHIFT;
- }
+ if (WC_MODE_PMEM(wc)) {
+ bio_copy_block(wc, bio, memory_data(wc, e));
+ wc->stats.writes++;
+ } else {
+ writecache_bio_copy_ssd(wc, bio, e, search_used);
+ return WC_MAP_REMAP;
+ }
+ } while (bio->bi_iter.bi_size);
- bio_set_dev(bio, wc->ssd_dev->bdev);
- bio->bi_iter.bi_sector = start_cache_sec;
- dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
+ if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks))
+ writecache_flush(wc);
+ else
+ writecache_schedule_autocommit(wc);
- if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
- wc->uncommitted_blocks = 0;
- queue_work(wc->writeback_wq, &wc->flush_work);
- } else {
- writecache_schedule_autocommit(wc);
- }
- goto unlock_remap;
- }
- } while (bio->bi_iter.bi_size);
+ return WC_MAP_SUBMIT;
+}
- if (unlikely(bio->bi_opf & REQ_FUA ||
- wc->uncommitted_blocks >= wc->autocommit_blocks))
- writecache_flush(wc);
- else
- writecache_schedule_autocommit(wc);
- goto unlock_submit;
+static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio)
+{
+ if (writecache_has_error(wc))
+ return WC_MAP_ERROR;
+
+ if (WC_MODE_PMEM(wc)) {
+ wc->stats.flushes++;
+ writecache_flush(wc);
+ if (writecache_has_error(wc))
+ return WC_MAP_ERROR;
+ else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only))
+ return WC_MAP_REMAP_ORIGIN;
+ return WC_MAP_SUBMIT;
}
+ /* SSD: */
+ if (dm_bio_get_target_bio_nr(bio))
+ return WC_MAP_REMAP_ORIGIN;
+ wc->stats.flushes++;
+ writecache_offload_bio(wc, bio);
+ return WC_MAP_RETURN;
+}
-unlock_remap_origin:
- bio_set_dev(bio, wc->dev->bdev);
- wc_unlock(wc);
- return DM_MAPIO_REMAPPED;
+static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio)
+{
+ wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits;
-unlock_remap:
- /* make sure that writecache_end_io decrements bio_in_progress: */
- bio->bi_private = (void *)1;
- atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
- wc_unlock(wc);
- return DM_MAPIO_REMAPPED;
+ if (writecache_has_error(wc))
+ return WC_MAP_ERROR;
-unlock_submit:
- wc_unlock(wc);
- bio_endio(bio);
- return DM_MAPIO_SUBMITTED;
+ if (WC_MODE_PMEM(wc)) {
+ writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
+ return WC_MAP_REMAP_ORIGIN;
+ }
+ /* SSD: */
+ writecache_offload_bio(wc, bio);
+ return WC_MAP_RETURN;
+}
-unlock_return:
- wc_unlock(wc);
- return DM_MAPIO_SUBMITTED;
+static int writecache_map(struct dm_target *ti, struct bio *bio)
+{
+ struct dm_writecache *wc = ti->private;
+ enum wc_map_op map_op;
-unlock_error:
- wc_unlock(wc);
- bio_io_error(bio);
- return DM_MAPIO_SUBMITTED;
+ bio->bi_private = NULL;
+
+ wc_lock(wc);
+
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
+ map_op = writecache_map_flush(wc, bio);
+ goto done;
+ }
+
+ bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+ if (unlikely((((unsigned int)bio->bi_iter.bi_sector | bio_sectors(bio)) &
+ (wc->block_size / 512 - 1)) != 0)) {
+ DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size, wc->block_size);
+ map_op = WC_MAP_ERROR;
+ goto done;
+ }
+
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+ map_op = writecache_map_discard(wc, bio);
+ goto done;
+ }
+
+ if (bio_data_dir(bio) == READ)
+ map_op = writecache_map_read(wc, bio);
+ else
+ map_op = writecache_map_write(wc, bio);
+done:
+ switch (map_op) {
+ case WC_MAP_REMAP_ORIGIN:
+ if (likely(wc->pause != 0)) {
+ if (bio_op(bio) == REQ_OP_WRITE) {
+ dm_iot_io_begin(&wc->iot, 1);
+ bio->bi_private = (void *)2;
+ }
+ }
+ bio_set_dev(bio, wc->dev->bdev);
+ wc_unlock(wc);
+ return DM_MAPIO_REMAPPED;
+
+ case WC_MAP_REMAP:
+ /* make sure that writecache_end_io decrements bio_in_progress: */
+ bio->bi_private = (void *)1;
+ atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
+ wc_unlock(wc);
+ return DM_MAPIO_REMAPPED;
+
+ case WC_MAP_SUBMIT:
+ wc_unlock(wc);
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+
+ case WC_MAP_RETURN:
+ wc_unlock(wc);
+ return DM_MAPIO_SUBMITTED;
+
+ case WC_MAP_ERROR:
+ wc_unlock(wc);
+ bio_io_error(bio);
+ return DM_MAPIO_SUBMITTED;
+
+ default:
+ BUG();
+ wc_unlock(wc);
+ return DM_MAPIO_KILL;
+ }
}
static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
{
struct dm_writecache *wc = ti->private;
- if (bio->bi_private != NULL) {
+ if (bio->bi_private == (void *)1) {
int dir = bio_data_dir(bio);
+
if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
wake_up(&wc->bio_in_progress_wait[dir]);
+ } else if (bio->bi_private == (void *)2) {
+ dm_iot_io_end(&wc->iot, 1);
}
return 0;
}
@@ -1502,7 +1680,7 @@ static void writecache_copy_endio(int read_err, unsigned long write_err, void *p
static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
{
- unsigned i;
+ unsigned int i;
struct writeback_struct *wb;
struct wc_entry *e;
unsigned long n_walked = 0;
@@ -1615,13 +1793,17 @@ pop_from_list:
return 0;
}
-static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
+static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
{
struct dm_writecache *wc = wb->wc;
- unsigned block_size = wc->block_size;
+ unsigned int block_size = wc->block_size;
void *address = memory_data(wc, e);
persistent_memory_flush_cache(address, block_size);
+
+ if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
+ return true;
+
return bio_add_page(&wb->bio, persistent_memory_page(address),
block_size, persistent_memory_page_offset(address)) != 0;
}
@@ -1649,7 +1831,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
struct wc_entry *e, *f;
struct bio *bio;
struct writeback_struct *wb;
- unsigned max_pages;
+ unsigned int max_pages;
while (wbl->size) {
wbl->size--;
@@ -1658,21 +1840,24 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
max_pages = e->wc_list_contiguous;
- bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
+ bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE,
+ GFP_NOIO, &wc->bio_set);
wb = container_of(bio, struct writeback_struct, bio);
wb->wc = wc;
bio->bi_end_io = writecache_writeback_endio;
- bio_set_dev(bio, wc->dev->bdev);
bio->bi_iter.bi_sector = read_original_sector(wc, e);
- if (max_pages <= WB_LIST_INLINE ||
- unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
- GFP_NOIO | __GFP_NORETRY |
- __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
+
+ if (unlikely(max_pages > WB_LIST_INLINE))
+ wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
+ GFP_NOIO | __GFP_NORETRY |
+ __GFP_NOMEMALLOC | __GFP_NOWARN);
+
+ if (likely(max_pages <= WB_LIST_INLINE) || unlikely(!wb->wc_list)) {
wb->wc_list = wb->wc_list_inline;
max_pages = WB_LIST_INLINE;
}
- BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
+ BUG_ON(!wc_add_block(wb, e));
wb->wc_list[0] = e;
wb->wc_list_n = 1;
@@ -1682,17 +1867,21 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
if (read_original_sector(wc, f) !=
read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
break;
- if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
+ if (!wc_add_block(wb, f))
break;
wbl->size--;
list_del(&f->lru);
wb->wc_list[wb->wc_list_n++] = f;
e = f;
}
- bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
+ if (WC_MODE_FUA(wc))
+ bio->bi_opf |= REQ_FUA;
if (writecache_has_error(wc)) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
+ } else if (unlikely(!bio_sectors(bio))) {
+ bio->bi_status = BLK_STS_OK;
+ bio_endio(bio);
} else {
submit_bio(bio);
}
@@ -1708,7 +1897,7 @@ static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writebac
struct copy_struct *c;
while (wbl->size) {
- unsigned n_sectors;
+ unsigned int n_sectors;
wbl->size--;
e = container_of(wbl->list.prev, struct wc_entry, lru);
@@ -1736,6 +1925,14 @@ static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writebac
e = f;
}
+ if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
+ if (to.sector >= wc->data_device_sectors) {
+ writecache_copy_endio(0, 0, c);
+ continue;
+ }
+ from.count = to.count = wc->data_device_sectors - to.sector;
+ }
+
dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
__writeback_throttle(wc, wbl);
@@ -1746,12 +1943,34 @@ static void writecache_writeback(struct work_struct *work)
{
struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
struct blk_plug plug;
- struct wc_entry *f, *uninitialized_var(g), *e = NULL;
+ struct wc_entry *f, *g, *e = NULL;
struct rb_node *node, *next_node;
struct list_head skipped;
struct writeback_list wbl;
unsigned long n_walked;
+ if (!WC_MODE_PMEM(wc)) {
+ /* Wait for any active kcopyd work on behalf of ssd writeback */
+ dm_kcopyd_client_flush(wc->dm_kcopyd);
+ }
+
+ if (likely(wc->pause != 0)) {
+ while (1) {
+ unsigned long idle;
+
+ if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) ||
+ unlikely(dm_suspended(wc->ti)))
+ break;
+ idle = dm_iot_idle_time(&wc->iot);
+ if (idle >= wc->pause)
+ break;
+ idle = wc->pause - idle;
+ if (idle > HZ)
+ idle = HZ;
+ schedule_timeout_idle(idle);
+ }
+ }
+
wc_lock(wc);
restart:
if (writecache_has_error(wc)) {
@@ -1764,9 +1983,8 @@ restart:
goto restart;
}
- if (wc->overwrote_committed) {
+ if (wc->overwrote_committed)
writecache_wait_for_ios(wc, WRITE);
- }
n_walked = 0;
INIT_LIST_HEAD(&skipped);
@@ -1780,8 +1998,9 @@ restart:
n_walked++;
if (unlikely(n_walked > WRITEBACK_LATENCY) &&
- likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) {
- queue_work(wc->writeback_wq, &wc->writeback_work);
+ likely(!wc->writeback_all)) {
+ if (likely(!dm_suspended(wc->ti)))
+ queue_work(wc->writeback_wq, &wc->writeback_work);
break;
}
@@ -1794,24 +2013,22 @@ restart:
} else
e = container_of(wc->lru.prev, struct wc_entry, lru);
BUG_ON(e->write_in_progress);
- if (unlikely(!writecache_entry_is_committed(wc, e))) {
+ if (unlikely(!writecache_entry_is_committed(wc, e)))
writecache_flush(wc);
- }
+
node = rb_prev(&e->rb_node);
if (node) {
f = container_of(node, struct wc_entry, rb_node);
if (unlikely(read_original_sector(wc, f) ==
read_original_sector(wc, e))) {
BUG_ON(!f->write_in_progress);
- list_del(&e->lru);
- list_add(&e->lru, &skipped);
+ list_move(&e->lru, &skipped);
cond_resched();
continue;
}
}
wc->writeback_size++;
- list_del(&e->lru);
- list_add(&e->lru, &wbl.list);
+ list_move(&e->lru, &wbl.list);
wbl.size++;
e->write_in_progress = true;
e->wc_list_contiguous = 1;
@@ -1846,14 +2063,13 @@ restart:
// break;
wc->writeback_size++;
- list_del(&g->lru);
- list_add(&g->lru, &wbl.list);
+ list_move(&g->lru, &wbl.list);
wbl.size++;
g->write_in_progress = true;
- g->wc_list_contiguous = BIO_MAX_PAGES;
+ g->wc_list_contiguous = BIO_MAX_VECS;
f = g;
e->wc_list_contiguous++;
- if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) {
+ if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) {
if (unlikely(wc->writeback_all)) {
next_node = rb_next(&f->rb_node);
if (likely(next_node))
@@ -1888,12 +2104,13 @@ restart:
if (unlikely(wc->writeback_all)) {
wc_lock(wc);
- while (writecache_wait_for_writeback(wc));
+ while (writecache_wait_for_writeback(wc))
+ ;
wc_unlock(wc);
}
}
-static int calculate_memory_size(uint64_t device_size, unsigned block_size,
+static int calculate_memory_size(uint64_t device_size, unsigned int block_size,
size_t *n_blocks_p, size_t *n_metadata_blocks_p)
{
uint64_t n_blocks, offset;
@@ -1956,7 +2173,7 @@ static int init_memory(struct dm_writecache *wc)
writecache_flush_all_metadata(wc);
writecache_commit_flushed(wc, false);
pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
- writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
+ writecache_flush_region(wc, &sb(wc)->magic, sizeof(sb(wc)->magic));
writecache_commit_flushed(wc, false);
return 0;
@@ -1988,8 +2205,7 @@ static void writecache_dtr(struct dm_target *ti)
if (wc->ssd_dev)
dm_put_device(ti, wc->ssd_dev);
- if (wc->entries)
- vfree(wc->entries);
+ vfree(wc->entries);
if (wc->memory_map) {
if (WC_MODE_PMEM(wc))
@@ -2004,18 +2220,17 @@ static void writecache_dtr(struct dm_target *ti)
if (wc->dm_io)
dm_io_client_destroy(wc->dm_io);
- if (wc->dirty_bitmap)
- vfree(wc->dirty_bitmap);
+ vfree(wc->dirty_bitmap);
kfree(wc);
}
-static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
+static int writecache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct dm_writecache *wc;
struct dm_arg_set as;
const char *string;
- unsigned opt_params;
+ unsigned int opt_params;
size_t offset, data_size;
int i, r;
char dummy;
@@ -2025,7 +2240,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
struct wc_memory_superblock s;
static struct dm_arg _args[] = {
- {0, 10, "Invalid number of feature args"},
+ {0, 18, "Invalid number of feature args"},
};
as.argc = argc;
@@ -2069,16 +2284,17 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
INIT_WORK(&wc->writeback_work, writecache_writeback);
INIT_WORK(&wc->flush_work, writecache_flush_work);
+ dm_iot_init(&wc->iot);
+
raw_spin_lock_init(&wc->endio_list_lock);
INIT_LIST_HEAD(&wc->endio_list);
- wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
+ wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio");
if (IS_ERR(wc->endio_thread)) {
r = PTR_ERR(wc->endio_thread);
wc->endio_thread = NULL;
ti->error = "Couldn't spawn endio thread";
goto bad;
}
- wake_up_process(wc->endio_thread);
/*
* Parse the mode (pmem or ssd)
@@ -2116,6 +2332,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
} else {
+ wc->pause = PAUSE_WRITEBACK;
r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
if (r) {
ti->error = "Could not allocate mempool";
@@ -2147,7 +2364,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->error = "Cache data device lookup failed";
goto bad;
}
- wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
+ wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev);
/*
* Parse the cache block size
@@ -2185,10 +2402,12 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
string = dm_shift_arg(&as), opt_params--;
if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
unsigned long long start_sector;
+
string = dm_shift_arg(&as), opt_params--;
if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
goto invalid_optional;
wc->start_sector = start_sector;
+ wc->start_sector_set = true;
if (wc->start_sector != start_sector ||
wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
goto invalid_optional;
@@ -2198,6 +2417,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto invalid_optional;
if (high_wm_percent < 0 || high_wm_percent > 100)
goto invalid_optional;
+ wc->high_wm_percent_value = high_wm_percent;
wc->high_wm_percent_set = true;
} else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
string = dm_shift_arg(&as), opt_params--;
@@ -2205,6 +2425,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto invalid_optional;
if (low_wm_percent < 0 || low_wm_percent > 100)
goto invalid_optional;
+ wc->low_wm_percent_value = low_wm_percent;
wc->low_wm_percent_set = true;
} else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
string = dm_shift_arg(&as), opt_params--;
@@ -2217,34 +2438,57 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto invalid_optional;
wc->autocommit_blocks_set = true;
} else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
- unsigned autocommit_msecs;
+ unsigned int autocommit_msecs;
+
string = dm_shift_arg(&as), opt_params--;
if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
goto invalid_optional;
if (autocommit_msecs > 3600000)
goto invalid_optional;
wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
+ wc->autocommit_time_value = autocommit_msecs;
wc->autocommit_time_set = true;
} else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
- unsigned max_age_msecs;
+ unsigned int max_age_msecs;
+
string = dm_shift_arg(&as), opt_params--;
if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
goto invalid_optional;
if (max_age_msecs > 86400000)
goto invalid_optional;
wc->max_age = msecs_to_jiffies(max_age_msecs);
+ wc->max_age_set = true;
+ wc->max_age_value = max_age_msecs;
} else if (!strcasecmp(string, "cleaner")) {
+ wc->cleaner_set = true;
wc->cleaner = true;
} else if (!strcasecmp(string, "fua")) {
if (WC_MODE_PMEM(wc)) {
wc->writeback_fua = true;
wc->writeback_fua_set = true;
- } else goto invalid_optional;
+ } else
+ goto invalid_optional;
} else if (!strcasecmp(string, "nofua")) {
if (WC_MODE_PMEM(wc)) {
wc->writeback_fua = false;
wc->writeback_fua_set = true;
- } else goto invalid_optional;
+ } else
+ goto invalid_optional;
+ } else if (!strcasecmp(string, "metadata_only")) {
+ wc->metadata_only = true;
+ } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) {
+ unsigned int pause_msecs;
+
+ if (WC_MODE_PMEM(wc))
+ goto invalid_optional;
+ string = dm_shift_arg(&as), opt_params--;
+ if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1)
+ goto invalid_optional;
+ if (pause_msecs > 60000)
+ goto invalid_optional;
+ wc->pause = msecs_to_jiffies(pause_msecs);
+ wc->pause_set = true;
+ wc->pause_value = pause_msecs;
} else {
invalid_optional:
r = -EINVAL;
@@ -2260,6 +2504,12 @@ invalid_optional:
}
if (WC_MODE_PMEM(wc)) {
+ if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
+ r = -EOPNOTSUPP;
+ ti->error = "Asynchronous persistent memory not supported as pmem cache";
+ goto bad;
+ }
+
r = persistent_memory_claim(wc);
if (r) {
ti->error = "Unable to map persistent memory for cache";
@@ -2272,14 +2522,13 @@ invalid_optional:
wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
bio_list_init(&wc->flush_list);
- wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
+ wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush");
if (IS_ERR(wc->flush_thread)) {
r = PTR_ERR(wc->flush_thread);
wc->flush_thread = NULL;
ti->error = "Couldn't spawn flush thread";
goto bad;
}
- wake_up_process(wc->flush_thread);
r = calculate_memory_size(wc->memory_map_size, wc->block_size,
&n_blocks, &n_metadata_blocks);
@@ -2329,7 +2578,7 @@ invalid_optional:
}
}
- r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
+ r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock));
if (r) {
ti->error = "Hardware memory error when reading superblock";
goto bad;
@@ -2340,7 +2589,8 @@ invalid_optional:
ti->error = "Unable to initialize device";
goto bad;
}
- r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
+ r = copy_mc_to_kernel(&s, sb(wc),
+ sizeof(struct wc_memory_superblock));
if (r) {
ti->error = "Hardware memory error when reading superblock";
goto bad;
@@ -2409,7 +2659,7 @@ overflow:
goto bad;
}
- ti->num_flush_bios = 1;
+ ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
ti->flush_supported = true;
ti->num_discard_bios = 1;
@@ -2427,28 +2677,38 @@ bad:
}
static void writecache_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+ unsigned int status_flags, char *result, unsigned int maxlen)
{
struct dm_writecache *wc = ti->private;
- unsigned extra_args;
- unsigned sz = 0;
- uint64_t x;
+ unsigned int extra_args;
+ unsigned int sz = 0;
switch (type) {
case STATUSTYPE_INFO:
- DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc),
+ DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu",
+ writecache_has_error(wc),
(unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
- (unsigned long long)wc->writeback_size);
+ (unsigned long long)wc->writeback_size,
+ wc->stats.reads,
+ wc->stats.read_hits,
+ wc->stats.writes,
+ wc->stats.write_hits_uncommitted,
+ wc->stats.write_hits_committed,
+ wc->stats.writes_around,
+ wc->stats.writes_allocate,
+ wc->stats.writes_blocked_on_freelist,
+ wc->stats.flushes,
+ wc->stats.discards);
break;
case STATUSTYPE_TABLE:
DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
wc->dev->name, wc->ssd_dev->name, wc->block_size);
extra_args = 0;
- if (wc->start_sector)
+ if (wc->start_sector_set)
extra_args += 2;
- if (wc->high_wm_percent_set && !wc->cleaner)
+ if (wc->high_wm_percent_set)
extra_args += 2;
- if (wc->low_wm_percent_set && !wc->cleaner)
+ if (wc->low_wm_percent_set)
extra_args += 2;
if (wc->max_writeback_jobs_set)
extra_args += 2;
@@ -2456,45 +2716,50 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
extra_args += 2;
if (wc->autocommit_time_set)
extra_args += 2;
- if (wc->cleaner)
+ if (wc->max_age_set)
+ extra_args += 2;
+ if (wc->cleaner_set)
extra_args++;
if (wc->writeback_fua_set)
extra_args++;
+ if (wc->metadata_only)
+ extra_args++;
+ if (wc->pause_set)
+ extra_args += 2;
DMEMIT("%u", extra_args);
- if (wc->start_sector)
+ if (wc->start_sector_set)
DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
- if (wc->high_wm_percent_set && !wc->cleaner) {
- x = (uint64_t)wc->freelist_high_watermark * 100;
- x += wc->n_blocks / 2;
- do_div(x, (size_t)wc->n_blocks);
- DMEMIT(" high_watermark %u", 100 - (unsigned)x);
- }
- if (wc->low_wm_percent_set && !wc->cleaner) {
- x = (uint64_t)wc->freelist_low_watermark * 100;
- x += wc->n_blocks / 2;
- do_div(x, (size_t)wc->n_blocks);
- DMEMIT(" low_watermark %u", 100 - (unsigned)x);
- }
+ if (wc->high_wm_percent_set)
+ DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
+ if (wc->low_wm_percent_set)
+ DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
if (wc->max_writeback_jobs_set)
DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
if (wc->autocommit_blocks_set)
DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
if (wc->autocommit_time_set)
- DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
- if (wc->max_age != MAX_AGE_UNSPECIFIED)
- DMEMIT(" max_age %u", jiffies_to_msecs(wc->max_age));
- if (wc->cleaner)
+ DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
+ if (wc->max_age_set)
+ DMEMIT(" max_age %u", wc->max_age_value);
+ if (wc->cleaner_set)
DMEMIT(" cleaner");
if (wc->writeback_fua_set)
DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
+ if (wc->metadata_only)
+ DMEMIT(" metadata_only");
+ if (wc->pause_set)
+ DMEMIT(" pause_writeback %u", wc->pause_value);
+ break;
+ case STATUSTYPE_IMA:
+ *result = '\0';
break;
}
}
static struct target_type writecache_target = {
.name = "writecache",
- .version = {1, 3, 0},
+ .version = {1, 6, 0},
.module = THIS_MODULE,
.ctr = writecache_ctr,
.dtr = writecache_dtr,
@@ -2507,28 +2772,8 @@ static struct target_type writecache_target = {
.iterate_devices = writecache_iterate_devices,
.io_hints = writecache_io_hints,
};
-
-static int __init dm_writecache_init(void)
-{
- int r;
-
- r = dm_register_target(&writecache_target);
- if (r < 0) {
- DMERR("register failed %d", r);
- return r;
- }
-
- return 0;
-}
-
-static void __exit dm_writecache_exit(void)
-{
- dm_unregister_target(&writecache_target);
-}
-
-module_init(dm_writecache_init);
-module_exit(dm_writecache_exit);
+module_dm(writecache);
MODULE_DESCRIPTION(DM_NAME " writecache target");
-MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
+MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>");
MODULE_LICENSE("GPL");