diff options
Diffstat (limited to 'drivers/md/dm-writecache.c')
| -rw-r--r-- | drivers/md/dm-writecache.c | 913 |
1 files changed, 579 insertions, 334 deletions
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 74f3c506f084..d8de4a3076a1 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2018 Red Hat. All rights reserved. * @@ -13,14 +13,15 @@ #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> #include <linux/dax.h> -#include <linux/pfn_t.h> #include <linux/libnvdimm.h> +#include <linux/delay.h> +#include "dm-io-tracker.h" #define DM_MSG_PREFIX "writecache" #define HIGH_WATERMARK 50 #define LOW_WATERMARK 45 -#define MAX_WRITEBACK_JOBS 0 +#define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16) #define ENDIO_LATENCY 16 #define WRITEBACK_LATENCY 64 #define AUTOCOMMIT_BLOCKS_SSD 65536 @@ -28,6 +29,7 @@ #define AUTOCOMMIT_MSEC 1000 #define MAX_AGE_DIV 16 #define MAX_AGE_UNSPECIFIED -1UL +#define PAUSE_WRITEBACK (HZ * 3) #define BITMAP_GRANULARITY 65536 #if BITMAP_GRANULARITY < PAGE_SIZE @@ -35,7 +37,7 @@ #define BITMAP_GRANULARITY PAGE_SIZE #endif -#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER) +#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX) #define DM_WRITECACHE_HAS_PMEM #endif @@ -49,7 +51,7 @@ do { \ #define pmem_assign(dest, src) ((dest) = (src)) #endif -#if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM) +#if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM) #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS #endif @@ -73,23 +75,20 @@ struct wc_memory_superblock { }; __le64 padding[8]; }; - struct wc_memory_entry entries[0]; + struct wc_memory_entry entries[]; }; struct wc_entry { struct rb_node rb_node; struct list_head lru; unsigned short wc_list_contiguous; - bool write_in_progress #if BITS_PER_LONG == 64 - :1 -#endif - ; - unsigned long index -#if BITS_PER_LONG == 64 - :47 + bool write_in_progress : 1; + unsigned long index : 47; +#else + bool write_in_progress; + unsigned long index; #endif - ; unsigned long age; #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS uint64_t original_sector; @@ -123,10 +122,11 @@ struct dm_writecache { size_t freelist_high_watermark; size_t freelist_low_watermark; unsigned long max_age; + unsigned long pause; - unsigned uncommitted_blocks; - unsigned autocommit_blocks; - unsigned max_writeback_jobs; + unsigned int uncommitted_blocks; + unsigned int autocommit_blocks; + unsigned int max_writeback_jobs; int error; @@ -148,9 +148,10 @@ struct dm_writecache { size_t metadata_sectors; size_t n_blocks; uint64_t seq_count; + sector_t data_device_sectors; void *block_start; struct wc_entry *entries; - unsigned block_size; + unsigned int block_size; unsigned char block_size_bits; bool pmem_mode:1; @@ -159,20 +160,33 @@ struct dm_writecache { bool overwrote_committed:1; bool memory_vmapped:1; + bool start_sector_set:1; bool high_wm_percent_set:1; bool low_wm_percent_set:1; bool max_writeback_jobs_set:1; bool autocommit_blocks_set:1; bool autocommit_time_set:1; + bool max_age_set:1; bool writeback_fua_set:1; bool flush_on_suspend:1; bool cleaner:1; + bool cleaner_set:1; + bool metadata_only:1; + bool pause_set:1; + + unsigned int high_wm_percent_value; + unsigned int low_wm_percent_value; + unsigned int autocommit_time_value; + unsigned int max_age_value; + unsigned int pause_value; - unsigned writeback_all; + unsigned int writeback_all; struct workqueue_struct *writeback_wq; struct work_struct writeback_work; struct work_struct flush_work; + struct dm_io_tracker iot; + struct dm_io_client *dm_io; raw_spinlock_t endio_list_lock; @@ -184,10 +198,23 @@ struct dm_writecache { struct dm_kcopyd_client *dm_kcopyd; unsigned long *dirty_bitmap; - unsigned dirty_bitmap_size; + unsigned int dirty_bitmap_size; struct bio_set bio_set; mempool_t copy_pool; + + struct { + unsigned long long reads; + unsigned long long read_hits; + unsigned long long writes; + unsigned long long write_hits_uncommitted; + unsigned long long write_hits_committed; + unsigned long long writes_around; + unsigned long long writes_allocate; + unsigned long long writes_blocked_on_freelist; + unsigned long long flushes; + unsigned long long discards; + } stats; }; #define WB_LIST_INLINE 16 @@ -196,7 +223,7 @@ struct writeback_struct { struct list_head endio_entry; struct dm_writecache *wc; struct wc_entry **wc_list; - unsigned wc_list_n; + unsigned int wc_list_n; struct wc_entry *wc_list_inline[WB_LIST_INLINE]; struct bio bio; }; @@ -205,7 +232,7 @@ struct copy_struct { struct list_head endio_entry; struct dm_writecache *wc; struct wc_entry *e; - unsigned n_entries; + unsigned int n_entries; int error; }; @@ -228,9 +255,10 @@ static int persistent_memory_claim(struct dm_writecache *wc) int r; loff_t s; long p, da; - pfn_t pfn; + unsigned long pfn; int id; struct page **pages; + sector_t offset; wc->memory_vmapped = false; @@ -245,23 +273,32 @@ static int persistent_memory_claim(struct dm_writecache *wc) goto err1; } + offset = get_start_sect(wc->ssd_dev->bdev); + if (offset & (PAGE_SIZE / 512 - 1)) { + r = -EINVAL; + goto err1; + } + offset >>= PAGE_SHIFT - 9; + id = dax_read_lock(); - da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn); + da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS, + &wc->memory_map, &pfn); if (da < 0) { wc->memory_map = NULL; r = da; goto err2; } - if (!pfn_t_has_page(pfn)) { + if (!pfn_valid(pfn)) { wc->memory_map = NULL; r = -EOPNOTSUPP; goto err2; } if (da != p) { long i; + wc->memory_map = NULL; - pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL); + pages = vmalloc_array(p, sizeof(struct page *)); if (!pages) { r = -ENOMEM; goto err2; @@ -269,19 +306,22 @@ static int persistent_memory_claim(struct dm_writecache *wc) i = 0; do { long daa; - daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i, - NULL, &pfn); + + daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, + p - i, DAX_ACCESS, NULL, &pfn); if (daa <= 0) { r = daa ? daa : -EINVAL; goto err3; } - if (!pfn_t_has_page(pfn)) { + if (!pfn_valid(pfn)) { r = -EOPNOTSUPP; goto err3; } while (daa-- && i < p) { - pages[i++] = pfn_t_to_page(pfn); - pfn.val++; + pages[i++] = pfn_to_page(pfn); + pfn++; + if (!(i & 15)) + cond_resched(); } } while (i < p); wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL); @@ -289,7 +329,7 @@ static int persistent_memory_claim(struct dm_writecache *wc) r = -ENOMEM; goto err3; } - kvfree(pages); + vfree(pages); wc->memory_vmapped = true; } @@ -300,7 +340,7 @@ static int persistent_memory_claim(struct dm_writecache *wc) return 0; err3: - kvfree(pages); + vfree(pages); err2: dax_read_unlock(id); err1: @@ -309,7 +349,7 @@ err1: #else static int persistent_memory_claim(struct dm_writecache *wc) { - BUG(); + return -EOPNOTSUPP; } #endif @@ -327,7 +367,7 @@ static struct page *persistent_memory_page(void *addr) return virt_to_page(addr); } -static unsigned persistent_memory_page_offset(void *addr) +static unsigned int persistent_memory_page_offset(void *addr) { return (unsigned long)addr & (PAGE_SIZE - 1); } @@ -460,11 +500,12 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) COMPLETION_INITIALIZER_ONSTACK(endio.c), ATOMIC_INIT(1), }; - unsigned bitmap_bits = wc->dirty_bitmap_size * 8; - unsigned i = 0; + unsigned int bitmap_bits = wc->dirty_bitmap_size * 8; + unsigned int i = 0; while (1) { - unsigned j; + unsigned int j; + i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i); if (unlikely(i == bitmap_bits)) break; @@ -481,8 +522,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) region.sector += wc->start_sector; atomic_inc(&endio.count); - req.bi_op = REQ_OP_WRITE; - req.bi_op_flags = REQ_SYNC; + req.bi_opf = REQ_OP_WRITE | REQ_SYNC; req.mem.type = DM_IO_VMA; req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY; req.client = wc->dm_io; @@ -490,7 +530,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) req.notify.context = &endio; /* writing via async dm-io (implied by notify.fn above) won't return an error */ - (void) dm_io(&req, 1, ®ion, NULL); + (void) dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); i = j; } @@ -513,22 +553,21 @@ static void ssd_commit_superblock(struct dm_writecache *wc) region.bdev = wc->ssd_dev->bdev; region.sector = 0; - region.count = PAGE_SIZE; + region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT; if (unlikely(region.sector + region.count > wc->metadata_sectors)) region.count = wc->metadata_sectors - region.sector; region.sector += wc->start_sector; - req.bi_op = REQ_OP_WRITE; - req.bi_op_flags = REQ_SYNC | REQ_FUA; + req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA; req.mem.type = DM_IO_VMA; req.mem.ptr.vma = (char *)wc->memory_map; req.client = wc->dm_io; req.notify.fn = NULL; req.notify.context = NULL; - r = dm_io(&req, 1, ®ion, NULL); + r = dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); if (unlikely(r)) writecache_error(wc, r, "error writing superblock"); } @@ -536,7 +575,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc) static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) { if (WC_MODE_PMEM(wc)) - wmb(); + pmem_wmb(); else ssd_commit_flushed(wc, wait_for_ios); } @@ -550,14 +589,13 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) region.bdev = dev->bdev; region.sector = 0; region.count = 0; - req.bi_op = REQ_OP_WRITE; - req.bi_op_flags = REQ_PREFLUSH; + req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; req.mem.type = DM_IO_KMEM; req.mem.ptr.addr = NULL; req.client = wc->dm_io; req.notify.fn = NULL; - r = dm_io(&req, 1, ®ion, NULL); + r = dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); if (unlikely(r)) writecache_error(wc, r, "error flushing metadata: %d", r); } @@ -584,20 +622,21 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, if (unlikely(!node)) { if (!(flags & WFE_RETURN_FOLLOWING)) return NULL; - if (read_original_sector(wc, e) >= block) { - return e; - } else { - node = rb_next(&e->rb_node); - if (unlikely(!node)) - return NULL; - e = container_of(node, struct wc_entry, rb_node); + if (read_original_sector(wc, e) >= block) return e; - } + + node = rb_next(&e->rb_node); + if (unlikely(!node)) + return NULL; + + e = container_of(node, struct wc_entry, rb_node); + return e; } } while (1) { struct wc_entry *e2; + if (flags & WFE_LOWEST_SEQ) node = rb_prev(&e->rb_node); else @@ -640,6 +679,7 @@ static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry { if (WC_MODE_SORT_FREELIST(wc)) { struct rb_node **node = &wc->freetree.rb_node, *parent = NULL; + if (unlikely(!*node)) wc->current_free = e; while (*node) { @@ -665,7 +705,7 @@ static inline void writecache_verify_watermark(struct dm_writecache *wc) static void writecache_max_age_timer(struct timer_list *t) { - struct dm_writecache *wc = from_timer(wc, t, max_age_timer); + struct dm_writecache *wc = timer_container_of(wc, t, max_age_timer); if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) { queue_work(wc->writeback_wq, &wc->writeback_work); @@ -679,6 +719,7 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, s if (WC_MODE_SORT_FREELIST(wc)) { struct rb_node *next; + if (unlikely(!wc->current_free)) return NULL; e = wc->current_free; @@ -730,7 +771,7 @@ static void writecache_poison_lists(struct dm_writecache *wc) /* * Catch incorrect access to these values while the device is suspended. */ - memset(&wc->tree, -1, sizeof wc->tree); + memset(&wc->tree, -1, sizeof(wc->tree)); wc->lru.next = LIST_POISON1; wc->lru.prev = LIST_POISON2; wc->freelist.next = LIST_POISON1; @@ -755,7 +796,7 @@ static void writecache_flush(struct dm_writecache *wc) bool need_flush_after_free; wc->uncommitted_blocks = 0; - del_timer(&wc->autocommit_timer); + timer_delete(&wc->autocommit_timer); if (list_empty(&wc->lru)) return; @@ -824,7 +865,8 @@ static void writecache_flush_work(struct work_struct *work) static void writecache_autocommit_timer(struct timer_list *t) { - struct dm_writecache *wc = from_timer(wc, t, autocommit_timer); + struct dm_writecache *wc = timer_container_of(wc, t, autocommit_timer); + if (!writecache_has_error(wc)) queue_work(wc->writeback_wq, &wc->flush_work); } @@ -849,10 +891,14 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_ if (likely(!e->write_in_progress)) { if (!discarded_something) { - writecache_wait_for_ios(wc, READ); - writecache_wait_for_ios(wc, WRITE); + if (!WC_MODE_PMEM(wc)) { + writecache_wait_for_ios(wc, READ); + writecache_wait_for_ios(wc, WRITE); + } discarded_something = true; } + if (!writecache_entry_is_committed(wc, e)) + wc->uncommitted_blocks--; writecache_free_entry(wc, e); } @@ -880,8 +926,8 @@ static void writecache_suspend(struct dm_target *ti) struct dm_writecache *wc = ti->private; bool flush_on_suspend; - del_timer_sync(&wc->autocommit_timer); - del_timer_sync(&wc->max_age_timer); + timer_delete_sync(&wc->autocommit_timer); + timer_delete_sync(&wc->max_age_timer); wc_lock(wc); writecache_flush(wc); @@ -898,7 +944,8 @@ static void writecache_suspend(struct dm_target *ti) wc_lock(wc); if (flush_on_suspend) wc->writeback_all--; - while (writecache_wait_for_writeback(wc)); + while (writecache_wait_for_writeback(wc)) + ; if (WC_MODE_PMEM(wc)) persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); @@ -914,11 +961,12 @@ static int writecache_alloc_entries(struct dm_writecache *wc) if (wc->entries) return 0; - wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks)); + wc->entries = vmalloc_array(wc->n_blocks, sizeof(struct wc_entry)); if (!wc->entries) return -ENOMEM; for (b = 0; b < wc->n_blocks; b++) { struct wc_entry *e = &wc->entries[b]; + e->index = b; e->write_in_progress = false; cond_resched(); @@ -935,14 +983,13 @@ static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors region.bdev = wc->ssd_dev->bdev; region.sector = wc->start_sector; region.count = n_sectors; - req.bi_op = REQ_OP_READ; - req.bi_op_flags = REQ_SYNC; + req.bi_opf = REQ_OP_READ | REQ_SYNC; req.mem.type = DM_IO_VMA; req.mem.ptr.vma = (char *)wc->memory_map; req.client = wc->dm_io; req.notify.fn = NULL; - return dm_io(&req, 1, ®ion, NULL); + return dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); } static void writecache_resume(struct dm_target *ti) @@ -955,12 +1002,15 @@ static void writecache_resume(struct dm_target *ti) wc_lock(wc); + wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev); + if (WC_MODE_PMEM(wc)) { persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); } else { r = writecache_read_metadata(wc, wc->metadata_sectors); if (r) { size_t sb_entries_offset; + writecache_error(wc, r, "unable to read metadata: %d", r); sb_entries_offset = offsetof(struct wc_memory_superblock, entries); memset((char *)wc->memory_map + sb_entries_offset, -1, @@ -978,7 +1028,8 @@ static void writecache_resume(struct dm_target *ti) } wc->freelist_size = 0; - r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t)); + r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count, + sizeof(uint64_t)); if (r) { writecache_error(wc, r, "hardware memory error when reading superblock: %d", r); sb_seq_count = cpu_to_le64(0); @@ -989,12 +1040,14 @@ static void writecache_resume(struct dm_target *ti) for (b = 0; b < wc->n_blocks; b++) { struct wc_entry *e = &wc->entries[b]; struct wc_memory_entry wme; + if (writecache_has_error(wc)) { e->original_sector = -1; e->seq_count = -1; continue; } - r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry)); + r = copy_mc_to_kernel(&wme, memory_entry(wc, e), + sizeof(struct wc_memory_entry)); if (r) { writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d", (unsigned long)b, r); @@ -1009,6 +1062,7 @@ static void writecache_resume(struct dm_target *ti) #endif for (b = 0; b < wc->n_blocks; b++) { struct wc_entry *e = &wc->entries[b]; + if (!writecache_entry_is_committed(wc, e)) { if (read_seq_count(wc, e) != -1) { erase_this: @@ -1054,7 +1108,7 @@ erase_this: wc_unlock(wc); } -static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc) +static int process_flush_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) { if (argc != 1) return -EINVAL; @@ -1087,7 +1141,7 @@ static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache * return 0; } -static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc) +static int process_flush_on_suspend_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) { if (argc != 1) return -EINVAL; @@ -1107,7 +1161,7 @@ static void activate_cleaner(struct dm_writecache *wc) wc->freelist_low_watermark = wc->n_blocks; } -static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc) +static int process_cleaner_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) { if (argc != 1) return -EINVAL; @@ -1121,8 +1175,20 @@ static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache return 0; } -static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, - char *result, unsigned maxlen) +static int process_clear_stats_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) +{ + if (argc != 1) + return -EINVAL; + + wc_lock(wc); + memset(&wc->stats, 0, sizeof(wc->stats)); + wc_unlock(wc); + + return 0; +} + +static int writecache_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) { int r = -EINVAL; struct dm_writecache *wc = ti->private; @@ -1133,6 +1199,8 @@ static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, r = process_flush_on_suspend_mesg(argc, argv, wc); else if (!strcasecmp(argv[0], "cleaner")) r = process_cleaner_mesg(argc, argv, wc); + else if (!strcasecmp(argv[0], "clear_stats")) + r = process_clear_stats_mesg(argc, argv, wc); else DMERR("unrecognised message received: %s", argv[0]); @@ -1178,21 +1246,22 @@ static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) { void *buf; - unsigned long flags; - unsigned size; + unsigned int size; int rw = bio_data_dir(bio); - unsigned remaining_size = wc->block_size; + unsigned int remaining_size = wc->block_size; do { struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); - buf = bvec_kmap_irq(&bv, &flags); + + buf = bvec_kmap_local(&bv); size = bv.bv_len; if (unlikely(size > remaining_size)) size = remaining_size; if (rw == READ) { int r; - r = memcpy_mcsafe(buf, data, size); + + r = copy_mc_to_kernel(buf, data, size); flush_dcache_page(bio_page(bio)); if (unlikely(r)) { writecache_error(wc, r, "hardware memory error when reading data: %d", r); @@ -1203,7 +1272,7 @@ static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data memcpy_flushcache_optimized(data, buf, size); } - bvec_kunmap_irq(buf, &flags); + kunmap_local(buf); data = (char *)data + size; remaining_size -= size; @@ -1238,7 +1307,7 @@ static int writecache_flush_thread(void *data) bio_end_sector(bio)); wc_unlock(wc); bio_set_dev(bio, wc->dev->bdev); - generic_make_request(bio); + submit_bio_noacct(bio); } else { writecache_flush(wc); wc_unlock(wc); @@ -1258,194 +1327,303 @@ static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio) bio_list_add(&wc->flush_list, bio); } -static int writecache_map(struct dm_target *ti, struct bio *bio) -{ - struct wc_entry *e; - struct dm_writecache *wc = ti->private; +enum wc_map_op { + WC_MAP_SUBMIT, + WC_MAP_REMAP, + WC_MAP_REMAP_ORIGIN, + WC_MAP_RETURN, + WC_MAP_ERROR, +}; - bio->bi_private = NULL; +static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio, + struct wc_entry *e) +{ + if (e) { + sector_t next_boundary = + read_original_sector(wc, e) - bio->bi_iter.bi_sector; + if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) + dm_accept_partial_bio(bio, next_boundary); + } +} - wc_lock(wc); +static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio) +{ + enum wc_map_op map_op; + struct wc_entry *e; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - if (writecache_has_error(wc)) - goto unlock_error; +read_next_block: + wc->stats.reads++; + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); + if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) { + wc->stats.read_hits++; if (WC_MODE_PMEM(wc)) { - writecache_flush(wc); - if (writecache_has_error(wc)) - goto unlock_error; - goto unlock_submit; + bio_copy_block(wc, bio, memory_data(wc, e)); + if (bio->bi_iter.bi_size) + goto read_next_block; + map_op = WC_MAP_SUBMIT; } else { - writecache_offload_bio(wc, bio); - goto unlock_return; + dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); + bio_set_dev(bio, wc->ssd_dev->bdev); + bio->bi_iter.bi_sector = cache_sector(wc, e); + if (!writecache_entry_is_committed(wc, e)) + writecache_wait_for_ios(wc, WRITE); + map_op = WC_MAP_REMAP; } + } else { + writecache_map_remap_origin(wc, bio, e); + wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; + map_op = WC_MAP_REMAP_ORIGIN; } - bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); + return map_op; +} - if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & - (wc->block_size / 512 - 1)) != 0)) { - DMERR("I/O is not aligned, sector %llu, size %u, block size %u", - (unsigned long long)bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, wc->block_size); - goto unlock_error; - } +static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio, + struct wc_entry *e, bool search_used) +{ + unsigned int bio_size = wc->block_size; + sector_t start_cache_sec = cache_sector(wc, e); + sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); - if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { - if (writecache_has_error(wc)) - goto unlock_error; - if (WC_MODE_PMEM(wc)) { - writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio)); - goto unlock_remap_origin; + while (bio_size < bio->bi_iter.bi_size) { + if (!search_used) { + struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); + + if (!f) + break; + write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + + (bio_size >> SECTOR_SHIFT), wc->seq_count); + writecache_insert_entry(wc, f); + wc->uncommitted_blocks++; } else { - writecache_offload_bio(wc, bio); - goto unlock_return; + struct wc_entry *f; + struct rb_node *next = rb_next(&e->rb_node); + + if (!next) + break; + f = container_of(next, struct wc_entry, rb_node); + if (f != e + 1) + break; + if (read_original_sector(wc, f) != + read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) + break; + if (unlikely(f->write_in_progress)) + break; + if (writecache_entry_is_committed(wc, f)) + wc->overwrote_committed = true; + e = f; } + bio_size += wc->block_size; + current_cache_sec += wc->block_size >> SECTOR_SHIFT; } - if (bio_data_dir(bio) == READ) { -read_next_block: - e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); - if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) { - if (WC_MODE_PMEM(wc)) { - bio_copy_block(wc, bio, memory_data(wc, e)); - if (bio->bi_iter.bi_size) - goto read_next_block; - goto unlock_submit; - } else { - dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); - bio_set_dev(bio, wc->ssd_dev->bdev); - bio->bi_iter.bi_sector = cache_sector(wc, e); - if (!writecache_entry_is_committed(wc, e)) - writecache_wait_for_ios(wc, WRITE); - goto unlock_remap; + bio_set_dev(bio, wc->ssd_dev->bdev); + bio->bi_iter.bi_sector = start_cache_sec; + dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); + + wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; + wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; + + if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { + wc->uncommitted_blocks = 0; + queue_work(wc->writeback_wq, &wc->flush_work); + } else { + writecache_schedule_autocommit(wc); + } +} + +static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio) +{ + struct wc_entry *e; + + do { + bool found_entry = false; + bool search_used = false; + + if (writecache_has_error(wc)) { + wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; + return WC_MAP_ERROR; + } + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); + if (e) { + if (!writecache_entry_is_committed(wc, e)) { + wc->stats.write_hits_uncommitted++; + search_used = true; + goto bio_copy; } - } else { - if (e) { - sector_t next_boundary = - read_original_sector(wc, e) - bio->bi_iter.bi_sector; - if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) { - dm_accept_partial_bio(bio, next_boundary); - } + wc->stats.write_hits_committed++; + if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { + wc->overwrote_committed = true; + search_used = true; + goto bio_copy; } - goto unlock_remap_origin; + found_entry = true; + } else { + if (unlikely(wc->cleaner) || + (wc->metadata_only && !(bio->bi_opf & REQ_META))) + goto direct_write; } - } else { - do { - bool found_entry = false; - if (writecache_has_error(wc)) - goto unlock_error; - e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); - if (e) { - if (!writecache_entry_is_committed(wc, e)) - goto bio_copy; - if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { - wc->overwrote_committed = true; - goto bio_copy; - } - found_entry = true; - } else { - if (unlikely(wc->cleaner)) - goto direct_write; - } - e = writecache_pop_from_freelist(wc, (sector_t)-1); - if (unlikely(!e)) { - if (!found_entry) { + e = writecache_pop_from_freelist(wc, (sector_t)-1); + if (unlikely(!e)) { + if (!WC_MODE_PMEM(wc) && !found_entry) { direct_write: - e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); - if (e) { - sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector; - BUG_ON(!next_boundary); - if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) { - dm_accept_partial_bio(bio, next_boundary); - } - } - goto unlock_remap_origin; - } - writecache_wait_on_freelist(wc); - continue; + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); + writecache_map_remap_origin(wc, bio, e); + wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits; + wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; + return WC_MAP_REMAP_ORIGIN; } - write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count); - writecache_insert_entry(wc, e); - wc->uncommitted_blocks++; + wc->stats.writes_blocked_on_freelist++; + writecache_wait_on_freelist(wc); + continue; + } + write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count); + writecache_insert_entry(wc, e); + wc->uncommitted_blocks++; + wc->stats.writes_allocate++; bio_copy: - if (WC_MODE_PMEM(wc)) { - bio_copy_block(wc, bio, memory_data(wc, e)); - } else { - unsigned bio_size = wc->block_size; - sector_t start_cache_sec = cache_sector(wc, e); - sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); - - while (bio_size < bio->bi_iter.bi_size) { - struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); - if (!f) - break; - write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + - (bio_size >> SECTOR_SHIFT), wc->seq_count); - writecache_insert_entry(wc, f); - wc->uncommitted_blocks++; - bio_size += wc->block_size; - current_cache_sec += wc->block_size >> SECTOR_SHIFT; - } + if (WC_MODE_PMEM(wc)) { + bio_copy_block(wc, bio, memory_data(wc, e)); + wc->stats.writes++; + } else { + writecache_bio_copy_ssd(wc, bio, e, search_used); + return WC_MAP_REMAP; + } + } while (bio->bi_iter.bi_size); - bio_set_dev(bio, wc->ssd_dev->bdev); - bio->bi_iter.bi_sector = start_cache_sec; - dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); + if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks)) + writecache_flush(wc); + else + writecache_schedule_autocommit(wc); - if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { - wc->uncommitted_blocks = 0; - queue_work(wc->writeback_wq, &wc->flush_work); - } else { - writecache_schedule_autocommit(wc); - } - goto unlock_remap; - } - } while (bio->bi_iter.bi_size); + return WC_MAP_SUBMIT; +} - if (unlikely(bio->bi_opf & REQ_FUA || - wc->uncommitted_blocks >= wc->autocommit_blocks)) - writecache_flush(wc); - else - writecache_schedule_autocommit(wc); - goto unlock_submit; +static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio) +{ + if (writecache_has_error(wc)) + return WC_MAP_ERROR; + + if (WC_MODE_PMEM(wc)) { + wc->stats.flushes++; + writecache_flush(wc); + if (writecache_has_error(wc)) + return WC_MAP_ERROR; + else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only)) + return WC_MAP_REMAP_ORIGIN; + return WC_MAP_SUBMIT; } + /* SSD: */ + if (dm_bio_get_target_bio_nr(bio)) + return WC_MAP_REMAP_ORIGIN; + wc->stats.flushes++; + writecache_offload_bio(wc, bio); + return WC_MAP_RETURN; +} -unlock_remap_origin: - bio_set_dev(bio, wc->dev->bdev); - wc_unlock(wc); - return DM_MAPIO_REMAPPED; +static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio) +{ + wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits; -unlock_remap: - /* make sure that writecache_end_io decrements bio_in_progress: */ - bio->bi_private = (void *)1; - atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]); - wc_unlock(wc); - return DM_MAPIO_REMAPPED; + if (writecache_has_error(wc)) + return WC_MAP_ERROR; -unlock_submit: - wc_unlock(wc); - bio_endio(bio); - return DM_MAPIO_SUBMITTED; + if (WC_MODE_PMEM(wc)) { + writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio)); + return WC_MAP_REMAP_ORIGIN; + } + /* SSD: */ + writecache_offload_bio(wc, bio); + return WC_MAP_RETURN; +} -unlock_return: - wc_unlock(wc); - return DM_MAPIO_SUBMITTED; +static int writecache_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_writecache *wc = ti->private; + enum wc_map_op map_op; -unlock_error: - wc_unlock(wc); - bio_io_error(bio); - return DM_MAPIO_SUBMITTED; + bio->bi_private = NULL; + + wc_lock(wc); + + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { + map_op = writecache_map_flush(wc, bio); + goto done; + } + + bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); + + if (unlikely((((unsigned int)bio->bi_iter.bi_sector | bio_sectors(bio)) & + (wc->block_size / 512 - 1)) != 0)) { + DMERR("I/O is not aligned, sector %llu, size %u, block size %u", + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, wc->block_size); + map_op = WC_MAP_ERROR; + goto done; + } + + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { + map_op = writecache_map_discard(wc, bio); + goto done; + } + + if (bio_data_dir(bio) == READ) + map_op = writecache_map_read(wc, bio); + else + map_op = writecache_map_write(wc, bio); +done: + switch (map_op) { + case WC_MAP_REMAP_ORIGIN: + if (likely(wc->pause != 0)) { + if (bio_op(bio) == REQ_OP_WRITE) { + dm_iot_io_begin(&wc->iot, 1); + bio->bi_private = (void *)2; + } + } + bio_set_dev(bio, wc->dev->bdev); + wc_unlock(wc); + return DM_MAPIO_REMAPPED; + + case WC_MAP_REMAP: + /* make sure that writecache_end_io decrements bio_in_progress: */ + bio->bi_private = (void *)1; + atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]); + wc_unlock(wc); + return DM_MAPIO_REMAPPED; + + case WC_MAP_SUBMIT: + wc_unlock(wc); + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + + case WC_MAP_RETURN: + wc_unlock(wc); + return DM_MAPIO_SUBMITTED; + + case WC_MAP_ERROR: + wc_unlock(wc); + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + + default: + BUG(); + wc_unlock(wc); + return DM_MAPIO_KILL; + } } static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) { struct dm_writecache *wc = ti->private; - if (bio->bi_private != NULL) { + if (bio->bi_private == (void *)1) { int dir = bio_data_dir(bio); + if (atomic_dec_and_test(&wc->bio_in_progress[dir])) if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir]))) wake_up(&wc->bio_in_progress_wait[dir]); + } else if (bio->bi_private == (void *)2) { + dm_iot_io_end(&wc->iot, 1); } return 0; } @@ -1502,7 +1680,7 @@ static void writecache_copy_endio(int read_err, unsigned long write_err, void *p static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list) { - unsigned i; + unsigned int i; struct writeback_struct *wb; struct wc_entry *e; unsigned long n_walked = 0; @@ -1615,13 +1793,17 @@ pop_from_list: return 0; } -static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp) +static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e) { struct dm_writecache *wc = wb->wc; - unsigned block_size = wc->block_size; + unsigned int block_size = wc->block_size; void *address = memory_data(wc, e); persistent_memory_flush_cache(address, block_size); + + if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors)) + return true; + return bio_add_page(&wb->bio, persistent_memory_page(address), block_size, persistent_memory_page_offset(address)) != 0; } @@ -1649,7 +1831,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba struct wc_entry *e, *f; struct bio *bio; struct writeback_struct *wb; - unsigned max_pages; + unsigned int max_pages; while (wbl->size) { wbl->size--; @@ -1658,21 +1840,24 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba max_pages = e->wc_list_contiguous; - bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set); + bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE, + GFP_NOIO, &wc->bio_set); wb = container_of(bio, struct writeback_struct, bio); wb->wc = wc; bio->bi_end_io = writecache_writeback_endio; - bio_set_dev(bio, wc->dev->bdev); bio->bi_iter.bi_sector = read_original_sector(wc, e); - if (max_pages <= WB_LIST_INLINE || - unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), - GFP_NOIO | __GFP_NORETRY | - __GFP_NOMEMALLOC | __GFP_NOWARN)))) { + + if (unlikely(max_pages > WB_LIST_INLINE)) + wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), + GFP_NOIO | __GFP_NORETRY | + __GFP_NOMEMALLOC | __GFP_NOWARN); + + if (likely(max_pages <= WB_LIST_INLINE) || unlikely(!wb->wc_list)) { wb->wc_list = wb->wc_list_inline; max_pages = WB_LIST_INLINE; } - BUG_ON(!wc_add_block(wb, e, GFP_NOIO)); + BUG_ON(!wc_add_block(wb, e)); wb->wc_list[0] = e; wb->wc_list_n = 1; @@ -1682,17 +1867,21 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba if (read_original_sector(wc, f) != read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) break; - if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN)) + if (!wc_add_block(wb, f)) break; wbl->size--; list_del(&f->lru); wb->wc_list[wb->wc_list_n++] = f; e = f; } - bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA); + if (WC_MODE_FUA(wc)) + bio->bi_opf |= REQ_FUA; if (writecache_has_error(wc)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); + } else if (unlikely(!bio_sectors(bio))) { + bio->bi_status = BLK_STS_OK; + bio_endio(bio); } else { submit_bio(bio); } @@ -1708,7 +1897,7 @@ static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writebac struct copy_struct *c; while (wbl->size) { - unsigned n_sectors; + unsigned int n_sectors; wbl->size--; e = container_of(wbl->list.prev, struct wc_entry, lru); @@ -1736,6 +1925,14 @@ static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writebac e = f; } + if (unlikely(to.sector + to.count > wc->data_device_sectors)) { + if (to.sector >= wc->data_device_sectors) { + writecache_copy_endio(0, 0, c); + continue; + } + from.count = to.count = wc->data_device_sectors - to.sector; + } + dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); __writeback_throttle(wc, wbl); @@ -1746,12 +1943,34 @@ static void writecache_writeback(struct work_struct *work) { struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); struct blk_plug plug; - struct wc_entry *f, *uninitialized_var(g), *e = NULL; + struct wc_entry *f, *g, *e = NULL; struct rb_node *node, *next_node; struct list_head skipped; struct writeback_list wbl; unsigned long n_walked; + if (!WC_MODE_PMEM(wc)) { + /* Wait for any active kcopyd work on behalf of ssd writeback */ + dm_kcopyd_client_flush(wc->dm_kcopyd); + } + + if (likely(wc->pause != 0)) { + while (1) { + unsigned long idle; + + if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) || + unlikely(dm_suspended(wc->ti))) + break; + idle = dm_iot_idle_time(&wc->iot); + if (idle >= wc->pause) + break; + idle = wc->pause - idle; + if (idle > HZ) + idle = HZ; + schedule_timeout_idle(idle); + } + } + wc_lock(wc); restart: if (writecache_has_error(wc)) { @@ -1764,9 +1983,8 @@ restart: goto restart; } - if (wc->overwrote_committed) { + if (wc->overwrote_committed) writecache_wait_for_ios(wc, WRITE); - } n_walked = 0; INIT_LIST_HEAD(&skipped); @@ -1780,8 +1998,9 @@ restart: n_walked++; if (unlikely(n_walked > WRITEBACK_LATENCY) && - likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) { - queue_work(wc->writeback_wq, &wc->writeback_work); + likely(!wc->writeback_all)) { + if (likely(!dm_suspended(wc->ti))) + queue_work(wc->writeback_wq, &wc->writeback_work); break; } @@ -1794,24 +2013,22 @@ restart: } else e = container_of(wc->lru.prev, struct wc_entry, lru); BUG_ON(e->write_in_progress); - if (unlikely(!writecache_entry_is_committed(wc, e))) { + if (unlikely(!writecache_entry_is_committed(wc, e))) writecache_flush(wc); - } + node = rb_prev(&e->rb_node); if (node) { f = container_of(node, struct wc_entry, rb_node); if (unlikely(read_original_sector(wc, f) == read_original_sector(wc, e))) { BUG_ON(!f->write_in_progress); - list_del(&e->lru); - list_add(&e->lru, &skipped); + list_move(&e->lru, &skipped); cond_resched(); continue; } } wc->writeback_size++; - list_del(&e->lru); - list_add(&e->lru, &wbl.list); + list_move(&e->lru, &wbl.list); wbl.size++; e->write_in_progress = true; e->wc_list_contiguous = 1; @@ -1846,14 +2063,13 @@ restart: // break; wc->writeback_size++; - list_del(&g->lru); - list_add(&g->lru, &wbl.list); + list_move(&g->lru, &wbl.list); wbl.size++; g->write_in_progress = true; - g->wc_list_contiguous = BIO_MAX_PAGES; + g->wc_list_contiguous = BIO_MAX_VECS; f = g; e->wc_list_contiguous++; - if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) { + if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) { if (unlikely(wc->writeback_all)) { next_node = rb_next(&f->rb_node); if (likely(next_node)) @@ -1888,12 +2104,13 @@ restart: if (unlikely(wc->writeback_all)) { wc_lock(wc); - while (writecache_wait_for_writeback(wc)); + while (writecache_wait_for_writeback(wc)) + ; wc_unlock(wc); } } -static int calculate_memory_size(uint64_t device_size, unsigned block_size, +static int calculate_memory_size(uint64_t device_size, unsigned int block_size, size_t *n_blocks_p, size_t *n_metadata_blocks_p) { uint64_t n_blocks, offset; @@ -1956,7 +2173,7 @@ static int init_memory(struct dm_writecache *wc) writecache_flush_all_metadata(wc); writecache_commit_flushed(wc, false); pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); - writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic); + writecache_flush_region(wc, &sb(wc)->magic, sizeof(sb(wc)->magic)); writecache_commit_flushed(wc, false); return 0; @@ -1988,8 +2205,7 @@ static void writecache_dtr(struct dm_target *ti) if (wc->ssd_dev) dm_put_device(ti, wc->ssd_dev); - if (wc->entries) - vfree(wc->entries); + vfree(wc->entries); if (wc->memory_map) { if (WC_MODE_PMEM(wc)) @@ -2004,18 +2220,17 @@ static void writecache_dtr(struct dm_target *ti) if (wc->dm_io) dm_io_client_destroy(wc->dm_io); - if (wc->dirty_bitmap) - vfree(wc->dirty_bitmap); + vfree(wc->dirty_bitmap); kfree(wc); } -static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) +static int writecache_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct dm_writecache *wc; struct dm_arg_set as; const char *string; - unsigned opt_params; + unsigned int opt_params; size_t offset, data_size; int i, r; char dummy; @@ -2025,7 +2240,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) struct wc_memory_superblock s; static struct dm_arg _args[] = { - {0, 10, "Invalid number of feature args"}, + {0, 18, "Invalid number of feature args"}, }; as.argc = argc; @@ -2069,16 +2284,17 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) INIT_WORK(&wc->writeback_work, writecache_writeback); INIT_WORK(&wc->flush_work, writecache_flush_work); + dm_iot_init(&wc->iot); + raw_spin_lock_init(&wc->endio_list_lock); INIT_LIST_HEAD(&wc->endio_list); - wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio"); + wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio"); if (IS_ERR(wc->endio_thread)) { r = PTR_ERR(wc->endio_thread); wc->endio_thread = NULL; ti->error = "Couldn't spawn endio thread"; goto bad; } - wake_up_process(wc->endio_thread); /* * Parse the mode (pmem or ssd) @@ -2116,6 +2332,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } } else { + wc->pause = PAUSE_WRITEBACK; r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct)); if (r) { ti->error = "Could not allocate mempool"; @@ -2147,7 +2364,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->error = "Cache data device lookup failed"; goto bad; } - wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode); + wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev); /* * Parse the cache block size @@ -2185,10 +2402,12 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) string = dm_shift_arg(&as), opt_params--; if (!strcasecmp(string, "start_sector") && opt_params >= 1) { unsigned long long start_sector; + string = dm_shift_arg(&as), opt_params--; if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) goto invalid_optional; wc->start_sector = start_sector; + wc->start_sector_set = true; if (wc->start_sector != start_sector || wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) goto invalid_optional; @@ -2198,6 +2417,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto invalid_optional; if (high_wm_percent < 0 || high_wm_percent > 100) goto invalid_optional; + wc->high_wm_percent_value = high_wm_percent; wc->high_wm_percent_set = true; } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { string = dm_shift_arg(&as), opt_params--; @@ -2205,6 +2425,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto invalid_optional; if (low_wm_percent < 0 || low_wm_percent > 100) goto invalid_optional; + wc->low_wm_percent_value = low_wm_percent; wc->low_wm_percent_set = true; } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { string = dm_shift_arg(&as), opt_params--; @@ -2217,34 +2438,57 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto invalid_optional; wc->autocommit_blocks_set = true; } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) { - unsigned autocommit_msecs; + unsigned int autocommit_msecs; + string = dm_shift_arg(&as), opt_params--; if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1) goto invalid_optional; if (autocommit_msecs > 3600000) goto invalid_optional; wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); + wc->autocommit_time_value = autocommit_msecs; wc->autocommit_time_set = true; } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { - unsigned max_age_msecs; + unsigned int max_age_msecs; + string = dm_shift_arg(&as), opt_params--; if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1) goto invalid_optional; if (max_age_msecs > 86400000) goto invalid_optional; wc->max_age = msecs_to_jiffies(max_age_msecs); + wc->max_age_set = true; + wc->max_age_value = max_age_msecs; } else if (!strcasecmp(string, "cleaner")) { + wc->cleaner_set = true; wc->cleaner = true; } else if (!strcasecmp(string, "fua")) { if (WC_MODE_PMEM(wc)) { wc->writeback_fua = true; wc->writeback_fua_set = true; - } else goto invalid_optional; + } else + goto invalid_optional; } else if (!strcasecmp(string, "nofua")) { if (WC_MODE_PMEM(wc)) { wc->writeback_fua = false; wc->writeback_fua_set = true; - } else goto invalid_optional; + } else + goto invalid_optional; + } else if (!strcasecmp(string, "metadata_only")) { + wc->metadata_only = true; + } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) { + unsigned int pause_msecs; + + if (WC_MODE_PMEM(wc)) + goto invalid_optional; + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1) + goto invalid_optional; + if (pause_msecs > 60000) + goto invalid_optional; + wc->pause = msecs_to_jiffies(pause_msecs); + wc->pause_set = true; + wc->pause_value = pause_msecs; } else { invalid_optional: r = -EINVAL; @@ -2260,6 +2504,12 @@ invalid_optional: } if (WC_MODE_PMEM(wc)) { + if (!dax_synchronous(wc->ssd_dev->dax_dev)) { + r = -EOPNOTSUPP; + ti->error = "Asynchronous persistent memory not supported as pmem cache"; + goto bad; + } + r = persistent_memory_claim(wc); if (r) { ti->error = "Unable to map persistent memory for cache"; @@ -2272,14 +2522,13 @@ invalid_optional: wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT; bio_list_init(&wc->flush_list); - wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush"); + wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush"); if (IS_ERR(wc->flush_thread)) { r = PTR_ERR(wc->flush_thread); wc->flush_thread = NULL; ti->error = "Couldn't spawn flush thread"; goto bad; } - wake_up_process(wc->flush_thread); r = calculate_memory_size(wc->memory_map_size, wc->block_size, &n_blocks, &n_metadata_blocks); @@ -2329,7 +2578,7 @@ invalid_optional: } } - r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock)); + r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock)); if (r) { ti->error = "Hardware memory error when reading superblock"; goto bad; @@ -2340,7 +2589,8 @@ invalid_optional: ti->error = "Unable to initialize device"; goto bad; } - r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock)); + r = copy_mc_to_kernel(&s, sb(wc), + sizeof(struct wc_memory_superblock)); if (r) { ti->error = "Hardware memory error when reading superblock"; goto bad; @@ -2409,7 +2659,7 @@ overflow: goto bad; } - ti->num_flush_bios = 1; + ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2; ti->flush_supported = true; ti->num_discard_bios = 1; @@ -2427,28 +2677,38 @@ bad: } static void writecache_status(struct dm_target *ti, status_type_t type, - unsigned status_flags, char *result, unsigned maxlen) + unsigned int status_flags, char *result, unsigned int maxlen) { struct dm_writecache *wc = ti->private; - unsigned extra_args; - unsigned sz = 0; - uint64_t x; + unsigned int extra_args; + unsigned int sz = 0; switch (type) { case STATUSTYPE_INFO: - DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc), + DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", + writecache_has_error(wc), (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size, - (unsigned long long)wc->writeback_size); + (unsigned long long)wc->writeback_size, + wc->stats.reads, + wc->stats.read_hits, + wc->stats.writes, + wc->stats.write_hits_uncommitted, + wc->stats.write_hits_committed, + wc->stats.writes_around, + wc->stats.writes_allocate, + wc->stats.writes_blocked_on_freelist, + wc->stats.flushes, + wc->stats.discards); break; case STATUSTYPE_TABLE: DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', wc->dev->name, wc->ssd_dev->name, wc->block_size); extra_args = 0; - if (wc->start_sector) + if (wc->start_sector_set) extra_args += 2; - if (wc->high_wm_percent_set && !wc->cleaner) + if (wc->high_wm_percent_set) extra_args += 2; - if (wc->low_wm_percent_set && !wc->cleaner) + if (wc->low_wm_percent_set) extra_args += 2; if (wc->max_writeback_jobs_set) extra_args += 2; @@ -2456,45 +2716,50 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args += 2; if (wc->autocommit_time_set) extra_args += 2; - if (wc->cleaner) + if (wc->max_age_set) + extra_args += 2; + if (wc->cleaner_set) extra_args++; if (wc->writeback_fua_set) extra_args++; + if (wc->metadata_only) + extra_args++; + if (wc->pause_set) + extra_args += 2; DMEMIT("%u", extra_args); - if (wc->start_sector) + if (wc->start_sector_set) DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); - if (wc->high_wm_percent_set && !wc->cleaner) { - x = (uint64_t)wc->freelist_high_watermark * 100; - x += wc->n_blocks / 2; - do_div(x, (size_t)wc->n_blocks); - DMEMIT(" high_watermark %u", 100 - (unsigned)x); - } - if (wc->low_wm_percent_set && !wc->cleaner) { - x = (uint64_t)wc->freelist_low_watermark * 100; - x += wc->n_blocks / 2; - do_div(x, (size_t)wc->n_blocks); - DMEMIT(" low_watermark %u", 100 - (unsigned)x); - } + if (wc->high_wm_percent_set) + DMEMIT(" high_watermark %u", wc->high_wm_percent_value); + if (wc->low_wm_percent_set) + DMEMIT(" low_watermark %u", wc->low_wm_percent_value); if (wc->max_writeback_jobs_set) DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); if (wc->autocommit_blocks_set) DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); if (wc->autocommit_time_set) - DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies)); - if (wc->max_age != MAX_AGE_UNSPECIFIED) - DMEMIT(" max_age %u", jiffies_to_msecs(wc->max_age)); - if (wc->cleaner) + DMEMIT(" autocommit_time %u", wc->autocommit_time_value); + if (wc->max_age_set) + DMEMIT(" max_age %u", wc->max_age_value); + if (wc->cleaner_set) DMEMIT(" cleaner"); if (wc->writeback_fua_set) DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); + if (wc->metadata_only) + DMEMIT(" metadata_only"); + if (wc->pause_set) + DMEMIT(" pause_writeback %u", wc->pause_value); + break; + case STATUSTYPE_IMA: + *result = '\0'; break; } } static struct target_type writecache_target = { .name = "writecache", - .version = {1, 3, 0}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, @@ -2507,28 +2772,8 @@ static struct target_type writecache_target = { .iterate_devices = writecache_iterate_devices, .io_hints = writecache_io_hints, }; - -static int __init dm_writecache_init(void) -{ - int r; - - r = dm_register_target(&writecache_target); - if (r < 0) { - DMERR("register failed %d", r); - return r; - } - - return 0; -} - -static void __exit dm_writecache_exit(void) -{ - dm_unregister_target(&writecache_target); -} - -module_init(dm_writecache_init); -module_exit(dm_writecache_exit); +module_dm(writecache); MODULE_DESCRIPTION(DM_NAME " writecache target"); -MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); +MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>"); MODULE_LICENSE("GPL"); |
