From 94dace8c85717588c2b4d116759cc3253f47d0eb Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Mon, 26 Jul 2021 13:59:49 +0200 Subject: block/rnbd-clt: Use put_cpu_ptr after get_cpu_ptr This patch replaces put_cpu_var with put_cpu_ptr because get_cpu_ptr should be paired with put_cpu_ptr. Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20210726115950.470543-2-jinpu.wang@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index e9cc413495f0..bd4a41afbbfc 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -271,7 +271,7 @@ unlock: */ if (cpu_q) *cpup = cpu_q->cpu; - put_cpu_var(sess->cpu_rr); + put_cpu_ptr(sess->cpu_rr); if (q) rnbd_clt_dev_requeue(q); -- cgit From 3087b335b5316cd180aa4c5a28abaa890905634e Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Mon, 26 Jul 2021 13:59:50 +0200 Subject: block/rnbd: Use sysfs_emit instead of s*printf function for sysfs show sysfs_emit function was added to be aware of the PAGE_SIZE maximum of the temporary buffer used for outputting sysfs content, so there is no possible overruns. So replace the uses of any s*printf functions for the sysfs show functions with sysfs_emit. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20210726115950.470543-3-jinpu.wang@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 33 +++++++++++++++------------------ drivers/block/rnbd/rnbd-srv-sysfs.c | 14 +++++++------- 2 files changed, 22 insertions(+), 25 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 324afdd63a96..4b93fd83bf79 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -227,17 +227,17 @@ static ssize_t state_show(struct kobject *kobj, switch (dev->dev_state) { case DEV_STATE_INIT: - return snprintf(page, PAGE_SIZE, "init\n"); + return sysfs_emit(page, "init\n"); case DEV_STATE_MAPPED: /* TODO fix cli tool before changing to proper state */ - return snprintf(page, PAGE_SIZE, "open\n"); + return sysfs_emit(page, "open\n"); case DEV_STATE_MAPPED_DISCONNECTED: /* TODO fix cli tool before changing to proper state */ - return snprintf(page, PAGE_SIZE, "closed\n"); + return sysfs_emit(page, "closed\n"); case DEV_STATE_UNMAPPED: - return snprintf(page, PAGE_SIZE, "unmapped\n"); + return sysfs_emit(page, "unmapped\n"); default: - return snprintf(page, PAGE_SIZE, "unknown\n"); + return sysfs_emit(page, "unknown\n"); } } @@ -263,7 +263,7 @@ static ssize_t mapping_path_show(struct kobject *kobj, dev = container_of(kobj, struct rnbd_clt_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%s\n", dev->pathname); + return sysfs_emit(page, "%s\n", dev->pathname); } static struct kobj_attribute rnbd_clt_mapping_path_attr = @@ -276,8 +276,7 @@ static ssize_t access_mode_show(struct kobject *kobj, dev = container_of(kobj, struct rnbd_clt_dev, kobj); - return snprintf(page, PAGE_SIZE, "%s\n", - rnbd_access_mode_str(dev->access_mode)); + return sysfs_emit(page, "%s\n", rnbd_access_mode_str(dev->access_mode)); } static struct kobj_attribute rnbd_clt_access_mode = @@ -286,8 +285,8 @@ static struct kobj_attribute rnbd_clt_access_mode = static ssize_t rnbd_clt_unmap_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, "Usage: echo > %s\n", - attr->attr.name); + return sysfs_emit(page, "Usage: echo > %s\n", + attr->attr.name); } static ssize_t rnbd_clt_unmap_dev_store(struct kobject *kobj, @@ -357,9 +356,8 @@ static ssize_t rnbd_clt_resize_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, - "Usage: echo > %s\n", - attr->attr.name); + return sysfs_emit(page, "Usage: echo > %s\n", + attr->attr.name); } static ssize_t rnbd_clt_resize_dev_store(struct kobject *kobj, @@ -390,8 +388,7 @@ static struct kobj_attribute rnbd_clt_resize_dev_attr = static ssize_t rnbd_clt_remap_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, "Usage: echo <1> > %s\n", - attr->attr.name); + return sysfs_emit(page, "Usage: echo <1> > %s\n", attr->attr.name); } static ssize_t rnbd_clt_remap_dev_store(struct kobject *kobj, @@ -436,7 +433,7 @@ static ssize_t session_show(struct kobject *kobj, struct kobj_attribute *attr, dev = container_of(kobj, struct rnbd_clt_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%s\n", dev->sess->sessname); + return sysfs_emit(page, "%s\n", dev->sess->sessname); } static struct kobj_attribute rnbd_clt_session_attr = @@ -499,8 +496,8 @@ static ssize_t rnbd_clt_map_device_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, - "Usage: echo \"[dest_port=server port number] sessname= path=<[srcaddr@]dstaddr> [path=<[srcaddr@]dstaddr>] device_path= [access_mode=] [nr_poll_queues=]\" > %s\n\naddr ::= [ ip: | ip: | gid: ]\n", + return sysfs_emit(page, + "Usage: echo \"[dest_port=server port number] sessname= path=<[srcaddr@]dstaddr> [path=<[srcaddr@]dstaddr>] device_path= [access_mode=] [nr_poll_queues=]\" > %s\n\naddr ::= [ ip: | ip: | gid: ]\n", attr->attr.name); } diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index acf5fced11ef..4db98e0e76f0 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -90,8 +90,8 @@ static ssize_t read_only_show(struct kobject *kobj, struct kobj_attribute *attr, sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%d\n", - !(sess_dev->open_flags & FMODE_WRITE)); + return sysfs_emit(page, "%d\n", + !(sess_dev->open_flags & FMODE_WRITE)); } static struct kobj_attribute rnbd_srv_dev_session_ro_attr = @@ -105,8 +105,8 @@ static ssize_t access_mode_show(struct kobject *kobj, sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%s\n", - rnbd_access_mode_str(sess_dev->access_mode)); + return sysfs_emit(page, "%s\n", + rnbd_access_mode_str(sess_dev->access_mode)); } static struct kobj_attribute rnbd_srv_dev_session_access_mode_attr = @@ -119,7 +119,7 @@ static ssize_t mapping_path_show(struct kobject *kobj, sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%s\n", sess_dev->pathname); + return sysfs_emit(page, "%s\n", sess_dev->pathname); } static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr = @@ -128,8 +128,8 @@ static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr = static ssize_t rnbd_srv_dev_session_force_close_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", - attr->attr.name); + return sysfs_emit(page, "Usage: echo 1 > %s\n", + attr->attr.name); } static ssize_t rnbd_srv_dev_session_force_close_store(struct kobject *kobj, -- cgit From da20b58d5bbbb0d23ae9530992a37d0f0d1787a4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 6 Aug 2021 12:06:01 +0100 Subject: xen-blkfront: Remove redundant assignment to variable err The variable err is being assigned a value that is never read, the assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210806110601.11386-1-colin.king@canonical.com Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d83fee21f6c5..715bfa8aca7f 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1092,7 +1092,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, err = xlbd_reserve_minors(minor, nr_minors); if (err) return err; - err = -ENODEV; memset(&info->tag_set, 0, sizeof(info->tag_set)); info->tag_set.ops = &blkfront_mq_ops; -- cgit From fad7cd3310db3099f95dd34312c77740fbc455e5 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 4 Aug 2021 10:12:12 +0800 Subject: nbd: add the check to prevent overflow in __nbd_ioctl() If user specify a large enough value of NBD blocks option, it may trigger signed integer overflow which may lead to nbd->config->bytesize becomes a large or small value, zero in particular. UBSAN: Undefined behaviour in drivers/block/nbd.c:325:31 signed integer overflow: 1024 * 4611686155866341414 cannot be represented in type 'long long int' [...] Call trace: [...] handle_overflow+0x188/0x1dc lib/ubsan.c:192 __ubsan_handle_mul_overflow+0x34/0x44 lib/ubsan.c:213 nbd_size_set drivers/block/nbd.c:325 [inline] __nbd_ioctl drivers/block/nbd.c:1342 [inline] nbd_ioctl+0x998/0xa10 drivers/block/nbd.c:1395 __blkdev_driver_ioctl block/ioctl.c:311 [inline] [...] Although it is not a big deal, still silence the UBSAN by limit the input value. Reported-by: Hulk Robot Signed-off-by: Baokun Li Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210804021212.990223-1-libaokun1@huawei.com [axboe: dropped unlikely()] Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index c38317979f74..f82264835794 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1384,6 +1384,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) { struct nbd_config *config = nbd->config; + loff_t bytesize; switch (cmd) { case NBD_DISCONNECT: @@ -1398,8 +1399,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, case NBD_SET_SIZE: return nbd_set_size(nbd, arg, config->blksize); case NBD_SET_SIZE_BLOCKS: - return nbd_set_size(nbd, arg * config->blksize, - config->blksize); + if (check_mul_overflow((loff_t)arg, config->blksize, &bytesize)) + return -EINVAL; + return nbd_set_size(nbd, bytesize, config->blksize); case NBD_SET_TIMEOUT: nbd_set_cmd_timeout(nbd, arg); return 0; -- cgit From 68c9417b193d0d174b0ada013602272177e61303 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 11 Aug 2021 14:44:23 +0200 Subject: nbd: do del_gendisk() asynchronously for NBD_DESTROY_ON_DISCONNECT Now open_mutex is used to synchronize partition operations (e.g, blk_drop_partitions() and blkdev_reread_part()), however it makes nbd driver broken, because nbd may call del_gendisk() in nbd_release() or nbd_genl_disconnect() if NBD_CFLAG_DESTROY_ON_DISCONNECT is enabled, and deadlock occurs, as shown below: // AB-BA dead-lock nbd_genl_disconnect blkdev_open nbd_disconnect_and_put lock bd_mutex // last ref nbd_put lock nbd_index_mutex del_gendisk nbd_open try lock nbd_index_mutex try lock bd_mutex or // AA dead-lock nbd_release lock bd_mutex nbd_put try lock bd_mutex Instead of fixing block layer (e.g, introduce another lock), fixing the nbd driver to call del_gendisk() in a kworker when NBD_DESTROY_ON_DISCONNECT is enabled. When NBD_DESTROY_ON_DISCONNECT is disabled, nbd device will always be destroy through module removal, and there is no risky of deadlock. To ensure the reuse of nbd index succeeds, moving the calling of idr_remove() after del_gendisk(), so if the reused index is not found in nbd_index_idr, the old disk must have been deleted. And reusing the existing destroy_complete mechanism to ensure nbd_genl_connect() will wait for the completion of del_gendisk(). Also adding a new workqueue for nbd removal, so nbd_cleanup() can ensure all removals complete before exits. Reported-by: syzbot+0fe7752e52337864d29b@syzkaller.appspotmail.com Fixes: c76f48eb5c08 ("block: take bd_mutex around delete_partitions in del_gendisk") Signed-off-by: Hou Tao Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210811124428.2368491-2-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f82264835794..deefb2cda9bb 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -49,6 +49,7 @@ static DEFINE_IDR(nbd_index_idr); static DEFINE_MUTEX(nbd_index_mutex); +static struct workqueue_struct *nbd_del_wq; static int nbd_total_devices = 0; struct nbd_sock { @@ -113,6 +114,7 @@ struct nbd_device { struct mutex config_lock; struct gendisk *disk; struct workqueue_struct *recv_workq; + struct work_struct remove_work; struct list_head list; struct task_struct *task_recv; @@ -233,7 +235,7 @@ static const struct device_attribute backend_attr = { .show = backend_show, }; -static void nbd_dev_remove(struct nbd_device *nbd) +static void nbd_del_disk(struct nbd_device *nbd) { struct gendisk *disk = nbd->disk; @@ -242,24 +244,60 @@ static void nbd_dev_remove(struct nbd_device *nbd) blk_cleanup_disk(disk); blk_mq_free_tag_set(&nbd->tag_set); } +} +/* + * Place this in the last just before the nbd is freed to + * make sure that the disk and the related kobject are also + * totally removed to avoid duplicate creation of the same + * one. + */ +static void nbd_notify_destroy_completion(struct nbd_device *nbd) +{ + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && + nbd->destroy_complete) + complete(nbd->destroy_complete); +} + +static void nbd_dev_remove_work(struct work_struct *work) +{ + struct nbd_device *nbd = + container_of(work, struct nbd_device, remove_work); + + nbd_del_disk(nbd); + + mutex_lock(&nbd_index_mutex); /* - * Place this in the last just before the nbd is freed to - * make sure that the disk and the related kobject are also - * totally removed to avoid duplicate creation of the same - * one. + * Remove from idr after del_gendisk() completes, + * so if the same id is reused, the following + * add_disk() will succeed. */ - if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete) - complete(nbd->destroy_complete); + idr_remove(&nbd_index_idr, nbd->index); + + nbd_notify_destroy_completion(nbd); + mutex_unlock(&nbd_index_mutex); kfree(nbd); } +static void nbd_dev_remove(struct nbd_device *nbd) +{ + /* Call del_gendisk() asynchrounously to prevent deadlock */ + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) { + queue_work(nbd_del_wq, &nbd->remove_work); + return; + } + + nbd_del_disk(nbd); + idr_remove(&nbd_index_idr, nbd->index); + nbd_notify_destroy_completion(nbd); + kfree(nbd); +} + static void nbd_put(struct nbd_device *nbd) { if (refcount_dec_and_mutex_lock(&nbd->refs, &nbd_index_mutex)) { - idr_remove(&nbd_index_idr, nbd->index); nbd_dev_remove(nbd); mutex_unlock(&nbd_index_mutex); } @@ -1681,6 +1719,7 @@ static int nbd_dev_add(int index) nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; nbd->tag_set.driver_data = nbd; + INIT_WORK(&nbd->remove_work, nbd_dev_remove_work); nbd->destroy_complete = NULL; nbd->backend = NULL; @@ -2418,7 +2457,14 @@ static int __init nbd_init(void) if (register_blkdev(NBD_MAJOR, "nbd")) return -EIO; + nbd_del_wq = alloc_workqueue("nbd-del", WQ_UNBOUND, 0); + if (!nbd_del_wq) { + unregister_blkdev(NBD_MAJOR, "nbd"); + return -ENOMEM; + } + if (genl_register_family(&nbd_genl_family)) { + destroy_workqueue(nbd_del_wq); unregister_blkdev(NBD_MAJOR, "nbd"); return -EINVAL; } @@ -2436,7 +2482,10 @@ static int nbd_exit_cb(int id, void *ptr, void *data) struct list_head *list = (struct list_head *)data; struct nbd_device *nbd = ptr; - list_add_tail(&nbd->list, list); + /* Skip nbd that is being removed asynchronously */ + if (refcount_read(&nbd->refs)) + list_add_tail(&nbd->list, list); + return 0; } @@ -2459,6 +2508,9 @@ static void __exit nbd_cleanup(void) nbd_put(nbd); } + /* Also wait for nbd_dev_remove_work() completes */ + destroy_workqueue(nbd_del_wq); + idr_destroy(&nbd_index_idr); genl_unregister_family(&nbd_genl_family); unregister_blkdev(NBD_MAJOR, "nbd"); -- cgit From 3f74e0645c52a08f640380c9c46f9a3a172b9389 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Aug 2021 14:44:24 +0200 Subject: nbd: refactor device removal Share common code for the synchronous and workqueue based device removal, and remove the pointless use of refcount_dec_and_mutex_lock. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210811124428.2368491-3-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index deefb2cda9bb..a9883fbed924 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -259,48 +259,37 @@ static void nbd_notify_destroy_completion(struct nbd_device *nbd) complete(nbd->destroy_complete); } -static void nbd_dev_remove_work(struct work_struct *work) +static void nbd_dev_remove(struct nbd_device *nbd) { - struct nbd_device *nbd = - container_of(work, struct nbd_device, remove_work); - nbd_del_disk(nbd); - mutex_lock(&nbd_index_mutex); /* - * Remove from idr after del_gendisk() completes, - * so if the same id is reused, the following - * add_disk() will succeed. + * Remove from idr after del_gendisk() completes, so if the same ID is + * reused, the following add_disk() will succeed. */ + mutex_lock(&nbd_index_mutex); idr_remove(&nbd_index_idr, nbd->index); - nbd_notify_destroy_completion(nbd); mutex_unlock(&nbd_index_mutex); kfree(nbd); } -static void nbd_dev_remove(struct nbd_device *nbd) +static void nbd_dev_remove_work(struct work_struct *work) { - /* Call del_gendisk() asynchrounously to prevent deadlock */ - if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) { - queue_work(nbd_del_wq, &nbd->remove_work); - return; - } - - nbd_del_disk(nbd); - idr_remove(&nbd_index_idr, nbd->index); - nbd_notify_destroy_completion(nbd); - kfree(nbd); + nbd_dev_remove(container_of(work, struct nbd_device, remove_work)); } static void nbd_put(struct nbd_device *nbd) { - if (refcount_dec_and_mutex_lock(&nbd->refs, - &nbd_index_mutex)) { + if (!refcount_dec_and_test(&nbd->refs)) + return; + + /* Call del_gendisk() asynchrounously to prevent deadlock */ + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) + queue_work(nbd_del_wq, &nbd->remove_work); + else nbd_dev_remove(nbd); - mutex_unlock(&nbd_index_mutex); - } } static int nbd_disconnected(struct nbd_config *config) -- cgit From 327b501b1d94342fe17a1b6b1a40746e57ddd472 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Aug 2021 14:44:25 +0200 Subject: nbd: remove nbd_del_disk Fold nbd_del_disk and remove the pointless NULL check on ->disk given that it is always set for a successfully allocated nbd_device structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210811124428.2368491-4-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a9883fbed924..48530fe01c0f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -235,17 +235,6 @@ static const struct device_attribute backend_attr = { .show = backend_show, }; -static void nbd_del_disk(struct nbd_device *nbd) -{ - struct gendisk *disk = nbd->disk; - - if (disk) { - del_gendisk(disk); - blk_cleanup_disk(disk); - blk_mq_free_tag_set(&nbd->tag_set); - } -} - /* * Place this in the last just before the nbd is freed to * make sure that the disk and the related kobject are also @@ -261,7 +250,11 @@ static void nbd_notify_destroy_completion(struct nbd_device *nbd) static void nbd_dev_remove(struct nbd_device *nbd) { - nbd_del_disk(nbd); + struct gendisk *disk = nbd->disk; + + del_gendisk(disk); + blk_cleanup_disk(disk); + blk_mq_free_tag_set(&nbd->tag_set); /* * Remove from idr after del_gendisk() completes, so if the same ID is -- cgit From 7bdc00cf7e369b3be17f26e5643da28de98d9d6d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Aug 2021 14:44:26 +0200 Subject: nbd: return the allocated nbd_device from nbd_dev_add Return the device we just allocated instead of doing an extra search for it in the caller. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210811124428.2368491-5-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 48530fe01c0f..a81b95c66dbf 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1683,7 +1683,7 @@ static const struct blk_mq_ops nbd_mq_ops = { .timeout = nbd_xmit_timeout, }; -static int nbd_dev_add(int index) +static struct nbd_device *nbd_dev_add(int index) { struct nbd_device *nbd; struct gendisk *disk; @@ -1755,7 +1755,7 @@ static int nbd_dev_add(int index) sprintf(disk->disk_name, "nbd%d", index); add_disk(disk); nbd_total_devices++; - return index; + return nbd; out_free_idr: idr_remove(&nbd_index_idr, index); @@ -1764,7 +1764,7 @@ out_free_tags: out_free_nbd: kfree(nbd); out: - return err; + return ERR_PTR(err); } static int find_free_cb(int id, void *ptr, void *data) @@ -1850,25 +1850,22 @@ again: if (index == -1) { ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd); if (ret == 0) { - int new_index; - new_index = nbd_dev_add(-1); - if (new_index < 0) { + nbd = nbd_dev_add(-1); + if (IS_ERR(nbd)) { mutex_unlock(&nbd_index_mutex); printk(KERN_ERR "nbd: failed to add new device\n"); - return new_index; + return PTR_ERR(nbd); } - nbd = idr_find(&nbd_index_idr, new_index); } } else { nbd = idr_find(&nbd_index_idr, index); if (!nbd) { - ret = nbd_dev_add(index); - if (ret < 0) { + nbd = nbd_dev_add(index); + if (IS_ERR(nbd)) { mutex_unlock(&nbd_index_mutex); printk(KERN_ERR "nbd: failed to add new device\n"); - return ret; + return PTR_ERR(nbd); } - nbd = idr_find(&nbd_index_idr, index); } } if (!nbd) { -- cgit From 6177b56c96ff3b5e23d47f6b6c8630f31145da93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Aug 2021 14:44:27 +0200 Subject: nbd: refactor device search and allocation in nbd_genl_connect Use idr_for_each_entry instead of the awkward callback to find an existing device for the index == -1 case, and de-duplicate the device allocation if no existing device was found. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210811124428.2368491-6-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 45 ++++++++++++++------------------------------- 1 file changed, 14 insertions(+), 31 deletions(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a81b95c66dbf..8c0e334bdfbf 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1767,18 +1767,6 @@ out: return ERR_PTR(err); } -static int find_free_cb(int id, void *ptr, void *data) -{ - struct nbd_device *nbd = ptr; - struct nbd_device **found = data; - - if (!refcount_read(&nbd->config_refs)) { - *found = nbd; - return 1; - } - return 0; -} - /* Netlink interface. */ static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = { [NBD_ATTR_INDEX] = { .type = NLA_U32 }, @@ -1848,31 +1836,26 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) again: mutex_lock(&nbd_index_mutex); if (index == -1) { - ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd); - if (ret == 0) { - nbd = nbd_dev_add(-1); - if (IS_ERR(nbd)) { - mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: failed to add new device\n"); - return PTR_ERR(nbd); + struct nbd_device *tmp; + int id; + + idr_for_each_entry(&nbd_index_idr, tmp, id) { + if (!refcount_read(&tmp->config_refs)) { + nbd = tmp; + break; } } } else { nbd = idr_find(&nbd_index_idr, index); - if (!nbd) { - nbd = nbd_dev_add(index); - if (IS_ERR(nbd)) { - mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: failed to add new device\n"); - return PTR_ERR(nbd); - } - } } + if (!nbd) { - printk(KERN_ERR "nbd: couldn't find device at index %d\n", - index); - mutex_unlock(&nbd_index_mutex); - return -EINVAL; + nbd = nbd_dev_add(index); + if (IS_ERR(nbd)) { + mutex_unlock(&nbd_index_mutex); + pr_err("nbd: failed to add new device\n"); + return PTR_ERR(nbd); + } } if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && -- cgit From 6e4df4c6488165637b95b9701cc862a42a3836ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Aug 2021 14:44:28 +0200 Subject: nbd: reduce the nbd_index_mutex scope nbd_index_mutex is currently held over add_disk and inside ->open, which leads to lock order reversals. Refactor the device creation code path so that nbd_dev_add is called without nbd_index_mutex lock held and only takes it for the IDR insertation. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210811124428.2368491-7-hch@lst.de [axboe: fix whitespace] Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 55 +++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 8c0e334bdfbf..0fe82626bf70 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1683,7 +1683,7 @@ static const struct blk_mq_ops nbd_mq_ops = { .timeout = nbd_xmit_timeout, }; -static struct nbd_device *nbd_dev_add(int index) +static struct nbd_device *nbd_dev_add(int index, unsigned int refs) { struct nbd_device *nbd; struct gendisk *disk; @@ -1709,6 +1709,7 @@ static struct nbd_device *nbd_dev_add(int index) if (err) goto out_free_nbd; + mutex_lock(&nbd_index_mutex); if (index >= 0) { err = idr_alloc(&nbd_index_idr, nbd, index, index + 1, GFP_KERNEL); @@ -1719,6 +1720,7 @@ static struct nbd_device *nbd_dev_add(int index) if (err >= 0) index = err; } + mutex_unlock(&nbd_index_mutex); if (err < 0) goto out_free_tags; nbd->index = index; @@ -1745,7 +1747,7 @@ static struct nbd_device *nbd_dev_add(int index) mutex_init(&nbd->config_lock); refcount_set(&nbd->config_refs, 0); - refcount_set(&nbd->refs, 1); + refcount_set(&nbd->refs, refs); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; disk->first_minor = index << part_shift; @@ -1849,34 +1851,35 @@ again: nbd = idr_find(&nbd_index_idr, index); } - if (!nbd) { - nbd = nbd_dev_add(index); - if (IS_ERR(nbd)) { + if (nbd) { + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && + test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { + nbd->destroy_complete = &destroy_complete; mutex_unlock(&nbd_index_mutex); - pr_err("nbd: failed to add new device\n"); - return PTR_ERR(nbd); + + /* wait until the nbd device is completely destroyed */ + wait_for_completion(&destroy_complete); + goto again; } - } - if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && - test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { - nbd->destroy_complete = &destroy_complete; + if (!refcount_inc_not_zero(&nbd->refs)) { + mutex_unlock(&nbd_index_mutex); + if (index == -1) + goto again; + pr_err("nbd: device at index %d is going down\n", + index); + return -EINVAL; + } mutex_unlock(&nbd_index_mutex); - - /* Wait untill the the nbd stuff is totally destroyed */ - wait_for_completion(&destroy_complete); - goto again; - } - - if (!refcount_inc_not_zero(&nbd->refs)) { + } else { mutex_unlock(&nbd_index_mutex); - if (index == -1) - goto again; - printk(KERN_ERR "nbd: device at index %d is going down\n", - index); - return -EINVAL; + + nbd = nbd_dev_add(index, 2); + if (IS_ERR(nbd)) { + pr_err("nbd: failed to add new device\n"); + return PTR_ERR(nbd); + } } - mutex_unlock(&nbd_index_mutex); mutex_lock(&nbd->config_lock); if (refcount_read(&nbd->config_refs)) { @@ -2432,10 +2435,8 @@ static int __init nbd_init(void) } nbd_dbg_init(); - mutex_lock(&nbd_index_mutex); for (i = 0; i < nbds_max; i++) - nbd_dev_add(i); - mutex_unlock(&nbd_index_mutex); + nbd_dev_add(i, 1); return 0; } -- cgit From 9ea9b9c48387edc101d56349492ad9c0492ff78d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Aug 2021 15:23:08 +0200 Subject: remove the lightnvm subsystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lightnvm supports the OCSSD 1.x and 2.0 specs which were early attempts to produce Open Channel SSDs and never made it into the NVMe spec proper. They have since been superceeded by NVMe enhancements such as ZNS support. Remove the support per the deprecation schedule. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210812132308.38486-1-hch@lst.de Reviewed-by: Matias Bjørling Reviewed-by: Javier González Signed-off-by: Jens Axboe --- drivers/Kconfig | 2 - drivers/Makefile | 1 - drivers/lightnvm/Kconfig | 44 - drivers/lightnvm/Makefile | 11 - drivers/lightnvm/core.c | 1440 ------------------------- drivers/lightnvm/pblk-cache.c | 137 --- drivers/lightnvm/pblk-core.c | 2151 -------------------------------------- drivers/lightnvm/pblk-gc.c | 726 ------------- drivers/lightnvm/pblk-init.c | 1324 ----------------------- drivers/lightnvm/pblk-map.c | 210 ---- drivers/lightnvm/pblk-rb.c | 858 --------------- drivers/lightnvm/pblk-read.c | 474 --------- drivers/lightnvm/pblk-recovery.c | 874 ---------------- drivers/lightnvm/pblk-rl.c | 254 ----- drivers/lightnvm/pblk-sysfs.c | 728 ------------- drivers/lightnvm/pblk-trace.h | 145 --- drivers/lightnvm/pblk-write.c | 665 ------------ drivers/lightnvm/pblk.h | 1358 ------------------------ drivers/nvme/host/Makefile | 1 - drivers/nvme/host/core.c | 13 - drivers/nvme/host/ioctl.c | 4 +- drivers/nvme/host/lightnvm.c | 1274 ---------------------- drivers/nvme/host/nvme.h | 26 - drivers/nvme/host/pci.c | 6 - 24 files changed, 1 insertion(+), 12725 deletions(-) delete mode 100644 drivers/lightnvm/Kconfig delete mode 100644 drivers/lightnvm/Makefile delete mode 100644 drivers/lightnvm/core.c delete mode 100644 drivers/lightnvm/pblk-cache.c delete mode 100644 drivers/lightnvm/pblk-core.c delete mode 100644 drivers/lightnvm/pblk-gc.c delete mode 100644 drivers/lightnvm/pblk-init.c delete mode 100644 drivers/lightnvm/pblk-map.c delete mode 100644 drivers/lightnvm/pblk-rb.c delete mode 100644 drivers/lightnvm/pblk-read.c delete mode 100644 drivers/lightnvm/pblk-recovery.c delete mode 100644 drivers/lightnvm/pblk-rl.c delete mode 100644 drivers/lightnvm/pblk-sysfs.c delete mode 100644 drivers/lightnvm/pblk-trace.h delete mode 100644 drivers/lightnvm/pblk-write.c delete mode 100644 drivers/lightnvm/pblk.h delete mode 100644 drivers/nvme/host/lightnvm.c (limited to 'drivers') diff --git a/drivers/Kconfig b/drivers/Kconfig index 8bad63417a50..30d2db37cc87 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -51,8 +51,6 @@ source "drivers/net/Kconfig" source "drivers/isdn/Kconfig" -source "drivers/lightnvm/Kconfig" - # input before char - char/joystick depends on it. As does USB. source "drivers/input/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index 27c018bdf4de..be5d40ae1488 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -70,7 +70,6 @@ obj-$(CONFIG_FB_I810) += video/fbdev/i810/ obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ obj-$(CONFIG_PARPORT) += parport/ -obj-$(CONFIG_NVM) += lightnvm/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ obj-$(CONFIG_DAX) += dax/ diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig deleted file mode 100644 index 04caa0f2d445..000000000000 --- a/drivers/lightnvm/Kconfig +++ /dev/null @@ -1,44 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Open-Channel SSD NVM configuration -# - -menuconfig NVM - bool "Open-Channel SSD target support (DEPRECATED)" - depends on BLOCK - help - Say Y here to get to enable Open-channel SSDs. - - Open-Channel SSDs implement a set of extension to SSDs, that - exposes direct access to the underlying non-volatile memory. - - If you say N, all options in this submenu will be skipped and disabled - only do this if you know what you are doing. - - This code is deprecated and will be removed in Linux 5.15. - -if NVM - -config NVM_PBLK - tristate "Physical Block Device Open-Channel SSD target" - select CRC32 - help - Allows an open-channel SSD to be exposed as a block device to the - host. The target assumes the device exposes raw flash and must be - explicitly managed by the host. - - Please note the disk format is considered EXPERIMENTAL for now. - -if NVM_PBLK - -config NVM_PBLK_DEBUG - bool "PBlk Debug Support" - default n - help - Enables debug support for pblk. This includes extra checks, more - vocal error messages, and extra tracking fields in the pblk sysfs - entries. - -endif # NVM_PBLK_DEBUG - -endif # NVM diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile deleted file mode 100644 index 97d9d7c71550..000000000000 --- a/drivers/lightnvm/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for Open-Channel SSDs. -# - -obj-$(CONFIG_NVM) := core.o -obj-$(CONFIG_NVM_PBLK) += pblk.o -pblk-y := pblk-init.o pblk-core.o pblk-rb.o \ - pblk-write.o pblk-cache.o pblk-read.o \ - pblk-gc.o pblk-recovery.o pblk-map.o \ - pblk-rl.o pblk-sysfs.o diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c deleted file mode 100644 index cf8a75494833..000000000000 --- a/drivers/lightnvm/core.c +++ /dev/null @@ -1,1440 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2015 IT University of Copenhagen. All rights reserved. - * Initial release: Matias Bjorling - */ - -#define pr_fmt(fmt) "nvm: " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static LIST_HEAD(nvm_tgt_types); -static DECLARE_RWSEM(nvm_tgtt_lock); -static LIST_HEAD(nvm_devices); -static DECLARE_RWSEM(nvm_lock); - -/* Map between virtual and physical channel and lun */ -struct nvm_ch_map { - int ch_off; - int num_lun; - int *lun_offs; -}; - -struct nvm_dev_map { - struct nvm_ch_map *chnls; - int num_ch; -}; - -static void nvm_free(struct kref *ref); - -static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) -{ - struct nvm_target *tgt; - - list_for_each_entry(tgt, &dev->targets, list) - if (!strcmp(name, tgt->disk->disk_name)) - return tgt; - - return NULL; -} - -static bool nvm_target_exists(const char *name) -{ - struct nvm_dev *dev; - struct nvm_target *tgt; - bool ret = false; - - down_write(&nvm_lock); - list_for_each_entry(dev, &nvm_devices, devices) { - mutex_lock(&dev->mlock); - list_for_each_entry(tgt, &dev->targets, list) { - if (!strcmp(name, tgt->disk->disk_name)) { - ret = true; - mutex_unlock(&dev->mlock); - goto out; - } - } - mutex_unlock(&dev->mlock); - } - -out: - up_write(&nvm_lock); - return ret; -} - -static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end) -{ - int i; - - for (i = lun_begin; i <= lun_end; i++) { - if (test_and_set_bit(i, dev->lun_map)) { - pr_err("lun %d already allocated\n", i); - goto err; - } - } - - return 0; -err: - while (--i >= lun_begin) - clear_bit(i, dev->lun_map); - - return -EBUSY; -} - -static void nvm_release_luns_err(struct nvm_dev *dev, int lun_begin, - int lun_end) -{ - int i; - - for (i = lun_begin; i <= lun_end; i++) - WARN_ON(!test_and_clear_bit(i, dev->lun_map)); -} - -static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_dev_map *dev_map = tgt_dev->map; - int i, j; - - for (i = 0; i < dev_map->num_ch; i++) { - struct nvm_ch_map *ch_map = &dev_map->chnls[i]; - int *lun_offs = ch_map->lun_offs; - int ch = i + ch_map->ch_off; - - if (clear) { - for (j = 0; j < ch_map->num_lun; j++) { - int lun = j + lun_offs[j]; - int lunid = (ch * dev->geo.num_lun) + lun; - - WARN_ON(!test_and_clear_bit(lunid, - dev->lun_map)); - } - } - - kfree(ch_map->lun_offs); - } - - kfree(dev_map->chnls); - kfree(dev_map); - - kfree(tgt_dev->luns); - kfree(tgt_dev); -} - -static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, - u16 lun_begin, u16 lun_end, - u16 op) -{ - struct nvm_tgt_dev *tgt_dev = NULL; - struct nvm_dev_map *dev_rmap = dev->rmap; - struct nvm_dev_map *dev_map; - struct ppa_addr *luns; - int num_lun = lun_end - lun_begin + 1; - int luns_left = num_lun; - int num_ch = num_lun / dev->geo.num_lun; - int num_ch_mod = num_lun % dev->geo.num_lun; - int bch = lun_begin / dev->geo.num_lun; - int blun = lun_begin % dev->geo.num_lun; - int lunid = 0; - int lun_balanced = 1; - int sec_per_lun, prev_num_lun; - int i, j; - - num_ch = (num_ch_mod == 0) ? num_ch : num_ch + 1; - - dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); - if (!dev_map) - goto err_dev; - - dev_map->chnls = kcalloc(num_ch, sizeof(struct nvm_ch_map), GFP_KERNEL); - if (!dev_map->chnls) - goto err_chnls; - - luns = kcalloc(num_lun, sizeof(struct ppa_addr), GFP_KERNEL); - if (!luns) - goto err_luns; - - prev_num_lun = (luns_left > dev->geo.num_lun) ? - dev->geo.num_lun : luns_left; - for (i = 0; i < num_ch; i++) { - struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch]; - int *lun_roffs = ch_rmap->lun_offs; - struct nvm_ch_map *ch_map = &dev_map->chnls[i]; - int *lun_offs; - int luns_in_chnl = (luns_left > dev->geo.num_lun) ? - dev->geo.num_lun : luns_left; - - if (lun_balanced && prev_num_lun != luns_in_chnl) - lun_balanced = 0; - - ch_map->ch_off = ch_rmap->ch_off = bch; - ch_map->num_lun = luns_in_chnl; - - lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); - if (!lun_offs) - goto err_ch; - - for (j = 0; j < luns_in_chnl; j++) { - luns[lunid].ppa = 0; - luns[lunid].a.ch = i; - luns[lunid++].a.lun = j; - - lun_offs[j] = blun; - lun_roffs[j + blun] = blun; - } - - ch_map->lun_offs = lun_offs; - - /* when starting a new channel, lun offset is reset */ - blun = 0; - luns_left -= luns_in_chnl; - } - - dev_map->num_ch = num_ch; - - tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL); - if (!tgt_dev) - goto err_ch; - - /* Inherit device geometry from parent */ - memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo)); - - /* Target device only owns a portion of the physical device */ - tgt_dev->geo.num_ch = num_ch; - tgt_dev->geo.num_lun = (lun_balanced) ? prev_num_lun : -1; - tgt_dev->geo.all_luns = num_lun; - tgt_dev->geo.all_chunks = num_lun * dev->geo.num_chk; - - tgt_dev->geo.op = op; - - sec_per_lun = dev->geo.clba * dev->geo.num_chk; - tgt_dev->geo.total_secs = num_lun * sec_per_lun; - - tgt_dev->q = dev->q; - tgt_dev->map = dev_map; - tgt_dev->luns = luns; - tgt_dev->parent = dev; - - return tgt_dev; -err_ch: - while (--i >= 0) - kfree(dev_map->chnls[i].lun_offs); - kfree(luns); -err_luns: - kfree(dev_map->chnls); -err_chnls: - kfree(dev_map); -err_dev: - return tgt_dev; -} - -static struct nvm_tgt_type *__nvm_find_target_type(const char *name) -{ - struct nvm_tgt_type *tt; - - list_for_each_entry(tt, &nvm_tgt_types, list) - if (!strcmp(name, tt->name)) - return tt; - - return NULL; -} - -static struct nvm_tgt_type *nvm_find_target_type(const char *name) -{ - struct nvm_tgt_type *tt; - - down_write(&nvm_tgtt_lock); - tt = __nvm_find_target_type(name); - up_write(&nvm_tgtt_lock); - - return tt; -} - -static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin, - int lun_end) -{ - if (lun_begin > lun_end || lun_end >= geo->all_luns) { - pr_err("lun out of bound (%u:%u > %u)\n", - lun_begin, lun_end, geo->all_luns - 1); - return -EINVAL; - } - - return 0; -} - -static int __nvm_config_simple(struct nvm_dev *dev, - struct nvm_ioctl_create_simple *s) -{ - struct nvm_geo *geo = &dev->geo; - - if (s->lun_begin == -1 && s->lun_end == -1) { - s->lun_begin = 0; - s->lun_end = geo->all_luns - 1; - } - - return nvm_config_check_luns(geo, s->lun_begin, s->lun_end); -} - -static int __nvm_config_extended(struct nvm_dev *dev, - struct nvm_ioctl_create_extended *e) -{ - if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) { - e->lun_begin = 0; - e->lun_end = dev->geo.all_luns - 1; - } - - /* op not set falls into target's default */ - if (e->op == 0xFFFF) { - e->op = NVM_TARGET_DEFAULT_OP; - } else if (e->op < NVM_TARGET_MIN_OP || e->op > NVM_TARGET_MAX_OP) { - pr_err("invalid over provisioning value\n"); - return -EINVAL; - } - - return nvm_config_check_luns(&dev->geo, e->lun_begin, e->lun_end); -} - -static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) -{ - struct nvm_ioctl_create_extended e; - struct gendisk *tdisk; - struct nvm_tgt_type *tt; - struct nvm_target *t; - struct nvm_tgt_dev *tgt_dev; - void *targetdata; - unsigned int mdts; - int ret; - - switch (create->conf.type) { - case NVM_CONFIG_TYPE_SIMPLE: - ret = __nvm_config_simple(dev, &create->conf.s); - if (ret) - return ret; - - e.lun_begin = create->conf.s.lun_begin; - e.lun_end = create->conf.s.lun_end; - e.op = NVM_TARGET_DEFAULT_OP; - break; - case NVM_CONFIG_TYPE_EXTENDED: - ret = __nvm_config_extended(dev, &create->conf.e); - if (ret) - return ret; - - e = create->conf.e; - break; - default: - pr_err("config type not valid\n"); - return -EINVAL; - } - - tt = nvm_find_target_type(create->tgttype); - if (!tt) { - pr_err("target type %s not found\n", create->tgttype); - return -EINVAL; - } - - if ((tt->flags & NVM_TGT_F_HOST_L2P) != (dev->geo.dom & NVM_RSP_L2P)) { - pr_err("device is incompatible with target L2P type.\n"); - return -EINVAL; - } - - if (nvm_target_exists(create->tgtname)) { - pr_err("target name already exists (%s)\n", - create->tgtname); - return -EINVAL; - } - - ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end); - if (ret) - return ret; - - t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); - if (!t) { - ret = -ENOMEM; - goto err_reserve; - } - - tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op); - if (!tgt_dev) { - pr_err("could not create target device\n"); - ret = -ENOMEM; - goto err_t; - } - - tdisk = blk_alloc_disk(dev->q->node); - if (!tdisk) { - ret = -ENOMEM; - goto err_dev; - } - - strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name)); - tdisk->major = 0; - tdisk->first_minor = 0; - tdisk->fops = tt->bops; - - targetdata = tt->init(tgt_dev, tdisk, create->flags); - if (IS_ERR(targetdata)) { - ret = PTR_ERR(targetdata); - goto err_init; - } - - tdisk->private_data = targetdata; - tdisk->queue->queuedata = targetdata; - - mdts = (dev->geo.csecs >> 9) * NVM_MAX_VLBA; - if (dev->geo.mdts) { - mdts = min_t(u32, dev->geo.mdts, - (dev->geo.csecs >> 9) * NVM_MAX_VLBA); - } - blk_queue_max_hw_sectors(tdisk->queue, mdts); - - set_capacity(tdisk, tt->capacity(targetdata)); - add_disk(tdisk); - - if (tt->sysfs_init && tt->sysfs_init(tdisk)) { - ret = -ENOMEM; - goto err_sysfs; - } - - t->type = tt; - t->disk = tdisk; - t->dev = tgt_dev; - - mutex_lock(&dev->mlock); - list_add_tail(&t->list, &dev->targets); - mutex_unlock(&dev->mlock); - - __module_get(tt->owner); - - return 0; -err_sysfs: - if (tt->exit) - tt->exit(targetdata, true); -err_init: - blk_cleanup_disk(tdisk); -err_dev: - nvm_remove_tgt_dev(tgt_dev, 0); -err_t: - kfree(t); -err_reserve: - nvm_release_luns_err(dev, e.lun_begin, e.lun_end); - return ret; -} - -static void __nvm_remove_target(struct nvm_target *t, bool graceful) -{ - struct nvm_tgt_type *tt = t->type; - struct gendisk *tdisk = t->disk; - - del_gendisk(tdisk); - - if (tt->sysfs_exit) - tt->sysfs_exit(tdisk); - - if (tt->exit) - tt->exit(tdisk->private_data, graceful); - - nvm_remove_tgt_dev(t->dev, 1); - blk_cleanup_disk(tdisk); - module_put(t->type->owner); - - list_del(&t->list); - kfree(t); -} - -/** - * nvm_remove_tgt - Removes a target from the media manager - * @remove: ioctl structure with target name to remove. - * - * Returns: - * 0: on success - * 1: on not found - * <0: on error - */ -static int nvm_remove_tgt(struct nvm_ioctl_remove *remove) -{ - struct nvm_target *t = NULL; - struct nvm_dev *dev; - - down_read(&nvm_lock); - list_for_each_entry(dev, &nvm_devices, devices) { - mutex_lock(&dev->mlock); - t = nvm_find_target(dev, remove->tgtname); - if (t) { - mutex_unlock(&dev->mlock); - break; - } - mutex_unlock(&dev->mlock); - } - up_read(&nvm_lock); - - if (!t) { - pr_err("failed to remove target %s\n", - remove->tgtname); - return 1; - } - - __nvm_remove_target(t, true); - kref_put(&dev->ref, nvm_free); - - return 0; -} - -static int nvm_register_map(struct nvm_dev *dev) -{ - struct nvm_dev_map *rmap; - int i, j; - - rmap = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); - if (!rmap) - goto err_rmap; - - rmap->chnls = kcalloc(dev->geo.num_ch, sizeof(struct nvm_ch_map), - GFP_KERNEL); - if (!rmap->chnls) - goto err_chnls; - - for (i = 0; i < dev->geo.num_ch; i++) { - struct nvm_ch_map *ch_rmap; - int *lun_roffs; - int luns_in_chnl = dev->geo.num_lun; - - ch_rmap = &rmap->chnls[i]; - - ch_rmap->ch_off = -1; - ch_rmap->num_lun = luns_in_chnl; - - lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); - if (!lun_roffs) - goto err_ch; - - for (j = 0; j < luns_in_chnl; j++) - lun_roffs[j] = -1; - - ch_rmap->lun_offs = lun_roffs; - } - - dev->rmap = rmap; - - return 0; -err_ch: - while (--i >= 0) - kfree(rmap->chnls[i].lun_offs); -err_chnls: - kfree(rmap); -err_rmap: - return -ENOMEM; -} - -static void nvm_unregister_map(struct nvm_dev *dev) -{ - struct nvm_dev_map *rmap = dev->rmap; - int i; - - for (i = 0; i < dev->geo.num_ch; i++) - kfree(rmap->chnls[i].lun_offs); - - kfree(rmap->chnls); - kfree(rmap); -} - -static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) -{ - struct nvm_dev_map *dev_map = tgt_dev->map; - struct nvm_ch_map *ch_map = &dev_map->chnls[p->a.ch]; - int lun_off = ch_map->lun_offs[p->a.lun]; - - p->a.ch += ch_map->ch_off; - p->a.lun += lun_off; -} - -static void nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_dev_map *dev_rmap = dev->rmap; - struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[p->a.ch]; - int lun_roff = ch_rmap->lun_offs[p->a.lun]; - - p->a.ch -= ch_rmap->ch_off; - p->a.lun -= lun_roff; -} - -static void nvm_ppa_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, - struct ppa_addr *ppa_list, int nr_ppas) -{ - int i; - - for (i = 0; i < nr_ppas; i++) { - nvm_map_to_dev(tgt_dev, &ppa_list[i]); - ppa_list[i] = generic_to_dev_addr(tgt_dev->parent, ppa_list[i]); - } -} - -static void nvm_ppa_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, - struct ppa_addr *ppa_list, int nr_ppas) -{ - int i; - - for (i = 0; i < nr_ppas; i++) { - ppa_list[i] = dev_to_generic_addr(tgt_dev->parent, ppa_list[i]); - nvm_map_to_tgt(tgt_dev, &ppa_list[i]); - } -} - -static void nvm_rq_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - nvm_ppa_tgt_to_dev(tgt_dev, ppa_list, rqd->nr_ppas); -} - -static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - nvm_ppa_dev_to_tgt(tgt_dev, ppa_list, rqd->nr_ppas); -} - -int nvm_register_tgt_type(struct nvm_tgt_type *tt) -{ - int ret = 0; - - down_write(&nvm_tgtt_lock); - if (__nvm_find_target_type(tt->name)) - ret = -EEXIST; - else - list_add(&tt->list, &nvm_tgt_types); - up_write(&nvm_tgtt_lock); - - return ret; -} -EXPORT_SYMBOL(nvm_register_tgt_type); - -void nvm_unregister_tgt_type(struct nvm_tgt_type *tt) -{ - if (!tt) - return; - - down_write(&nvm_tgtt_lock); - list_del(&tt->list); - up_write(&nvm_tgtt_lock); -} -EXPORT_SYMBOL(nvm_unregister_tgt_type); - -void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags, - dma_addr_t *dma_handler) -{ - return dev->ops->dev_dma_alloc(dev, dev->dma_pool, mem_flags, - dma_handler); -} -EXPORT_SYMBOL(nvm_dev_dma_alloc); - -void nvm_dev_dma_free(struct nvm_dev *dev, void *addr, dma_addr_t dma_handler) -{ - dev->ops->dev_dma_free(dev->dma_pool, addr, dma_handler); -} -EXPORT_SYMBOL(nvm_dev_dma_free); - -static struct nvm_dev *nvm_find_nvm_dev(const char *name) -{ - struct nvm_dev *dev; - - list_for_each_entry(dev, &nvm_devices, devices) - if (!strcmp(name, dev->name)) - return dev; - - return NULL; -} - -static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, - const struct ppa_addr *ppas, int nr_ppas) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_geo *geo = &tgt_dev->geo; - int i, plane_cnt, pl_idx; - struct ppa_addr ppa; - - if (geo->pln_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { - rqd->nr_ppas = nr_ppas; - rqd->ppa_addr = ppas[0]; - - return 0; - } - - rqd->nr_ppas = nr_ppas; - rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); - if (!rqd->ppa_list) { - pr_err("failed to allocate dma memory\n"); - return -ENOMEM; - } - - plane_cnt = geo->pln_mode; - rqd->nr_ppas *= plane_cnt; - - for (i = 0; i < nr_ppas; i++) { - for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { - ppa = ppas[i]; - ppa.g.pl = pl_idx; - rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; - } - } - - return 0; -} - -static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, - struct nvm_rq *rqd) -{ - if (!rqd->ppa_list) - return; - - nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list); -} - -static int nvm_set_flags(struct nvm_geo *geo, struct nvm_rq *rqd) -{ - int flags = 0; - - if (geo->version == NVM_OCSSD_SPEC_20) - return 0; - - if (rqd->is_seq) - flags |= geo->pln_mode >> 1; - - if (rqd->opcode == NVM_OP_PREAD) - flags |= (NVM_IO_SCRAMBLE_ENABLE | NVM_IO_SUSPEND); - else if (rqd->opcode == NVM_OP_PWRITE) - flags |= NVM_IO_SCRAMBLE_ENABLE; - - return flags; -} - -int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, void *buf) -{ - struct nvm_dev *dev = tgt_dev->parent; - int ret; - - if (!dev->ops->submit_io) - return -ENODEV; - - nvm_rq_tgt_to_dev(tgt_dev, rqd); - - rqd->dev = tgt_dev; - rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd); - - /* In case of error, fail with right address format */ - ret = dev->ops->submit_io(dev, rqd, buf); - if (ret) - nvm_rq_dev_to_tgt(tgt_dev, rqd); - return ret; -} -EXPORT_SYMBOL(nvm_submit_io); - -static void nvm_sync_end_io(struct nvm_rq *rqd) -{ - struct completion *waiting = rqd->private; - - complete(waiting); -} - -static int nvm_submit_io_wait(struct nvm_dev *dev, struct nvm_rq *rqd, - void *buf) -{ - DECLARE_COMPLETION_ONSTACK(wait); - int ret = 0; - - rqd->end_io = nvm_sync_end_io; - rqd->private = &wait; - - ret = dev->ops->submit_io(dev, rqd, buf); - if (ret) - return ret; - - wait_for_completion_io(&wait); - - return 0; -} - -int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, - void *buf) -{ - struct nvm_dev *dev = tgt_dev->parent; - int ret; - - if (!dev->ops->submit_io) - return -ENODEV; - - nvm_rq_tgt_to_dev(tgt_dev, rqd); - - rqd->dev = tgt_dev; - rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd); - - ret = nvm_submit_io_wait(dev, rqd, buf); - - return ret; -} -EXPORT_SYMBOL(nvm_submit_io_sync); - -void nvm_end_io(struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *tgt_dev = rqd->dev; - - /* Convert address space */ - if (tgt_dev) - nvm_rq_dev_to_tgt(tgt_dev, rqd); - - if (rqd->end_io) - rqd->end_io(rqd); -} -EXPORT_SYMBOL(nvm_end_io); - -static int nvm_submit_io_sync_raw(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - if (!dev->ops->submit_io) - return -ENODEV; - - rqd->dev = NULL; - rqd->flags = nvm_set_flags(&dev->geo, rqd); - - return nvm_submit_io_wait(dev, rqd, NULL); -} - -static int nvm_bb_chunk_sense(struct nvm_dev *dev, struct ppa_addr ppa) -{ - struct nvm_rq rqd = { NULL }; - struct bio bio; - struct bio_vec bio_vec; - struct page *page; - int ret; - - page = alloc_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - - bio_init(&bio, &bio_vec, 1); - bio_add_page(&bio, page, PAGE_SIZE, 0); - bio_set_op_attrs(&bio, REQ_OP_READ, 0); - - rqd.bio = &bio; - rqd.opcode = NVM_OP_PREAD; - rqd.is_seq = 1; - rqd.nr_ppas = 1; - rqd.ppa_addr = generic_to_dev_addr(dev, ppa); - - ret = nvm_submit_io_sync_raw(dev, &rqd); - __free_page(page); - if (ret) - return ret; - - return rqd.error; -} - -/* - * Scans a 1.2 chunk first and last page to determine if its state. - * If the chunk is found to be open, also scan it to update the write - * pointer. - */ -static int nvm_bb_chunk_scan(struct nvm_dev *dev, struct ppa_addr ppa, - struct nvm_chk_meta *meta) -{ - struct nvm_geo *geo = &dev->geo; - int ret, pg, pl; - - /* sense first page */ - ret = nvm_bb_chunk_sense(dev, ppa); - if (ret < 0) /* io error */ - return ret; - else if (ret == 0) /* valid data */ - meta->state = NVM_CHK_ST_OPEN; - else if (ret > 0) { - /* - * If empty page, the chunk is free, else it is an - * actual io error. In that case, mark it offline. - */ - switch (ret) { - case NVM_RSP_ERR_EMPTYPAGE: - meta->state = NVM_CHK_ST_FREE; - return 0; - case NVM_RSP_ERR_FAILCRC: - case NVM_RSP_ERR_FAILECC: - case NVM_RSP_WARN_HIGHECC: - meta->state = NVM_CHK_ST_OPEN; - goto scan; - default: - return -ret; /* other io error */ - } - } - - /* sense last page */ - ppa.g.pg = geo->num_pg - 1; - ppa.g.pl = geo->num_pln - 1; - - ret = nvm_bb_chunk_sense(dev, ppa); - if (ret < 0) /* io error */ - return ret; - else if (ret == 0) { /* Chunk fully written */ - meta->state = NVM_CHK_ST_CLOSED; - meta->wp = geo->clba; - return 0; - } else if (ret > 0) { - switch (ret) { - case NVM_RSP_ERR_EMPTYPAGE: - case NVM_RSP_ERR_FAILCRC: - case NVM_RSP_ERR_FAILECC: - case NVM_RSP_WARN_HIGHECC: - meta->state = NVM_CHK_ST_OPEN; - break; - default: - return -ret; /* other io error */ - } - } - -scan: - /* - * chunk is open, we scan sequentially to update the write pointer. - * We make the assumption that targets write data across all planes - * before moving to the next page. - */ - for (pg = 0; pg < geo->num_pg; pg++) { - for (pl = 0; pl < geo->num_pln; pl++) { - ppa.g.pg = pg; - ppa.g.pl = pl; - - ret = nvm_bb_chunk_sense(dev, ppa); - if (ret < 0) /* io error */ - return ret; - else if (ret == 0) { - meta->wp += geo->ws_min; - } else if (ret > 0) { - switch (ret) { - case NVM_RSP_ERR_EMPTYPAGE: - return 0; - case NVM_RSP_ERR_FAILCRC: - case NVM_RSP_ERR_FAILECC: - case NVM_RSP_WARN_HIGHECC: - meta->wp += geo->ws_min; - break; - default: - return -ret; /* other io error */ - } - } - } - } - - return 0; -} - -/* - * folds a bad block list from its plane representation to its - * chunk representation. - * - * If any of the planes status are bad or grown bad, the chunk is marked - * offline. If not bad, the first plane state acts as the chunk state. - */ -static int nvm_bb_to_chunk(struct nvm_dev *dev, struct ppa_addr ppa, - u8 *blks, int nr_blks, struct nvm_chk_meta *meta) -{ - struct nvm_geo *geo = &dev->geo; - int ret, blk, pl, offset, blktype; - - for (blk = 0; blk < geo->num_chk; blk++) { - offset = blk * geo->pln_mode; - blktype = blks[offset]; - - for (pl = 0; pl < geo->pln_mode; pl++) { - if (blks[offset + pl] & - (NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) { - blktype = blks[offset + pl]; - break; - } - } - - ppa.g.blk = blk; - - meta->wp = 0; - meta->type = NVM_CHK_TP_W_SEQ; - meta->wi = 0; - meta->slba = generic_to_dev_addr(dev, ppa).ppa; - meta->cnlb = dev->geo.clba; - - if (blktype == NVM_BLK_T_FREE) { - ret = nvm_bb_chunk_scan(dev, ppa, meta); - if (ret) - return ret; - } else { - meta->state = NVM_CHK_ST_OFFLINE; - } - - meta++; - } - - return 0; -} - -static int nvm_get_bb_meta(struct nvm_dev *dev, sector_t slba, - int nchks, struct nvm_chk_meta *meta) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr ppa; - u8 *blks; - int ch, lun, nr_blks; - int ret = 0; - - ppa.ppa = slba; - ppa = dev_to_generic_addr(dev, ppa); - - if (ppa.g.blk != 0) - return -EINVAL; - - if ((nchks % geo->num_chk) != 0) - return -EINVAL; - - nr_blks = geo->num_chk * geo->pln_mode; - - blks = kmalloc(nr_blks, GFP_KERNEL); - if (!blks) - return -ENOMEM; - - for (ch = ppa.g.ch; ch < geo->num_ch; ch++) { - for (lun = ppa.g.lun; lun < geo->num_lun; lun++) { - struct ppa_addr ppa_gen, ppa_dev; - - if (!nchks) - goto done; - - ppa_gen.ppa = 0; - ppa_gen.g.ch = ch; - ppa_gen.g.lun = lun; - ppa_dev = generic_to_dev_addr(dev, ppa_gen); - - ret = dev->ops->get_bb_tbl(dev, ppa_dev, blks); - if (ret) - goto done; - - ret = nvm_bb_to_chunk(dev, ppa_gen, blks, nr_blks, - meta); - if (ret) - goto done; - - meta += geo->num_chk; - nchks -= geo->num_chk; - } - } -done: - kfree(blks); - return ret; -} - -int nvm_get_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa, - int nchks, struct nvm_chk_meta *meta) -{ - struct nvm_dev *dev = tgt_dev->parent; - - nvm_ppa_tgt_to_dev(tgt_dev, &ppa, 1); - - if (dev->geo.version == NVM_OCSSD_SPEC_12) - return nvm_get_bb_meta(dev, (sector_t)ppa.ppa, nchks, meta); - - return dev->ops->get_chk_meta(dev, (sector_t)ppa.ppa, nchks, meta); -} -EXPORT_SYMBOL_GPL(nvm_get_chunk_meta); - -int nvm_set_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, - int nr_ppas, int type) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_rq rqd; - int ret; - - if (dev->geo.version == NVM_OCSSD_SPEC_20) - return 0; - - if (nr_ppas > NVM_MAX_VLBA) { - pr_err("unable to update all blocks atomically\n"); - return -EINVAL; - } - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); - nvm_rq_tgt_to_dev(tgt_dev, &rqd); - - ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); - nvm_free_rqd_ppalist(tgt_dev, &rqd); - if (ret) - return -EINVAL; - - return 0; -} -EXPORT_SYMBOL_GPL(nvm_set_chunk_meta); - -static int nvm_core_init(struct nvm_dev *dev) -{ - struct nvm_geo *geo = &dev->geo; - int ret; - - dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns), - sizeof(unsigned long), GFP_KERNEL); - if (!dev->lun_map) - return -ENOMEM; - - INIT_LIST_HEAD(&dev->area_list); - INIT_LIST_HEAD(&dev->targets); - mutex_init(&dev->mlock); - spin_lock_init(&dev->lock); - - ret = nvm_register_map(dev); - if (ret) - goto err_fmtype; - - return 0; -err_fmtype: - kfree(dev->lun_map); - return ret; -} - -static void nvm_free(struct kref *ref) -{ - struct nvm_dev *dev = container_of(ref, struct nvm_dev, ref); - - if (dev->dma_pool) - dev->ops->destroy_dma_pool(dev->dma_pool); - - if (dev->rmap) - nvm_unregister_map(dev); - - kfree(dev->lun_map); - kfree(dev); -} - -static int nvm_init(struct nvm_dev *dev) -{ - struct nvm_geo *geo = &dev->geo; - int ret = -EINVAL; - - if (dev->ops->identity(dev)) { - pr_err("device could not be identified\n"); - goto err; - } - - pr_debug("ver:%u.%u nvm_vendor:%x\n", geo->major_ver_id, - geo->minor_ver_id, geo->vmnt); - - ret = nvm_core_init(dev); - if (ret) { - pr_err("could not initialize core structures.\n"); - goto err; - } - - pr_info("registered %s [%u/%u/%u/%u/%u]\n", - dev->name, dev->geo.ws_min, dev->geo.ws_opt, - dev->geo.num_chk, dev->geo.all_luns, - dev->geo.num_ch); - return 0; -err: - pr_err("failed to initialize nvm\n"); - return ret; -} - -struct nvm_dev *nvm_alloc_dev(int node) -{ - struct nvm_dev *dev; - - dev = kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node); - if (dev) - kref_init(&dev->ref); - - return dev; -} -EXPORT_SYMBOL(nvm_alloc_dev); - -int nvm_register(struct nvm_dev *dev) -{ - int ret, exp_pool_size; - - pr_warn_once("lightnvm support is deprecated and will be removed in Linux 5.15.\n"); - - if (!dev->q || !dev->ops) { - kref_put(&dev->ref, nvm_free); - return -EINVAL; - } - - ret = nvm_init(dev); - if (ret) { - kref_put(&dev->ref, nvm_free); - return ret; - } - - exp_pool_size = max_t(int, PAGE_SIZE, - (NVM_MAX_VLBA * (sizeof(u64) + dev->geo.sos))); - exp_pool_size = round_up(exp_pool_size, PAGE_SIZE); - - dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist", - exp_pool_size); - if (!dev->dma_pool) { - pr_err("could not create dma pool\n"); - kref_put(&dev->ref, nvm_free); - return -ENOMEM; - } - - /* register device with a supported media manager */ - down_write(&nvm_lock); - list_add(&dev->devices, &nvm_devices); - up_write(&nvm_lock); - - return 0; -} -EXPORT_SYMBOL(nvm_register); - -void nvm_unregister(struct nvm_dev *dev) -{ - struct nvm_target *t, *tmp; - - mutex_lock(&dev->mlock); - list_for_each_entry_safe(t, tmp, &dev->targets, list) { - if (t->dev->parent != dev) - continue; - __nvm_remove_target(t, false); - kref_put(&dev->ref, nvm_free); - } - mutex_unlock(&dev->mlock); - - down_write(&nvm_lock); - list_del(&dev->devices); - up_write(&nvm_lock); - - kref_put(&dev->ref, nvm_free); -} -EXPORT_SYMBOL(nvm_unregister); - -static int __nvm_configure_create(struct nvm_ioctl_create *create) -{ - struct nvm_dev *dev; - int ret; - - down_write(&nvm_lock); - dev = nvm_find_nvm_dev(create->dev); - up_write(&nvm_lock); - - if (!dev) { - pr_err("device not found\n"); - return -EINVAL; - } - - kref_get(&dev->ref); - ret = nvm_create_tgt(dev, create); - if (ret) - kref_put(&dev->ref, nvm_free); - - return ret; -} - -static long nvm_ioctl_info(struct file *file, void __user *arg) -{ - struct nvm_ioctl_info *info; - struct nvm_tgt_type *tt; - int tgt_iter = 0; - - info = memdup_user(arg, sizeof(struct nvm_ioctl_info)); - if (IS_ERR(info)) - return PTR_ERR(info); - - info->version[0] = NVM_VERSION_MAJOR; - info->version[1] = NVM_VERSION_MINOR; - info->version[2] = NVM_VERSION_PATCH; - - down_write(&nvm_tgtt_lock); - list_for_each_entry(tt, &nvm_tgt_types, list) { - struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter]; - - tgt->version[0] = tt->version[0]; - tgt->version[1] = tt->version[1]; - tgt->version[2] = tt->version[2]; - strncpy(tgt->tgtname, tt->name, NVM_TTYPE_NAME_MAX); - - tgt_iter++; - } - - info->tgtsize = tgt_iter; - up_write(&nvm_tgtt_lock); - - if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) { - kfree(info); - return -EFAULT; - } - - kfree(info); - return 0; -} - -static long nvm_ioctl_get_devices(struct file *file, void __user *arg) -{ - struct nvm_ioctl_get_devices *devices; - struct nvm_dev *dev; - int i = 0; - - devices = kzalloc(sizeof(struct nvm_ioctl_get_devices), GFP_KERNEL); - if (!devices) - return -ENOMEM; - - down_write(&nvm_lock); - list_for_each_entry(dev, &nvm_devices, devices) { - struct nvm_ioctl_device_info *info = &devices->info[i]; - - strlcpy(info->devname, dev->name, sizeof(info->devname)); - - /* kept for compatibility */ - info->bmversion[0] = 1; - info->bmversion[1] = 0; - info->bmversion[2] = 0; - strlcpy(info->bmname, "gennvm", sizeof(info->bmname)); - i++; - - if (i >= ARRAY_SIZE(devices->info)) { - pr_err("max %zd devices can be reported.\n", - ARRAY_SIZE(devices->info)); - break; - } - } - up_write(&nvm_lock); - - devices->nr_devices = i; - - if (copy_to_user(arg, devices, - sizeof(struct nvm_ioctl_get_devices))) { - kfree(devices); - return -EFAULT; - } - - kfree(devices); - return 0; -} - -static long nvm_ioctl_dev_create(struct file *file, void __user *arg) -{ - struct nvm_ioctl_create create; - - if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create))) - return -EFAULT; - - if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED && - create.conf.e.rsv != 0) { - pr_err("reserved config field in use\n"); - return -EINVAL; - } - - create.dev[DISK_NAME_LEN - 1] = '\0'; - create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0'; - create.tgtname[DISK_NAME_LEN - 1] = '\0'; - - if (create.flags != 0) { - __u32 flags = create.flags; - - /* Check for valid flags */ - if (flags & NVM_TARGET_FACTORY) - flags &= ~NVM_TARGET_FACTORY; - - if (flags) { - pr_err("flag not supported\n"); - return -EINVAL; - } - } - - return __nvm_configure_create(&create); -} - -static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) -{ - struct nvm_ioctl_remove remove; - - if (copy_from_user(&remove, arg, sizeof(struct nvm_ioctl_remove))) - return -EFAULT; - - remove.tgtname[DISK_NAME_LEN - 1] = '\0'; - - if (remove.flags != 0) { - pr_err("no flags supported\n"); - return -EINVAL; - } - - return nvm_remove_tgt(&remove); -} - -/* kept for compatibility reasons */ -static long nvm_ioctl_dev_init(struct file *file, void __user *arg) -{ - struct nvm_ioctl_dev_init init; - - if (copy_from_user(&init, arg, sizeof(struct nvm_ioctl_dev_init))) - return -EFAULT; - - if (init.flags != 0) { - pr_err("no flags supported\n"); - return -EINVAL; - } - - return 0; -} - -/* Kept for compatibility reasons */ -static long nvm_ioctl_dev_factory(struct file *file, void __user *arg) -{ - struct nvm_ioctl_dev_factory fact; - - if (copy_from_user(&fact, arg, sizeof(struct nvm_ioctl_dev_factory))) - return -EFAULT; - - fact.dev[DISK_NAME_LEN - 1] = '\0'; - - if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1)) - return -EINVAL; - - return 0; -} - -static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case NVM_INFO: - return nvm_ioctl_info(file, argp); - case NVM_GET_DEVICES: - return nvm_ioctl_get_devices(file, argp); - case NVM_DEV_CREATE: - return nvm_ioctl_dev_create(file, argp); - case NVM_DEV_REMOVE: - return nvm_ioctl_dev_remove(file, argp); - case NVM_DEV_INIT: - return nvm_ioctl_dev_init(file, argp); - case NVM_DEV_FACTORY: - return nvm_ioctl_dev_factory(file, argp); - } - return 0; -} - -static const struct file_operations _ctl_fops = { - .open = nonseekable_open, - .unlocked_ioctl = nvm_ctl_ioctl, - .owner = THIS_MODULE, - .llseek = noop_llseek, -}; - -static struct miscdevice _nvm_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = "lightnvm", - .nodename = "lightnvm/control", - .fops = &_ctl_fops, -}; -builtin_misc_device(_nvm_misc); diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c deleted file mode 100644 index f185f1a00008..000000000000 --- a/drivers/lightnvm/pblk-cache.c +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-cache.c - pblk's write cache - */ - -#include "pblk.h" - -void pblk_write_to_cache(struct pblk *pblk, struct bio *bio, - unsigned long flags) -{ - struct pblk_w_ctx w_ctx; - sector_t lba = pblk_get_lba(bio); - unsigned long start_time; - unsigned int bpos, pos; - int nr_entries = pblk_get_secs(bio); - int i, ret; - - start_time = bio_start_io_acct(bio); - - /* Update the write buffer head (mem) with the entries that we can - * write. The write in itself cannot fail, so there is no need to - * rollback from here on. - */ -retry: - ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos); - switch (ret) { - case NVM_IO_REQUEUE: - io_schedule(); - goto retry; - case NVM_IO_ERR: - pblk_pipeline_stop(pblk); - bio_io_error(bio); - goto out; - } - - pblk_ppa_set_empty(&w_ctx.ppa); - w_ctx.flags = flags; - if (bio->bi_opf & REQ_PREFLUSH) { - w_ctx.flags |= PBLK_FLUSH_ENTRY; - pblk_write_kick(pblk); - } - - if (unlikely(!bio_has_data(bio))) - goto out; - - for (i = 0; i < nr_entries; i++) { - void *data = bio_data(bio); - - w_ctx.lba = lba + i; - - pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i); - pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos); - - bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE); - } - - atomic64_add(nr_entries, &pblk->user_wa); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(nr_entries, &pblk->inflight_writes); - atomic_long_add(nr_entries, &pblk->req_writes); -#endif - - pblk_rl_inserted(&pblk->rl, nr_entries); - -out: - bio_end_io_acct(bio, start_time); - pblk_write_should_kick(pblk); - - if (ret == NVM_IO_DONE) - bio_endio(bio); -} - -/* - * On GC the incoming lbas are not necessarily sequential. Also, some of the - * lbas might not be valid entries, which are marked as empty by the GC thread - */ -int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq) -{ - struct pblk_w_ctx w_ctx; - unsigned int bpos, pos; - void *data = gc_rq->data; - int i, valid_entries; - - /* Update the write buffer head (mem) with the entries that we can - * write. The write in itself cannot fail, so there is no need to - * rollback from here on. - */ -retry: - if (!pblk_rb_may_write_gc(&pblk->rwb, gc_rq->secs_to_gc, &bpos)) { - io_schedule(); - goto retry; - } - - w_ctx.flags = PBLK_IOTYPE_GC; - pblk_ppa_set_empty(&w_ctx.ppa); - - for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) { - if (gc_rq->lba_list[i] == ADDR_EMPTY) - continue; - - w_ctx.lba = gc_rq->lba_list[i]; - - pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries); - pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_rq->line, - gc_rq->paddr_list[i], pos); - - data += PBLK_EXPOSED_PAGE_SIZE; - valid_entries++; - } - - WARN_ONCE(gc_rq->secs_to_gc != valid_entries, - "pblk: inconsistent GC write\n"); - - atomic64_add(valid_entries, &pblk->gc_wa); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(valid_entries, &pblk->inflight_writes); - atomic_long_add(valid_entries, &pblk->recov_gc_writes); -#endif - - pblk_write_should_kick(pblk); - return NVM_IO_OK; -} diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c deleted file mode 100644 index 33d39d3dd343..000000000000 --- a/drivers/lightnvm/pblk-core.c +++ /dev/null @@ -1,2151 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-core.c - pblk's core functionality - * - */ - -#define CREATE_TRACE_POINTS - -#include "pblk.h" -#include "pblk-trace.h" - -static void pblk_line_mark_bb(struct work_struct *work) -{ - struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, - ws); - struct pblk *pblk = line_ws->pblk; - struct nvm_tgt_dev *dev = pblk->dev; - struct ppa_addr *ppa = line_ws->priv; - int ret; - - ret = nvm_set_chunk_meta(dev, ppa, 1, NVM_BLK_T_GRWN_BAD); - if (ret) { - struct pblk_line *line; - int pos; - - line = pblk_ppa_to_line(pblk, *ppa); - pos = pblk_ppa_to_pos(&dev->geo, *ppa); - - pblk_err(pblk, "failed to mark bb, line:%d, pos:%d\n", - line->id, pos); - } - - kfree(ppa); - mempool_free(line_ws, &pblk->gen_ws_pool); -} - -static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, - struct ppa_addr ppa_addr) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct ppa_addr *ppa; - int pos = pblk_ppa_to_pos(geo, ppa_addr); - - pblk_debug(pblk, "erase failed: line:%d, pos:%d\n", line->id, pos); - atomic_long_inc(&pblk->erase_failed); - - atomic_dec(&line->blk_in_line); - if (test_and_set_bit(pos, line->blk_bitmap)) - pblk_err(pblk, "attempted to erase bb: line:%d, pos:%d\n", - line->id, pos); - - /* Not necessary to mark bad blocks on 2.0 spec. */ - if (geo->version == NVM_OCSSD_SPEC_20) - return; - - ppa = kmalloc(sizeof(struct ppa_addr), GFP_ATOMIC); - if (!ppa) - return; - - *ppa = ppa_addr; - pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, - GFP_ATOMIC, pblk->bb_wq); -} - -static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct nvm_chk_meta *chunk; - struct pblk_line *line; - int pos; - - line = pblk_ppa_to_line(pblk, rqd->ppa_addr); - pos = pblk_ppa_to_pos(geo, rqd->ppa_addr); - chunk = &line->chks[pos]; - - atomic_dec(&line->left_seblks); - - if (rqd->error) { - trace_pblk_chunk_reset(pblk_disk_name(pblk), - &rqd->ppa_addr, PBLK_CHUNK_RESET_FAILED); - - chunk->state = NVM_CHK_ST_OFFLINE; - pblk_mark_bb(pblk, line, rqd->ppa_addr); - } else { - trace_pblk_chunk_reset(pblk_disk_name(pblk), - &rqd->ppa_addr, PBLK_CHUNK_RESET_DONE); - - chunk->state = NVM_CHK_ST_FREE; - } - - trace_pblk_chunk_state(pblk_disk_name(pblk), &rqd->ppa_addr, - chunk->state); - - atomic_dec(&pblk->inflight_io); -} - -/* Erase completion assumes that only one block is erased at the time */ -static void pblk_end_io_erase(struct nvm_rq *rqd) -{ - struct pblk *pblk = rqd->private; - - __pblk_end_io_erase(pblk, rqd); - mempool_free(rqd, &pblk->e_rq_pool); -} - -/* - * Get information for all chunks from the device. - * - * The caller is responsible for freeing (vmalloc) the returned structure - */ -struct nvm_chk_meta *pblk_get_chunk_meta(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct nvm_chk_meta *meta; - struct ppa_addr ppa; - unsigned long len; - int ret; - - ppa.ppa = 0; - - len = geo->all_chunks * sizeof(*meta); - meta = vzalloc(len); - if (!meta) - return ERR_PTR(-ENOMEM); - - ret = nvm_get_chunk_meta(dev, ppa, geo->all_chunks, meta); - if (ret) { - vfree(meta); - return ERR_PTR(-EIO); - } - - return meta; -} - -struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk, - struct nvm_chk_meta *meta, - struct ppa_addr ppa) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int ch_off = ppa.m.grp * geo->num_chk * geo->num_lun; - int lun_off = ppa.m.pu * geo->num_chk; - int chk_off = ppa.m.chk; - - return meta + ch_off + lun_off + chk_off; -} - -void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, - u64 paddr) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct list_head *move_list = NULL; - - /* Lines being reclaimed (GC'ed) cannot be invalidated. Before the L2P - * table is modified with reclaimed sectors, a check is done to endure - * that newer updates are not overwritten. - */ - spin_lock(&line->lock); - WARN_ON(line->state == PBLK_LINESTATE_FREE); - - if (test_and_set_bit(paddr, line->invalid_bitmap)) { - WARN_ONCE(1, "pblk: double invalidate\n"); - spin_unlock(&line->lock); - return; - } - le32_add_cpu(line->vsc, -1); - - if (line->state == PBLK_LINESTATE_CLOSED) - move_list = pblk_line_gc_list(pblk, line); - spin_unlock(&line->lock); - - if (move_list) { - spin_lock(&l_mg->gc_lock); - spin_lock(&line->lock); - /* Prevent moving a line that has just been chosen for GC */ - if (line->state == PBLK_LINESTATE_GC) { - spin_unlock(&line->lock); - spin_unlock(&l_mg->gc_lock); - return; - } - spin_unlock(&line->lock); - - list_move_tail(&line->list, move_list); - spin_unlock(&l_mg->gc_lock); - } -} - -void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa) -{ - struct pblk_line *line; - u64 paddr; - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a device address */ - BUG_ON(pblk_addr_in_cache(ppa)); - BUG_ON(pblk_ppa_empty(ppa)); -#endif - - line = pblk_ppa_to_line(pblk, ppa); - paddr = pblk_dev_ppa_to_line_addr(pblk, ppa); - - __pblk_map_invalidate(pblk, line, paddr); -} - -static void pblk_invalidate_range(struct pblk *pblk, sector_t slba, - unsigned int nr_secs) -{ - sector_t lba; - - spin_lock(&pblk->trans_lock); - for (lba = slba; lba < slba + nr_secs; lba++) { - struct ppa_addr ppa; - - ppa = pblk_trans_map_get(pblk, lba); - - if (!pblk_addr_in_cache(ppa) && !pblk_ppa_empty(ppa)) - pblk_map_invalidate(pblk, ppa); - - pblk_ppa_set_empty(&ppa); - pblk_trans_map_set(pblk, lba, ppa); - } - spin_unlock(&pblk->trans_lock); -} - -int pblk_alloc_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, - &rqd->dma_meta_list); - if (!rqd->meta_list) - return -ENOMEM; - - if (rqd->nr_ppas == 1) - return 0; - - rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size(pblk); - rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size(pblk); - - return 0; -} - -void pblk_free_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - if (rqd->meta_list) - nvm_dev_dma_free(dev->parent, rqd->meta_list, - rqd->dma_meta_list); -} - -/* Caller must guarantee that the request is a valid type */ -struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type) -{ - mempool_t *pool; - struct nvm_rq *rqd; - int rq_size; - - switch (type) { - case PBLK_WRITE: - case PBLK_WRITE_INT: - pool = &pblk->w_rq_pool; - rq_size = pblk_w_rq_size; - break; - case PBLK_READ: - pool = &pblk->r_rq_pool; - rq_size = pblk_g_rq_size; - break; - default: - pool = &pblk->e_rq_pool; - rq_size = pblk_g_rq_size; - } - - rqd = mempool_alloc(pool, GFP_KERNEL); - memset(rqd, 0, rq_size); - - return rqd; -} - -/* Typically used on completion path. Cannot guarantee request consistency */ -void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type) -{ - mempool_t *pool; - - switch (type) { - case PBLK_WRITE: - kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap); - fallthrough; - case PBLK_WRITE_INT: - pool = &pblk->w_rq_pool; - break; - case PBLK_READ: - pool = &pblk->r_rq_pool; - break; - case PBLK_ERASE: - pool = &pblk->e_rq_pool; - break; - default: - pblk_err(pblk, "trying to free unknown rqd type\n"); - return; - } - - pblk_free_rqd_meta(pblk, rqd); - mempool_free(rqd, pool); -} - -void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, - int nr_pages) -{ - struct bio_vec *bv; - struct page *page; - int i, e, nbv = 0; - - for (i = 0; i < bio->bi_vcnt; i++) { - bv = &bio->bi_io_vec[i]; - page = bv->bv_page; - for (e = 0; e < bv->bv_len; e += PBLK_EXPOSED_PAGE_SIZE, nbv++) - if (nbv >= off) - mempool_free(page++, &pblk->page_bio_pool); - } -} - -int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, - int nr_pages) -{ - struct request_queue *q = pblk->dev->q; - struct page *page; - int i, ret; - - for (i = 0; i < nr_pages; i++) { - page = mempool_alloc(&pblk->page_bio_pool, flags); - - ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); - if (ret != PBLK_EXPOSED_PAGE_SIZE) { - pblk_err(pblk, "could not add page to bio\n"); - mempool_free(page, &pblk->page_bio_pool); - goto err; - } - } - - return 0; -err: - pblk_bio_free_pages(pblk, bio, (bio->bi_vcnt - i), i); - return -1; -} - -void pblk_write_kick(struct pblk *pblk) -{ - wake_up_process(pblk->writer_ts); - mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000)); -} - -void pblk_write_timer_fn(struct timer_list *t) -{ - struct pblk *pblk = from_timer(pblk, t, wtimer); - - /* kick the write thread every tick to flush outstanding data */ - pblk_write_kick(pblk); -} - -void pblk_write_should_kick(struct pblk *pblk) -{ - unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb); - - if (secs_avail >= pblk->min_write_pgs_data) - pblk_write_kick(pblk); -} - -static void pblk_wait_for_meta(struct pblk *pblk) -{ - do { - if (!atomic_read(&pblk->inflight_io)) - break; - - schedule(); - } while (1); -} - -static void pblk_flush_writer(struct pblk *pblk) -{ - pblk_rb_flush(&pblk->rwb); - do { - if (!pblk_rb_sync_count(&pblk->rwb)) - break; - - pblk_write_kick(pblk); - schedule(); - } while (1); -} - -struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct list_head *move_list = NULL; - int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data) - * (pblk->min_write_pgs - pblk->min_write_pgs_data); - int vsc = le32_to_cpu(*line->vsc) + packed_meta; - - lockdep_assert_held(&line->lock); - - if (line->w_err_gc->has_write_err) { - if (line->gc_group != PBLK_LINEGC_WERR) { - line->gc_group = PBLK_LINEGC_WERR; - move_list = &l_mg->gc_werr_list; - pblk_rl_werr_line_in(&pblk->rl); - } - } else if (!vsc) { - if (line->gc_group != PBLK_LINEGC_FULL) { - line->gc_group = PBLK_LINEGC_FULL; - move_list = &l_mg->gc_full_list; - } - } else if (vsc < lm->high_thrs) { - if (line->gc_group != PBLK_LINEGC_HIGH) { - line->gc_group = PBLK_LINEGC_HIGH; - move_list = &l_mg->gc_high_list; - } - } else if (vsc < lm->mid_thrs) { - if (line->gc_group != PBLK_LINEGC_MID) { - line->gc_group = PBLK_LINEGC_MID; - move_list = &l_mg->gc_mid_list; - } - } else if (vsc < line->sec_in_line) { - if (line->gc_group != PBLK_LINEGC_LOW) { - line->gc_group = PBLK_LINEGC_LOW; - move_list = &l_mg->gc_low_list; - } - } else if (vsc == line->sec_in_line) { - if (line->gc_group != PBLK_LINEGC_EMPTY) { - line->gc_group = PBLK_LINEGC_EMPTY; - move_list = &l_mg->gc_empty_list; - } - } else { - line->state = PBLK_LINESTATE_CORRUPT; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - - line->gc_group = PBLK_LINEGC_NONE; - move_list = &l_mg->corrupt_list; - pblk_err(pblk, "corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n", - line->id, vsc, - line->sec_in_line, - lm->high_thrs, lm->mid_thrs); - } - - return move_list; -} - -void pblk_discard(struct pblk *pblk, struct bio *bio) -{ - sector_t slba = pblk_get_lba(bio); - sector_t nr_secs = pblk_get_secs(bio); - - pblk_invalidate_range(pblk, slba, nr_secs); -} - -void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd) -{ - atomic_long_inc(&pblk->write_failed); -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_print_failed_rqd(pblk, rqd, rqd->error); -#endif -} - -void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd) -{ - /* Empty page read is not necessarily an error (e.g., L2P recovery) */ - if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) { - atomic_long_inc(&pblk->read_empty); - return; - } - - switch (rqd->error) { - case NVM_RSP_WARN_HIGHECC: - atomic_long_inc(&pblk->read_high_ecc); - break; - case NVM_RSP_ERR_FAILECC: - case NVM_RSP_ERR_FAILCRC: - atomic_long_inc(&pblk->read_failed); - break; - default: - pblk_err(pblk, "unknown read error:%d\n", rqd->error); - } -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_print_failed_rqd(pblk, rqd, rqd->error); -#endif -} - -void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write) -{ - pblk->sec_per_write = sec_per_write; -} - -int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - atomic_inc(&pblk->inflight_io); - -#ifdef CONFIG_NVM_PBLK_DEBUG - if (pblk_check_io(pblk, rqd)) - return NVM_IO_ERR; -#endif - - return nvm_submit_io(dev, rqd, buf); -} - -void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - int i; - - for (i = 0; i < rqd->nr_ppas; i++) { - struct ppa_addr *ppa = &ppa_list[i]; - struct nvm_chk_meta *chunk = pblk_dev_ppa_to_chunk(pblk, *ppa); - u64 caddr = pblk_dev_ppa_to_chunk_addr(pblk, *ppa); - - if (caddr == 0) - trace_pblk_chunk_state(pblk_disk_name(pblk), - ppa, NVM_CHK_ST_OPEN); - else if (caddr == (chunk->cnlb - 1)) - trace_pblk_chunk_state(pblk_disk_name(pblk), - ppa, NVM_CHK_ST_CLOSED); - } -} - -int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf) -{ - struct nvm_tgt_dev *dev = pblk->dev; - int ret; - - atomic_inc(&pblk->inflight_io); - -#ifdef CONFIG_NVM_PBLK_DEBUG - if (pblk_check_io(pblk, rqd)) - return NVM_IO_ERR; -#endif - - ret = nvm_submit_io_sync(dev, rqd, buf); - - if (trace_pblk_chunk_state_enabled() && !ret && - rqd->opcode == NVM_OP_PWRITE) - pblk_check_chunk_state_update(pblk, rqd); - - return ret; -} - -static int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd, - void *buf) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - int ret; - - pblk_down_chunk(pblk, ppa_list[0]); - ret = pblk_submit_io_sync(pblk, rqd, buf); - pblk_up_chunk(pblk, ppa_list[0]); - - return ret; -} - -int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, - unsigned long secs_to_flush, bool skip_meta) -{ - int max = pblk->sec_per_write; - int min = pblk->min_write_pgs; - int secs_to_sync = 0; - - if (skip_meta && pblk->min_write_pgs_data != pblk->min_write_pgs) - min = max = pblk->min_write_pgs_data; - - if (secs_avail >= max) - secs_to_sync = max; - else if (secs_avail >= min) - secs_to_sync = min * (secs_avail / min); - else if (secs_to_flush) - secs_to_sync = min; - - return secs_to_sync; -} - -void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) -{ - u64 addr; - int i; - - spin_lock(&line->lock); - addr = find_next_zero_bit(line->map_bitmap, - pblk->lm.sec_per_line, line->cur_sec); - line->cur_sec = addr - nr_secs; - - for (i = 0; i < nr_secs; i++, line->cur_sec--) - WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap)); - spin_unlock(&line->lock); -} - -u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) -{ - u64 addr; - int i; - - lockdep_assert_held(&line->lock); - - /* logic error: ppa out-of-bounds. Prevent generating bad address */ - if (line->cur_sec + nr_secs > pblk->lm.sec_per_line) { - WARN(1, "pblk: page allocation out of bounds\n"); - nr_secs = pblk->lm.sec_per_line - line->cur_sec; - } - - line->cur_sec = addr = find_next_zero_bit(line->map_bitmap, - pblk->lm.sec_per_line, line->cur_sec); - for (i = 0; i < nr_secs; i++, line->cur_sec++) - WARN_ON(test_and_set_bit(line->cur_sec, line->map_bitmap)); - - return addr; -} - -u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) -{ - u64 addr; - - /* Lock needed in case a write fails and a recovery needs to remap - * failed write buffer entries - */ - spin_lock(&line->lock); - addr = __pblk_alloc_page(pblk, line, nr_secs); - line->left_msecs -= nr_secs; - WARN(line->left_msecs < 0, "pblk: page allocation out of bounds\n"); - spin_unlock(&line->lock); - - return addr; -} - -u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line) -{ - u64 paddr; - - spin_lock(&line->lock); - paddr = find_next_zero_bit(line->map_bitmap, - pblk->lm.sec_per_line, line->cur_sec); - spin_unlock(&line->lock); - - return paddr; -} - -u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - int bit; - - /* This usually only happens on bad lines */ - bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); - if (bit >= lm->blk_per_line) - return -1; - - return bit * geo->ws_opt; -} - -int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct ppa_addr *ppa_list; - struct nvm_rq rqd; - u64 paddr = pblk_line_smeta_start(pblk, line); - int i, ret; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - ret = pblk_alloc_rqd_meta(pblk, &rqd); - if (ret) - return ret; - - rqd.opcode = NVM_OP_PREAD; - rqd.nr_ppas = lm->smeta_sec; - rqd.is_seq = 1; - ppa_list = nvm_rq_to_ppa_list(&rqd); - - for (i = 0; i < lm->smeta_sec; i++, paddr++) - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - - ret = pblk_submit_io_sync(pblk, &rqd, line->smeta); - if (ret) { - pblk_err(pblk, "smeta I/O submission failed: %d\n", ret); - goto clear_rqd; - } - - atomic_dec(&pblk->inflight_io); - - if (rqd.error && rqd.error != NVM_RSP_WARN_HIGHECC) { - pblk_log_read_err(pblk, &rqd); - ret = -EIO; - } - -clear_rqd: - pblk_free_rqd_meta(pblk, &rqd); - return ret; -} - -static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line, - u64 paddr) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct ppa_addr *ppa_list; - struct nvm_rq rqd; - __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf); - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - int i, ret; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - ret = pblk_alloc_rqd_meta(pblk, &rqd); - if (ret) - return ret; - - rqd.opcode = NVM_OP_PWRITE; - rqd.nr_ppas = lm->smeta_sec; - rqd.is_seq = 1; - ppa_list = nvm_rq_to_ppa_list(&rqd); - - for (i = 0; i < lm->smeta_sec; i++, paddr++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, - rqd.meta_list, i); - - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - meta->lba = lba_list[paddr] = addr_empty; - } - - ret = pblk_submit_io_sync_sem(pblk, &rqd, line->smeta); - if (ret) { - pblk_err(pblk, "smeta I/O submission failed: %d\n", ret); - goto clear_rqd; - } - - atomic_dec(&pblk->inflight_io); - - if (rqd.error) { - pblk_log_write_err(pblk, &rqd); - ret = -EIO; - } - -clear_rqd: - pblk_free_rqd_meta(pblk, &rqd); - return ret; -} - -int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line, - void *emeta_buf) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - void *ppa_list_buf, *meta_list; - struct ppa_addr *ppa_list; - struct nvm_rq rqd; - u64 paddr = line->emeta_ssec; - dma_addr_t dma_ppa_list, dma_meta_list; - int min = pblk->min_write_pgs; - int left_ppas = lm->emeta_sec[0]; - int line_id = line->id; - int rq_ppas, rq_len; - int i, j; - int ret; - - meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, - &dma_meta_list); - if (!meta_list) - return -ENOMEM; - - ppa_list_buf = meta_list + pblk_dma_meta_size(pblk); - dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk); - -next_rq: - memset(&rqd, 0, sizeof(struct nvm_rq)); - - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); - rq_len = rq_ppas * geo->csecs; - - rqd.meta_list = meta_list; - rqd.ppa_list = ppa_list_buf; - rqd.dma_meta_list = dma_meta_list; - rqd.dma_ppa_list = dma_ppa_list; - rqd.opcode = NVM_OP_PREAD; - rqd.nr_ppas = rq_ppas; - ppa_list = nvm_rq_to_ppa_list(&rqd); - - for (i = 0; i < rqd.nr_ppas; ) { - struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, line_id); - int pos = pblk_ppa_to_pos(geo, ppa); - - if (pblk_io_aligned(pblk, rq_ppas)) - rqd.is_seq = 1; - - while (test_bit(pos, line->blk_bitmap)) { - paddr += min; - if (pblk_boundary_paddr_checks(pblk, paddr)) { - ret = -EINTR; - goto free_rqd_dma; - } - - ppa = addr_to_gen_ppa(pblk, paddr, line_id); - pos = pblk_ppa_to_pos(geo, ppa); - } - - if (pblk_boundary_paddr_checks(pblk, paddr + min)) { - ret = -EINTR; - goto free_rqd_dma; - } - - for (j = 0; j < min; j++, i++, paddr++) - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line_id); - } - - ret = pblk_submit_io_sync(pblk, &rqd, emeta_buf); - if (ret) { - pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); - goto free_rqd_dma; - } - - atomic_dec(&pblk->inflight_io); - - if (rqd.error && rqd.error != NVM_RSP_WARN_HIGHECC) { - pblk_log_read_err(pblk, &rqd); - ret = -EIO; - goto free_rqd_dma; - } - - emeta_buf += rq_len; - left_ppas -= rq_ppas; - if (left_ppas) - goto next_rq; - -free_rqd_dma: - nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); - return ret; -} - -static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct ppa_addr ppa) -{ - rqd->opcode = NVM_OP_ERASE; - rqd->ppa_addr = ppa; - rqd->nr_ppas = 1; - rqd->is_seq = 1; - rqd->bio = NULL; -} - -static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) -{ - struct nvm_rq rqd = {NULL}; - int ret; - - trace_pblk_chunk_reset(pblk_disk_name(pblk), &ppa, - PBLK_CHUNK_RESET_START); - - pblk_setup_e_rq(pblk, &rqd, ppa); - - /* The write thread schedules erases so that it minimizes disturbances - * with writes. Thus, there is no need to take the LUN semaphore. - */ - ret = pblk_submit_io_sync(pblk, &rqd, NULL); - rqd.private = pblk; - __pblk_end_io_erase(pblk, &rqd); - - return ret; -} - -int pblk_line_erase(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct ppa_addr ppa; - int ret, bit = -1; - - /* Erase only good blocks, one at a time */ - do { - spin_lock(&line->lock); - bit = find_next_zero_bit(line->erase_bitmap, lm->blk_per_line, - bit + 1); - if (bit >= lm->blk_per_line) { - spin_unlock(&line->lock); - break; - } - - ppa = pblk->luns[bit].bppa; /* set ch and lun */ - ppa.a.blk = line->id; - - atomic_dec(&line->left_eblks); - WARN_ON(test_and_set_bit(bit, line->erase_bitmap)); - spin_unlock(&line->lock); - - ret = pblk_blk_erase_sync(pblk, ppa); - if (ret) { - pblk_err(pblk, "failed to erase line %d\n", line->id); - return ret; - } - } while (1); - - return 0; -} - -static void pblk_line_setup_metadata(struct pblk_line *line, - struct pblk_line_mgmt *l_mg, - struct pblk_line_meta *lm) -{ - int meta_line; - - lockdep_assert_held(&l_mg->free_lock); - -retry_meta: - meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); - if (meta_line == PBLK_DATA_LINES) { - spin_unlock(&l_mg->free_lock); - io_schedule(); - spin_lock(&l_mg->free_lock); - goto retry_meta; - } - - set_bit(meta_line, &l_mg->meta_bitmap); - line->meta_line = meta_line; - - line->smeta = l_mg->sline_meta[meta_line]; - line->emeta = l_mg->eline_meta[meta_line]; - - memset(line->smeta, 0, lm->smeta_len); - memset(line->emeta->buf, 0, lm->emeta_len[0]); - - line->emeta->mem = 0; - atomic_set(&line->emeta->sync, 0); -} - -/* For now lines are always assumed full lines. Thus, smeta former and current - * lun bitmaps are omitted. - */ -static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, - struct pblk_line *cur) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_emeta *emeta = line->emeta; - struct line_emeta *emeta_buf = emeta->buf; - struct line_smeta *smeta_buf = (struct line_smeta *)line->smeta; - int nr_blk_line; - - /* After erasing the line, new bad blocks might appear and we risk - * having an invalid line - */ - nr_blk_line = lm->blk_per_line - - bitmap_weight(line->blk_bitmap, lm->blk_per_line); - if (nr_blk_line < lm->min_blk_line) { - spin_lock(&l_mg->free_lock); - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_BAD; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_add_tail(&line->list, &l_mg->bad_list); - spin_unlock(&l_mg->free_lock); - - pblk_debug(pblk, "line %d is bad\n", line->id); - - return 0; - } - - /* Run-time metadata */ - line->lun_bitmap = ((void *)(smeta_buf)) + sizeof(struct line_smeta); - - /* Mark LUNs allocated in this line (all for now) */ - bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len); - - smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); - export_guid(smeta_buf->header.uuid, &pblk->instance_uuid); - smeta_buf->header.id = cpu_to_le32(line->id); - smeta_buf->header.type = cpu_to_le16(line->type); - smeta_buf->header.version_major = SMETA_VERSION_MAJOR; - smeta_buf->header.version_minor = SMETA_VERSION_MINOR; - - /* Start metadata */ - smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); - smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns); - - /* Fill metadata among lines */ - if (cur) { - memcpy(line->lun_bitmap, cur->lun_bitmap, lm->lun_bitmap_len); - smeta_buf->prev_id = cpu_to_le32(cur->id); - cur->emeta->buf->next_id = cpu_to_le32(line->id); - } else { - smeta_buf->prev_id = cpu_to_le32(PBLK_LINE_EMPTY); - } - - /* All smeta must be set at this point */ - smeta_buf->header.crc = cpu_to_le32( - pblk_calc_meta_header_crc(pblk, &smeta_buf->header)); - smeta_buf->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta_buf)); - - /* End metadata */ - memcpy(&emeta_buf->header, &smeta_buf->header, - sizeof(struct line_header)); - - emeta_buf->header.version_major = EMETA_VERSION_MAJOR; - emeta_buf->header.version_minor = EMETA_VERSION_MINOR; - emeta_buf->header.crc = cpu_to_le32( - pblk_calc_meta_header_crc(pblk, &emeta_buf->header)); - - emeta_buf->seq_nr = cpu_to_le64(line->seq_nr); - emeta_buf->nr_lbas = cpu_to_le64(line->sec_in_line); - emeta_buf->nr_valid_lbas = cpu_to_le64(0); - emeta_buf->next_id = cpu_to_le32(PBLK_LINE_EMPTY); - emeta_buf->crc = cpu_to_le32(0); - emeta_buf->prev_id = smeta_buf->prev_id; - - return 1; -} - -static int pblk_line_alloc_bitmaps(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - - line->map_bitmap = mempool_alloc(l_mg->bitmap_pool, GFP_KERNEL); - if (!line->map_bitmap) - return -ENOMEM; - - memset(line->map_bitmap, 0, lm->sec_bitmap_len); - - /* will be initialized using bb info from map_bitmap */ - line->invalid_bitmap = mempool_alloc(l_mg->bitmap_pool, GFP_KERNEL); - if (!line->invalid_bitmap) { - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - line->map_bitmap = NULL; - return -ENOMEM; - } - - return 0; -} - -/* For now lines are always assumed full lines. Thus, smeta former and current - * lun bitmaps are omitted. - */ -static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, - int init) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - u64 off; - int bit = -1; - int emeta_secs; - - line->sec_in_line = lm->sec_per_line; - - /* Capture bad block information on line mapping bitmaps */ - while ((bit = find_next_bit(line->blk_bitmap, lm->blk_per_line, - bit + 1)) < lm->blk_per_line) { - off = bit * geo->ws_opt; - bitmap_shift_left(l_mg->bb_aux, l_mg->bb_template, off, - lm->sec_per_line); - bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux, - lm->sec_per_line); - line->sec_in_line -= geo->clba; - } - - /* Mark smeta metadata sectors as bad sectors */ - bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); - off = bit * geo->ws_opt; - bitmap_set(line->map_bitmap, off, lm->smeta_sec); - line->sec_in_line -= lm->smeta_sec; - line->cur_sec = off + lm->smeta_sec; - - if (init && pblk_line_smeta_write(pblk, line, off)) { - pblk_debug(pblk, "line smeta I/O failed. Retry\n"); - return 0; - } - - bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line); - - /* Mark emeta metadata sectors as bad sectors. We need to consider bad - * blocks to make sure that there are enough sectors to store emeta - */ - emeta_secs = lm->emeta_sec[0]; - off = lm->sec_per_line; - while (emeta_secs) { - off -= geo->ws_opt; - if (!test_bit(off, line->invalid_bitmap)) { - bitmap_set(line->invalid_bitmap, off, geo->ws_opt); - emeta_secs -= geo->ws_opt; - } - } - - line->emeta_ssec = off; - line->sec_in_line -= lm->emeta_sec[0]; - line->nr_valid_lbas = 0; - line->left_msecs = line->sec_in_line; - *line->vsc = cpu_to_le32(line->sec_in_line); - - if (lm->sec_per_line - line->sec_in_line != - bitmap_weight(line->invalid_bitmap, lm->sec_per_line)) { - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_BAD; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_add_tail(&line->list, &l_mg->bad_list); - pblk_err(pblk, "unexpected line %d is bad\n", line->id); - - return 0; - } - - return 1; -} - -static int pblk_prepare_new_line(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int blk_to_erase = atomic_read(&line->blk_in_line); - int i; - - for (i = 0; i < lm->blk_per_line; i++) { - struct pblk_lun *rlun = &pblk->luns[i]; - int pos = pblk_ppa_to_pos(geo, rlun->bppa); - int state = line->chks[pos].state; - - /* Free chunks should not be erased */ - if (state & NVM_CHK_ST_FREE) { - set_bit(pblk_ppa_to_pos(geo, rlun->bppa), - line->erase_bitmap); - blk_to_erase--; - } - } - - return blk_to_erase; -} - -static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - int blk_in_line = atomic_read(&line->blk_in_line); - int blk_to_erase; - - /* Bad blocks do not need to be erased */ - bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line); - - spin_lock(&line->lock); - - /* If we have not written to this line, we need to mark up free chunks - * as already erased - */ - if (line->state == PBLK_LINESTATE_NEW) { - blk_to_erase = pblk_prepare_new_line(pblk, line); - line->state = PBLK_LINESTATE_FREE; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - } else { - blk_to_erase = blk_in_line; - } - - if (blk_in_line < lm->min_blk_line) { - spin_unlock(&line->lock); - return -EAGAIN; - } - - if (line->state != PBLK_LINESTATE_FREE) { - WARN(1, "pblk: corrupted line %d, state %d\n", - line->id, line->state); - spin_unlock(&line->lock); - return -EINTR; - } - - line->state = PBLK_LINESTATE_OPEN; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - - atomic_set(&line->left_eblks, blk_to_erase); - atomic_set(&line->left_seblks, blk_to_erase); - - line->meta_distance = lm->meta_distance; - spin_unlock(&line->lock); - - kref_init(&line->ref); - atomic_set(&line->sec_to_update, 0); - - return 0; -} - -/* Line allocations in the recovery path are always single threaded */ -int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int ret; - - spin_lock(&l_mg->free_lock); - l_mg->data_line = line; - list_del(&line->list); - - ret = pblk_line_prepare(pblk, line); - if (ret) { - list_add(&line->list, &l_mg->free_list); - spin_unlock(&l_mg->free_lock); - return ret; - } - spin_unlock(&l_mg->free_lock); - - ret = pblk_line_alloc_bitmaps(pblk, line); - if (ret) - goto fail; - - if (!pblk_line_init_bb(pblk, line, 0)) { - ret = -EINTR; - goto fail; - } - - pblk_rl_free_lines_dec(&pblk->rl, line, true); - return 0; - -fail: - spin_lock(&l_mg->free_lock); - list_add(&line->list, &l_mg->free_list); - spin_unlock(&l_mg->free_lock); - - return ret; -} - -void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - line->map_bitmap = NULL; - line->smeta = NULL; - line->emeta = NULL; -} - -static void pblk_line_reinit(struct pblk_line *line) -{ - *line->vsc = cpu_to_le32(EMPTY_ENTRY); - - line->map_bitmap = NULL; - line->invalid_bitmap = NULL; - line->smeta = NULL; - line->emeta = NULL; -} - -void pblk_line_free(struct pblk_line *line) -{ - struct pblk *pblk = line->pblk; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - mempool_free(line->invalid_bitmap, l_mg->bitmap_pool); - - pblk_line_reinit(line); -} - -struct pblk_line *pblk_line_get(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *line; - int ret, bit; - - lockdep_assert_held(&l_mg->free_lock); - -retry: - if (list_empty(&l_mg->free_list)) { - pblk_err(pblk, "no free lines\n"); - return NULL; - } - - line = list_first_entry(&l_mg->free_list, struct pblk_line, list); - list_del(&line->list); - l_mg->nr_free_lines--; - - bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); - if (unlikely(bit >= lm->blk_per_line)) { - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_BAD; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_add_tail(&line->list, &l_mg->bad_list); - - pblk_debug(pblk, "line %d is bad\n", line->id); - goto retry; - } - - ret = pblk_line_prepare(pblk, line); - if (ret) { - switch (ret) { - case -EAGAIN: - list_add(&line->list, &l_mg->bad_list); - goto retry; - case -EINTR: - list_add(&line->list, &l_mg->corrupt_list); - goto retry; - default: - pblk_err(pblk, "failed to prepare line %d\n", line->id); - list_add(&line->list, &l_mg->free_list); - l_mg->nr_free_lines++; - return NULL; - } - } - - return line; -} - -static struct pblk_line *pblk_line_retry(struct pblk *pblk, - struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *retry_line; - -retry: - spin_lock(&l_mg->free_lock); - retry_line = pblk_line_get(pblk); - if (!retry_line) { - l_mg->data_line = NULL; - spin_unlock(&l_mg->free_lock); - return NULL; - } - - retry_line->map_bitmap = line->map_bitmap; - retry_line->invalid_bitmap = line->invalid_bitmap; - retry_line->smeta = line->smeta; - retry_line->emeta = line->emeta; - retry_line->meta_line = line->meta_line; - - pblk_line_reinit(line); - - l_mg->data_line = retry_line; - spin_unlock(&l_mg->free_lock); - - pblk_rl_free_lines_dec(&pblk->rl, line, false); - - if (pblk_line_erase(pblk, retry_line)) - goto retry; - - return retry_line; -} - -static void pblk_set_space_limit(struct pblk *pblk) -{ - struct pblk_rl *rl = &pblk->rl; - - atomic_set(&rl->rb_space, 0); -} - -struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line; - - spin_lock(&l_mg->free_lock); - line = pblk_line_get(pblk); - if (!line) { - spin_unlock(&l_mg->free_lock); - return NULL; - } - - line->seq_nr = l_mg->d_seq_nr++; - line->type = PBLK_LINETYPE_DATA; - l_mg->data_line = line; - - pblk_line_setup_metadata(line, l_mg, &pblk->lm); - - /* Allocate next line for preparation */ - l_mg->data_next = pblk_line_get(pblk); - if (!l_mg->data_next) { - /* If we cannot get a new line, we need to stop the pipeline. - * Only allow as many writes in as we can store safely and then - * fail gracefully - */ - pblk_set_space_limit(pblk); - - l_mg->data_next = NULL; - } else { - l_mg->data_next->seq_nr = l_mg->d_seq_nr++; - l_mg->data_next->type = PBLK_LINETYPE_DATA; - } - spin_unlock(&l_mg->free_lock); - - if (pblk_line_alloc_bitmaps(pblk, line)) - return NULL; - - if (pblk_line_erase(pblk, line)) { - line = pblk_line_retry(pblk, line); - if (!line) - return NULL; - } - -retry_setup: - if (!pblk_line_init_metadata(pblk, line, NULL)) { - line = pblk_line_retry(pblk, line); - if (!line) - return NULL; - - goto retry_setup; - } - - if (!pblk_line_init_bb(pblk, line, 1)) { - line = pblk_line_retry(pblk, line); - if (!line) - return NULL; - - goto retry_setup; - } - - pblk_rl_free_lines_dec(&pblk->rl, line, true); - - return line; -} - -void pblk_ppa_to_line_put(struct pblk *pblk, struct ppa_addr ppa) -{ - struct pblk_line *line; - - line = pblk_ppa_to_line(pblk, ppa); - kref_put(&line->ref, pblk_line_put_wq); -} - -void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - int i; - - for (i = 0; i < rqd->nr_ppas; i++) - pblk_ppa_to_line_put(pblk, ppa_list[i]); -} - -static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line) -{ - lockdep_assert_held(&pblk->l_mg.free_lock); - - pblk_set_space_limit(pblk); - pblk->state = PBLK_STATE_STOPPING; - trace_pblk_state(pblk_disk_name(pblk), pblk->state); -} - -static void pblk_line_close_meta_sync(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *line, *tline; - LIST_HEAD(list); - - spin_lock(&l_mg->close_lock); - if (list_empty(&l_mg->emeta_list)) { - spin_unlock(&l_mg->close_lock); - return; - } - - list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev); - spin_unlock(&l_mg->close_lock); - - list_for_each_entry_safe(line, tline, &list, list) { - struct pblk_emeta *emeta = line->emeta; - - while (emeta->mem < lm->emeta_len[0]) { - int ret; - - ret = pblk_submit_meta_io(pblk, line); - if (ret) { - pblk_err(pblk, "sync meta line %d failed (%d)\n", - line->id, ret); - return; - } - } - } - - pblk_wait_for_meta(pblk); - flush_workqueue(pblk->close_wq); -} - -void __pblk_pipeline_flush(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int ret; - - spin_lock(&l_mg->free_lock); - if (pblk->state == PBLK_STATE_RECOVERING || - pblk->state == PBLK_STATE_STOPPED) { - spin_unlock(&l_mg->free_lock); - return; - } - pblk->state = PBLK_STATE_RECOVERING; - trace_pblk_state(pblk_disk_name(pblk), pblk->state); - spin_unlock(&l_mg->free_lock); - - pblk_flush_writer(pblk); - pblk_wait_for_meta(pblk); - - ret = pblk_recov_pad(pblk); - if (ret) { - pblk_err(pblk, "could not close data on teardown(%d)\n", ret); - return; - } - - flush_workqueue(pblk->bb_wq); - pblk_line_close_meta_sync(pblk); -} - -void __pblk_pipeline_stop(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - - spin_lock(&l_mg->free_lock); - pblk->state = PBLK_STATE_STOPPED; - trace_pblk_state(pblk_disk_name(pblk), pblk->state); - l_mg->data_line = NULL; - l_mg->data_next = NULL; - spin_unlock(&l_mg->free_lock); -} - -void pblk_pipeline_stop(struct pblk *pblk) -{ - __pblk_pipeline_flush(pblk); - __pblk_pipeline_stop(pblk); -} - -struct pblk_line *pblk_line_replace_data(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *cur, *new = NULL; - unsigned int left_seblks; - - new = l_mg->data_next; - if (!new) - goto out; - - spin_lock(&l_mg->free_lock); - cur = l_mg->data_line; - l_mg->data_line = new; - - pblk_line_setup_metadata(new, l_mg, &pblk->lm); - spin_unlock(&l_mg->free_lock); - -retry_erase: - left_seblks = atomic_read(&new->left_seblks); - if (left_seblks) { - /* If line is not fully erased, erase it */ - if (atomic_read(&new->left_eblks)) { - if (pblk_line_erase(pblk, new)) - goto out; - } else { - io_schedule(); - } - goto retry_erase; - } - - if (pblk_line_alloc_bitmaps(pblk, new)) - return NULL; - -retry_setup: - if (!pblk_line_init_metadata(pblk, new, cur)) { - new = pblk_line_retry(pblk, new); - if (!new) - goto out; - - goto retry_setup; - } - - if (!pblk_line_init_bb(pblk, new, 1)) { - new = pblk_line_retry(pblk, new); - if (!new) - goto out; - - goto retry_setup; - } - - pblk_rl_free_lines_dec(&pblk->rl, new, true); - - /* Allocate next line for preparation */ - spin_lock(&l_mg->free_lock); - l_mg->data_next = pblk_line_get(pblk); - if (!l_mg->data_next) { - /* If we cannot get a new line, we need to stop the pipeline. - * Only allow as many writes in as we can store safely and then - * fail gracefully - */ - pblk_stop_writes(pblk, new); - l_mg->data_next = NULL; - } else { - l_mg->data_next->seq_nr = l_mg->d_seq_nr++; - l_mg->data_next->type = PBLK_LINETYPE_DATA; - } - spin_unlock(&l_mg->free_lock); - -out: - return new; -} - -static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_gc *gc = &pblk->gc; - - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_GC); - if (line->w_err_gc->has_gc_err) { - spin_unlock(&line->lock); - pblk_err(pblk, "line %d had errors during GC\n", line->id); - pblk_put_line_back(pblk, line); - line->w_err_gc->has_gc_err = 0; - return; - } - - line->state = PBLK_LINESTATE_FREE; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - line->gc_group = PBLK_LINEGC_NONE; - pblk_line_free(line); - - if (line->w_err_gc->has_write_err) { - pblk_rl_werr_line_out(&pblk->rl); - line->w_err_gc->has_write_err = 0; - } - - spin_unlock(&line->lock); - atomic_dec(&gc->pipeline_gc); - - spin_lock(&l_mg->free_lock); - list_add_tail(&line->list, &l_mg->free_list); - l_mg->nr_free_lines++; - spin_unlock(&l_mg->free_lock); - - pblk_rl_free_lines_inc(&pblk->rl, line); -} - -static void pblk_line_put_ws(struct work_struct *work) -{ - struct pblk_line_ws *line_put_ws = container_of(work, - struct pblk_line_ws, ws); - struct pblk *pblk = line_put_ws->pblk; - struct pblk_line *line = line_put_ws->line; - - __pblk_line_put(pblk, line); - mempool_free(line_put_ws, &pblk->gen_ws_pool); -} - -void pblk_line_put(struct kref *ref) -{ - struct pblk_line *line = container_of(ref, struct pblk_line, ref); - struct pblk *pblk = line->pblk; - - __pblk_line_put(pblk, line); -} - -void pblk_line_put_wq(struct kref *ref) -{ - struct pblk_line *line = container_of(ref, struct pblk_line, ref); - struct pblk *pblk = line->pblk; - struct pblk_line_ws *line_put_ws; - - line_put_ws = mempool_alloc(&pblk->gen_ws_pool, GFP_ATOMIC); - if (!line_put_ws) - return; - - line_put_ws->pblk = pblk; - line_put_ws->line = line; - line_put_ws->priv = NULL; - - INIT_WORK(&line_put_ws->ws, pblk_line_put_ws); - queue_work(pblk->r_end_wq, &line_put_ws->ws); -} - -int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) -{ - struct nvm_rq *rqd; - int err; - - rqd = pblk_alloc_rqd(pblk, PBLK_ERASE); - - pblk_setup_e_rq(pblk, rqd, ppa); - - rqd->end_io = pblk_end_io_erase; - rqd->private = pblk; - - trace_pblk_chunk_reset(pblk_disk_name(pblk), - &ppa, PBLK_CHUNK_RESET_START); - - /* The write thread schedules erases so that it minimizes disturbances - * with writes. Thus, there is no need to take the LUN semaphore. - */ - err = pblk_submit_io(pblk, rqd, NULL); - if (err) { - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - - pblk_err(pblk, "could not async erase line:%d,blk:%d\n", - pblk_ppa_to_line_id(ppa), - pblk_ppa_to_pos(geo, ppa)); - } - - return err; -} - -struct pblk_line *pblk_line_get_data(struct pblk *pblk) -{ - return pblk->l_mg.data_line; -} - -/* For now, always erase next line */ -struct pblk_line *pblk_line_get_erase(struct pblk *pblk) -{ - return pblk->l_mg.data_next; -} - -int pblk_line_is_full(struct pblk_line *line) -{ - return (line->left_msecs == 0); -} - -static void pblk_line_should_sync_meta(struct pblk *pblk) -{ - if (pblk_rl_is_limit(&pblk->rl)) - pblk_line_close_meta_sync(pblk); -} - -void pblk_line_close(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct list_head *move_list; - int i; - -#ifdef CONFIG_NVM_PBLK_DEBUG - WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line), - "pblk: corrupt closed line %d\n", line->id); -#endif - - spin_lock(&l_mg->free_lock); - WARN_ON(!test_and_clear_bit(line->meta_line, &l_mg->meta_bitmap)); - spin_unlock(&l_mg->free_lock); - - spin_lock(&l_mg->gc_lock); - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_OPEN); - line->state = PBLK_LINESTATE_CLOSED; - move_list = pblk_line_gc_list(pblk, line); - list_add_tail(&line->list, move_list); - - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - line->map_bitmap = NULL; - line->smeta = NULL; - line->emeta = NULL; - - for (i = 0; i < lm->blk_per_line; i++) { - struct pblk_lun *rlun = &pblk->luns[i]; - int pos = pblk_ppa_to_pos(geo, rlun->bppa); - int state = line->chks[pos].state; - - if (!(state & NVM_CHK_ST_OFFLINE)) - state = NVM_CHK_ST_CLOSED; - } - - spin_unlock(&line->lock); - spin_unlock(&l_mg->gc_lock); - - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); -} - -void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_emeta *emeta = line->emeta; - struct line_emeta *emeta_buf = emeta->buf; - struct wa_counters *wa = emeta_to_wa(lm, emeta_buf); - - /* No need for exact vsc value; avoid a big line lock and take aprox. */ - memcpy(emeta_to_vsc(pblk, emeta_buf), l_mg->vsc_list, lm->vsc_list_len); - memcpy(emeta_to_bb(emeta_buf), line->blk_bitmap, lm->blk_bitmap_len); - - wa->user = cpu_to_le64(atomic64_read(&pblk->user_wa)); - wa->pad = cpu_to_le64(atomic64_read(&pblk->pad_wa)); - wa->gc = cpu_to_le64(atomic64_read(&pblk->gc_wa)); - - if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) { - emeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); - export_guid(emeta_buf->header.uuid, &pblk->instance_uuid); - emeta_buf->header.id = cpu_to_le32(line->id); - emeta_buf->header.type = cpu_to_le16(line->type); - emeta_buf->header.version_major = EMETA_VERSION_MAJOR; - emeta_buf->header.version_minor = EMETA_VERSION_MINOR; - emeta_buf->header.crc = cpu_to_le32( - pblk_calc_meta_header_crc(pblk, &emeta_buf->header)); - } - - emeta_buf->nr_valid_lbas = cpu_to_le64(line->nr_valid_lbas); - emeta_buf->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, emeta_buf)); - - spin_lock(&l_mg->close_lock); - spin_lock(&line->lock); - - /* Update the in-memory start address for emeta, in case it has - * shifted due to write errors - */ - if (line->emeta_ssec != line->cur_sec) - line->emeta_ssec = line->cur_sec; - - list_add_tail(&line->list, &l_mg->emeta_list); - spin_unlock(&line->lock); - spin_unlock(&l_mg->close_lock); - - pblk_line_should_sync_meta(pblk); -} - -static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - unsigned int lba_list_size = lm->emeta_len[2]; - struct pblk_w_err_gc *w_err_gc = line->w_err_gc; - struct pblk_emeta *emeta = line->emeta; - - w_err_gc->lba_list = kvmalloc(lba_list_size, GFP_KERNEL); - memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf), - lba_list_size); -} - -void pblk_line_close_ws(struct work_struct *work) -{ - struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, - ws); - struct pblk *pblk = line_ws->pblk; - struct pblk_line *line = line_ws->line; - struct pblk_w_err_gc *w_err_gc = line->w_err_gc; - - /* Write errors makes the emeta start address stored in smeta invalid, - * so keep a copy of the lba list until we've gc'd the line - */ - if (w_err_gc->has_write_err) - pblk_save_lba_list(pblk, line); - - pblk_line_close(pblk, line); - mempool_free(line_ws, &pblk->gen_ws_pool); -} - -void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *), gfp_t gfp_mask, - struct workqueue_struct *wq) -{ - struct pblk_line_ws *line_ws; - - line_ws = mempool_alloc(&pblk->gen_ws_pool, gfp_mask); - if (!line_ws) { - pblk_err(pblk, "pblk: could not allocate memory\n"); - return; - } - - line_ws->pblk = pblk; - line_ws->line = line; - line_ws->priv = priv; - - INIT_WORK(&line_ws->ws, work); - queue_work(wq, &line_ws->ws); -} - -static void __pblk_down_chunk(struct pblk *pblk, int pos) -{ - struct pblk_lun *rlun = &pblk->luns[pos]; - int ret; - - /* - * Only send one inflight I/O per LUN. Since we map at a page - * granurality, all ppas in the I/O will map to the same LUN - */ - - ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); - if (ret == -ETIME || ret == -EINTR) - pblk_err(pblk, "taking lun semaphore timed out: err %d\n", - -ret); -} - -void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int pos = pblk_ppa_to_pos(geo, ppa); - - __pblk_down_chunk(pblk, pos); -} - -void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa, - unsigned long *lun_bitmap) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int pos = pblk_ppa_to_pos(geo, ppa); - - /* If the LUN has been locked for this same request, do no attempt to - * lock it again - */ - if (test_and_set_bit(pos, lun_bitmap)) - return; - - __pblk_down_chunk(pblk, pos); -} - -void pblk_up_chunk(struct pblk *pblk, struct ppa_addr ppa) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - int pos = pblk_ppa_to_pos(geo, ppa); - - rlun = &pblk->luns[pos]; - up(&rlun->wr_sem); -} - -void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - int num_lun = geo->all_luns; - int bit = -1; - - while ((bit = find_next_bit(lun_bitmap, num_lun, bit + 1)) < num_lun) { - rlun = &pblk->luns[bit]; - up(&rlun->wr_sem); - } -} - -void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) -{ - struct ppa_addr ppa_l2p; - - /* logic error: lba out-of-bounds. Ignore update */ - if (!(lba < pblk->capacity)) { - WARN(1, "pblk: corrupted L2P map request\n"); - return; - } - - spin_lock(&pblk->trans_lock); - ppa_l2p = pblk_trans_map_get(pblk, lba); - - if (!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)) - pblk_map_invalidate(pblk, ppa_l2p); - - pblk_trans_map_set(pblk, lba, ppa); - spin_unlock(&pblk->trans_lock); -} - -void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) -{ - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a cache address */ - BUG_ON(!pblk_addr_in_cache(ppa)); - BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa))); -#endif - - pblk_update_map(pblk, lba, ppa); -} - -int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new, - struct pblk_line *gc_line, u64 paddr_gc) -{ - struct ppa_addr ppa_l2p, ppa_gc; - int ret = 1; - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a cache address */ - BUG_ON(!pblk_addr_in_cache(ppa_new)); - BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new))); -#endif - - /* logic error: lba out-of-bounds. Ignore update */ - if (!(lba < pblk->capacity)) { - WARN(1, "pblk: corrupted L2P map request\n"); - return 0; - } - - spin_lock(&pblk->trans_lock); - ppa_l2p = pblk_trans_map_get(pblk, lba); - ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, gc_line->id); - - if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) { - spin_lock(&gc_line->lock); - WARN(!test_bit(paddr_gc, gc_line->invalid_bitmap), - "pblk: corrupted GC update"); - spin_unlock(&gc_line->lock); - - ret = 0; - goto out; - } - - pblk_trans_map_set(pblk, lba, ppa_new); -out: - spin_unlock(&pblk->trans_lock); - return ret; -} - -void pblk_update_map_dev(struct pblk *pblk, sector_t lba, - struct ppa_addr ppa_mapped, struct ppa_addr ppa_cache) -{ - struct ppa_addr ppa_l2p; - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a device address */ - BUG_ON(pblk_addr_in_cache(ppa_mapped)); -#endif - /* Invalidate and discard padded entries */ - if (lba == ADDR_EMPTY) { - atomic64_inc(&pblk->pad_wa); -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->padded_wb); -#endif - if (!pblk_ppa_empty(ppa_mapped)) - pblk_map_invalidate(pblk, ppa_mapped); - return; - } - - /* logic error: lba out-of-bounds. Ignore update */ - if (!(lba < pblk->capacity)) { - WARN(1, "pblk: corrupted L2P map request\n"); - return; - } - - spin_lock(&pblk->trans_lock); - ppa_l2p = pblk_trans_map_get(pblk, lba); - - /* Do not update L2P if the cacheline has been updated. In this case, - * the mapped ppa must be invalidated - */ - if (!pblk_ppa_comp(ppa_l2p, ppa_cache)) { - if (!pblk_ppa_empty(ppa_mapped)) - pblk_map_invalidate(pblk, ppa_mapped); - goto out; - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)); -#endif - - pblk_trans_map_set(pblk, lba, ppa_mapped); -out: - spin_unlock(&pblk->trans_lock); -} - -int pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, - sector_t blba, int nr_secs, bool *from_cache) -{ - int i; - - spin_lock(&pblk->trans_lock); - for (i = 0; i < nr_secs; i++) { - struct ppa_addr ppa; - - ppa = ppas[i] = pblk_trans_map_get(pblk, blba + i); - - /* If the L2P entry maps to a line, the reference is valid */ - if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) { - struct pblk_line *line = pblk_ppa_to_line(pblk, ppa); - - if (i > 0 && *from_cache) - break; - *from_cache = false; - - kref_get(&line->ref); - } else { - if (i > 0 && !*from_cache) - break; - *from_cache = true; - } - } - spin_unlock(&pblk->trans_lock); - return i; -} - -void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, - u64 *lba_list, int nr_secs) -{ - u64 lba; - int i; - - spin_lock(&pblk->trans_lock); - for (i = 0; i < nr_secs; i++) { - lba = lba_list[i]; - if (lba != ADDR_EMPTY) { - /* logic error: lba out-of-bounds. Ignore update */ - if (!(lba < pblk->capacity)) { - WARN(1, "pblk: corrupted L2P map request\n"); - continue; - } - ppas[i] = pblk_trans_map_get(pblk, lba); - } - } - spin_unlock(&pblk->trans_lock); -} - -void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd) -{ - void *buffer; - - if (pblk_is_oob_meta_supported(pblk)) { - /* Just use OOB metadata buffer as always */ - buffer = rqd->meta_list; - } else { - /* We need to reuse last page of request (packed metadata) - * in similar way as traditional oob metadata - */ - buffer = page_to_virt( - rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page); - } - - return buffer; -} - -void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd) -{ - void *meta_list = rqd->meta_list; - void *page; - int i = 0; - - if (pblk_is_oob_meta_supported(pblk)) - return; - - page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page); - /* We need to fill oob meta buffer with data from packed metadata */ - for (; i < rqd->nr_ppas; i++) - memcpy(pblk_get_meta(pblk, meta_list, i), - page + (i * sizeof(struct pblk_sec_meta)), - sizeof(struct pblk_sec_meta)); -} diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c deleted file mode 100644 index b31658be35a7..000000000000 --- a/drivers/lightnvm/pblk-gc.c +++ /dev/null @@ -1,726 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-gc.c - pblk's garbage collector - */ - -#include "pblk.h" -#include "pblk-trace.h" -#include - - -static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq) -{ - vfree(gc_rq->data); - kfree(gc_rq); -} - -static int pblk_gc_write(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - struct pblk_gc_rq *gc_rq, *tgc_rq; - LIST_HEAD(w_list); - - spin_lock(&gc->w_lock); - if (list_empty(&gc->w_list)) { - spin_unlock(&gc->w_lock); - return 1; - } - - list_cut_position(&w_list, &gc->w_list, gc->w_list.prev); - gc->w_entries = 0; - spin_unlock(&gc->w_lock); - - list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) { - pblk_write_gc_to_cache(pblk, gc_rq); - list_del(&gc_rq->list); - kref_put(&gc_rq->line->ref, pblk_line_put); - pblk_gc_free_gc_rq(gc_rq); - } - - return 0; -} - -static void pblk_gc_writer_kick(struct pblk_gc *gc) -{ - wake_up_process(gc->gc_writer_ts); -} - -void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct list_head *move_list; - - spin_lock(&l_mg->gc_lock); - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_GC); - line->state = PBLK_LINESTATE_CLOSED; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - - /* We need to reset gc_group in order to ensure that - * pblk_line_gc_list will return proper move_list - * since right now current line is not on any of the - * gc lists. - */ - line->gc_group = PBLK_LINEGC_NONE; - move_list = pblk_line_gc_list(pblk, line); - spin_unlock(&line->lock); - list_add_tail(&line->list, move_list); - spin_unlock(&l_mg->gc_lock); -} - -static void pblk_gc_line_ws(struct work_struct *work) -{ - struct pblk_line_ws *gc_rq_ws = container_of(work, - struct pblk_line_ws, ws); - struct pblk *pblk = gc_rq_ws->pblk; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line = gc_rq_ws->line; - struct pblk_gc_rq *gc_rq = gc_rq_ws->priv; - int ret; - - up(&gc->gc_sem); - - /* Read from GC victim block */ - ret = pblk_submit_read_gc(pblk, gc_rq); - if (ret) { - line->w_err_gc->has_gc_err = 1; - goto out; - } - - if (!gc_rq->secs_to_gc) - goto out; - -retry: - spin_lock(&gc->w_lock); - if (gc->w_entries >= PBLK_GC_RQ_QD) { - spin_unlock(&gc->w_lock); - pblk_gc_writer_kick(&pblk->gc); - usleep_range(128, 256); - goto retry; - } - gc->w_entries++; - list_add_tail(&gc_rq->list, &gc->w_list); - spin_unlock(&gc->w_lock); - - pblk_gc_writer_kick(&pblk->gc); - - kfree(gc_rq_ws); - return; - -out: - pblk_gc_free_gc_rq(gc_rq); - kref_put(&line->ref, pblk_line_put); - kfree(gc_rq_ws); -} - -static __le64 *get_lba_list_from_emeta(struct pblk *pblk, - struct pblk_line *line) -{ - struct line_emeta *emeta_buf; - struct pblk_line_meta *lm = &pblk->lm; - unsigned int lba_list_size = lm->emeta_len[2]; - __le64 *lba_list; - int ret; - - emeta_buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL); - if (!emeta_buf) - return NULL; - - ret = pblk_line_emeta_read(pblk, line, emeta_buf); - if (ret) { - pblk_err(pblk, "line %d read emeta failed (%d)\n", - line->id, ret); - kvfree(emeta_buf); - return NULL; - } - - /* If this read fails, it means that emeta is corrupted. - * For now, leave the line untouched. - * TODO: Implement a recovery routine that scans and moves - * all sectors on the line. - */ - - ret = pblk_recov_check_emeta(pblk, emeta_buf); - if (ret) { - pblk_err(pblk, "inconsistent emeta (line %d)\n", - line->id); - kvfree(emeta_buf); - return NULL; - } - - lba_list = kvmalloc(lba_list_size, GFP_KERNEL); - - if (lba_list) - memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size); - - kvfree(emeta_buf); - - return lba_list; -} - -static void pblk_gc_line_prepare_ws(struct work_struct *work) -{ - struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, - ws); - struct pblk *pblk = line_ws->pblk; - struct pblk_line *line = line_ws->line; - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line_ws *gc_rq_ws; - struct pblk_gc_rq *gc_rq; - __le64 *lba_list; - unsigned long *invalid_bitmap; - int sec_left, nr_secs, bit; - - invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL); - if (!invalid_bitmap) - goto fail_free_ws; - - if (line->w_err_gc->has_write_err) { - lba_list = line->w_err_gc->lba_list; - line->w_err_gc->lba_list = NULL; - } else { - lba_list = get_lba_list_from_emeta(pblk, line); - if (!lba_list) { - pblk_err(pblk, "could not interpret emeta (line %d)\n", - line->id); - goto fail_free_invalid_bitmap; - } - } - - spin_lock(&line->lock); - bitmap_copy(invalid_bitmap, line->invalid_bitmap, lm->sec_per_line); - sec_left = pblk_line_vsc(line); - spin_unlock(&line->lock); - - if (sec_left < 0) { - pblk_err(pblk, "corrupted GC line (%d)\n", line->id); - goto fail_free_lba_list; - } - - bit = -1; -next_rq: - gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL); - if (!gc_rq) - goto fail_free_lba_list; - - nr_secs = 0; - do { - bit = find_next_zero_bit(invalid_bitmap, lm->sec_per_line, - bit + 1); - if (bit > line->emeta_ssec) - break; - - gc_rq->paddr_list[nr_secs] = bit; - gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]); - } while (nr_secs < pblk->max_write_pgs); - - if (unlikely(!nr_secs)) { - kfree(gc_rq); - goto out; - } - - gc_rq->nr_secs = nr_secs; - gc_rq->line = line; - - gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs)); - if (!gc_rq->data) - goto fail_free_gc_rq; - - gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); - if (!gc_rq_ws) - goto fail_free_gc_data; - - gc_rq_ws->pblk = pblk; - gc_rq_ws->line = line; - gc_rq_ws->priv = gc_rq; - - /* The write GC path can be much slower than the read GC one due to - * the budget imposed by the rate-limiter. Balance in case that we get - * back pressure from the write GC path. - */ - while (down_timeout(&gc->gc_sem, msecs_to_jiffies(30000))) - io_schedule(); - - kref_get(&line->ref); - - INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws); - queue_work(gc->gc_line_reader_wq, &gc_rq_ws->ws); - - sec_left -= nr_secs; - if (sec_left > 0) - goto next_rq; - -out: - kvfree(lba_list); - kfree(line_ws); - kfree(invalid_bitmap); - - kref_put(&line->ref, pblk_line_put); - atomic_dec(&gc->read_inflight_gc); - - return; - -fail_free_gc_data: - vfree(gc_rq->data); -fail_free_gc_rq: - kfree(gc_rq); -fail_free_lba_list: - kvfree(lba_list); -fail_free_invalid_bitmap: - kfree(invalid_bitmap); -fail_free_ws: - kfree(line_ws); - - /* Line goes back to closed state, so we cannot release additional - * reference for line, since we do that only when we want to do - * gc to free line state transition. - */ - pblk_put_line_back(pblk, line); - atomic_dec(&gc->read_inflight_gc); - - pblk_err(pblk, "failed to GC line %d\n", line->id); -} - -static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_gc *gc = &pblk->gc; - struct pblk_line_ws *line_ws; - - pblk_debug(pblk, "line '%d' being reclaimed for GC\n", line->id); - - line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); - if (!line_ws) - return -ENOMEM; - - line_ws->pblk = pblk; - line_ws->line = line; - - atomic_inc(&gc->pipeline_gc); - INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws); - queue_work(gc->gc_reader_wq, &line_ws->ws); - - return 0; -} - -static void pblk_gc_reader_kick(struct pblk_gc *gc) -{ - wake_up_process(gc->gc_reader_ts); -} - -static void pblk_gc_kick(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - - pblk_gc_writer_kick(gc); - pblk_gc_reader_kick(gc); - - /* If we're shutting down GC, let's not start it up again */ - if (gc->gc_enabled) { - wake_up_process(gc->gc_ts); - mod_timer(&gc->gc_timer, - jiffies + msecs_to_jiffies(GC_TIME_MSECS)); - } -} - -static int pblk_gc_read(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line; - - spin_lock(&gc->r_lock); - if (list_empty(&gc->r_list)) { - spin_unlock(&gc->r_lock); - return 1; - } - - line = list_first_entry(&gc->r_list, struct pblk_line, list); - list_del(&line->list); - spin_unlock(&gc->r_lock); - - pblk_gc_kick(pblk); - - if (pblk_gc_line(pblk, line)) { - pblk_err(pblk, "failed to GC line %d\n", line->id); - /* rollback */ - spin_lock(&gc->r_lock); - list_add_tail(&line->list, &gc->r_list); - spin_unlock(&gc->r_lock); - } - - return 0; -} - -static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk, - struct list_head *group_list) -{ - struct pblk_line *line, *victim; - unsigned int line_vsc = ~0x0L, victim_vsc = ~0x0L; - - victim = list_first_entry(group_list, struct pblk_line, list); - - list_for_each_entry(line, group_list, list) { - if (!atomic_read(&line->sec_to_update)) - line_vsc = le32_to_cpu(*line->vsc); - if (line_vsc < victim_vsc) { - victim = line; - victim_vsc = le32_to_cpu(*victim->vsc); - } - } - - if (victim_vsc == ~0x0) - return NULL; - - return victim; -} - -static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl) -{ - unsigned int nr_blocks_free, nr_blocks_need; - unsigned int werr_lines = atomic_read(&rl->werr_lines); - - nr_blocks_need = pblk_rl_high_thrs(rl); - nr_blocks_free = pblk_rl_nr_free_blks(rl); - - /* This is not critical, no need to take lock here */ - return ((werr_lines > 0) || - ((gc->gc_active) && (nr_blocks_need > nr_blocks_free))); -} - -void pblk_gc_free_full_lines(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line; - - do { - spin_lock(&l_mg->gc_lock); - if (list_empty(&l_mg->gc_full_list)) { - spin_unlock(&l_mg->gc_lock); - return; - } - - line = list_first_entry(&l_mg->gc_full_list, - struct pblk_line, list); - - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_CLOSED); - line->state = PBLK_LINESTATE_GC; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_del(&line->list); - spin_unlock(&l_mg->gc_lock); - - atomic_inc(&gc->pipeline_gc); - kref_put(&line->ref, pblk_line_put); - } while (1); -} - -/* - * Lines with no valid sectors will be returned to the free list immediately. If - * GC is activated - either because the free block count is under the determined - * threshold, or because it is being forced from user space - only lines with a - * high count of invalid sectors will be recycled. - */ -static void pblk_gc_run(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line; - struct list_head *group_list; - bool run_gc; - int read_inflight_gc, gc_group = 0, prev_group = 0; - - pblk_gc_free_full_lines(pblk); - - run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); - if (!run_gc || (atomic_read(&gc->read_inflight_gc) >= PBLK_GC_L_QD)) - return; - -next_gc_group: - group_list = l_mg->gc_lists[gc_group++]; - - do { - spin_lock(&l_mg->gc_lock); - - line = pblk_gc_get_victim_line(pblk, group_list); - if (!line) { - spin_unlock(&l_mg->gc_lock); - break; - } - - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_CLOSED); - line->state = PBLK_LINESTATE_GC; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_del(&line->list); - spin_unlock(&l_mg->gc_lock); - - spin_lock(&gc->r_lock); - list_add_tail(&line->list, &gc->r_list); - spin_unlock(&gc->r_lock); - - read_inflight_gc = atomic_inc_return(&gc->read_inflight_gc); - pblk_gc_reader_kick(gc); - - prev_group = 1; - - /* No need to queue up more GC lines than we can handle */ - run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); - if (!run_gc || read_inflight_gc >= PBLK_GC_L_QD) - break; - } while (1); - - if (!prev_group && pblk->rl.rb_state > gc_group && - gc_group < PBLK_GC_NR_LISTS) - goto next_gc_group; -} - -static void pblk_gc_timer(struct timer_list *t) -{ - struct pblk *pblk = from_timer(pblk, t, gc.gc_timer); - - pblk_gc_kick(pblk); -} - -static int pblk_gc_ts(void *data) -{ - struct pblk *pblk = data; - - while (!kthread_should_stop()) { - pblk_gc_run(pblk); - set_current_state(TASK_INTERRUPTIBLE); - io_schedule(); - } - - return 0; -} - -static int pblk_gc_writer_ts(void *data) -{ - struct pblk *pblk = data; - - while (!kthread_should_stop()) { - if (!pblk_gc_write(pblk)) - continue; - set_current_state(TASK_INTERRUPTIBLE); - io_schedule(); - } - - return 0; -} - -static int pblk_gc_reader_ts(void *data) -{ - struct pblk *pblk = data; - struct pblk_gc *gc = &pblk->gc; - - while (!kthread_should_stop()) { - if (!pblk_gc_read(pblk)) - continue; - set_current_state(TASK_INTERRUPTIBLE); - io_schedule(); - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_info(pblk, "flushing gc pipeline, %d lines left\n", - atomic_read(&gc->pipeline_gc)); -#endif - - do { - if (!atomic_read(&gc->pipeline_gc)) - break; - - schedule(); - } while (1); - - return 0; -} - -static void pblk_gc_start(struct pblk *pblk) -{ - pblk->gc.gc_active = 1; - pblk_debug(pblk, "gc start\n"); -} - -void pblk_gc_should_start(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - - if (gc->gc_enabled && !gc->gc_active) { - pblk_gc_start(pblk); - pblk_gc_kick(pblk); - } -} - -void pblk_gc_should_stop(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - - if (gc->gc_active && !gc->gc_forced) - gc->gc_active = 0; -} - -void pblk_gc_should_kick(struct pblk *pblk) -{ - pblk_rl_update_rates(&pblk->rl); -} - -void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, - int *gc_active) -{ - struct pblk_gc *gc = &pblk->gc; - - spin_lock(&gc->lock); - *gc_enabled = gc->gc_enabled; - *gc_active = gc->gc_active; - spin_unlock(&gc->lock); -} - -int pblk_gc_sysfs_force(struct pblk *pblk, int force) -{ - struct pblk_gc *gc = &pblk->gc; - - if (force < 0 || force > 1) - return -EINVAL; - - spin_lock(&gc->lock); - gc->gc_forced = force; - - if (force) - gc->gc_enabled = 1; - else - gc->gc_enabled = 0; - spin_unlock(&gc->lock); - - pblk_gc_should_start(pblk); - - return 0; -} - -int pblk_gc_init(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - int ret; - - gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts"); - if (IS_ERR(gc->gc_ts)) { - pblk_err(pblk, "could not allocate GC main kthread\n"); - return PTR_ERR(gc->gc_ts); - } - - gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk, - "pblk-gc-writer-ts"); - if (IS_ERR(gc->gc_writer_ts)) { - pblk_err(pblk, "could not allocate GC writer kthread\n"); - ret = PTR_ERR(gc->gc_writer_ts); - goto fail_free_main_kthread; - } - - gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk, - "pblk-gc-reader-ts"); - if (IS_ERR(gc->gc_reader_ts)) { - pblk_err(pblk, "could not allocate GC reader kthread\n"); - ret = PTR_ERR(gc->gc_reader_ts); - goto fail_free_writer_kthread; - } - - timer_setup(&gc->gc_timer, pblk_gc_timer, 0); - mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS)); - - gc->gc_active = 0; - gc->gc_forced = 0; - gc->gc_enabled = 1; - gc->w_entries = 0; - atomic_set(&gc->read_inflight_gc, 0); - atomic_set(&gc->pipeline_gc, 0); - - /* Workqueue that reads valid sectors from a line and submit them to the - * GC writer to be recycled. - */ - gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS); - if (!gc->gc_line_reader_wq) { - pblk_err(pblk, "could not allocate GC line reader workqueue\n"); - ret = -ENOMEM; - goto fail_free_reader_kthread; - } - - /* Workqueue that prepare lines for GC */ - gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, 1); - if (!gc->gc_reader_wq) { - pblk_err(pblk, "could not allocate GC reader workqueue\n"); - ret = -ENOMEM; - goto fail_free_reader_line_wq; - } - - spin_lock_init(&gc->lock); - spin_lock_init(&gc->w_lock); - spin_lock_init(&gc->r_lock); - - sema_init(&gc->gc_sem, PBLK_GC_RQ_QD); - - INIT_LIST_HEAD(&gc->w_list); - INIT_LIST_HEAD(&gc->r_list); - - return 0; - -fail_free_reader_line_wq: - destroy_workqueue(gc->gc_line_reader_wq); -fail_free_reader_kthread: - kthread_stop(gc->gc_reader_ts); -fail_free_writer_kthread: - kthread_stop(gc->gc_writer_ts); -fail_free_main_kthread: - kthread_stop(gc->gc_ts); - - return ret; -} - -void pblk_gc_exit(struct pblk *pblk, bool graceful) -{ - struct pblk_gc *gc = &pblk->gc; - - gc->gc_enabled = 0; - del_timer_sync(&gc->gc_timer); - gc->gc_active = 0; - - if (gc->gc_ts) - kthread_stop(gc->gc_ts); - - if (gc->gc_reader_ts) - kthread_stop(gc->gc_reader_ts); - - if (graceful) { - flush_workqueue(gc->gc_reader_wq); - flush_workqueue(gc->gc_line_reader_wq); - } - - destroy_workqueue(gc->gc_reader_wq); - destroy_workqueue(gc->gc_line_reader_wq); - - if (gc->gc_writer_ts) - kthread_stop(gc->gc_writer_ts); -} diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c deleted file mode 100644 index 5924f09c217b..000000000000 --- a/drivers/lightnvm/pblk-init.c +++ /dev/null @@ -1,1324 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2015 IT University of Copenhagen (rrpc.c) - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Implementation of a physical block-device target for Open-channel SSDs. - * - * pblk-init.c - pblk's initialization. - */ - -#include "pblk.h" -#include "pblk-trace.h" - -static unsigned int write_buffer_size; - -module_param(write_buffer_size, uint, 0644); -MODULE_PARM_DESC(write_buffer_size, "number of entries in a write buffer"); - -struct pblk_global_caches { - struct kmem_cache *ws; - struct kmem_cache *rec; - struct kmem_cache *g_rq; - struct kmem_cache *w_rq; - - struct kref kref; - - struct mutex mutex; /* Ensures consistency between - * caches and kref - */ -}; - -static struct pblk_global_caches pblk_caches = { - .mutex = __MUTEX_INITIALIZER(pblk_caches.mutex), - .kref = KREF_INIT(0), -}; - -struct bio_set pblk_bio_set; - -static blk_qc_t pblk_submit_bio(struct bio *bio) -{ - struct pblk *pblk = bio->bi_bdev->bd_disk->queue->queuedata; - - if (bio_op(bio) == REQ_OP_DISCARD) { - pblk_discard(pblk, bio); - if (!(bio->bi_opf & REQ_PREFLUSH)) { - bio_endio(bio); - return BLK_QC_T_NONE; - } - } - - /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap - * constraint. Writes can be of arbitrary size. - */ - if (bio_data_dir(bio) == READ) { - blk_queue_split(&bio); - pblk_submit_read(pblk, bio); - } else { - /* Prevent deadlock in the case of a modest LUN configuration - * and large user I/Os. Unless stalled, the rate limiter - * leaves at least 256KB available for user I/O. - */ - if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl)) - blk_queue_split(&bio); - - pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER); - } - - return BLK_QC_T_NONE; -} - -static const struct block_device_operations pblk_bops = { - .owner = THIS_MODULE, - .submit_bio = pblk_submit_bio, -}; - - -static size_t pblk_trans_map_size(struct pblk *pblk) -{ - int entry_size = 8; - - if (pblk->addrf_len < 32) - entry_size = 4; - - return entry_size * pblk->capacity; -} - -#ifdef CONFIG_NVM_PBLK_DEBUG -static u32 pblk_l2p_crc(struct pblk *pblk) -{ - size_t map_size; - u32 crc = ~(u32)0; - - map_size = pblk_trans_map_size(pblk); - crc = crc32_le(crc, pblk->trans_map, map_size); - return crc; -} -#endif - -static void pblk_l2p_free(struct pblk *pblk) -{ - vfree(pblk->trans_map); -} - -static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) -{ - struct pblk_line *line = NULL; - - if (factory_init) { - guid_gen(&pblk->instance_uuid); - } else { - line = pblk_recov_l2p(pblk); - if (IS_ERR(line)) { - pblk_err(pblk, "could not recover l2p table\n"); - return -EFAULT; - } - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_info(pblk, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); -#endif - - /* Free full lines directly as GC has not been started yet */ - pblk_gc_free_full_lines(pblk); - - if (!line) { - /* Configure next line for user data */ - line = pblk_line_get_first_data(pblk); - if (!line) - return -EFAULT; - } - - return 0; -} - -static int pblk_l2p_init(struct pblk *pblk, bool factory_init) -{ - sector_t i; - struct ppa_addr ppa; - size_t map_size; - int ret = 0; - - map_size = pblk_trans_map_size(pblk); - pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN | - __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM); - if (!pblk->trans_map) { - pblk_err(pblk, "failed to allocate L2P (need %zu of memory)\n", - map_size); - return -ENOMEM; - } - - pblk_ppa_set_empty(&ppa); - - for (i = 0; i < pblk->capacity; i++) - pblk_trans_map_set(pblk, i, ppa); - - ret = pblk_l2p_recover(pblk, factory_init); - if (ret) - vfree(pblk->trans_map); - - return ret; -} - -static void pblk_rwb_free(struct pblk *pblk) -{ - if (pblk_rb_tear_down_check(&pblk->rwb)) - pblk_err(pblk, "write buffer error on tear down\n"); - - pblk_rb_free(&pblk->rwb); -} - -static int pblk_rwb_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - unsigned long buffer_size; - int pgs_in_buffer, threshold; - - threshold = geo->mw_cunits * geo->all_luns; - pgs_in_buffer = (max(geo->mw_cunits, geo->ws_opt) + geo->ws_opt) - * geo->all_luns; - - if (write_buffer_size && (write_buffer_size > pgs_in_buffer)) - buffer_size = write_buffer_size; - else - buffer_size = pgs_in_buffer; - - return pblk_rb_init(&pblk->rwb, buffer_size, threshold, geo->csecs); -} - -static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo, - struct nvm_addrf_12 *dst) -{ - struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf; - int power_len; - - /* Re-calculate channel and lun format to adapt to configuration */ - power_len = get_count_order(geo->num_ch); - if (1 << power_len != geo->num_ch) { - pblk_err(pblk, "supports only power-of-two channel config.\n"); - return -EINVAL; - } - dst->ch_len = power_len; - - power_len = get_count_order(geo->num_lun); - if (1 << power_len != geo->num_lun) { - pblk_err(pblk, "supports only power-of-two LUN config.\n"); - return -EINVAL; - } - dst->lun_len = power_len; - - dst->blk_len = src->blk_len; - dst->pg_len = src->pg_len; - dst->pln_len = src->pln_len; - dst->sec_len = src->sec_len; - - dst->sec_offset = 0; - dst->pln_offset = dst->sec_len; - dst->ch_offset = dst->pln_offset + dst->pln_len; - dst->lun_offset = dst->ch_offset + dst->ch_len; - dst->pg_offset = dst->lun_offset + dst->lun_len; - dst->blk_offset = dst->pg_offset + dst->pg_len; - - dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset; - dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset; - dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset; - dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset; - dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset; - dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset; - - return dst->blk_offset + src->blk_len; -} - -static int pblk_set_addrf_20(struct nvm_geo *geo, struct nvm_addrf *adst, - struct pblk_addrf *udst) -{ - struct nvm_addrf *src = &geo->addrf; - - adst->ch_len = get_count_order(geo->num_ch); - adst->lun_len = get_count_order(geo->num_lun); - adst->chk_len = src->chk_len; - adst->sec_len = src->sec_len; - - adst->sec_offset = 0; - adst->ch_offset = adst->sec_len; - adst->lun_offset = adst->ch_offset + adst->ch_len; - adst->chk_offset = adst->lun_offset + adst->lun_len; - - adst->sec_mask = ((1ULL << adst->sec_len) - 1) << adst->sec_offset; - adst->chk_mask = ((1ULL << adst->chk_len) - 1) << adst->chk_offset; - adst->lun_mask = ((1ULL << adst->lun_len) - 1) << adst->lun_offset; - adst->ch_mask = ((1ULL << adst->ch_len) - 1) << adst->ch_offset; - - udst->sec_stripe = geo->ws_opt; - udst->ch_stripe = geo->num_ch; - udst->lun_stripe = geo->num_lun; - - udst->sec_lun_stripe = udst->sec_stripe * udst->ch_stripe; - udst->sec_ws_stripe = udst->sec_lun_stripe * udst->lun_stripe; - - return adst->chk_offset + adst->chk_len; -} - -static int pblk_set_addrf(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int mod; - - switch (geo->version) { - case NVM_OCSSD_SPEC_12: - div_u64_rem(geo->clba, pblk->min_write_pgs, &mod); - if (mod) { - pblk_err(pblk, "bad configuration of sectors/pages\n"); - return -EINVAL; - } - - pblk->addrf_len = pblk_set_addrf_12(pblk, geo, - (void *)&pblk->addrf); - break; - case NVM_OCSSD_SPEC_20: - pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf, - &pblk->uaddrf); - break; - default: - pblk_err(pblk, "OCSSD revision not supported (%d)\n", - geo->version); - return -EINVAL; - } - - return 0; -} - -static int pblk_create_global_caches(void) -{ - - pblk_caches.ws = kmem_cache_create("pblk_blk_ws", - sizeof(struct pblk_line_ws), 0, 0, NULL); - if (!pblk_caches.ws) - return -ENOMEM; - - pblk_caches.rec = kmem_cache_create("pblk_rec", - sizeof(struct pblk_rec_ctx), 0, 0, NULL); - if (!pblk_caches.rec) - goto fail_destroy_ws; - - pblk_caches.g_rq = kmem_cache_create("pblk_g_rq", pblk_g_rq_size, - 0, 0, NULL); - if (!pblk_caches.g_rq) - goto fail_destroy_rec; - - pblk_caches.w_rq = kmem_cache_create("pblk_w_rq", pblk_w_rq_size, - 0, 0, NULL); - if (!pblk_caches.w_rq) - goto fail_destroy_g_rq; - - return 0; - -fail_destroy_g_rq: - kmem_cache_destroy(pblk_caches.g_rq); -fail_destroy_rec: - kmem_cache_destroy(pblk_caches.rec); -fail_destroy_ws: - kmem_cache_destroy(pblk_caches.ws); - - return -ENOMEM; -} - -static int pblk_get_global_caches(void) -{ - int ret = 0; - - mutex_lock(&pblk_caches.mutex); - - if (kref_get_unless_zero(&pblk_caches.kref)) - goto out; - - ret = pblk_create_global_caches(); - if (!ret) - kref_init(&pblk_caches.kref); - -out: - mutex_unlock(&pblk_caches.mutex); - return ret; -} - -static void pblk_destroy_global_caches(struct kref *ref) -{ - struct pblk_global_caches *c; - - c = container_of(ref, struct pblk_global_caches, kref); - - kmem_cache_destroy(c->ws); - kmem_cache_destroy(c->rec); - kmem_cache_destroy(c->g_rq); - kmem_cache_destroy(c->w_rq); -} - -static void pblk_put_global_caches(void) -{ - mutex_lock(&pblk_caches.mutex); - kref_put(&pblk_caches.kref, pblk_destroy_global_caches); - mutex_unlock(&pblk_caches.mutex); -} - -static int pblk_core_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int ret, max_write_ppas; - - atomic64_set(&pblk->user_wa, 0); - atomic64_set(&pblk->pad_wa, 0); - atomic64_set(&pblk->gc_wa, 0); - pblk->user_rst_wa = 0; - pblk->pad_rst_wa = 0; - pblk->gc_rst_wa = 0; - - atomic64_set(&pblk->nr_flush, 0); - pblk->nr_flush_rst = 0; - - pblk->min_write_pgs = geo->ws_opt; - pblk->min_write_pgs_data = pblk->min_write_pgs; - max_write_ppas = pblk->min_write_pgs * geo->all_luns; - pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); - pblk->max_write_pgs = min_t(int, pblk->max_write_pgs, - queue_max_hw_sectors(dev->q) / (geo->csecs >> SECTOR_SHIFT)); - pblk_set_sec_per_write(pblk, pblk->min_write_pgs); - - pblk->oob_meta_size = geo->sos; - if (!pblk_is_oob_meta_supported(pblk)) { - /* For drives which does not have OOB metadata feature - * in order to support recovery feature we need to use - * so called packed metadata. Packed metada will store - * the same information as OOB metadata (l2p table mapping, - * but in the form of the single page at the end of - * every write request. - */ - if (pblk->min_write_pgs - * sizeof(struct pblk_sec_meta) > PAGE_SIZE) { - /* We want to keep all the packed metadata on single - * page per write requests. So we need to ensure that - * it will fit. - * - * This is more like sanity check, since there is - * no device with such a big minimal write size - * (above 1 metabytes). - */ - pblk_err(pblk, "Not supported min write size\n"); - return -EINVAL; - } - /* For packed meta approach we do some simplification. - * On read path we always issue requests which size - * equal to max_write_pgs, with all pages filled with - * user payload except of last one page which will be - * filled with packed metadata. - */ - pblk->max_write_pgs = pblk->min_write_pgs; - pblk->min_write_pgs_data = pblk->min_write_pgs - 1; - } - - pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t), - GFP_KERNEL); - if (!pblk->pad_dist) - return -ENOMEM; - - if (pblk_get_global_caches()) - goto fail_free_pad_dist; - - /* Internal bios can be at most the sectors signaled by the device. */ - ret = mempool_init_page_pool(&pblk->page_bio_pool, NVM_MAX_VLBA, 0); - if (ret) - goto free_global_caches; - - ret = mempool_init_slab_pool(&pblk->gen_ws_pool, PBLK_GEN_WS_POOL_SIZE, - pblk_caches.ws); - if (ret) - goto free_page_bio_pool; - - ret = mempool_init_slab_pool(&pblk->rec_pool, geo->all_luns, - pblk_caches.rec); - if (ret) - goto free_gen_ws_pool; - - ret = mempool_init_slab_pool(&pblk->r_rq_pool, geo->all_luns, - pblk_caches.g_rq); - if (ret) - goto free_rec_pool; - - ret = mempool_init_slab_pool(&pblk->e_rq_pool, geo->all_luns, - pblk_caches.g_rq); - if (ret) - goto free_r_rq_pool; - - ret = mempool_init_slab_pool(&pblk->w_rq_pool, geo->all_luns, - pblk_caches.w_rq); - if (ret) - goto free_e_rq_pool; - - pblk->close_wq = alloc_workqueue("pblk-close-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS); - if (!pblk->close_wq) - goto free_w_rq_pool; - - pblk->bb_wq = alloc_workqueue("pblk-bb-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, 0); - if (!pblk->bb_wq) - goto free_close_wq; - - pblk->r_end_wq = alloc_workqueue("pblk-read-end-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, 0); - if (!pblk->r_end_wq) - goto free_bb_wq; - - if (pblk_set_addrf(pblk)) - goto free_r_end_wq; - - INIT_LIST_HEAD(&pblk->compl_list); - INIT_LIST_HEAD(&pblk->resubmit_list); - - return 0; - -free_r_end_wq: - destroy_workqueue(pblk->r_end_wq); -free_bb_wq: - destroy_workqueue(pblk->bb_wq); -free_close_wq: - destroy_workqueue(pblk->close_wq); -free_w_rq_pool: - mempool_exit(&pblk->w_rq_pool); -free_e_rq_pool: - mempool_exit(&pblk->e_rq_pool); -free_r_rq_pool: - mempool_exit(&pblk->r_rq_pool); -free_rec_pool: - mempool_exit(&pblk->rec_pool); -free_gen_ws_pool: - mempool_exit(&pblk->gen_ws_pool); -free_page_bio_pool: - mempool_exit(&pblk->page_bio_pool); -free_global_caches: - pblk_put_global_caches(); -fail_free_pad_dist: - kfree(pblk->pad_dist); - return -ENOMEM; -} - -static void pblk_core_free(struct pblk *pblk) -{ - if (pblk->close_wq) - destroy_workqueue(pblk->close_wq); - - if (pblk->r_end_wq) - destroy_workqueue(pblk->r_end_wq); - - if (pblk->bb_wq) - destroy_workqueue(pblk->bb_wq); - - mempool_exit(&pblk->page_bio_pool); - mempool_exit(&pblk->gen_ws_pool); - mempool_exit(&pblk->rec_pool); - mempool_exit(&pblk->r_rq_pool); - mempool_exit(&pblk->e_rq_pool); - mempool_exit(&pblk->w_rq_pool); - - pblk_put_global_caches(); - kfree(pblk->pad_dist); -} - -static void pblk_line_mg_free(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int i; - - kfree(l_mg->bb_template); - kfree(l_mg->bb_aux); - kfree(l_mg->vsc_list); - - for (i = 0; i < PBLK_DATA_LINES; i++) { - kfree(l_mg->sline_meta[i]); - kvfree(l_mg->eline_meta[i]->buf); - kfree(l_mg->eline_meta[i]); - } - - mempool_destroy(l_mg->bitmap_pool); - kmem_cache_destroy(l_mg->bitmap_cache); -} - -static void pblk_line_meta_free(struct pblk_line_mgmt *l_mg, - struct pblk_line *line) -{ - struct pblk_w_err_gc *w_err_gc = line->w_err_gc; - - kfree(line->blk_bitmap); - kfree(line->erase_bitmap); - kfree(line->chks); - - kvfree(w_err_gc->lba_list); - kfree(w_err_gc); -} - -static void pblk_lines_free(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line; - int i; - - for (i = 0; i < l_mg->nr_lines; i++) { - line = &pblk->lines[i]; - - pblk_line_free(line); - pblk_line_meta_free(l_mg, line); - } - - pblk_line_mg_free(pblk); - - kfree(pblk->luns); - kfree(pblk->lines); -} - -static int pblk_luns_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - int i; - - /* TODO: Implement unbalanced LUN support */ - if (geo->num_lun < 0) { - pblk_err(pblk, "unbalanced LUN config.\n"); - return -EINVAL; - } - - pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun), - GFP_KERNEL); - if (!pblk->luns) - return -ENOMEM; - - for (i = 0; i < geo->all_luns; i++) { - /* Stripe across channels */ - int ch = i % geo->num_ch; - int lun_raw = i / geo->num_ch; - int lunid = lun_raw + ch * geo->num_lun; - - rlun = &pblk->luns[i]; - rlun->bppa = dev->luns[lunid]; - - sema_init(&rlun->wr_sem, 1); - } - - return 0; -} - -/* See comment over struct line_emeta definition */ -static unsigned int calc_emeta_len(struct pblk *pblk) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - - /* Round to sector size so that lba_list starts on its own sector */ - lm->emeta_sec[1] = DIV_ROUND_UP( - sizeof(struct line_emeta) + lm->blk_bitmap_len + - sizeof(struct wa_counters), geo->csecs); - lm->emeta_len[1] = lm->emeta_sec[1] * geo->csecs; - - /* Round to sector size so that vsc_list starts on its own sector */ - lm->dsec_per_line = lm->sec_per_line - lm->emeta_sec[0]; - lm->emeta_sec[2] = DIV_ROUND_UP(lm->dsec_per_line * sizeof(u64), - geo->csecs); - lm->emeta_len[2] = lm->emeta_sec[2] * geo->csecs; - - lm->emeta_sec[3] = DIV_ROUND_UP(l_mg->nr_lines * sizeof(u32), - geo->csecs); - lm->emeta_len[3] = lm->emeta_sec[3] * geo->csecs; - - lm->vsc_list_len = l_mg->nr_lines * sizeof(u32); - - return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]); -} - -static int pblk_set_provision(struct pblk *pblk, int nr_free_chks) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_geo *geo = &dev->geo; - sector_t provisioned; - int sec_meta, blk_meta, clba; - int minimum; - - if (geo->op == NVM_TARGET_DEFAULT_OP) - pblk->op = PBLK_DEFAULT_OP; - else - pblk->op = geo->op; - - minimum = pblk_get_min_chks(pblk); - provisioned = nr_free_chks; - provisioned *= (100 - pblk->op); - sector_div(provisioned, 100); - - if ((nr_free_chks - provisioned) < minimum) { - if (geo->op != NVM_TARGET_DEFAULT_OP) { - pblk_err(pblk, "OP too small to create a sane instance\n"); - return -EINTR; - } - - /* If the user did not specify an OP value, and PBLK_DEFAULT_OP - * is not enough, calculate and set sane value - */ - - provisioned = nr_free_chks - minimum; - pblk->op = (100 * minimum) / nr_free_chks; - pblk_info(pblk, "Default OP insufficient, adjusting OP to %d\n", - pblk->op); - } - - pblk->op_blks = nr_free_chks - provisioned; - - /* Internally pblk manages all free blocks, but all calculations based - * on user capacity consider only provisioned blocks - */ - pblk->rl.total_blocks = nr_free_chks; - - /* Consider sectors used for metadata */ - sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; - blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); - - clba = (geo->clba / pblk->min_write_pgs) * pblk->min_write_pgs_data; - pblk->capacity = (provisioned - blk_meta) * clba; - - atomic_set(&pblk->rl.free_blocks, nr_free_chks); - atomic_set(&pblk->rl.free_user_blocks, nr_free_chks); - - return 0; -} - -static int pblk_setup_line_meta_chk(struct pblk *pblk, struct pblk_line *line, - struct nvm_chk_meta *meta) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - int i, nr_bad_chks = 0; - - for (i = 0; i < lm->blk_per_line; i++) { - struct pblk_lun *rlun = &pblk->luns[i]; - struct nvm_chk_meta *chunk; - struct nvm_chk_meta *chunk_meta; - struct ppa_addr ppa; - int pos; - - ppa = rlun->bppa; - pos = pblk_ppa_to_pos(geo, ppa); - chunk = &line->chks[pos]; - - ppa.m.chk = line->id; - chunk_meta = pblk_chunk_get_off(pblk, meta, ppa); - - chunk->state = chunk_meta->state; - chunk->type = chunk_meta->type; - chunk->wi = chunk_meta->wi; - chunk->slba = chunk_meta->slba; - chunk->cnlb = chunk_meta->cnlb; - chunk->wp = chunk_meta->wp; - - trace_pblk_chunk_state(pblk_disk_name(pblk), &ppa, - chunk->state); - - if (chunk->type & NVM_CHK_TP_SZ_SPEC) { - WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n"); - continue; - } - - if (!(chunk->state & NVM_CHK_ST_OFFLINE)) - continue; - - set_bit(pos, line->blk_bitmap); - nr_bad_chks++; - } - - return nr_bad_chks; -} - -static long pblk_setup_line_meta(struct pblk *pblk, struct pblk_line *line, - void *chunk_meta, int line_id) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - long nr_bad_chks, chk_in_line; - - line->pblk = pblk; - line->id = line_id; - line->type = PBLK_LINETYPE_FREE; - line->state = PBLK_LINESTATE_NEW; - line->gc_group = PBLK_LINEGC_NONE; - line->vsc = &l_mg->vsc_list[line_id]; - spin_lock_init(&line->lock); - - nr_bad_chks = pblk_setup_line_meta_chk(pblk, line, chunk_meta); - - chk_in_line = lm->blk_per_line - nr_bad_chks; - if (nr_bad_chks < 0 || nr_bad_chks > lm->blk_per_line || - chk_in_line < lm->min_blk_line) { - line->state = PBLK_LINESTATE_BAD; - list_add_tail(&line->list, &l_mg->bad_list); - return 0; - } - - atomic_set(&line->blk_in_line, chk_in_line); - list_add_tail(&line->list, &l_mg->free_list); - l_mg->nr_free_lines++; - - return chk_in_line; -} - -static int pblk_alloc_line_meta(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - - line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL); - if (!line->blk_bitmap) - return -ENOMEM; - - line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL); - if (!line->erase_bitmap) - goto free_blk_bitmap; - - - line->chks = kmalloc_array(lm->blk_per_line, - sizeof(struct nvm_chk_meta), GFP_KERNEL); - if (!line->chks) - goto free_erase_bitmap; - - line->w_err_gc = kzalloc(sizeof(struct pblk_w_err_gc), GFP_KERNEL); - if (!line->w_err_gc) - goto free_chks; - - return 0; - -free_chks: - kfree(line->chks); -free_erase_bitmap: - kfree(line->erase_bitmap); -free_blk_bitmap: - kfree(line->blk_bitmap); - return -ENOMEM; -} - -static int pblk_line_mg_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - int i, bb_distance; - - l_mg->nr_lines = geo->num_chk; - l_mg->log_line = l_mg->data_line = NULL; - l_mg->l_seq_nr = l_mg->d_seq_nr = 0; - l_mg->nr_free_lines = 0; - bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES); - - INIT_LIST_HEAD(&l_mg->free_list); - INIT_LIST_HEAD(&l_mg->corrupt_list); - INIT_LIST_HEAD(&l_mg->bad_list); - INIT_LIST_HEAD(&l_mg->gc_full_list); - INIT_LIST_HEAD(&l_mg->gc_high_list); - INIT_LIST_HEAD(&l_mg->gc_mid_list); - INIT_LIST_HEAD(&l_mg->gc_low_list); - INIT_LIST_HEAD(&l_mg->gc_empty_list); - INIT_LIST_HEAD(&l_mg->gc_werr_list); - - INIT_LIST_HEAD(&l_mg->emeta_list); - - l_mg->gc_lists[0] = &l_mg->gc_werr_list; - l_mg->gc_lists[1] = &l_mg->gc_high_list; - l_mg->gc_lists[2] = &l_mg->gc_mid_list; - l_mg->gc_lists[3] = &l_mg->gc_low_list; - - spin_lock_init(&l_mg->free_lock); - spin_lock_init(&l_mg->close_lock); - spin_lock_init(&l_mg->gc_lock); - - l_mg->vsc_list = kcalloc(l_mg->nr_lines, sizeof(__le32), GFP_KERNEL); - if (!l_mg->vsc_list) - goto fail; - - l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL); - if (!l_mg->bb_template) - goto fail_free_vsc_list; - - l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL); - if (!l_mg->bb_aux) - goto fail_free_bb_template; - - /* smeta is always small enough to fit on a kmalloc memory allocation, - * emeta depends on the number of LUNs allocated to the pblk instance - */ - for (i = 0; i < PBLK_DATA_LINES; i++) { - l_mg->sline_meta[i] = kmalloc(lm->smeta_len, GFP_KERNEL); - if (!l_mg->sline_meta[i]) - goto fail_free_smeta; - } - - l_mg->bitmap_cache = kmem_cache_create("pblk_lm_bitmap", - lm->sec_bitmap_len, 0, 0, NULL); - if (!l_mg->bitmap_cache) - goto fail_free_smeta; - - /* the bitmap pool is used for both valid and map bitmaps */ - l_mg->bitmap_pool = mempool_create_slab_pool(PBLK_DATA_LINES * 2, - l_mg->bitmap_cache); - if (!l_mg->bitmap_pool) - goto fail_destroy_bitmap_cache; - - /* emeta allocates three different buffers for managing metadata with - * in-memory and in-media layouts - */ - for (i = 0; i < PBLK_DATA_LINES; i++) { - struct pblk_emeta *emeta; - - emeta = kmalloc(sizeof(struct pblk_emeta), GFP_KERNEL); - if (!emeta) - goto fail_free_emeta; - - emeta->buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL); - if (!emeta->buf) { - kfree(emeta); - goto fail_free_emeta; - } - - emeta->nr_entries = lm->emeta_sec[0]; - l_mg->eline_meta[i] = emeta; - } - - for (i = 0; i < l_mg->nr_lines; i++) - l_mg->vsc_list[i] = cpu_to_le32(EMPTY_ENTRY); - - bb_distance = (geo->all_luns) * geo->ws_opt; - for (i = 0; i < lm->sec_per_line; i += bb_distance) - bitmap_set(l_mg->bb_template, i, geo->ws_opt); - - return 0; - -fail_free_emeta: - while (--i >= 0) { - kvfree(l_mg->eline_meta[i]->buf); - kfree(l_mg->eline_meta[i]); - } - - mempool_destroy(l_mg->bitmap_pool); -fail_destroy_bitmap_cache: - kmem_cache_destroy(l_mg->bitmap_cache); -fail_free_smeta: - for (i = 0; i < PBLK_DATA_LINES; i++) - kfree(l_mg->sline_meta[i]); - kfree(l_mg->bb_aux); -fail_free_bb_template: - kfree(l_mg->bb_template); -fail_free_vsc_list: - kfree(l_mg->vsc_list); -fail: - return -ENOMEM; -} - -static int pblk_line_meta_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - unsigned int smeta_len, emeta_len; - int i; - - lm->sec_per_line = geo->clba * geo->all_luns; - lm->blk_per_line = geo->all_luns; - lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long); - lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); - lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long); - lm->mid_thrs = lm->sec_per_line / 2; - lm->high_thrs = lm->sec_per_line / 4; - lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs; - - /* Calculate necessary pages for smeta. See comment over struct - * line_smeta definition - */ - i = 1; -add_smeta_page: - lm->smeta_sec = i * geo->ws_opt; - lm->smeta_len = lm->smeta_sec * geo->csecs; - - smeta_len = sizeof(struct line_smeta) + lm->lun_bitmap_len; - if (smeta_len > lm->smeta_len) { - i++; - goto add_smeta_page; - } - - /* Calculate necessary pages for emeta. See comment over struct - * line_emeta definition - */ - i = 1; -add_emeta_page: - lm->emeta_sec[0] = i * geo->ws_opt; - lm->emeta_len[0] = lm->emeta_sec[0] * geo->csecs; - - emeta_len = calc_emeta_len(pblk); - if (emeta_len > lm->emeta_len[0]) { - i++; - goto add_emeta_page; - } - - lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0; - - lm->min_blk_line = 1; - if (geo->all_luns > 1) - lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec + - lm->emeta_sec[0], geo->clba); - - if (lm->min_blk_line > lm->blk_per_line) { - pblk_err(pblk, "config. not supported. Min. LUN in line:%d\n", - lm->blk_per_line); - return -EINVAL; - } - - return 0; -} - -static int pblk_lines_init(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line; - void *chunk_meta; - int nr_free_chks = 0; - int i, ret; - - ret = pblk_line_meta_init(pblk); - if (ret) - return ret; - - ret = pblk_line_mg_init(pblk); - if (ret) - return ret; - - ret = pblk_luns_init(pblk); - if (ret) - goto fail_free_meta; - - chunk_meta = pblk_get_chunk_meta(pblk); - if (IS_ERR(chunk_meta)) { - ret = PTR_ERR(chunk_meta); - goto fail_free_luns; - } - - pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line), - GFP_KERNEL); - if (!pblk->lines) { - ret = -ENOMEM; - goto fail_free_chunk_meta; - } - - for (i = 0; i < l_mg->nr_lines; i++) { - line = &pblk->lines[i]; - - ret = pblk_alloc_line_meta(pblk, line); - if (ret) - goto fail_free_lines; - - nr_free_chks += pblk_setup_line_meta(pblk, line, chunk_meta, i); - - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - } - - if (!nr_free_chks) { - pblk_err(pblk, "too many bad blocks prevent for sane instance\n"); - ret = -EINTR; - goto fail_free_lines; - } - - ret = pblk_set_provision(pblk, nr_free_chks); - if (ret) - goto fail_free_lines; - - vfree(chunk_meta); - return 0; - -fail_free_lines: - while (--i >= 0) - pblk_line_meta_free(l_mg, &pblk->lines[i]); - kfree(pblk->lines); -fail_free_chunk_meta: - vfree(chunk_meta); -fail_free_luns: - kfree(pblk->luns); -fail_free_meta: - pblk_line_mg_free(pblk); - - return ret; -} - -static int pblk_writer_init(struct pblk *pblk) -{ - pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); - if (IS_ERR(pblk->writer_ts)) { - int err = PTR_ERR(pblk->writer_ts); - - if (err != -EINTR) - pblk_err(pblk, "could not allocate writer kthread (%d)\n", - err); - return err; - } - - timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); - mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); - - return 0; -} - -static void pblk_writer_stop(struct pblk *pblk) -{ - /* The pipeline must be stopped and the write buffer emptied before the - * write thread is stopped - */ - WARN(pblk_rb_read_count(&pblk->rwb), - "Stopping not fully persisted write buffer\n"); - - WARN(pblk_rb_sync_count(&pblk->rwb), - "Stopping not fully synced write buffer\n"); - - del_timer_sync(&pblk->wtimer); - if (pblk->writer_ts) - kthread_stop(pblk->writer_ts); -} - -static void pblk_free(struct pblk *pblk) -{ - pblk_lines_free(pblk); - pblk_l2p_free(pblk); - pblk_rwb_free(pblk); - pblk_core_free(pblk); - - kfree(pblk); -} - -static void pblk_tear_down(struct pblk *pblk, bool graceful) -{ - if (graceful) - __pblk_pipeline_flush(pblk); - __pblk_pipeline_stop(pblk); - pblk_writer_stop(pblk); - pblk_rb_sync_l2p(&pblk->rwb); - pblk_rl_free(&pblk->rl); - - pblk_debug(pblk, "consistent tear down (graceful:%d)\n", graceful); -} - -static void pblk_exit(void *private, bool graceful) -{ - struct pblk *pblk = private; - - pblk_gc_exit(pblk, graceful); - pblk_tear_down(pblk, graceful); - -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_info(pblk, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); -#endif - - pblk_free(pblk); -} - -static sector_t pblk_capacity(void *private) -{ - struct pblk *pblk = private; - - return pblk->capacity * NR_PHY_IN_LOG; -} - -static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, - int flags) -{ - struct nvm_geo *geo = &dev->geo; - struct request_queue *bqueue = dev->q; - struct request_queue *tqueue = tdisk->queue; - struct pblk *pblk; - int ret; - - pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL); - if (!pblk) - return ERR_PTR(-ENOMEM); - - pblk->dev = dev; - pblk->disk = tdisk; - pblk->state = PBLK_STATE_RUNNING; - trace_pblk_state(pblk_disk_name(pblk), pblk->state); - pblk->gc.gc_enabled = 0; - - if (!(geo->version == NVM_OCSSD_SPEC_12 || - geo->version == NVM_OCSSD_SPEC_20)) { - pblk_err(pblk, "OCSSD version not supported (%u)\n", - geo->version); - kfree(pblk); - return ERR_PTR(-EINVAL); - } - - if (geo->ext) { - pblk_err(pblk, "extended metadata not supported\n"); - kfree(pblk); - return ERR_PTR(-EINVAL); - } - - spin_lock_init(&pblk->resubmit_lock); - spin_lock_init(&pblk->trans_lock); - spin_lock_init(&pblk->lock); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_set(&pblk->inflight_writes, 0); - atomic_long_set(&pblk->padded_writes, 0); - atomic_long_set(&pblk->padded_wb, 0); - atomic_long_set(&pblk->req_writes, 0); - atomic_long_set(&pblk->sub_writes, 0); - atomic_long_set(&pblk->sync_writes, 0); - atomic_long_set(&pblk->inflight_reads, 0); - atomic_long_set(&pblk->cache_reads, 0); - atomic_long_set(&pblk->sync_reads, 0); - atomic_long_set(&pblk->recov_writes, 0); - atomic_long_set(&pblk->recov_writes, 0); - atomic_long_set(&pblk->recov_gc_writes, 0); - atomic_long_set(&pblk->recov_gc_reads, 0); -#endif - - atomic_long_set(&pblk->read_failed, 0); - atomic_long_set(&pblk->read_empty, 0); - atomic_long_set(&pblk->read_high_ecc, 0); - atomic_long_set(&pblk->read_failed_gc, 0); - atomic_long_set(&pblk->write_failed, 0); - atomic_long_set(&pblk->erase_failed, 0); - - ret = pblk_core_init(pblk); - if (ret) { - pblk_err(pblk, "could not initialize core\n"); - goto fail; - } - - ret = pblk_lines_init(pblk); - if (ret) { - pblk_err(pblk, "could not initialize lines\n"); - goto fail_free_core; - } - - ret = pblk_rwb_init(pblk); - if (ret) { - pblk_err(pblk, "could not initialize write buffer\n"); - goto fail_free_lines; - } - - ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY); - if (ret) { - pblk_err(pblk, "could not initialize maps\n"); - goto fail_free_rwb; - } - - ret = pblk_writer_init(pblk); - if (ret) { - if (ret != -EINTR) - pblk_err(pblk, "could not initialize write thread\n"); - goto fail_free_l2p; - } - - ret = pblk_gc_init(pblk); - if (ret) { - pblk_err(pblk, "could not initialize gc\n"); - goto fail_stop_writer; - } - - /* inherit the size from the underlying device */ - blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue)); - blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue)); - - blk_queue_write_cache(tqueue, true, false); - - tqueue->limits.discard_granularity = geo->clba * geo->csecs; - tqueue->limits.discard_alignment = 0; - blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue); - - pblk_info(pblk, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n", - geo->all_luns, pblk->l_mg.nr_lines, - (unsigned long long)pblk->capacity, - pblk->rwb.nr_entries); - - wake_up_process(pblk->writer_ts); - - /* Check if we need to start GC */ - pblk_gc_should_kick(pblk); - - return pblk; - -fail_stop_writer: - pblk_writer_stop(pblk); -fail_free_l2p: - pblk_l2p_free(pblk); -fail_free_rwb: - pblk_rwb_free(pblk); -fail_free_lines: - pblk_lines_free(pblk); -fail_free_core: - pblk_core_free(pblk); -fail: - kfree(pblk); - return ERR_PTR(ret); -} - -/* physical block device target */ -static struct nvm_tgt_type tt_pblk = { - .name = "pblk", - .version = {1, 0, 0}, - - .bops = &pblk_bops, - .capacity = pblk_capacity, - - .init = pblk_init, - .exit = pblk_exit, - - .sysfs_init = pblk_sysfs_init, - .sysfs_exit = pblk_sysfs_exit, - .owner = THIS_MODULE, -}; - -static int __init pblk_module_init(void) -{ - int ret; - - ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); - if (ret) - return ret; - ret = nvm_register_tgt_type(&tt_pblk); - if (ret) - bioset_exit(&pblk_bio_set); - return ret; -} - -static void pblk_module_exit(void) -{ - bioset_exit(&pblk_bio_set); - nvm_unregister_tgt_type(&tt_pblk); -} - -module_init(pblk_module_init); -module_exit(pblk_module_exit); -MODULE_AUTHOR("Javier Gonzalez "); -MODULE_AUTHOR("Matias Bjorling "); -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs"); diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c deleted file mode 100644 index 5408e32b2f13..000000000000 --- a/drivers/lightnvm/pblk-map.c +++ /dev/null @@ -1,210 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-map.c - pblk's lba-ppa mapping strategy - * - */ - -#include "pblk.h" - -static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry, - struct ppa_addr *ppa_list, - unsigned long *lun_bitmap, - void *meta_list, - unsigned int valid_secs) -{ - struct pblk_line *line = pblk_line_get_data(pblk); - struct pblk_emeta *emeta; - struct pblk_w_ctx *w_ctx; - __le64 *lba_list; - u64 paddr; - int nr_secs = pblk->min_write_pgs; - int i; - - if (!line) - return -ENOSPC; - - if (pblk_line_is_full(line)) { - struct pblk_line *prev_line = line; - - /* If we cannot allocate a new line, make sure to store metadata - * on current line and then fail - */ - line = pblk_line_replace_data(pblk); - pblk_line_close_meta(pblk, prev_line); - - if (!line) { - pblk_pipeline_stop(pblk); - return -ENOSPC; - } - - } - - emeta = line->emeta; - lba_list = emeta_to_lbas(pblk, emeta->buf); - - paddr = pblk_alloc_page(pblk, line, nr_secs); - - for (i = 0; i < nr_secs; i++, paddr++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - - /* ppa to be sent to the device */ - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - - /* Write context for target bio completion on write buffer. Note - * that the write buffer is protected by the sync backpointer, - * and a single writer thread have access to each specific entry - * at a time. Thus, it is safe to modify the context for the - * entry we are setting up for submission without taking any - * lock or memory barrier. - */ - if (i < valid_secs) { - kref_get(&line->ref); - atomic_inc(&line->sec_to_update); - w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i); - w_ctx->ppa = ppa_list[i]; - meta->lba = cpu_to_le64(w_ctx->lba); - lba_list[paddr] = cpu_to_le64(w_ctx->lba); - if (lba_list[paddr] != addr_empty) - line->nr_valid_lbas++; - else - atomic64_inc(&pblk->pad_wa); - } else { - lba_list[paddr] = addr_empty; - meta->lba = addr_empty; - __pblk_map_invalidate(pblk, line, paddr); - } - } - - pblk_down_rq(pblk, ppa_list[0], lun_bitmap); - return 0; -} - -int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, - unsigned long *lun_bitmap, unsigned int valid_secs, - unsigned int off) -{ - void *meta_list = pblk_get_meta_for_writes(pblk, rqd); - void *meta_buffer; - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - unsigned int map_secs; - int min = pblk->min_write_pgs; - int i; - int ret; - - for (i = off; i < rqd->nr_ppas; i += min) { - map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; - meta_buffer = pblk_get_meta(pblk, meta_list, i); - - ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i], - lun_bitmap, meta_buffer, map_secs); - if (ret) - return ret; - } - - return 0; -} - -/* only if erase_ppa is set, acquire erase semaphore */ -int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned int sentry, unsigned long *lun_bitmap, - unsigned int valid_secs, struct ppa_addr *erase_ppa) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - void *meta_list = pblk_get_meta_for_writes(pblk, rqd); - void *meta_buffer; - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - struct pblk_line *e_line, *d_line; - unsigned int map_secs; - int min = pblk->min_write_pgs; - int i, erase_lun; - int ret; - - - for (i = 0; i < rqd->nr_ppas; i += min) { - map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; - meta_buffer = pblk_get_meta(pblk, meta_list, i); - - ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i], - lun_bitmap, meta_buffer, map_secs); - if (ret) - return ret; - - erase_lun = pblk_ppa_to_pos(geo, ppa_list[i]); - - /* line can change after page map. We might also be writing the - * last line. - */ - e_line = pblk_line_get_erase(pblk); - if (!e_line) - return pblk_map_rq(pblk, rqd, sentry, lun_bitmap, - valid_secs, i + min); - - spin_lock(&e_line->lock); - if (!test_bit(erase_lun, e_line->erase_bitmap)) { - set_bit(erase_lun, e_line->erase_bitmap); - atomic_dec(&e_line->left_eblks); - - *erase_ppa = ppa_list[i]; - erase_ppa->a.blk = e_line->id; - erase_ppa->a.reserved = 0; - - spin_unlock(&e_line->lock); - - /* Avoid evaluating e_line->left_eblks */ - return pblk_map_rq(pblk, rqd, sentry, lun_bitmap, - valid_secs, i + min); - } - spin_unlock(&e_line->lock); - } - - d_line = pblk_line_get_data(pblk); - - /* line can change after page map. We might also be writing the - * last line. - */ - e_line = pblk_line_get_erase(pblk); - if (!e_line) - return -ENOSPC; - - /* Erase blocks that are bad in this line but might not be in next */ - if (unlikely(pblk_ppa_empty(*erase_ppa)) && - bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) { - int bit = -1; - -retry: - bit = find_next_bit(d_line->blk_bitmap, - lm->blk_per_line, bit + 1); - if (bit >= lm->blk_per_line) - return 0; - - spin_lock(&e_line->lock); - if (test_bit(bit, e_line->erase_bitmap)) { - spin_unlock(&e_line->lock); - goto retry; - } - spin_unlock(&e_line->lock); - - set_bit(bit, e_line->erase_bitmap); - atomic_dec(&e_line->left_eblks); - *erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */ - erase_ppa->a.blk = e_line->id; - } - - return 0; -} diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c deleted file mode 100644 index 5abb1705b039..000000000000 --- a/drivers/lightnvm/pblk-rb.c +++ /dev/null @@ -1,858 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * - * Based upon the circular ringbuffer. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-rb.c - pblk's write buffer - */ - -#include - -#include "pblk.h" - -static DECLARE_RWSEM(pblk_rb_lock); - -static void pblk_rb_data_free(struct pblk_rb *rb) -{ - struct pblk_rb_pages *p, *t; - - down_write(&pblk_rb_lock); - list_for_each_entry_safe(p, t, &rb->pages, list) { - free_pages((unsigned long)page_address(p->pages), p->order); - list_del(&p->list); - kfree(p); - } - up_write(&pblk_rb_lock); -} - -void pblk_rb_free(struct pblk_rb *rb) -{ - pblk_rb_data_free(rb); - vfree(rb->entries); -} - -/* - * pblk_rb_calculate_size -- calculate the size of the write buffer - */ -static unsigned int pblk_rb_calculate_size(unsigned int nr_entries, - unsigned int threshold) -{ - unsigned int thr_sz = 1 << (get_count_order(threshold + NVM_MAX_VLBA)); - unsigned int max_sz = max(thr_sz, nr_entries); - unsigned int max_io; - - /* Alloc a write buffer that can (i) fit at least two split bios - * (considering max I/O size NVM_MAX_VLBA, and (ii) guarantee that the - * threshold will be respected - */ - max_io = (1 << max((int)(get_count_order(max_sz)), - (int)(get_count_order(NVM_MAX_VLBA << 1)))); - if ((threshold + NVM_MAX_VLBA) >= max_io) - max_io <<= 1; - - return max_io; -} - -/* - * Initialize ring buffer. The data and metadata buffers must be previously - * allocated and their size must be a power of two - * (Documentation/core-api/circular-buffers.rst) - */ -int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold, - unsigned int seg_size) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_rb_entry *entries; - unsigned int init_entry = 0; - unsigned int max_order = MAX_ORDER - 1; - unsigned int power_size, power_seg_sz; - unsigned int alloc_order, order, iter; - unsigned int nr_entries; - - nr_entries = pblk_rb_calculate_size(size, threshold); - entries = vzalloc(array_size(nr_entries, sizeof(struct pblk_rb_entry))); - if (!entries) - return -ENOMEM; - - power_size = get_count_order(nr_entries); - power_seg_sz = get_count_order(seg_size); - - down_write(&pblk_rb_lock); - rb->entries = entries; - rb->seg_size = (1 << power_seg_sz); - rb->nr_entries = (1 << power_size); - rb->mem = rb->subm = rb->sync = rb->l2p_update = 0; - rb->back_thres = threshold; - rb->flush_point = EMPTY_ENTRY; - - spin_lock_init(&rb->w_lock); - spin_lock_init(&rb->s_lock); - - INIT_LIST_HEAD(&rb->pages); - - alloc_order = power_size; - if (alloc_order >= max_order) { - order = max_order; - iter = (1 << (alloc_order - max_order)); - } else { - order = alloc_order; - iter = 1; - } - - do { - struct pblk_rb_entry *entry; - struct pblk_rb_pages *page_set; - void *kaddr; - unsigned long set_size; - int i; - - page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL); - if (!page_set) { - up_write(&pblk_rb_lock); - vfree(entries); - return -ENOMEM; - } - - page_set->order = order; - page_set->pages = alloc_pages(GFP_KERNEL, order); - if (!page_set->pages) { - kfree(page_set); - pblk_rb_data_free(rb); - up_write(&pblk_rb_lock); - vfree(entries); - return -ENOMEM; - } - kaddr = page_address(page_set->pages); - - entry = &rb->entries[init_entry]; - entry->data = kaddr; - entry->cacheline = pblk_cacheline_to_addr(init_entry++); - entry->w_ctx.flags = PBLK_WRITABLE_ENTRY; - - set_size = (1 << order); - for (i = 1; i < set_size; i++) { - entry = &rb->entries[init_entry]; - entry->cacheline = pblk_cacheline_to_addr(init_entry++); - entry->data = kaddr + (i * rb->seg_size); - entry->w_ctx.flags = PBLK_WRITABLE_ENTRY; - bio_list_init(&entry->w_ctx.bios); - } - - list_add_tail(&page_set->list, &rb->pages); - iter--; - } while (iter > 0); - up_write(&pblk_rb_lock); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_set(&rb->inflight_flush_point, 0); -#endif - - /* - * Initialize rate-limiter, which controls access to the write buffer - * by user and GC I/O - */ - pblk_rl_init(&pblk->rl, rb->nr_entries, threshold); - - return 0; -} - -static void clean_wctx(struct pblk_w_ctx *w_ctx) -{ - int flags; - - flags = READ_ONCE(w_ctx->flags); - WARN_ONCE(!(flags & PBLK_SUBMITTED_ENTRY), - "pblk: overwriting unsubmitted data\n"); - - /* Release flags on context. Protect from writes and reads */ - smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY); - pblk_ppa_set_empty(&w_ctx->ppa); - w_ctx->lba = ADDR_EMPTY; -} - -#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size) -#define pblk_rb_ring_space(rb, head, tail, size) \ - (CIRC_SPACE(head, tail, size)) - -/* - * Buffer space is calculated with respect to the back pointer signaling - * synchronized entries to the media. - */ -static unsigned int pblk_rb_space(struct pblk_rb *rb) -{ - unsigned int mem = READ_ONCE(rb->mem); - unsigned int sync = READ_ONCE(rb->sync); - - return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries); -} - -unsigned int pblk_rb_ptr_wrap(struct pblk_rb *rb, unsigned int p, - unsigned int nr_entries) -{ - return (p + nr_entries) & (rb->nr_entries - 1); -} - -/* - * Buffer count is calculated with respect to the submission entry signaling the - * entries that are available to send to the media - */ -unsigned int pblk_rb_read_count(struct pblk_rb *rb) -{ - unsigned int mem = READ_ONCE(rb->mem); - unsigned int subm = READ_ONCE(rb->subm); - - return pblk_rb_ring_count(mem, subm, rb->nr_entries); -} - -unsigned int pblk_rb_sync_count(struct pblk_rb *rb) -{ - unsigned int mem = READ_ONCE(rb->mem); - unsigned int sync = READ_ONCE(rb->sync); - - return pblk_rb_ring_count(mem, sync, rb->nr_entries); -} - -unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries) -{ - unsigned int subm; - - subm = READ_ONCE(rb->subm); - /* Commit read means updating submission pointer */ - smp_store_release(&rb->subm, pblk_rb_ptr_wrap(rb, subm, nr_entries)); - - return subm; -} - -static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_line *line; - struct pblk_rb_entry *entry; - struct pblk_w_ctx *w_ctx; - unsigned int user_io = 0, gc_io = 0; - unsigned int i; - int flags; - - for (i = 0; i < to_update; i++) { - entry = &rb->entries[rb->l2p_update]; - w_ctx = &entry->w_ctx; - - flags = READ_ONCE(entry->w_ctx.flags); - if (flags & PBLK_IOTYPE_USER) - user_io++; - else if (flags & PBLK_IOTYPE_GC) - gc_io++; - else - WARN(1, "pblk: unknown IO type\n"); - - pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa, - entry->cacheline); - - line = pblk_ppa_to_line(pblk, w_ctx->ppa); - atomic_dec(&line->sec_to_update); - kref_put(&line->ref, pblk_line_put); - clean_wctx(w_ctx); - rb->l2p_update = pblk_rb_ptr_wrap(rb, rb->l2p_update, 1); - } - - pblk_rl_out(&pblk->rl, user_io, gc_io); - - return 0; -} - -/* - * When we move the l2p_update pointer, we update the l2p table - lookups will - * point to the physical address instead of to the cacheline in the write buffer - * from this moment on. - */ -static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int mem, unsigned int sync) -{ - unsigned int space, count; - int ret = 0; - - lockdep_assert_held(&rb->w_lock); - - /* Update l2p only as buffer entries are being overwritten */ - space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries); - if (space > nr_entries) - goto out; - - count = nr_entries - space; - /* l2p_update used exclusively under rb->w_lock */ - ret = __pblk_rb_update_l2p(rb, count); - -out: - return ret; -} - -/* - * Update the l2p entry for all sectors stored on the write buffer. This means - * that all future lookups to the l2p table will point to a device address, not - * to the cacheline in the write buffer. - */ -void pblk_rb_sync_l2p(struct pblk_rb *rb) -{ - unsigned int sync; - unsigned int to_update; - - spin_lock(&rb->w_lock); - - /* Protect from reads and writes */ - sync = smp_load_acquire(&rb->sync); - - to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries); - __pblk_rb_update_l2p(rb, to_update); - - spin_unlock(&rb->w_lock); -} - -/* - * Write @nr_entries to ring buffer from @data buffer if there is enough space. - * Typically, 4KB data chunks coming from a bio will be copied to the ring - * buffer, thus the write will fail if not all incoming data can be copied. - * - */ -static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, - struct pblk_rb_entry *entry) -{ - memcpy(entry->data, data, rb->seg_size); - - entry->w_ctx.lba = w_ctx.lba; - entry->w_ctx.ppa = w_ctx.ppa; -} - -void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, unsigned int ring_pos) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_rb_entry *entry; - int flags; - - entry = &rb->entries[ring_pos]; - flags = READ_ONCE(entry->w_ctx.flags); -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Caller must guarantee that the entry is free */ - BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); -#endif - - __pblk_rb_write_entry(rb, data, w_ctx, entry); - - pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline); - flags = w_ctx.flags | PBLK_WRITTEN_DATA; - - /* Release flags on write context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); -} - -void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, struct pblk_line *line, - u64 paddr, unsigned int ring_pos) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_rb_entry *entry; - int flags; - - entry = &rb->entries[ring_pos]; - flags = READ_ONCE(entry->w_ctx.flags); -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Caller must guarantee that the entry is free */ - BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); -#endif - - __pblk_rb_write_entry(rb, data, w_ctx, entry); - - if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, line, paddr)) - entry->w_ctx.lba = ADDR_EMPTY; - - flags = w_ctx.flags | PBLK_WRITTEN_DATA; - - /* Release flags on write context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); -} - -static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio, - unsigned int pos) -{ - struct pblk_rb_entry *entry; - unsigned int sync, flush_point; - - pblk_rb_sync_init(rb, NULL); - sync = READ_ONCE(rb->sync); - - if (pos == sync) { - pblk_rb_sync_end(rb, NULL); - return 0; - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_inc(&rb->inflight_flush_point); -#endif - - flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); - entry = &rb->entries[flush_point]; - - /* Protect flush points */ - smp_store_release(&rb->flush_point, flush_point); - - if (bio) - bio_list_add(&entry->w_ctx.bios, bio); - - pblk_rb_sync_end(rb, NULL); - - return bio ? 1 : 0; -} - -static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos) -{ - unsigned int mem; - unsigned int sync; - unsigned int threshold; - - sync = READ_ONCE(rb->sync); - mem = READ_ONCE(rb->mem); - - threshold = nr_entries + rb->back_thres; - - if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < threshold) - return 0; - - if (pblk_rb_update_l2p(rb, nr_entries, mem, sync)) - return 0; - - *pos = mem; - - return 1; -} - -static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos) -{ - if (!__pblk_rb_may_write(rb, nr_entries, pos)) - return 0; - - /* Protect from read count */ - smp_store_release(&rb->mem, pblk_rb_ptr_wrap(rb, *pos, nr_entries)); - return 1; -} - -void pblk_rb_flush(struct pblk_rb *rb) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - unsigned int mem = READ_ONCE(rb->mem); - - if (pblk_rb_flush_point_set(rb, NULL, mem)) - return; - - pblk_write_kick(pblk); -} - -static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos, struct bio *bio, - int *io_ret) -{ - unsigned int mem; - - if (!__pblk_rb_may_write(rb, nr_entries, pos)) - return 0; - - mem = pblk_rb_ptr_wrap(rb, *pos, nr_entries); - *io_ret = NVM_IO_DONE; - - if (bio->bi_opf & REQ_PREFLUSH) { - struct pblk *pblk = container_of(rb, struct pblk, rwb); - - atomic64_inc(&pblk->nr_flush); - if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem)) - *io_ret = NVM_IO_OK; - } - - /* Protect from read count */ - smp_store_release(&rb->mem, mem); - - return 1; -} - -/* - * Atomically check that (i) there is space on the write buffer for the - * incoming I/O, and (ii) the current I/O type has enough budget in the write - * buffer (rate-limiter). - */ -int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio, - unsigned int nr_entries, unsigned int *pos) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - int io_ret; - - spin_lock(&rb->w_lock); - io_ret = pblk_rl_user_may_insert(&pblk->rl, nr_entries); - if (io_ret) { - spin_unlock(&rb->w_lock); - return io_ret; - } - - if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &io_ret)) { - spin_unlock(&rb->w_lock); - return NVM_IO_REQUEUE; - } - - pblk_rl_user_in(&pblk->rl, nr_entries); - spin_unlock(&rb->w_lock); - - return io_ret; -} - -/* - * Look at pblk_rb_may_write_user comment - */ -int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - - spin_lock(&rb->w_lock); - if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) { - spin_unlock(&rb->w_lock); - return 0; - } - - if (!pblk_rb_may_write(rb, nr_entries, pos)) { - spin_unlock(&rb->w_lock); - return 0; - } - - pblk_rl_gc_in(&pblk->rl, nr_entries); - spin_unlock(&rb->w_lock); - - return 1; -} - -/* - * Read available entries on rb and add them to the given bio. To avoid a memory - * copy, a page reference to the write buffer is used to be added to the bio. - * - * This function is used by the write thread to form the write bio that will - * persist data on the write buffer to the media. - */ -unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, - unsigned int pos, unsigned int nr_entries, - unsigned int count) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct request_queue *q = pblk->dev->q; - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - struct bio *bio = rqd->bio; - struct pblk_rb_entry *entry; - struct page *page; - unsigned int pad = 0, to_read = nr_entries; - unsigned int i; - int flags; - - if (count < nr_entries) { - pad = nr_entries - count; - to_read = count; - } - - /* Add space for packed metadata if in use*/ - pad += (pblk->min_write_pgs - pblk->min_write_pgs_data); - - c_ctx->sentry = pos; - c_ctx->nr_valid = to_read; - c_ctx->nr_padded = pad; - - for (i = 0; i < to_read; i++) { - entry = &rb->entries[pos]; - - /* A write has been allowed into the buffer, but data is still - * being copied to it. It is ok to busy wait. - */ -try: - flags = READ_ONCE(entry->w_ctx.flags); - if (!(flags & PBLK_WRITTEN_DATA)) { - io_schedule(); - goto try; - } - - page = virt_to_page(entry->data); - if (!page) { - pblk_err(pblk, "could not allocate write bio page\n"); - flags &= ~PBLK_WRITTEN_DATA; - flags |= PBLK_SUBMITTED_ENTRY; - /* Release flags on context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); - return NVM_IO_ERR; - } - - if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) != - rb->seg_size) { - pblk_err(pblk, "could not add page to write bio\n"); - flags &= ~PBLK_WRITTEN_DATA; - flags |= PBLK_SUBMITTED_ENTRY; - /* Release flags on context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); - return NVM_IO_ERR; - } - - flags &= ~PBLK_WRITTEN_DATA; - flags |= PBLK_SUBMITTED_ENTRY; - - /* Release flags on context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); - - pos = pblk_rb_ptr_wrap(rb, pos, 1); - } - - if (pad) { - if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) { - pblk_err(pblk, "could not pad page in write bio\n"); - return NVM_IO_ERR; - } - - if (pad < pblk->min_write_pgs) - atomic64_inc(&pblk->pad_dist[pad - 1]); - else - pblk_warn(pblk, "padding more than min. sectors\n"); - - atomic64_add(pad, &pblk->pad_wa); - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(pad, &pblk->padded_writes); -#endif - - return NVM_IO_OK; -} - -/* - * Copy to bio only if the lba matches the one on the given cache entry. - * Otherwise, it means that the entry has been overwritten, and the bio should - * be directed to disk. - */ -int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, - struct ppa_addr ppa) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_rb_entry *entry; - struct pblk_w_ctx *w_ctx; - struct ppa_addr l2p_ppa; - u64 pos = pblk_addr_to_cacheline(ppa); - void *data; - int flags; - int ret = 1; - - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Caller must ensure that the access will not cause an overflow */ - BUG_ON(pos >= rb->nr_entries); -#endif - entry = &rb->entries[pos]; - w_ctx = &entry->w_ctx; - flags = READ_ONCE(w_ctx->flags); - - spin_lock(&rb->w_lock); - spin_lock(&pblk->trans_lock); - l2p_ppa = pblk_trans_map_get(pblk, lba); - spin_unlock(&pblk->trans_lock); - - /* Check if the entry has been overwritten or is scheduled to be */ - if (!pblk_ppa_comp(l2p_ppa, ppa) || w_ctx->lba != lba || - flags & PBLK_WRITABLE_ENTRY) { - ret = 0; - goto out; - } - data = bio_data(bio); - memcpy(data, entry->data, rb->seg_size); - -out: - spin_unlock(&rb->w_lock); - return ret; -} - -struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos) -{ - unsigned int entry = pblk_rb_ptr_wrap(rb, pos, 0); - - return &rb->entries[entry].w_ctx; -} - -unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags) - __acquires(&rb->s_lock) -{ - if (flags) - spin_lock_irqsave(&rb->s_lock, *flags); - else - spin_lock_irq(&rb->s_lock); - - return rb->sync; -} - -void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags) - __releases(&rb->s_lock) -{ - lockdep_assert_held(&rb->s_lock); - - if (flags) - spin_unlock_irqrestore(&rb->s_lock, *flags); - else - spin_unlock_irq(&rb->s_lock); -} - -unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries) -{ - unsigned int sync, flush_point; - lockdep_assert_held(&rb->s_lock); - - sync = READ_ONCE(rb->sync); - flush_point = READ_ONCE(rb->flush_point); - - if (flush_point != EMPTY_ENTRY) { - unsigned int secs_to_flush; - - secs_to_flush = pblk_rb_ring_count(flush_point, sync, - rb->nr_entries); - if (secs_to_flush < nr_entries) { - /* Protect flush points */ - smp_store_release(&rb->flush_point, EMPTY_ENTRY); - } - } - - sync = pblk_rb_ptr_wrap(rb, sync, nr_entries); - - /* Protect from counts */ - smp_store_release(&rb->sync, sync); - - return sync; -} - -/* Calculate how many sectors to submit up to the current flush point. */ -unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb) -{ - unsigned int subm, sync, flush_point; - unsigned int submitted, to_flush; - - /* Protect flush points */ - flush_point = smp_load_acquire(&rb->flush_point); - if (flush_point == EMPTY_ENTRY) - return 0; - - /* Protect syncs */ - sync = smp_load_acquire(&rb->sync); - - subm = READ_ONCE(rb->subm); - submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries); - - /* The sync point itself counts as a sector to sync */ - to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1; - - return (submitted < to_flush) ? (to_flush - submitted) : 0; -} - -int pblk_rb_tear_down_check(struct pblk_rb *rb) -{ - struct pblk_rb_entry *entry; - int i; - int ret = 0; - - spin_lock(&rb->w_lock); - spin_lock_irq(&rb->s_lock); - - if ((rb->mem == rb->subm) && (rb->subm == rb->sync) && - (rb->sync == rb->l2p_update) && - (rb->flush_point == EMPTY_ENTRY)) { - goto out; - } - - if (!rb->entries) { - ret = 1; - goto out; - } - - for (i = 0; i < rb->nr_entries; i++) { - entry = &rb->entries[i]; - - if (!entry->data) { - ret = 1; - goto out; - } - } - -out: - spin_unlock_irq(&rb->s_lock); - spin_unlock(&rb->w_lock); - - return ret; -} - -unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos) -{ - return (pos & (rb->nr_entries - 1)); -} - -int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos) -{ - return (pos >= rb->nr_entries); -} - -ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_c_ctx *c; - ssize_t offset; - int queued_entries = 0; - - spin_lock_irq(&rb->s_lock); - list_for_each_entry(c, &pblk->compl_list, list) - queued_entries++; - spin_unlock_irq(&rb->s_lock); - - if (rb->flush_point != EMPTY_ENTRY) - offset = scnprintf(buf, PAGE_SIZE, - "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n", - rb->nr_entries, - rb->mem, - rb->subm, - rb->sync, - rb->l2p_update, -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_read(&rb->inflight_flush_point), -#else - 0, -#endif - rb->flush_point, - pblk_rb_read_count(rb), - pblk_rb_space(rb), - pblk_rb_flush_point_count(rb), - queued_entries); - else - offset = scnprintf(buf, PAGE_SIZE, - "%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n", - rb->nr_entries, - rb->mem, - rb->subm, - rb->sync, - rb->l2p_update, -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_read(&rb->inflight_flush_point), -#else - 0, -#endif - pblk_rb_read_count(rb), - pblk_rb_space(rb), - pblk_rb_flush_point_count(rb), - queued_entries); - - return offset; -} diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c deleted file mode 100644 index c28537a489bc..000000000000 --- a/drivers/lightnvm/pblk-read.c +++ /dev/null @@ -1,474 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-read.c - pblk's read path - */ - -#include "pblk.h" - -/* - * There is no guarantee that the value read from cache has not been updated and - * resides at another location in the cache. We guarantee though that if the - * value is read from the cache, it belongs to the mapped lba. In order to - * guarantee and order between writes and reads are ordered, a flush must be - * issued. - */ -static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, - sector_t lba, struct ppa_addr ppa) -{ -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a cache address */ - BUG_ON(pblk_ppa_empty(ppa)); - BUG_ON(!pblk_addr_in_cache(ppa)); -#endif - - return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa); -} - -static int pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct bio *bio, sector_t blba, - bool *from_cache) -{ - void *meta_list = rqd->meta_list; - int nr_secs, i; - -retry: - nr_secs = pblk_lookup_l2p_seq(pblk, rqd->ppa_list, blba, rqd->nr_ppas, - from_cache); - - if (!*from_cache) - goto end; - - for (i = 0; i < nr_secs; i++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); - sector_t lba = blba + i; - - if (pblk_ppa_empty(rqd->ppa_list[i])) { - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - - meta->lba = addr_empty; - } else if (pblk_addr_in_cache(rqd->ppa_list[i])) { - /* - * Try to read from write buffer. The address is later - * checked on the write buffer to prevent retrieving - * overwritten data. - */ - if (!pblk_read_from_cache(pblk, bio, lba, - rqd->ppa_list[i])) { - if (i == 0) { - /* - * We didn't call with bio_advance() - * yet, so we can just retry. - */ - goto retry; - } else { - /* - * We already call bio_advance() - * so we cannot retry and we need - * to quit that function in order - * to allow caller to handle the bio - * splitting in the current sector - * position. - */ - nr_secs = i; - goto end; - } - } - meta->lba = cpu_to_le64(lba); -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->cache_reads); -#endif - } - bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE); - } - -end: - if (pblk_io_aligned(pblk, nr_secs)) - rqd->is_seq = 1; - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(nr_secs, &pblk->inflight_reads); -#endif - - return nr_secs; -} - - -static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd, - sector_t blba) -{ - void *meta_list = rqd->meta_list; - int nr_lbas = rqd->nr_ppas; - int i; - - if (!pblk_is_oob_meta_supported(pblk)) - return; - - for (i = 0; i < nr_lbas; i++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); - u64 lba = le64_to_cpu(meta->lba); - - if (lba == ADDR_EMPTY) - continue; - - if (lba != blba + i) { -#ifdef CONFIG_NVM_PBLK_DEBUG - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - print_ppa(pblk, &ppa_list[i], "seq", i); -#endif - pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n", - lba, (u64)blba + i); - WARN_ON(1); - } - } -} - -/* - * There can be holes in the lba list. - */ -static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd, - u64 *lba_list, int nr_lbas) -{ - void *meta_lba_list = rqd->meta_list; - int i, j; - - if (!pblk_is_oob_meta_supported(pblk)) - return; - - for (i = 0, j = 0; i < nr_lbas; i++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, - meta_lba_list, j); - u64 lba = lba_list[i]; - u64 meta_lba; - - if (lba == ADDR_EMPTY) - continue; - - meta_lba = le64_to_cpu(meta->lba); - - if (lba != meta_lba) { -#ifdef CONFIG_NVM_PBLK_DEBUG - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - print_ppa(pblk, &ppa_list[j], "rnd", j); -#endif - pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n", - meta_lba, lba); - WARN_ON(1); - } - - j++; - } - - WARN_ONCE(j != rqd->nr_ppas, "pblk: corrupted random request\n"); -} - -static void pblk_end_user_read(struct bio *bio, int error) -{ - if (error && error != NVM_RSP_WARN_HIGHECC) - bio_io_error(bio); - else - bio_endio(bio); -} - -static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, - bool put_line) -{ - struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); - struct bio *int_bio = rqd->bio; - unsigned long start_time = r_ctx->start_time; - - bio_end_io_acct(int_bio, start_time); - - if (rqd->error) - pblk_log_read_err(pblk, rqd); - - pblk_read_check_seq(pblk, rqd, r_ctx->lba); - bio_put(int_bio); - - if (put_line) - pblk_rq_to_line_put(pblk, rqd); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(rqd->nr_ppas, &pblk->sync_reads); - atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads); -#endif - - pblk_free_rqd(pblk, rqd, PBLK_READ); - atomic_dec(&pblk->inflight_io); -} - -static void pblk_end_io_read(struct nvm_rq *rqd) -{ - struct pblk *pblk = rqd->private; - struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); - struct bio *bio = (struct bio *)r_ctx->private; - - pblk_end_user_read(bio, rqd->error); - __pblk_end_io_read(pblk, rqd, true); -} - -static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, - sector_t lba, bool *from_cache) -{ - struct pblk_sec_meta *meta = pblk_get_meta(pblk, rqd->meta_list, 0); - struct ppa_addr ppa; - - pblk_lookup_l2p_seq(pblk, &ppa, lba, 1, from_cache); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->inflight_reads); -#endif - -retry: - if (pblk_ppa_empty(ppa)) { - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - - meta->lba = addr_empty; - return; - } - - /* Try to read from write buffer. The address is later checked on the - * write buffer to prevent retrieving overwritten data. - */ - if (pblk_addr_in_cache(ppa)) { - if (!pblk_read_from_cache(pblk, bio, lba, ppa)) { - pblk_lookup_l2p_seq(pblk, &ppa, lba, 1, from_cache); - goto retry; - } - - meta->lba = cpu_to_le64(lba); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->cache_reads); -#endif - } else { - rqd->ppa_addr = ppa; - } -} - -void pblk_submit_read(struct pblk *pblk, struct bio *bio) -{ - sector_t blba = pblk_get_lba(bio); - unsigned int nr_secs = pblk_get_secs(bio); - bool from_cache; - struct pblk_g_ctx *r_ctx; - struct nvm_rq *rqd; - struct bio *int_bio, *split_bio; - unsigned long start_time; - - start_time = bio_start_io_acct(bio); - - rqd = pblk_alloc_rqd(pblk, PBLK_READ); - - rqd->opcode = NVM_OP_PREAD; - rqd->nr_ppas = nr_secs; - rqd->private = pblk; - rqd->end_io = pblk_end_io_read; - - r_ctx = nvm_rq_to_pdu(rqd); - r_ctx->start_time = start_time; - r_ctx->lba = blba; - - if (pblk_alloc_rqd_meta(pblk, rqd)) { - bio_io_error(bio); - pblk_free_rqd(pblk, rqd, PBLK_READ); - return; - } - - /* Clone read bio to deal internally with: - * -read errors when reading from drive - * -bio_advance() calls during cache reads - */ - int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); - - if (nr_secs > 1) - nr_secs = pblk_read_ppalist_rq(pblk, rqd, int_bio, blba, - &from_cache); - else - pblk_read_rq(pblk, rqd, int_bio, blba, &from_cache); - -split_retry: - r_ctx->private = bio; /* original bio */ - rqd->bio = int_bio; /* internal bio */ - - if (from_cache && nr_secs == rqd->nr_ppas) { - /* All data was read from cache, we can complete the IO. */ - pblk_end_user_read(bio, 0); - atomic_inc(&pblk->inflight_io); - __pblk_end_io_read(pblk, rqd, false); - } else if (nr_secs != rqd->nr_ppas) { - /* The read bio request could be partially filled by the write - * buffer, but there are some holes that need to be read from - * the drive. In order to handle this, we will use block layer - * mechanism to split this request in to smaller ones and make - * a chain of it. - */ - split_bio = bio_split(bio, nr_secs * NR_PHY_IN_LOG, GFP_KERNEL, - &pblk_bio_set); - bio_chain(split_bio, bio); - submit_bio_noacct(bio); - - /* New bio contains first N sectors of the previous one, so - * we can continue to use existing rqd, but we need to shrink - * the number of PPAs in it. New bio is also guaranteed that - * it contains only either data from cache or from drive, newer - * mix of them. - */ - bio = split_bio; - rqd->nr_ppas = nr_secs; - if (rqd->nr_ppas == 1) - rqd->ppa_addr = rqd->ppa_list[0]; - - /* Recreate int_bio - existing might have some needed internal - * fields modified already. - */ - bio_put(int_bio); - int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); - goto split_retry; - } else if (pblk_submit_io(pblk, rqd, NULL)) { - /* Submitting IO to drive failed, let's report an error */ - rqd->error = -ENODEV; - pblk_end_io_read(rqd); - } -} - -static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_line *line, u64 *lba_list, - u64 *paddr_list_gc, unsigned int nr_secs) -{ - struct ppa_addr ppa_list_l2p[NVM_MAX_VLBA]; - struct ppa_addr ppa_gc; - int valid_secs = 0; - int i; - - pblk_lookup_l2p_rand(pblk, ppa_list_l2p, lba_list, nr_secs); - - for (i = 0; i < nr_secs; i++) { - if (lba_list[i] == ADDR_EMPTY) - continue; - - ppa_gc = addr_to_gen_ppa(pblk, paddr_list_gc[i], line->id); - if (!pblk_ppa_comp(ppa_list_l2p[i], ppa_gc)) { - paddr_list_gc[i] = lba_list[i] = ADDR_EMPTY; - continue; - } - - rqd->ppa_list[valid_secs++] = ppa_list_l2p[i]; - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(valid_secs, &pblk->inflight_reads); -#endif - - return valid_secs; -} - -static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_line *line, sector_t lba, - u64 paddr_gc) -{ - struct ppa_addr ppa_l2p, ppa_gc; - int valid_secs = 0; - - if (lba == ADDR_EMPTY) - goto out; - - /* logic error: lba out-of-bounds */ - if (lba >= pblk->capacity) { - WARN(1, "pblk: read lba out of bounds\n"); - goto out; - } - - spin_lock(&pblk->trans_lock); - ppa_l2p = pblk_trans_map_get(pblk, lba); - spin_unlock(&pblk->trans_lock); - - ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, line->id); - if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) - goto out; - - rqd->ppa_addr = ppa_l2p; - valid_secs = 1; - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->inflight_reads); -#endif - -out: - return valid_secs; -} - -int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) -{ - struct nvm_rq rqd; - int ret = NVM_IO_OK; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - ret = pblk_alloc_rqd_meta(pblk, &rqd); - if (ret) - return ret; - - if (gc_rq->nr_secs > 1) { - gc_rq->secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, gc_rq->line, - gc_rq->lba_list, - gc_rq->paddr_list, - gc_rq->nr_secs); - if (gc_rq->secs_to_gc == 1) - rqd.ppa_addr = rqd.ppa_list[0]; - } else { - gc_rq->secs_to_gc = read_rq_gc(pblk, &rqd, gc_rq->line, - gc_rq->lba_list[0], - gc_rq->paddr_list[0]); - } - - if (!(gc_rq->secs_to_gc)) - goto out; - - rqd.opcode = NVM_OP_PREAD; - rqd.nr_ppas = gc_rq->secs_to_gc; - - if (pblk_submit_io_sync(pblk, &rqd, gc_rq->data)) { - ret = -EIO; - goto err_free_dma; - } - - pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs); - - atomic_dec(&pblk->inflight_io); - - if (rqd.error) { - atomic_long_inc(&pblk->read_failed_gc); -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_print_failed_rqd(pblk, &rqd, rqd.error); -#endif - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads); - atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads); - atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads); -#endif - -out: - pblk_free_rqd_meta(pblk, &rqd); - return ret; - -err_free_dma: - pblk_free_rqd_meta(pblk, &rqd); - return ret; -} diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c deleted file mode 100644 index 0e6f0c76e930..000000000000 --- a/drivers/lightnvm/pblk-recovery.c +++ /dev/null @@ -1,874 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial: Javier Gonzalez - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-recovery.c - pblk's recovery path - * - * The L2P recovery path is single threaded as the L2P table is updated in order - * following the line sequence ID. - */ - -#include "pblk.h" -#include "pblk-trace.h" - -int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf) -{ - u32 crc; - - crc = pblk_calc_emeta_crc(pblk, emeta_buf); - if (le32_to_cpu(emeta_buf->crc) != crc) - return 1; - - if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) - return 1; - - return 0; -} - -static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_emeta *emeta = line->emeta; - struct line_emeta *emeta_buf = emeta->buf; - __le64 *lba_list; - u64 data_start, data_end; - u64 nr_valid_lbas, nr_lbas = 0; - u64 i; - - lba_list = emeta_to_lbas(pblk, emeta_buf); - if (!lba_list) - return 1; - - data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; - data_end = line->emeta_ssec; - nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas); - - for (i = data_start; i < data_end; i++) { - struct ppa_addr ppa; - int pos; - - ppa = addr_to_gen_ppa(pblk, i, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - - /* Do not update bad blocks */ - if (test_bit(pos, line->blk_bitmap)) - continue; - - if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) { - spin_lock(&line->lock); - if (test_and_set_bit(i, line->invalid_bitmap)) - WARN_ONCE(1, "pblk: rec. double invalidate:\n"); - else - le32_add_cpu(line->vsc, -1); - spin_unlock(&line->lock); - - continue; - } - - pblk_update_map(pblk, le64_to_cpu(lba_list[i]), ppa); - nr_lbas++; - } - - if (nr_valid_lbas != nr_lbas) - pblk_err(pblk, "line %d - inconsistent lba list(%llu/%llu)\n", - line->id, nr_valid_lbas, nr_lbas); - - line->left_msecs = 0; - - return 0; -} - -static void pblk_update_line_wp(struct pblk *pblk, struct pblk_line *line, - u64 written_secs) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int i; - - for (i = 0; i < written_secs; i += pblk->min_write_pgs) - __pblk_alloc_page(pblk, line, pblk->min_write_pgs); - - spin_lock(&l_mg->free_lock); - if (written_secs > line->left_msecs) { - /* - * We have all data sectors written - * and some emeta sectors written too. - */ - line->left_msecs = 0; - } else { - /* We have only some data sectors written. */ - line->left_msecs -= written_secs; - } - spin_unlock(&l_mg->free_lock); -} - -static u64 pblk_sec_in_open_line(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); - u64 written_secs = 0; - int valid_chunks = 0; - int i; - - for (i = 0; i < lm->blk_per_line; i++) { - struct nvm_chk_meta *chunk = &line->chks[i]; - - if (chunk->state & NVM_CHK_ST_OFFLINE) - continue; - - written_secs += chunk->wp; - valid_chunks++; - } - - if (lm->blk_per_line - nr_bb != valid_chunks) - pblk_err(pblk, "recovery line %d is bad\n", line->id); - - pblk_update_line_wp(pblk, line, written_secs - lm->smeta_sec); - - return written_secs; -} - -struct pblk_recov_alloc { - struct ppa_addr *ppa_list; - void *meta_list; - struct nvm_rq *rqd; - void *data; - dma_addr_t dma_ppa_list; - dma_addr_t dma_meta_list; -}; - -static void pblk_recov_complete(struct kref *ref) -{ - struct pblk_pad_rq *pad_rq = container_of(ref, struct pblk_pad_rq, ref); - - complete(&pad_rq->wait); -} - -static void pblk_end_io_recov(struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - struct pblk_pad_rq *pad_rq = rqd->private; - struct pblk *pblk = pad_rq->pblk; - - pblk_up_chunk(pblk, ppa_list[0]); - - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - - atomic_dec(&pblk->inflight_io); - kref_put(&pad_rq->ref, pblk_recov_complete); -} - -/* pad line using line bitmap. */ -static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line, - int left_ppas) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - void *meta_list; - struct pblk_pad_rq *pad_rq; - struct nvm_rq *rqd; - struct ppa_addr *ppa_list; - void *data; - __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf); - u64 w_ptr = line->cur_sec; - int left_line_ppas, rq_ppas; - int i, j; - int ret = 0; - - spin_lock(&line->lock); - left_line_ppas = line->left_msecs; - spin_unlock(&line->lock); - - pad_rq = kmalloc(sizeof(struct pblk_pad_rq), GFP_KERNEL); - if (!pad_rq) - return -ENOMEM; - - data = vzalloc(array_size(pblk->max_write_pgs, geo->csecs)); - if (!data) { - ret = -ENOMEM; - goto free_rq; - } - - pad_rq->pblk = pblk; - init_completion(&pad_rq->wait); - kref_init(&pad_rq->ref); - -next_pad_rq: - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); - if (rq_ppas < pblk->min_write_pgs) { - pblk_err(pblk, "corrupted pad line %d\n", line->id); - goto fail_complete; - } - - rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); - - ret = pblk_alloc_rqd_meta(pblk, rqd); - if (ret) { - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - goto fail_complete; - } - - rqd->bio = NULL; - rqd->opcode = NVM_OP_PWRITE; - rqd->is_seq = 1; - rqd->nr_ppas = rq_ppas; - rqd->end_io = pblk_end_io_recov; - rqd->private = pad_rq; - - ppa_list = nvm_rq_to_ppa_list(rqd); - meta_list = rqd->meta_list; - - for (i = 0; i < rqd->nr_ppas; ) { - struct ppa_addr ppa; - int pos; - - w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); - ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - - while (test_bit(pos, line->blk_bitmap)) { - w_ptr += pblk->min_write_pgs; - ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - } - - for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) { - struct ppa_addr dev_ppa; - struct pblk_sec_meta *meta; - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - - dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); - - pblk_map_invalidate(pblk, dev_ppa); - lba_list[w_ptr] = addr_empty; - meta = pblk_get_meta(pblk, meta_list, i); - meta->lba = addr_empty; - ppa_list[i] = dev_ppa; - } - } - - kref_get(&pad_rq->ref); - pblk_down_chunk(pblk, ppa_list[0]); - - ret = pblk_submit_io(pblk, rqd, data); - if (ret) { - pblk_err(pblk, "I/O submission failed: %d\n", ret); - pblk_up_chunk(pblk, ppa_list[0]); - kref_put(&pad_rq->ref, pblk_recov_complete); - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - goto fail_complete; - } - - left_line_ppas -= rq_ppas; - left_ppas -= rq_ppas; - if (left_ppas && left_line_ppas) - goto next_pad_rq; - -fail_complete: - kref_put(&pad_rq->ref, pblk_recov_complete); - wait_for_completion(&pad_rq->wait); - - if (!pblk_line_is_full(line)) - pblk_err(pblk, "corrupted padded line: %d\n", line->id); - - vfree(data); -free_rq: - kfree(pad_rq); - return ret; -} - -static int pblk_pad_distance(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int distance = geo->mw_cunits * geo->all_luns * geo->ws_opt; - - return (distance > line->left_msecs) ? line->left_msecs : distance; -} - -/* Return a chunk belonging to a line by stripe(write order) index */ -static struct nvm_chk_meta *pblk_get_stripe_chunk(struct pblk *pblk, - struct pblk_line *line, - int index) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - struct ppa_addr ppa; - int pos; - - rlun = &pblk->luns[index]; - ppa = rlun->bppa; - pos = pblk_ppa_to_pos(geo, ppa); - - return &line->chks[pos]; -} - -static int pblk_line_wps_are_unbalanced(struct pblk *pblk, - struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - int blk_in_line = lm->blk_per_line; - struct nvm_chk_meta *chunk; - u64 max_wp, min_wp; - int i; - - i = find_first_zero_bit(line->blk_bitmap, blk_in_line); - - /* If there is one or zero good chunks in the line, - * the write pointers can't be unbalanced. - */ - if (i >= (blk_in_line - 1)) - return 0; - - chunk = pblk_get_stripe_chunk(pblk, line, i); - max_wp = chunk->wp; - if (max_wp > pblk->max_write_pgs) - min_wp = max_wp - pblk->max_write_pgs; - else - min_wp = 0; - - i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1); - while (i < blk_in_line) { - chunk = pblk_get_stripe_chunk(pblk, line, i); - if (chunk->wp > max_wp || chunk->wp < min_wp) - return 1; - - i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1); - } - - return 0; -} - -static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, - struct pblk_recov_alloc p) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_geo *geo = &dev->geo; - struct ppa_addr *ppa_list; - void *meta_list; - struct nvm_rq *rqd; - void *data; - dma_addr_t dma_ppa_list, dma_meta_list; - __le64 *lba_list; - u64 paddr = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; - bool padded = false; - int rq_ppas; - int i, j; - int ret; - u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec; - - if (pblk_line_wps_are_unbalanced(pblk, line)) - pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id); - - ppa_list = p.ppa_list; - meta_list = p.meta_list; - rqd = p.rqd; - data = p.data; - dma_ppa_list = p.dma_ppa_list; - dma_meta_list = p.dma_meta_list; - - lba_list = emeta_to_lbas(pblk, line->emeta->buf); - -next_rq: - memset(rqd, 0, pblk_g_rq_size); - - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); - if (!rq_ppas) - rq_ppas = pblk->min_write_pgs; - -retry_rq: - rqd->bio = NULL; - rqd->opcode = NVM_OP_PREAD; - rqd->meta_list = meta_list; - rqd->nr_ppas = rq_ppas; - rqd->ppa_list = ppa_list; - rqd->dma_ppa_list = dma_ppa_list; - rqd->dma_meta_list = dma_meta_list; - ppa_list = nvm_rq_to_ppa_list(rqd); - - if (pblk_io_aligned(pblk, rq_ppas)) - rqd->is_seq = 1; - - for (i = 0; i < rqd->nr_ppas; ) { - struct ppa_addr ppa; - int pos; - - ppa = addr_to_gen_ppa(pblk, paddr, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - - while (test_bit(pos, line->blk_bitmap)) { - paddr += pblk->min_write_pgs; - ppa = addr_to_gen_ppa(pblk, paddr, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - } - - for (j = 0; j < pblk->min_write_pgs; j++, i++) - ppa_list[i] = - addr_to_gen_ppa(pblk, paddr + j, line->id); - } - - ret = pblk_submit_io_sync(pblk, rqd, data); - if (ret) { - pblk_err(pblk, "I/O submission failed: %d\n", ret); - return ret; - } - - atomic_dec(&pblk->inflight_io); - - /* If a read fails, do a best effort by padding the line and retrying */ - if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) { - int pad_distance, ret; - - if (padded) { - pblk_log_read_err(pblk, rqd); - return -EINTR; - } - - pad_distance = pblk_pad_distance(pblk, line); - ret = pblk_recov_pad_line(pblk, line, pad_distance); - if (ret) { - return ret; - } - - padded = true; - goto retry_rq; - } - - pblk_get_packed_meta(pblk, rqd); - - for (i = 0; i < rqd->nr_ppas; i++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); - u64 lba = le64_to_cpu(meta->lba); - - lba_list[paddr++] = cpu_to_le64(lba); - - if (lba == ADDR_EMPTY || lba >= pblk->capacity) - continue; - - line->nr_valid_lbas++; - pblk_update_map(pblk, lba, ppa_list[i]); - } - - left_ppas -= rq_ppas; - if (left_ppas > 0) - goto next_rq; - -#ifdef CONFIG_NVM_PBLK_DEBUG - WARN_ON(padded && !pblk_line_is_full(line)); -#endif - - return 0; -} - -/* Scan line for lbas on out of bound area */ -static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct nvm_rq *rqd; - struct ppa_addr *ppa_list; - void *meta_list; - struct pblk_recov_alloc p; - void *data; - dma_addr_t dma_ppa_list, dma_meta_list; - int ret = 0; - - meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); - if (!meta_list) - return -ENOMEM; - - ppa_list = (void *)(meta_list) + pblk_dma_meta_size(pblk); - dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk); - - data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL); - if (!data) { - ret = -ENOMEM; - goto free_meta_list; - } - - rqd = mempool_alloc(&pblk->r_rq_pool, GFP_KERNEL); - memset(rqd, 0, pblk_g_rq_size); - - p.ppa_list = ppa_list; - p.meta_list = meta_list; - p.rqd = rqd; - p.data = data; - p.dma_ppa_list = dma_ppa_list; - p.dma_meta_list = dma_meta_list; - - ret = pblk_recov_scan_oob(pblk, line, p); - if (ret) { - pblk_err(pblk, "could not recover L2P form OOB\n"); - goto out; - } - - if (pblk_line_is_full(line)) - pblk_line_recov_close(pblk, line); - -out: - mempool_free(rqd, &pblk->r_rq_pool); - kfree(data); -free_meta_list: - nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); - - return ret; -} - -/* Insert lines ordered by sequence number (seq_num) on list */ -static void pblk_recov_line_add_ordered(struct list_head *head, - struct pblk_line *line) -{ - struct pblk_line *t = NULL; - - list_for_each_entry(t, head, list) - if (t->seq_nr > line->seq_nr) - break; - - __list_add(&line->list, t->list.prev, &t->list); -} - -static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - unsigned int emeta_secs; - u64 emeta_start; - struct ppa_addr ppa; - int pos; - - emeta_secs = lm->emeta_sec[0]; - emeta_start = lm->sec_per_line; - - while (emeta_secs) { - emeta_start--; - ppa = addr_to_gen_ppa(pblk, emeta_start, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - if (!test_bit(pos, line->blk_bitmap)) - emeta_secs--; - } - - return emeta_start; -} - -static int pblk_recov_check_line_version(struct pblk *pblk, - struct line_emeta *emeta) -{ - struct line_header *header = &emeta->header; - - if (header->version_major != EMETA_VERSION_MAJOR) { - pblk_err(pblk, "line major version mismatch: %d, expected: %d\n", - header->version_major, EMETA_VERSION_MAJOR); - return 1; - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - if (header->version_minor > EMETA_VERSION_MINOR) - pblk_info(pblk, "newer line minor version found: %d\n", - header->version_minor); -#endif - - return 0; -} - -static void pblk_recov_wa_counters(struct pblk *pblk, - struct line_emeta *emeta) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct line_header *header = &emeta->header; - struct wa_counters *wa = emeta_to_wa(lm, emeta); - - /* WA counters were introduced in emeta version 0.2 */ - if (header->version_major > 0 || header->version_minor >= 2) { - u64 user = le64_to_cpu(wa->user); - u64 pad = le64_to_cpu(wa->pad); - u64 gc = le64_to_cpu(wa->gc); - - atomic64_set(&pblk->user_wa, user); - atomic64_set(&pblk->pad_wa, pad); - atomic64_set(&pblk->gc_wa, gc); - - pblk->user_rst_wa = user; - pblk->pad_rst_wa = pad; - pblk->gc_rst_wa = gc; - } -} - -static int pblk_line_was_written(struct pblk_line *line, - struct pblk *pblk) -{ - - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct nvm_chk_meta *chunk; - struct ppa_addr bppa; - int smeta_blk; - - if (line->state == PBLK_LINESTATE_BAD) - return 0; - - smeta_blk = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); - if (smeta_blk >= lm->blk_per_line) - return 0; - - bppa = pblk->luns[smeta_blk].bppa; - chunk = &line->chks[pblk_ppa_to_pos(geo, bppa)]; - - if (chunk->state & NVM_CHK_ST_CLOSED || - (chunk->state & NVM_CHK_ST_OPEN - && chunk->wp >= lm->smeta_sec)) - return 1; - - return 0; -} - -static bool pblk_line_is_open(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - int i; - - for (i = 0; i < lm->blk_per_line; i++) - if (line->chks[i].state & NVM_CHK_ST_OPEN) - return true; - - return false; -} - -struct pblk_line *pblk_recov_l2p(struct pblk *pblk) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line, *tline, *data_line = NULL; - struct pblk_smeta *smeta; - struct pblk_emeta *emeta; - struct line_smeta *smeta_buf; - int found_lines = 0, recovered_lines = 0, open_lines = 0; - int is_next = 0; - int meta_line; - int i, valid_uuid = 0; - LIST_HEAD(recov_list); - - /* TODO: Implement FTL snapshot */ - - /* Scan recovery - takes place when FTL snapshot fails */ - spin_lock(&l_mg->free_lock); - meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); - set_bit(meta_line, &l_mg->meta_bitmap); - smeta = l_mg->sline_meta[meta_line]; - emeta = l_mg->eline_meta[meta_line]; - smeta_buf = (struct line_smeta *)smeta; - spin_unlock(&l_mg->free_lock); - - /* Order data lines using their sequence number */ - for (i = 0; i < l_mg->nr_lines; i++) { - u32 crc; - - line = &pblk->lines[i]; - - memset(smeta, 0, lm->smeta_len); - line->smeta = smeta; - line->lun_bitmap = ((void *)(smeta_buf)) + - sizeof(struct line_smeta); - - if (!pblk_line_was_written(line, pblk)) - continue; - - /* Lines that cannot be read are assumed as not written here */ - if (pblk_line_smeta_read(pblk, line)) - continue; - - crc = pblk_calc_smeta_crc(pblk, smeta_buf); - if (le32_to_cpu(smeta_buf->crc) != crc) - continue; - - if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC) - continue; - - if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) { - pblk_err(pblk, "found incompatible line version %u\n", - smeta_buf->header.version_major); - return ERR_PTR(-EINVAL); - } - - /* The first valid instance uuid is used for initialization */ - if (!valid_uuid) { - import_guid(&pblk->instance_uuid, smeta_buf->header.uuid); - valid_uuid = 1; - } - - if (!guid_equal(&pblk->instance_uuid, - (guid_t *)&smeta_buf->header.uuid)) { - pblk_debug(pblk, "ignore line %u due to uuid mismatch\n", - i); - continue; - } - - /* Update line metadata */ - spin_lock(&line->lock); - line->id = le32_to_cpu(smeta_buf->header.id); - line->type = le16_to_cpu(smeta_buf->header.type); - line->seq_nr = le64_to_cpu(smeta_buf->seq_nr); - spin_unlock(&line->lock); - - /* Update general metadata */ - spin_lock(&l_mg->free_lock); - if (line->seq_nr >= l_mg->d_seq_nr) - l_mg->d_seq_nr = line->seq_nr + 1; - l_mg->nr_free_lines--; - spin_unlock(&l_mg->free_lock); - - if (pblk_line_recov_alloc(pblk, line)) - goto out; - - pblk_recov_line_add_ordered(&recov_list, line); - found_lines++; - pblk_debug(pblk, "recovering data line %d, seq:%llu\n", - line->id, smeta_buf->seq_nr); - } - - if (!found_lines) { - guid_gen(&pblk->instance_uuid); - - spin_lock(&l_mg->free_lock); - WARN_ON_ONCE(!test_and_clear_bit(meta_line, - &l_mg->meta_bitmap)); - spin_unlock(&l_mg->free_lock); - - goto out; - } - - /* Verify closed blocks and recover this portion of L2P table*/ - list_for_each_entry_safe(line, tline, &recov_list, list) { - recovered_lines++; - - line->emeta_ssec = pblk_line_emeta_start(pblk, line); - line->emeta = emeta; - memset(line->emeta->buf, 0, lm->emeta_len[0]); - - if (pblk_line_is_open(pblk, line)) { - pblk_recov_l2p_from_oob(pblk, line); - goto next; - } - - if (pblk_line_emeta_read(pblk, line, line->emeta->buf)) { - pblk_recov_l2p_from_oob(pblk, line); - goto next; - } - - if (pblk_recov_check_emeta(pblk, line->emeta->buf)) { - pblk_recov_l2p_from_oob(pblk, line); - goto next; - } - - if (pblk_recov_check_line_version(pblk, line->emeta->buf)) - return ERR_PTR(-EINVAL); - - pblk_recov_wa_counters(pblk, line->emeta->buf); - - if (pblk_recov_l2p_from_emeta(pblk, line)) - pblk_recov_l2p_from_oob(pblk, line); - -next: - if (pblk_line_is_full(line)) { - struct list_head *move_list; - - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_CLOSED; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - move_list = pblk_line_gc_list(pblk, line); - spin_unlock(&line->lock); - - spin_lock(&l_mg->gc_lock); - list_move_tail(&line->list, move_list); - spin_unlock(&l_mg->gc_lock); - - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - line->map_bitmap = NULL; - line->smeta = NULL; - line->emeta = NULL; - } else { - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_OPEN; - spin_unlock(&line->lock); - - line->emeta->mem = 0; - atomic_set(&line->emeta->sync, 0); - - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - - data_line = line; - line->meta_line = meta_line; - - open_lines++; - } - } - - if (!open_lines) { - spin_lock(&l_mg->free_lock); - WARN_ON_ONCE(!test_and_clear_bit(meta_line, - &l_mg->meta_bitmap)); - spin_unlock(&l_mg->free_lock); - } else { - spin_lock(&l_mg->free_lock); - l_mg->data_line = data_line; - /* Allocate next line for preparation */ - l_mg->data_next = pblk_line_get(pblk); - if (l_mg->data_next) { - l_mg->data_next->seq_nr = l_mg->d_seq_nr++; - l_mg->data_next->type = PBLK_LINETYPE_DATA; - is_next = 1; - } - spin_unlock(&l_mg->free_lock); - } - - if (is_next) - pblk_line_erase(pblk, l_mg->data_next); - -out: - if (found_lines != recovered_lines) - pblk_err(pblk, "failed to recover all found lines %d/%d\n", - found_lines, recovered_lines); - - return data_line; -} - -/* - * Pad current line - */ -int pblk_recov_pad(struct pblk *pblk) -{ - struct pblk_line *line; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int left_msecs; - int ret = 0; - - spin_lock(&l_mg->free_lock); - line = l_mg->data_line; - left_msecs = line->left_msecs; - spin_unlock(&l_mg->free_lock); - - ret = pblk_recov_pad_line(pblk, line, left_msecs); - if (ret) { - pblk_err(pblk, "tear down padding failed (%d)\n", ret); - return ret; - } - - pblk_line_close_meta(pblk, line); - return ret; -} diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c deleted file mode 100644 index a5f8bc2defbc..000000000000 --- a/drivers/lightnvm/pblk-rl.c +++ /dev/null @@ -1,254 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-rl.c - pblk's rate limiter for user I/O - * - */ - -#include "pblk.h" - -static void pblk_rl_kick_u_timer(struct pblk_rl *rl) -{ - mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000)); -} - -int pblk_rl_is_limit(struct pblk_rl *rl) -{ - int rb_space; - - rb_space = atomic_read(&rl->rb_space); - - return (rb_space == 0); -} - -int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries) -{ - int rb_user_cnt = atomic_read(&rl->rb_user_cnt); - int rb_space = atomic_read(&rl->rb_space); - - if (unlikely(rb_space >= 0) && (rb_space - nr_entries < 0)) - return NVM_IO_ERR; - - if (rb_user_cnt >= rl->rb_user_max) - return NVM_IO_REQUEUE; - - return NVM_IO_OK; -} - -void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries) -{ - int rb_space = atomic_read(&rl->rb_space); - - if (unlikely(rb_space >= 0)) - atomic_sub(nr_entries, &rl->rb_space); -} - -int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries) -{ - int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt); - int rb_user_active; - - /* If there is no user I/O let GC take over space on the write buffer */ - rb_user_active = READ_ONCE(rl->rb_user_active); - return (!(rb_gc_cnt >= rl->rb_gc_max && rb_user_active)); -} - -void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries) -{ - atomic_add(nr_entries, &rl->rb_user_cnt); - - /* Release user I/O state. Protect from GC */ - smp_store_release(&rl->rb_user_active, 1); - pblk_rl_kick_u_timer(rl); -} - -void pblk_rl_werr_line_in(struct pblk_rl *rl) -{ - atomic_inc(&rl->werr_lines); -} - -void pblk_rl_werr_line_out(struct pblk_rl *rl) -{ - atomic_dec(&rl->werr_lines); -} - -void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries) -{ - atomic_add(nr_entries, &rl->rb_gc_cnt); -} - -void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc) -{ - atomic_sub(nr_user, &rl->rb_user_cnt); - atomic_sub(nr_gc, &rl->rb_gc_cnt); -} - -unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl) -{ - return atomic_read(&rl->free_blocks); -} - -unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl) -{ - return atomic_read(&rl->free_user_blocks); -} - -static void __pblk_rl_update_rates(struct pblk_rl *rl, - unsigned long free_blocks) -{ - struct pblk *pblk = container_of(rl, struct pblk, rl); - int max = rl->rb_budget; - int werr_gc_needed = atomic_read(&rl->werr_lines); - - if (free_blocks >= rl->high) { - if (werr_gc_needed) { - /* Allocate a small budget for recovering - * lines with write errors - */ - rl->rb_gc_max = 1 << rl->rb_windows_pw; - rl->rb_user_max = max - rl->rb_gc_max; - rl->rb_state = PBLK_RL_WERR; - } else { - rl->rb_user_max = max; - rl->rb_gc_max = 0; - rl->rb_state = PBLK_RL_OFF; - } - } else if (free_blocks < rl->high) { - int shift = rl->high_pw - rl->rb_windows_pw; - int user_windows = free_blocks >> shift; - int user_max = user_windows << ilog2(NVM_MAX_VLBA); - - rl->rb_user_max = user_max; - rl->rb_gc_max = max - user_max; - - if (free_blocks <= rl->rsv_blocks) { - rl->rb_user_max = 0; - rl->rb_gc_max = max; - } - - /* In the worst case, we will need to GC lines in the low list - * (high valid sector count). If there are lines to GC on high - * or mid lists, these will be prioritized - */ - rl->rb_state = PBLK_RL_LOW; - } - - if (rl->rb_state != PBLK_RL_OFF) - pblk_gc_should_start(pblk); - else - pblk_gc_should_stop(pblk); -} - -void pblk_rl_update_rates(struct pblk_rl *rl) -{ - __pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl)); -} - -void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) -{ - int blk_in_line = atomic_read(&line->blk_in_line); - int free_blocks; - - atomic_add(blk_in_line, &rl->free_blocks); - free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks); - - __pblk_rl_update_rates(rl, free_blocks); -} - -void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line, - bool used) -{ - int blk_in_line = atomic_read(&line->blk_in_line); - int free_blocks; - - atomic_sub(blk_in_line, &rl->free_blocks); - - if (used) - free_blocks = atomic_sub_return(blk_in_line, - &rl->free_user_blocks); - else - free_blocks = atomic_read(&rl->free_user_blocks); - - __pblk_rl_update_rates(rl, free_blocks); -} - -int pblk_rl_high_thrs(struct pblk_rl *rl) -{ - return rl->high; -} - -int pblk_rl_max_io(struct pblk_rl *rl) -{ - return rl->rb_max_io; -} - -static void pblk_rl_u_timer(struct timer_list *t) -{ - struct pblk_rl *rl = from_timer(rl, t, u_timer); - - /* Release user I/O state. Protect from GC */ - smp_store_release(&rl->rb_user_active, 0); -} - -void pblk_rl_free(struct pblk_rl *rl) -{ - del_timer(&rl->u_timer); -} - -void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold) -{ - struct pblk *pblk = container_of(rl, struct pblk, rl); - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - int sec_meta, blk_meta; - unsigned int rb_windows; - - /* Consider sectors used for metadata */ - sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; - blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); - - rl->high = pblk->op_blks - blk_meta - lm->blk_per_line; - rl->high_pw = get_count_order(rl->high); - - rl->rsv_blocks = pblk_get_min_chks(pblk); - - /* This will always be a power-of-2 */ - rb_windows = budget / NVM_MAX_VLBA; - rl->rb_windows_pw = get_count_order(rb_windows); - - /* To start with, all buffer is available to user I/O writers */ - rl->rb_budget = budget; - rl->rb_user_max = budget; - rl->rb_gc_max = 0; - rl->rb_state = PBLK_RL_HIGH; - - /* Maximize I/O size and ansure that back threshold is respected */ - if (threshold) - rl->rb_max_io = budget - pblk->min_write_pgs_data - threshold; - else - rl->rb_max_io = budget - pblk->min_write_pgs_data - 1; - - atomic_set(&rl->rb_user_cnt, 0); - atomic_set(&rl->rb_gc_cnt, 0); - atomic_set(&rl->rb_space, -1); - atomic_set(&rl->werr_lines, 0); - - timer_setup(&rl->u_timer, pblk_rl_u_timer, 0); - - rl->rb_user_active = 0; - rl->rb_gc_active = 0; -} diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c deleted file mode 100644 index 6387302b03f2..000000000000 --- a/drivers/lightnvm/pblk-sysfs.c +++ /dev/null @@ -1,728 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Implementation of a physical block-device target for Open-channel SSDs. - * - * pblk-sysfs.c - pblk's sysfs - * - */ - -#include "pblk.h" - -static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - ssize_t sz = 0; - int i; - - for (i = 0; i < geo->all_luns; i++) { - int active = 1; - - rlun = &pblk->luns[i]; - if (!down_trylock(&rlun->wr_sem)) { - active = 0; - up(&rlun->wr_sem); - } - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "pblk: pos:%d, ch:%d, lun:%d - %d\n", - i, - rlun->bppa.a.ch, - rlun->bppa.a.lun, - active); - } - - return sz; -} - -static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page) -{ - int free_blocks, free_user_blocks, total_blocks; - int rb_user_max, rb_user_cnt; - int rb_gc_max, rb_gc_cnt, rb_budget, rb_state; - - free_blocks = pblk_rl_nr_free_blks(&pblk->rl); - free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl); - rb_user_max = pblk->rl.rb_user_max; - rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt); - rb_gc_max = pblk->rl.rb_gc_max; - rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt); - rb_budget = pblk->rl.rb_budget; - rb_state = pblk->rl.rb_state; - - total_blocks = pblk->rl.total_blocks; - - return snprintf(page, PAGE_SIZE, - "u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n", - rb_user_cnt, - rb_user_max, - rb_gc_cnt, - rb_gc_max, - rb_state, - rb_budget, - pblk->rl.high, - free_blocks, - free_user_blocks, - total_blocks, - READ_ONCE(pblk->rl.rb_user_active)); -} - -static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page) -{ - int gc_enabled, gc_active; - - pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active); - return snprintf(page, PAGE_SIZE, "gc_enabled=%d, gc_active=%d\n", - gc_enabled, gc_active); -} - -static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page) -{ - ssize_t sz; - - sz = snprintf(page, PAGE_SIZE, - "read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n", - atomic_long_read(&pblk->read_failed), - atomic_long_read(&pblk->read_high_ecc), - atomic_long_read(&pblk->read_empty), - atomic_long_read(&pblk->read_failed_gc), - atomic_long_read(&pblk->write_failed), - atomic_long_read(&pblk->erase_failed)); - - return sz; -} - -static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page) -{ - return pblk_rb_sysfs(&pblk->rwb, page); -} - -static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - ssize_t sz = 0; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf; - struct nvm_addrf_12 *gppaf = (struct nvm_addrf_12 *)&geo->addrf; - - sz = scnprintf(page, PAGE_SIZE, - "g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n", - pblk->addrf_len, - ppaf->blk_offset, ppaf->blk_len, - ppaf->pg_offset, ppaf->pg_len, - ppaf->lun_offset, ppaf->lun_len, - ppaf->ch_offset, ppaf->ch_len, - ppaf->pln_offset, ppaf->pln_len, - ppaf->sec_offset, ppaf->sec_len); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n", - gppaf->blk_offset, gppaf->blk_len, - gppaf->pg_offset, gppaf->pg_len, - gppaf->lun_offset, gppaf->lun_len, - gppaf->ch_offset, gppaf->ch_len, - gppaf->pln_offset, gppaf->pln_len, - gppaf->sec_offset, gppaf->sec_len); - } else { - struct nvm_addrf *ppaf = &pblk->addrf; - struct nvm_addrf *gppaf = &geo->addrf; - - sz = scnprintf(page, PAGE_SIZE, - "pblk:(s:%d)ch:%d/%d,lun:%d/%d,chk:%d/%d/sec:%d/%d\n", - pblk->addrf_len, - ppaf->ch_offset, ppaf->ch_len, - ppaf->lun_offset, ppaf->lun_len, - ppaf->chk_offset, ppaf->chk_len, - ppaf->sec_offset, ppaf->sec_len); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "device:ch:%d/%d,lun:%d/%d,chk:%d/%d,sec:%d/%d\n", - gppaf->ch_offset, gppaf->ch_len, - gppaf->lun_offset, gppaf->lun_len, - gppaf->chk_offset, gppaf->chk_len, - gppaf->sec_offset, gppaf->sec_len); - } - - return sz; -} - -static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line; - ssize_t sz = 0; - int nr_free_lines; - int cur_data, cur_log; - int free_line_cnt = 0, closed_line_cnt = 0, emeta_line_cnt = 0; - int d_line_cnt = 0, l_line_cnt = 0; - int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0; - int gc_werr = 0; - - int bad = 0, cor = 0; - int msecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0; - int map_weight = 0, meta_weight = 0; - - spin_lock(&l_mg->free_lock); - cur_data = (l_mg->data_line) ? l_mg->data_line->id : -1; - cur_log = (l_mg->log_line) ? l_mg->log_line->id : -1; - nr_free_lines = l_mg->nr_free_lines; - - list_for_each_entry(line, &l_mg->free_list, list) - free_line_cnt++; - spin_unlock(&l_mg->free_lock); - - spin_lock(&l_mg->close_lock); - list_for_each_entry(line, &l_mg->emeta_list, list) - emeta_line_cnt++; - spin_unlock(&l_mg->close_lock); - - spin_lock(&l_mg->gc_lock); - list_for_each_entry(line, &l_mg->gc_full_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_full++; - } - - list_for_each_entry(line, &l_mg->gc_high_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_high++; - } - - list_for_each_entry(line, &l_mg->gc_mid_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_mid++; - } - - list_for_each_entry(line, &l_mg->gc_low_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_low++; - } - - list_for_each_entry(line, &l_mg->gc_empty_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_empty++; - } - - list_for_each_entry(line, &l_mg->gc_werr_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_werr++; - } - - list_for_each_entry(line, &l_mg->bad_list, list) - bad++; - list_for_each_entry(line, &l_mg->corrupt_list, list) - cor++; - spin_unlock(&l_mg->gc_lock); - - spin_lock(&l_mg->free_lock); - if (l_mg->data_line) { - cur_sec = l_mg->data_line->cur_sec; - msecs = l_mg->data_line->left_msecs; - vsc = le32_to_cpu(*l_mg->data_line->vsc); - sec_in_line = l_mg->data_line->sec_in_line; - meta_weight = bitmap_weight(&l_mg->meta_bitmap, - PBLK_DATA_LINES); - - spin_lock(&l_mg->data_line->lock); - if (l_mg->data_line->map_bitmap) - map_weight = bitmap_weight(l_mg->data_line->map_bitmap, - lm->sec_per_line); - else - map_weight = 0; - spin_unlock(&l_mg->data_line->lock); - } - spin_unlock(&l_mg->free_lock); - - if (nr_free_lines != free_line_cnt) - pblk_err(pblk, "corrupted free line list:%d/%d\n", - nr_free_lines, free_line_cnt); - - sz = scnprintf(page, PAGE_SIZE - sz, - "line: nluns:%d, nblks:%d, nsecs:%d\n", - geo->all_luns, lm->blk_per_line, lm->sec_per_line); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n", - cur_data, cur_log, - nr_free_lines, - emeta_line_cnt, meta_weight, - closed_line_cnt, - bad, cor, - d_line_cnt, l_line_cnt, - l_mg->nr_lines); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, werr: %d, queue:%d\n", - gc_full, gc_high, gc_mid, gc_low, gc_empty, gc_werr, - atomic_read(&pblk->gc.read_inflight_gc)); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n", - cur_data, cur_sec, msecs, vsc, sec_in_line, - map_weight, lm->sec_per_line, - atomic_read(&pblk->inflight_io)); - - return sz; -} - -static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - ssize_t sz = 0; - - sz = scnprintf(page, PAGE_SIZE - sz, - "smeta - len:%d, secs:%d\n", - lm->smeta_len, lm->smeta_sec); - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "emeta - len:%d, sec:%d, bb_start:%d\n", - lm->emeta_len[0], lm->emeta_sec[0], - lm->emeta_bb); - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "bitmap lengths: sec:%d, blk:%d, lun:%d\n", - lm->sec_bitmap_len, - lm->blk_bitmap_len, - lm->lun_bitmap_len); - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "blk_line:%d, sec_line:%d, sec_blk:%d\n", - lm->blk_per_line, - lm->sec_per_line, - geo->clba); - - return sz; -} - -static ssize_t pblk_sysfs_get_sec_per_write(struct pblk *pblk, char *page) -{ - return snprintf(page, PAGE_SIZE, "%d\n", pblk->sec_per_write); -} - -static ssize_t pblk_get_write_amp(u64 user, u64 gc, u64 pad, - char *page) -{ - int sz; - - sz = scnprintf(page, PAGE_SIZE, - "user:%lld gc:%lld pad:%lld WA:", - user, gc, pad); - - if (!user) { - sz += scnprintf(page + sz, PAGE_SIZE - sz, "NaN\n"); - } else { - u64 wa_int; - u32 wa_frac; - - wa_int = (user + gc + pad) * 100000; - wa_int = div64_u64(wa_int, user); - wa_int = div_u64_rem(wa_int, 100000, &wa_frac); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, "%llu.%05u\n", - wa_int, wa_frac); - } - - return sz; -} - -static ssize_t pblk_sysfs_get_write_amp_mileage(struct pblk *pblk, char *page) -{ - return pblk_get_write_amp(atomic64_read(&pblk->user_wa), - atomic64_read(&pblk->gc_wa), atomic64_read(&pblk->pad_wa), - page); -} - -static ssize_t pblk_sysfs_get_write_amp_trip(struct pblk *pblk, char *page) -{ - return pblk_get_write_amp( - atomic64_read(&pblk->user_wa) - pblk->user_rst_wa, - atomic64_read(&pblk->gc_wa) - pblk->gc_rst_wa, - atomic64_read(&pblk->pad_wa) - pblk->pad_rst_wa, page); -} - -static long long bucket_percentage(unsigned long long bucket, - unsigned long long total) -{ - int p = bucket * 100; - - p = div_u64(p, total); - - return p; -} - -static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page) -{ - int sz = 0; - unsigned long long total; - unsigned long long total_buckets = 0; - int buckets = pblk->min_write_pgs - 1; - int i; - - total = atomic64_read(&pblk->nr_flush) - pblk->nr_flush_rst; - if (!total) { - for (i = 0; i < (buckets + 1); i++) - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "%d:0 ", i); - sz += scnprintf(page + sz, PAGE_SIZE - sz, "\n"); - - return sz; - } - - for (i = 0; i < buckets; i++) - total_buckets += atomic64_read(&pblk->pad_dist[i]); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, "0:%lld%% ", - bucket_percentage(total - total_buckets, total)); - - for (i = 0; i < buckets; i++) { - unsigned long long p; - - p = bucket_percentage(atomic64_read(&pblk->pad_dist[i]), - total); - sz += scnprintf(page + sz, PAGE_SIZE - sz, "%d:%lld%% ", - i + 1, p); - } - sz += scnprintf(page + sz, PAGE_SIZE - sz, "\n"); - - return sz; -} - -#ifdef CONFIG_NVM_PBLK_DEBUG -static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page) -{ - return snprintf(page, PAGE_SIZE, - "%lu\t%lu\t%ld\t%llu\t%ld\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n", - atomic_long_read(&pblk->inflight_writes), - atomic_long_read(&pblk->inflight_reads), - atomic_long_read(&pblk->req_writes), - (u64)atomic64_read(&pblk->nr_flush), - atomic_long_read(&pblk->padded_writes), - atomic_long_read(&pblk->padded_wb), - atomic_long_read(&pblk->sub_writes), - atomic_long_read(&pblk->sync_writes), - atomic_long_read(&pblk->recov_writes), - atomic_long_read(&pblk->recov_gc_writes), - atomic_long_read(&pblk->recov_gc_reads), - atomic_long_read(&pblk->cache_reads), - atomic_long_read(&pblk->sync_reads)); -} -#endif - -static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page, - size_t len) -{ - size_t c_len; - int force; - - c_len = strcspn(page, "\n"); - if (c_len >= len) - return -EINVAL; - - if (kstrtouint(page, 0, &force)) - return -EINVAL; - - pblk_gc_sysfs_force(pblk, force); - - return len; -} - -static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk, - const char *page, size_t len) -{ - size_t c_len; - int sec_per_write; - - c_len = strcspn(page, "\n"); - if (c_len >= len) - return -EINVAL; - - if (kstrtouint(page, 0, &sec_per_write)) - return -EINVAL; - - if (!pblk_is_oob_meta_supported(pblk)) { - /* For packed metadata case it is - * not allowed to change sec_per_write. - */ - return -EINVAL; - } - - if (sec_per_write < pblk->min_write_pgs - || sec_per_write > pblk->max_write_pgs - || sec_per_write % pblk->min_write_pgs != 0) - return -EINVAL; - - pblk_set_sec_per_write(pblk, sec_per_write); - - return len; -} - -static ssize_t pblk_sysfs_set_write_amp_trip(struct pblk *pblk, - const char *page, size_t len) -{ - size_t c_len; - int reset_value; - - c_len = strcspn(page, "\n"); - if (c_len >= len) - return -EINVAL; - - if (kstrtouint(page, 0, &reset_value)) - return -EINVAL; - - if (reset_value != 0) - return -EINVAL; - - pblk->user_rst_wa = atomic64_read(&pblk->user_wa); - pblk->pad_rst_wa = atomic64_read(&pblk->pad_wa); - pblk->gc_rst_wa = atomic64_read(&pblk->gc_wa); - - return len; -} - - -static ssize_t pblk_sysfs_set_padding_dist(struct pblk *pblk, - const char *page, size_t len) -{ - size_t c_len; - int reset_value; - int buckets = pblk->min_write_pgs - 1; - int i; - - c_len = strcspn(page, "\n"); - if (c_len >= len) - return -EINVAL; - - if (kstrtouint(page, 0, &reset_value)) - return -EINVAL; - - if (reset_value != 0) - return -EINVAL; - - for (i = 0; i < buckets; i++) - atomic64_set(&pblk->pad_dist[i], 0); - - pblk->nr_flush_rst = atomic64_read(&pblk->nr_flush); - - return len; -} - -static struct attribute sys_write_luns = { - .name = "write_luns", - .mode = 0444, -}; - -static struct attribute sys_rate_limiter_attr = { - .name = "rate_limiter", - .mode = 0444, -}; - -static struct attribute sys_gc_state = { - .name = "gc_state", - .mode = 0444, -}; - -static struct attribute sys_errors_attr = { - .name = "errors", - .mode = 0444, -}; - -static struct attribute sys_rb_attr = { - .name = "write_buffer", - .mode = 0444, -}; - -static struct attribute sys_stats_ppaf_attr = { - .name = "ppa_format", - .mode = 0444, -}; - -static struct attribute sys_lines_attr = { - .name = "lines", - .mode = 0444, -}; - -static struct attribute sys_lines_info_attr = { - .name = "lines_info", - .mode = 0444, -}; - -static struct attribute sys_gc_force = { - .name = "gc_force", - .mode = 0200, -}; - -static struct attribute sys_max_sec_per_write = { - .name = "max_sec_per_write", - .mode = 0644, -}; - -static struct attribute sys_write_amp_mileage = { - .name = "write_amp_mileage", - .mode = 0444, -}; - -static struct attribute sys_write_amp_trip = { - .name = "write_amp_trip", - .mode = 0644, -}; - -static struct attribute sys_padding_dist = { - .name = "padding_dist", - .mode = 0644, -}; - -#ifdef CONFIG_NVM_PBLK_DEBUG -static struct attribute sys_stats_debug_attr = { - .name = "stats", - .mode = 0444, -}; -#endif - -static struct attribute *pblk_attrs[] = { - &sys_write_luns, - &sys_rate_limiter_attr, - &sys_errors_attr, - &sys_gc_state, - &sys_gc_force, - &sys_max_sec_per_write, - &sys_rb_attr, - &sys_stats_ppaf_attr, - &sys_lines_attr, - &sys_lines_info_attr, - &sys_write_amp_mileage, - &sys_write_amp_trip, - &sys_padding_dist, -#ifdef CONFIG_NVM_PBLK_DEBUG - &sys_stats_debug_attr, -#endif - NULL, -}; - -static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct pblk *pblk = container_of(kobj, struct pblk, kobj); - - if (strcmp(attr->name, "rate_limiter") == 0) - return pblk_sysfs_rate_limiter(pblk, buf); - else if (strcmp(attr->name, "write_luns") == 0) - return pblk_sysfs_luns_show(pblk, buf); - else if (strcmp(attr->name, "gc_state") == 0) - return pblk_sysfs_gc_state_show(pblk, buf); - else if (strcmp(attr->name, "errors") == 0) - return pblk_sysfs_stats(pblk, buf); - else if (strcmp(attr->name, "write_buffer") == 0) - return pblk_sysfs_write_buffer(pblk, buf); - else if (strcmp(attr->name, "ppa_format") == 0) - return pblk_sysfs_ppaf(pblk, buf); - else if (strcmp(attr->name, "lines") == 0) - return pblk_sysfs_lines(pblk, buf); - else if (strcmp(attr->name, "lines_info") == 0) - return pblk_sysfs_lines_info(pblk, buf); - else if (strcmp(attr->name, "max_sec_per_write") == 0) - return pblk_sysfs_get_sec_per_write(pblk, buf); - else if (strcmp(attr->name, "write_amp_mileage") == 0) - return pblk_sysfs_get_write_amp_mileage(pblk, buf); - else if (strcmp(attr->name, "write_amp_trip") == 0) - return pblk_sysfs_get_write_amp_trip(pblk, buf); - else if (strcmp(attr->name, "padding_dist") == 0) - return pblk_sysfs_get_padding_dist(pblk, buf); -#ifdef CONFIG_NVM_PBLK_DEBUG - else if (strcmp(attr->name, "stats") == 0) - return pblk_sysfs_stats_debug(pblk, buf); -#endif - return 0; -} - -static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct pblk *pblk = container_of(kobj, struct pblk, kobj); - - if (strcmp(attr->name, "gc_force") == 0) - return pblk_sysfs_gc_force(pblk, buf, len); - else if (strcmp(attr->name, "max_sec_per_write") == 0) - return pblk_sysfs_set_sec_per_write(pblk, buf, len); - else if (strcmp(attr->name, "write_amp_trip") == 0) - return pblk_sysfs_set_write_amp_trip(pblk, buf, len); - else if (strcmp(attr->name, "padding_dist") == 0) - return pblk_sysfs_set_padding_dist(pblk, buf, len); - return 0; -} - -static const struct sysfs_ops pblk_sysfs_ops = { - .show = pblk_sysfs_show, - .store = pblk_sysfs_store, -}; - -static struct kobj_type pblk_ktype = { - .sysfs_ops = &pblk_sysfs_ops, - .default_attrs = pblk_attrs, -}; - -int pblk_sysfs_init(struct gendisk *tdisk) -{ - struct pblk *pblk = tdisk->private_data; - struct device *parent_dev = disk_to_dev(pblk->disk); - int ret; - - ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype, - kobject_get(&parent_dev->kobj), - "%s", "pblk"); - if (ret) { - pblk_err(pblk, "could not register\n"); - return ret; - } - - kobject_uevent(&pblk->kobj, KOBJ_ADD); - return 0; -} - -void pblk_sysfs_exit(struct gendisk *tdisk) -{ - struct pblk *pblk = tdisk->private_data; - - kobject_uevent(&pblk->kobj, KOBJ_REMOVE); - kobject_del(&pblk->kobj); - kobject_put(&pblk->kobj); -} diff --git a/drivers/lightnvm/pblk-trace.h b/drivers/lightnvm/pblk-trace.h deleted file mode 100644 index 47b67c6bff7a..000000000000 --- a/drivers/lightnvm/pblk-trace.h +++ /dev/null @@ -1,145 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM pblk - -#if !defined(_TRACE_PBLK_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_PBLK_H - -#include - -struct ppa_addr; - -#define show_chunk_flags(state) __print_flags(state, "", \ - { NVM_CHK_ST_FREE, "FREE", }, \ - { NVM_CHK_ST_CLOSED, "CLOSED", }, \ - { NVM_CHK_ST_OPEN, "OPEN", }, \ - { NVM_CHK_ST_OFFLINE, "OFFLINE", }) - -#define show_line_state(state) __print_symbolic(state, \ - { PBLK_LINESTATE_NEW, "NEW", }, \ - { PBLK_LINESTATE_FREE, "FREE", }, \ - { PBLK_LINESTATE_OPEN, "OPEN", }, \ - { PBLK_LINESTATE_CLOSED, "CLOSED", }, \ - { PBLK_LINESTATE_GC, "GC", }, \ - { PBLK_LINESTATE_BAD, "BAD", }, \ - { PBLK_LINESTATE_CORRUPT, "CORRUPT" }) - - -#define show_pblk_state(state) __print_symbolic(state, \ - { PBLK_STATE_RUNNING, "RUNNING", }, \ - { PBLK_STATE_STOPPING, "STOPPING", }, \ - { PBLK_STATE_RECOVERING, "RECOVERING", }, \ - { PBLK_STATE_STOPPED, "STOPPED" }) - -#define show_chunk_erase_state(state) __print_symbolic(state, \ - { PBLK_CHUNK_RESET_START, "START", }, \ - { PBLK_CHUNK_RESET_DONE, "OK", }, \ - { PBLK_CHUNK_RESET_FAILED, "FAILED" }) - - -TRACE_EVENT(pblk_chunk_reset, - - TP_PROTO(const char *name, struct ppa_addr *ppa, int state), - - TP_ARGS(name, ppa, state), - - TP_STRUCT__entry( - __string(name, name) - __field(u64, ppa) - __field(int, state) - ), - - TP_fast_assign( - __assign_str(name, name); - __entry->ppa = ppa->ppa; - __entry->state = state; - ), - - TP_printk("dev=%s grp=%llu pu=%llu chk=%llu state=%s", __get_str(name), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.grp), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.pu), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.chk), - show_chunk_erase_state((int)__entry->state)) - -); - -TRACE_EVENT(pblk_chunk_state, - - TP_PROTO(const char *name, struct ppa_addr *ppa, int state), - - TP_ARGS(name, ppa, state), - - TP_STRUCT__entry( - __string(name, name) - __field(u64, ppa) - __field(int, state) - ), - - TP_fast_assign( - __assign_str(name, name); - __entry->ppa = ppa->ppa; - __entry->state = state; - ), - - TP_printk("dev=%s grp=%llu pu=%llu chk=%llu state=%s", __get_str(name), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.grp), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.pu), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.chk), - show_chunk_flags((int)__entry->state)) - -); - -TRACE_EVENT(pblk_line_state, - - TP_PROTO(const char *name, int line, int state), - - TP_ARGS(name, line, state), - - TP_STRUCT__entry( - __string(name, name) - __field(int, line) - __field(int, state) - ), - - TP_fast_assign( - __assign_str(name, name); - __entry->line = line; - __entry->state = state; - ), - - TP_printk("dev=%s line=%d state=%s", __get_str(name), - (int)__entry->line, - show_line_state((int)__entry->state)) - -); - -TRACE_EVENT(pblk_state, - - TP_PROTO(const char *name, int state), - - TP_ARGS(name, state), - - TP_STRUCT__entry( - __string(name, name) - __field(int, state) - ), - - TP_fast_assign( - __assign_str(name, name); - __entry->state = state; - ), - - TP_printk("dev=%s state=%s", __get_str(name), - show_pblk_state((int)__entry->state)) - -); - -#endif /* !defined(_TRACE_PBLK_H) || defined(TRACE_HEADER_MULTI_READ) */ - -/* This part must be outside protection */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../drivers/lightnvm -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE pblk-trace -#include diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c deleted file mode 100644 index b9a2aeba95ab..000000000000 --- a/drivers/lightnvm/pblk-write.c +++ /dev/null @@ -1,665 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-write.c - pblk's write path from write buffer to media - */ - -#include "pblk.h" -#include "pblk-trace.h" - -static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx) -{ - struct bio *original_bio; - struct pblk_rb *rwb = &pblk->rwb; - unsigned long ret; - int i; - - for (i = 0; i < c_ctx->nr_valid; i++) { - struct pblk_w_ctx *w_ctx; - int pos = c_ctx->sentry + i; - int flags; - - w_ctx = pblk_rb_w_ctx(rwb, pos); - flags = READ_ONCE(w_ctx->flags); - - if (flags & PBLK_FLUSH_ENTRY) { - flags &= ~PBLK_FLUSH_ENTRY; - /* Release flags on context. Protect from writes */ - smp_store_release(&w_ctx->flags, flags); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_dec(&rwb->inflight_flush_point); -#endif - } - - while ((original_bio = bio_list_pop(&w_ctx->bios))) - bio_endio(original_bio); - } - - if (c_ctx->nr_padded) - pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, - c_ctx->nr_padded); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(rqd->nr_ppas, &pblk->sync_writes); -#endif - - ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); - - bio_put(rqd->bio); - pblk_free_rqd(pblk, rqd, PBLK_WRITE); - - return ret; -} - -static unsigned long pblk_end_queued_w_bio(struct pblk *pblk, - struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx) -{ - list_del(&c_ctx->list); - return pblk_end_w_bio(pblk, rqd, c_ctx); -} - -static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx) -{ - struct pblk_c_ctx *c, *r; - unsigned long flags; - unsigned long pos; - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes); -#endif - pblk_up_rq(pblk, c_ctx->lun_bitmap); - - pos = pblk_rb_sync_init(&pblk->rwb, &flags); - if (pos == c_ctx->sentry) { - pos = pblk_end_w_bio(pblk, rqd, c_ctx); - -retry: - list_for_each_entry_safe(c, r, &pblk->compl_list, list) { - rqd = nvm_rq_from_c_ctx(c); - if (c->sentry == pos) { - pos = pblk_end_queued_w_bio(pblk, rqd, c); - goto retry; - } - } - } else { - WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd); - list_add_tail(&c_ctx->list, &pblk->compl_list); - } - pblk_rb_sync_end(&pblk->rwb, &flags); -} - -/* Map remaining sectors in chunk, starting from ppa */ -static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa, - int rqd_ppas) -{ - struct pblk_line *line; - struct ppa_addr map_ppa = *ppa; - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - __le64 *lba_list; - u64 paddr; - int done = 0; - int n = 0; - - line = pblk_ppa_to_line(pblk, *ppa); - lba_list = emeta_to_lbas(pblk, line->emeta->buf); - - spin_lock(&line->lock); - - while (!done) { - paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa); - - if (!test_and_set_bit(paddr, line->map_bitmap)) - line->left_msecs--; - - if (n < rqd_ppas && lba_list[paddr] != addr_empty) - line->nr_valid_lbas--; - - lba_list[paddr] = addr_empty; - - if (!test_and_set_bit(paddr, line->invalid_bitmap)) - le32_add_cpu(line->vsc, -1); - - done = nvm_next_ppa_in_chk(pblk->dev, &map_ppa); - - n++; - } - - line->w_err_gc->has_write_err = 1; - spin_unlock(&line->lock); -} - -static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry, - unsigned int nr_entries) -{ - struct pblk_rb *rb = &pblk->rwb; - struct pblk_rb_entry *entry; - struct pblk_line *line; - struct pblk_w_ctx *w_ctx; - struct ppa_addr ppa_l2p; - int flags; - unsigned int i; - - spin_lock(&pblk->trans_lock); - for (i = 0; i < nr_entries; i++) { - entry = &rb->entries[pblk_rb_ptr_wrap(rb, sentry, i)]; - w_ctx = &entry->w_ctx; - - /* Check if the lba has been overwritten */ - if (w_ctx->lba != ADDR_EMPTY) { - ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba); - if (!pblk_ppa_comp(ppa_l2p, entry->cacheline)) - w_ctx->lba = ADDR_EMPTY; - } - - /* Mark up the entry as submittable again */ - flags = READ_ONCE(w_ctx->flags); - flags |= PBLK_WRITTEN_DATA; - /* Release flags on write context. Protect from writes */ - smp_store_release(&w_ctx->flags, flags); - - /* Decrease the reference count to the line as we will - * re-map these entries - */ - line = pblk_ppa_to_line(pblk, w_ctx->ppa); - atomic_dec(&line->sec_to_update); - kref_put(&line->ref, pblk_line_put); - } - spin_unlock(&pblk->trans_lock); -} - -static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx) -{ - struct pblk_c_ctx *r_ctx; - - r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL); - if (!r_ctx) - return; - - r_ctx->lun_bitmap = NULL; - r_ctx->sentry = c_ctx->sentry; - r_ctx->nr_valid = c_ctx->nr_valid; - r_ctx->nr_padded = c_ctx->nr_padded; - - spin_lock(&pblk->resubmit_lock); - list_add_tail(&r_ctx->list, &pblk->resubmit_list); - spin_unlock(&pblk->resubmit_lock); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes); -#endif -} - -static void pblk_submit_rec(struct work_struct *work) -{ - struct pblk_rec_ctx *recovery = - container_of(work, struct pblk_rec_ctx, ws_rec); - struct pblk *pblk = recovery->pblk; - struct nvm_rq *rqd = recovery->rqd; - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - pblk_log_write_err(pblk, rqd); - - pblk_map_remaining(pblk, ppa_list, rqd->nr_ppas); - pblk_queue_resubmit(pblk, c_ctx); - - pblk_up_rq(pblk, c_ctx->lun_bitmap); - if (c_ctx->nr_padded) - pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, - c_ctx->nr_padded); - bio_put(rqd->bio); - pblk_free_rqd(pblk, rqd, PBLK_WRITE); - mempool_free(recovery, &pblk->rec_pool); - - atomic_dec(&pblk->inflight_io); - pblk_write_kick(pblk); -} - - -static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct pblk_rec_ctx *recovery; - - recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC); - if (!recovery) { - pblk_err(pblk, "could not allocate recovery work\n"); - return; - } - - recovery->pblk = pblk; - recovery->rqd = rqd; - - INIT_WORK(&recovery->ws_rec, pblk_submit_rec); - queue_work(pblk->close_wq, &recovery->ws_rec); -} - -static void pblk_end_io_write(struct nvm_rq *rqd) -{ - struct pblk *pblk = rqd->private; - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - - if (rqd->error) { - pblk_end_w_fail(pblk, rqd); - return; - } else { - if (trace_pblk_chunk_state_enabled()) - pblk_check_chunk_state_update(pblk, rqd); -#ifdef CONFIG_NVM_PBLK_DEBUG - WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); -#endif - } - - pblk_complete_write(pblk, rqd, c_ctx); - atomic_dec(&pblk->inflight_io); -} - -static void pblk_end_io_write_meta(struct nvm_rq *rqd) -{ - struct pblk *pblk = rqd->private; - struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd); - struct pblk_line *line = m_ctx->private; - struct pblk_emeta *emeta = line->emeta; - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - int sync; - - pblk_up_chunk(pblk, ppa_list[0]); - - if (rqd->error) { - pblk_log_write_err(pblk, rqd); - pblk_err(pblk, "metadata I/O failed. Line %d\n", line->id); - line->w_err_gc->has_write_err = 1; - } else { - if (trace_pblk_chunk_state_enabled()) - pblk_check_chunk_state_update(pblk, rqd); - } - - sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); - if (sync == emeta->nr_entries) - pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws, - GFP_ATOMIC, pblk->close_wq); - - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - - atomic_dec(&pblk->inflight_io); -} - -static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned int nr_secs, nvm_end_io_fn(*end_io)) -{ - /* Setup write request */ - rqd->opcode = NVM_OP_PWRITE; - rqd->nr_ppas = nr_secs; - rqd->is_seq = 1; - rqd->private = pblk; - rqd->end_io = end_io; - - return pblk_alloc_rqd_meta(pblk, rqd); -} - -static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct ppa_addr *erase_ppa) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *e_line = pblk_line_get_erase(pblk); - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - unsigned int valid = c_ctx->nr_valid; - unsigned int padded = c_ctx->nr_padded; - unsigned int nr_secs = valid + padded; - unsigned long *lun_bitmap; - int ret; - - lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); - if (!lun_bitmap) - return -ENOMEM; - c_ctx->lun_bitmap = lun_bitmap; - - ret = pblk_alloc_w_rq(pblk, rqd, nr_secs, pblk_end_io_write); - if (ret) { - kfree(lun_bitmap); - return ret; - } - - if (likely(!e_line || !atomic_read(&e_line->left_eblks))) - ret = pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, - valid, 0); - else - ret = pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, - valid, erase_ppa); - - return ret; -} - -static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, - unsigned int secs_to_flush) -{ - int secs_to_sync; - - secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush, true); - -#ifdef CONFIG_NVM_PBLK_DEBUG - if ((!secs_to_sync && secs_to_flush) - || (secs_to_sync < 0) - || (secs_to_sync > secs_avail && !secs_to_flush)) { - pblk_err(pblk, "bad sector calculation (a:%d,s:%d,f:%d)\n", - secs_avail, secs_to_sync, secs_to_flush); - } -#endif - - return secs_to_sync; -} - -int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_emeta *emeta = meta_line->emeta; - struct ppa_addr *ppa_list; - struct pblk_g_ctx *m_ctx; - struct nvm_rq *rqd; - void *data; - u64 paddr; - int rq_ppas = pblk->min_write_pgs; - int id = meta_line->id; - int rq_len; - int i, j; - int ret; - - rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); - - m_ctx = nvm_rq_to_pdu(rqd); - m_ctx->private = meta_line; - - rq_len = rq_ppas * geo->csecs; - data = ((void *)emeta->buf) + emeta->mem; - - ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta); - if (ret) - goto fail_free_rqd; - - ppa_list = nvm_rq_to_ppa_list(rqd); - for (i = 0; i < rqd->nr_ppas; ) { - spin_lock(&meta_line->lock); - paddr = __pblk_alloc_page(pblk, meta_line, rq_ppas); - spin_unlock(&meta_line->lock); - for (j = 0; j < rq_ppas; j++, i++, paddr++) - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id); - } - - spin_lock(&l_mg->close_lock); - emeta->mem += rq_len; - if (emeta->mem >= lm->emeta_len[0]) - list_del(&meta_line->list); - spin_unlock(&l_mg->close_lock); - - pblk_down_chunk(pblk, ppa_list[0]); - - ret = pblk_submit_io(pblk, rqd, data); - if (ret) { - pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); - goto fail_rollback; - } - - return NVM_IO_OK; - -fail_rollback: - pblk_up_chunk(pblk, ppa_list[0]); - spin_lock(&l_mg->close_lock); - pblk_dealloc_page(pblk, meta_line, rq_ppas); - list_add(&meta_line->list, &meta_line->list); - spin_unlock(&l_mg->close_lock); -fail_free_rqd: - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - return ret; -} - -static inline bool pblk_valid_meta_ppa(struct pblk *pblk, - struct pblk_line *meta_line, - struct nvm_rq *data_rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd); - struct pblk_line *data_line = pblk_line_get_data(pblk); - struct ppa_addr ppa, ppa_opt; - u64 paddr; - int pos_opt; - - /* Schedule a metadata I/O that is half the distance from the data I/O - * with regards to the number of LUNs forming the pblk instance. This - * balances LUN conflicts across every I/O. - * - * When the LUN configuration changes (e.g., due to GC), this distance - * can align, which would result on metadata and data I/Os colliding. In - * this case, modify the distance to not be optimal, but move the - * optimal in the right direction. - */ - paddr = pblk_lookup_page(pblk, meta_line); - ppa = addr_to_gen_ppa(pblk, paddr, 0); - ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0); - pos_opt = pblk_ppa_to_pos(geo, ppa_opt); - - if (test_bit(pos_opt, data_c_ctx->lun_bitmap) || - test_bit(pos_opt, data_line->blk_bitmap)) - return true; - - if (unlikely(pblk_ppa_comp(ppa_opt, ppa))) - data_line->meta_distance--; - - return false; -} - -static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk, - struct nvm_rq *data_rqd) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *meta_line; - - spin_lock(&l_mg->close_lock); - if (list_empty(&l_mg->emeta_list)) { - spin_unlock(&l_mg->close_lock); - return NULL; - } - meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); - if (meta_line->emeta->mem >= lm->emeta_len[0]) { - spin_unlock(&l_mg->close_lock); - return NULL; - } - spin_unlock(&l_mg->close_lock); - - if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd)) - return NULL; - - return meta_line; -} - -static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct ppa_addr erase_ppa; - struct pblk_line *meta_line; - int err; - - pblk_ppa_set_empty(&erase_ppa); - - /* Assign lbas to ppas and populate request structure */ - err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); - if (err) { - pblk_err(pblk, "could not setup write request: %d\n", err); - return NVM_IO_ERR; - } - - meta_line = pblk_should_submit_meta_io(pblk, rqd); - - /* Submit data write for current data line */ - err = pblk_submit_io(pblk, rqd, NULL); - if (err) { - pblk_err(pblk, "data I/O submission failed: %d\n", err); - return NVM_IO_ERR; - } - - if (!pblk_ppa_empty(erase_ppa)) { - /* Submit erase for next data line */ - if (pblk_blk_erase_async(pblk, erase_ppa)) { - struct pblk_line *e_line = pblk_line_get_erase(pblk); - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int bit; - - atomic_inc(&e_line->left_eblks); - bit = pblk_ppa_to_pos(geo, erase_ppa); - WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap)); - } - } - - if (meta_line) { - /* Submit metadata write for previous data line */ - err = pblk_submit_meta_io(pblk, meta_line); - if (err) { - pblk_err(pblk, "metadata I/O submission failed: %d", - err); - return NVM_IO_ERR; - } - } - - return NVM_IO_OK; -} - -static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - struct bio *bio = rqd->bio; - - if (c_ctx->nr_padded) - pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid, - c_ctx->nr_padded); -} - -static int pblk_submit_write(struct pblk *pblk, int *secs_left) -{ - struct bio *bio; - struct nvm_rq *rqd; - unsigned int secs_avail, secs_to_sync, secs_to_com; - unsigned int secs_to_flush, packed_meta_pgs; - unsigned long pos; - unsigned int resubmit; - - *secs_left = 0; - - spin_lock(&pblk->resubmit_lock); - resubmit = !list_empty(&pblk->resubmit_list); - spin_unlock(&pblk->resubmit_lock); - - /* Resubmit failed writes first */ - if (resubmit) { - struct pblk_c_ctx *r_ctx; - - spin_lock(&pblk->resubmit_lock); - r_ctx = list_first_entry(&pblk->resubmit_list, - struct pblk_c_ctx, list); - list_del(&r_ctx->list); - spin_unlock(&pblk->resubmit_lock); - - secs_avail = r_ctx->nr_valid; - pos = r_ctx->sentry; - - pblk_prepare_resubmit(pblk, pos, secs_avail); - secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, - secs_avail); - - kfree(r_ctx); - } else { - /* If there are no sectors in the cache, - * flushes (bios without data) will be cleared on - * the cache threads - */ - secs_avail = pblk_rb_read_count(&pblk->rwb); - if (!secs_avail) - return 0; - - secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb); - if (!secs_to_flush && secs_avail < pblk->min_write_pgs_data) - return 0; - - secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, - secs_to_flush); - if (secs_to_sync > pblk->max_write_pgs) { - pblk_err(pblk, "bad buffer sync calculation\n"); - return 0; - } - - secs_to_com = (secs_to_sync > secs_avail) ? - secs_avail : secs_to_sync; - pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); - } - - packed_meta_pgs = (pblk->min_write_pgs - pblk->min_write_pgs_data); - bio = bio_alloc(GFP_KERNEL, secs_to_sync + packed_meta_pgs); - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - - rqd = pblk_alloc_rqd(pblk, PBLK_WRITE); - rqd->bio = bio; - - if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, - secs_avail)) { - pblk_err(pblk, "corrupted write bio\n"); - goto fail_put_bio; - } - - if (pblk_submit_io_set(pblk, rqd)) - goto fail_free_bio; - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(secs_to_sync, &pblk->sub_writes); -#endif - - *secs_left = 1; - return 0; - -fail_free_bio: - pblk_free_write_rqd(pblk, rqd); -fail_put_bio: - bio_put(bio); - pblk_free_rqd(pblk, rqd, PBLK_WRITE); - - return -EINTR; -} - -int pblk_write_ts(void *data) -{ - struct pblk *pblk = data; - int secs_left; - int write_failure = 0; - - while (!kthread_should_stop()) { - if (!write_failure) { - write_failure = pblk_submit_write(pblk, &secs_left); - - if (secs_left) - continue; - } - set_current_state(TASK_INTERRUPTIBLE); - io_schedule(); - } - - return 0; -} diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h deleted file mode 100644 index 86ffa875bfe1..000000000000 --- a/drivers/lightnvm/pblk.h +++ /dev/null @@ -1,1358 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2015 IT University of Copenhagen (rrpc.h) - * Copyright (C) 2016 CNEX Labs - * Initial release: Matias Bjorling - * Write buffering: Javier Gonzalez - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Implementation of a Physical Block-device target for Open-channel SSDs. - * - */ - -#ifndef PBLK_H_ -#define PBLK_H_ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* Run only GC if less than 1/X blocks are free */ -#define GC_LIMIT_INVERSE 5 -#define GC_TIME_MSECS 1000 - -#define PBLK_SECTOR (512) -#define PBLK_EXPOSED_PAGE_SIZE (4096) - -#define PBLK_NR_CLOSE_JOBS (4) - -#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16) - -/* Max 512 LUNs per device */ -#define PBLK_MAX_LUNS_BITMAP (4) - -#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR) - -/* Static pool sizes */ -#define PBLK_GEN_WS_POOL_SIZE (2) - -#define PBLK_DEFAULT_OP (11) - -enum { - PBLK_READ = READ, - PBLK_WRITE = WRITE,/* Write from write buffer */ - PBLK_WRITE_INT, /* Internal write - no write buffer */ - PBLK_READ_RECOV, /* Recovery read - errors allowed */ - PBLK_ERASE, -}; - -enum { - /* IO Types */ - PBLK_IOTYPE_USER = 1 << 0, - PBLK_IOTYPE_GC = 1 << 1, - - /* Write buffer flags */ - PBLK_FLUSH_ENTRY = 1 << 2, - PBLK_WRITTEN_DATA = 1 << 3, - PBLK_SUBMITTED_ENTRY = 1 << 4, - PBLK_WRITABLE_ENTRY = 1 << 5, -}; - -enum { - PBLK_BLK_ST_OPEN = 0x1, - PBLK_BLK_ST_CLOSED = 0x2, -}; - -enum { - PBLK_CHUNK_RESET_START, - PBLK_CHUNK_RESET_DONE, - PBLK_CHUNK_RESET_FAILED, -}; - -struct pblk_sec_meta { - u64 reserved; - __le64 lba; -}; - -/* The number of GC lists and the rate-limiter states go together. This way the - * rate-limiter can dictate how much GC is needed based on resource utilization. - */ -#define PBLK_GC_NR_LISTS 4 - -enum { - PBLK_RL_OFF = 0, - PBLK_RL_WERR = 1, - PBLK_RL_HIGH = 2, - PBLK_RL_MID = 3, - PBLK_RL_LOW = 4 -}; - -#define pblk_dma_ppa_size (sizeof(u64) * NVM_MAX_VLBA) - -/* write buffer completion context */ -struct pblk_c_ctx { - struct list_head list; /* Head for out-of-order completion */ - - unsigned long *lun_bitmap; /* Luns used on current request */ - unsigned int sentry; - unsigned int nr_valid; - unsigned int nr_padded; -}; - -/* read context */ -struct pblk_g_ctx { - void *private; - unsigned long start_time; - u64 lba; -}; - -/* Pad context */ -struct pblk_pad_rq { - struct pblk *pblk; - struct completion wait; - struct kref ref; -}; - -/* Recovery context */ -struct pblk_rec_ctx { - struct pblk *pblk; - struct nvm_rq *rqd; - struct work_struct ws_rec; -}; - -/* Write context */ -struct pblk_w_ctx { - struct bio_list bios; /* Original bios - used for completion - * in REQ_FUA, REQ_FLUSH case - */ - u64 lba; /* Logic addr. associated with entry */ - struct ppa_addr ppa; /* Physic addr. associated with entry */ - int flags; /* Write context flags */ -}; - -struct pblk_rb_entry { - struct ppa_addr cacheline; /* Cacheline for this entry */ - void *data; /* Pointer to data on this entry */ - struct pblk_w_ctx w_ctx; /* Context for this entry */ - struct list_head index; /* List head to enable indexes */ -}; - -#define EMPTY_ENTRY (~0U) - -struct pblk_rb_pages { - struct page *pages; - int order; - struct list_head list; -}; - -struct pblk_rb { - struct pblk_rb_entry *entries; /* Ring buffer entries */ - unsigned int mem; /* Write offset - points to next - * writable entry in memory - */ - unsigned int subm; /* Read offset - points to last entry - * that has been submitted to the media - * to be persisted - */ - unsigned int sync; /* Synced - backpointer that signals - * the last submitted entry that has - * been successfully persisted to media - */ - unsigned int flush_point; /* Sync point - last entry that must be - * flushed to the media. Used with - * REQ_FLUSH and REQ_FUA - */ - unsigned int l2p_update; /* l2p update point - next entry for - * which l2p mapping will be updated to - * contain a device ppa address (instead - * of a cacheline - */ - unsigned int nr_entries; /* Number of entries in write buffer - - * must be a power of two - */ - unsigned int seg_size; /* Size of the data segments being - * stored on each entry. Typically this - * will be 4KB - */ - - unsigned int back_thres; /* Threshold that shall be maintained by - * the backpointer in order to respect - * geo->mw_cunits on a per chunk basis - */ - - struct list_head pages; /* List of data pages */ - - spinlock_t w_lock; /* Write lock */ - spinlock_t s_lock; /* Sync lock */ - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */ -#endif -}; - -#define PBLK_RECOVERY_SECTORS 16 - -struct pblk_lun { - struct ppa_addr bppa; - struct semaphore wr_sem; -}; - -struct pblk_gc_rq { - struct pblk_line *line; - void *data; - u64 paddr_list[NVM_MAX_VLBA]; - u64 lba_list[NVM_MAX_VLBA]; - int nr_secs; - int secs_to_gc; - struct list_head list; -}; - -struct pblk_gc { - /* These states are not protected by a lock since (i) they are in the - * fast path, and (ii) they are not critical. - */ - int gc_active; - int gc_enabled; - int gc_forced; - - struct task_struct *gc_ts; - struct task_struct *gc_writer_ts; - struct task_struct *gc_reader_ts; - - struct workqueue_struct *gc_line_reader_wq; - struct workqueue_struct *gc_reader_wq; - - struct timer_list gc_timer; - - struct semaphore gc_sem; - atomic_t read_inflight_gc; /* Number of lines with inflight GC reads */ - atomic_t pipeline_gc; /* Number of lines in the GC pipeline - - * started reads to finished writes - */ - int w_entries; - - struct list_head w_list; - struct list_head r_list; - - spinlock_t lock; - spinlock_t w_lock; - spinlock_t r_lock; -}; - -struct pblk_rl { - unsigned int high; /* Upper threshold for rate limiter (free run - - * user I/O rate limiter - */ - unsigned int high_pw; /* High rounded up as a power of 2 */ - -#define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */ -#define PBLK_USER_LOW_THRS 10 /* Aggressive GC at 10% available blocks */ - - int rb_windows_pw; /* Number of rate windows in the write buffer - * given as a power-of-2. This guarantees that - * when user I/O is being rate limited, there - * will be reserved enough space for the GC to - * place its payload. A window is of - * pblk->max_write_pgs size, which in NVMe is - * 64, i.e., 256kb. - */ - int rb_budget; /* Total number of entries available for I/O */ - int rb_user_max; /* Max buffer entries available for user I/O */ - int rb_gc_max; /* Max buffer entries available for GC I/O */ - int rb_gc_rsv; /* Reserved buffer entries for GC I/O */ - int rb_state; /* Rate-limiter current state */ - int rb_max_io; /* Maximum size for an I/O giving the config */ - - atomic_t rb_user_cnt; /* User I/O buffer counter */ - atomic_t rb_gc_cnt; /* GC I/O buffer counter */ - atomic_t rb_space; /* Space limit in case of reaching capacity */ - - int rsv_blocks; /* Reserved blocks for GC */ - - int rb_user_active; - int rb_gc_active; - - atomic_t werr_lines; /* Number of write error lines that needs gc */ - - struct timer_list u_timer; - - unsigned long total_blocks; - - atomic_t free_blocks; /* Total number of free blocks (+ OP) */ - atomic_t free_user_blocks; /* Number of user free blocks (no OP) */ -}; - -#define PBLK_LINE_EMPTY (~0U) - -enum { - /* Line Types */ - PBLK_LINETYPE_FREE = 0, - PBLK_LINETYPE_LOG = 1, - PBLK_LINETYPE_DATA = 2, - - /* Line state */ - PBLK_LINESTATE_NEW = 9, - PBLK_LINESTATE_FREE = 10, - PBLK_LINESTATE_OPEN = 11, - PBLK_LINESTATE_CLOSED = 12, - PBLK_LINESTATE_GC = 13, - PBLK_LINESTATE_BAD = 14, - PBLK_LINESTATE_CORRUPT = 15, - - /* GC group */ - PBLK_LINEGC_NONE = 20, - PBLK_LINEGC_EMPTY = 21, - PBLK_LINEGC_LOW = 22, - PBLK_LINEGC_MID = 23, - PBLK_LINEGC_HIGH = 24, - PBLK_LINEGC_FULL = 25, - PBLK_LINEGC_WERR = 26 -}; - -#define PBLK_MAGIC 0x70626c6b /*pblk*/ - -/* emeta/smeta persistent storage format versions: - * Changes in major version requires offline migration. - * Changes in minor version are handled automatically during - * recovery. - */ - -#define SMETA_VERSION_MAJOR (0) -#define SMETA_VERSION_MINOR (1) - -#define EMETA_VERSION_MAJOR (0) -#define EMETA_VERSION_MINOR (2) - -struct line_header { - __le32 crc; - __le32 identifier; /* pblk identifier */ - __u8 uuid[16]; /* instance uuid */ - __le16 type; /* line type */ - __u8 version_major; /* version major */ - __u8 version_minor; /* version minor */ - __le32 id; /* line id for current line */ -}; - -struct line_smeta { - struct line_header header; - - __le32 crc; /* Full structure including struct crc */ - /* Previous line metadata */ - __le32 prev_id; /* Line id for previous line */ - - /* Current line metadata */ - __le64 seq_nr; /* Sequence number for current line */ - - /* Active writers */ - __le32 window_wr_lun; /* Number of parallel LUNs to write */ - - __le32 rsvd[2]; - - __le64 lun_bitmap[]; -}; - - -/* - * Metadata layout in media: - * First sector: - * 1. struct line_emeta - * 2. bad block bitmap (u64 * window_wr_lun) - * 3. write amplification counters - * Mid sectors (start at lbas_sector): - * 3. nr_lbas (u64) forming lba list - * Last sectors (start at vsc_sector): - * 4. u32 valid sector count (vsc) for all lines (~0U: free line) - */ -struct line_emeta { - struct line_header header; - - __le32 crc; /* Full structure including struct crc */ - - /* Previous line metadata */ - __le32 prev_id; /* Line id for prev line */ - - /* Current line metadata */ - __le64 seq_nr; /* Sequence number for current line */ - - /* Active writers */ - __le32 window_wr_lun; /* Number of parallel LUNs to write */ - - /* Bookkeeping for recovery */ - __le32 next_id; /* Line id for next line */ - __le64 nr_lbas; /* Number of lbas mapped in line */ - __le64 nr_valid_lbas; /* Number of valid lbas mapped in line */ - __le64 bb_bitmap[]; /* Updated bad block bitmap for line */ -}; - - -/* Write amplification counters stored on media */ -struct wa_counters { - __le64 user; /* Number of user written sectors */ - __le64 gc; /* Number of sectors written by GC*/ - __le64 pad; /* Number of padded sectors */ -}; - -struct pblk_emeta { - struct line_emeta *buf; /* emeta buffer in media format */ - int mem; /* Write offset - points to next - * writable entry in memory - */ - atomic_t sync; /* Synced - backpointer that signals the - * last entry that has been successfully - * persisted to media - */ - unsigned int nr_entries; /* Number of emeta entries */ -}; - -struct pblk_smeta { - struct line_smeta *buf; /* smeta buffer in persistent format */ -}; - -struct pblk_w_err_gc { - int has_write_err; - int has_gc_err; - __le64 *lba_list; -}; - -struct pblk_line { - struct pblk *pblk; - unsigned int id; /* Line number corresponds to the - * block line - */ - unsigned int seq_nr; /* Unique line sequence number */ - - int state; /* PBLK_LINESTATE_X */ - int type; /* PBLK_LINETYPE_X */ - int gc_group; /* PBLK_LINEGC_X */ - struct list_head list; /* Free, GC lists */ - - unsigned long *lun_bitmap; /* Bitmap for LUNs mapped in line */ - - struct nvm_chk_meta *chks; /* Chunks forming line */ - - struct pblk_smeta *smeta; /* Start metadata */ - struct pblk_emeta *emeta; /* End medatada */ - - int meta_line; /* Metadata line id */ - int meta_distance; /* Distance between data and metadata */ - - u64 emeta_ssec; /* Sector where emeta starts */ - - unsigned int sec_in_line; /* Number of usable secs in line */ - - atomic_t blk_in_line; /* Number of good blocks in line */ - unsigned long *blk_bitmap; /* Bitmap for valid/invalid blocks */ - unsigned long *erase_bitmap; /* Bitmap for erased blocks */ - - unsigned long *map_bitmap; /* Bitmap for mapped sectors in line */ - unsigned long *invalid_bitmap; /* Bitmap for invalid sectors in line */ - - atomic_t left_eblks; /* Blocks left for erasing */ - atomic_t left_seblks; /* Blocks left for sync erasing */ - - int left_msecs; /* Sectors left for mapping */ - unsigned int cur_sec; /* Sector map pointer */ - unsigned int nr_valid_lbas; /* Number of valid lbas in line */ - - __le32 *vsc; /* Valid sector count in line */ - - struct kref ref; /* Write buffer L2P references */ - atomic_t sec_to_update; /* Outstanding L2P updates to ppa */ - - struct pblk_w_err_gc *w_err_gc; /* Write error gc recovery metadata */ - - spinlock_t lock; /* Necessary for invalid_bitmap only */ -}; - -#define PBLK_DATA_LINES 4 - -enum { - PBLK_EMETA_TYPE_HEADER = 1, /* struct line_emeta first sector */ - PBLK_EMETA_TYPE_LLBA = 2, /* lba list - type: __le64 */ - PBLK_EMETA_TYPE_VSC = 3, /* vsc list - type: __le32 */ -}; - -struct pblk_line_mgmt { - int nr_lines; /* Total number of full lines */ - int nr_free_lines; /* Number of full lines in free list */ - - /* Free lists - use free_lock */ - struct list_head free_list; /* Full lines ready to use */ - struct list_head corrupt_list; /* Full lines corrupted */ - struct list_head bad_list; /* Full lines bad */ - - /* GC lists - use gc_lock */ - struct list_head *gc_lists[PBLK_GC_NR_LISTS]; - struct list_head gc_high_list; /* Full lines ready to GC, high isc */ - struct list_head gc_mid_list; /* Full lines ready to GC, mid isc */ - struct list_head gc_low_list; /* Full lines ready to GC, low isc */ - - struct list_head gc_werr_list; /* Write err recovery list */ - - struct list_head gc_full_list; /* Full lines ready to GC, no valid */ - struct list_head gc_empty_list; /* Full lines close, all valid */ - - struct pblk_line *log_line; /* Current FTL log line */ - struct pblk_line *data_line; /* Current data line */ - struct pblk_line *log_next; /* Next FTL log line */ - struct pblk_line *data_next; /* Next data line */ - - struct list_head emeta_list; /* Lines queued to schedule emeta */ - - __le32 *vsc_list; /* Valid sector counts for all lines */ - - /* Pre-allocated metadata for data lines */ - struct pblk_smeta *sline_meta[PBLK_DATA_LINES]; - struct pblk_emeta *eline_meta[PBLK_DATA_LINES]; - unsigned long meta_bitmap; - - /* Cache and mempool for map/invalid bitmaps */ - struct kmem_cache *bitmap_cache; - mempool_t *bitmap_pool; - - /* Helpers for fast bitmap calculations */ - unsigned long *bb_template; - unsigned long *bb_aux; - - unsigned long d_seq_nr; /* Data line unique sequence number */ - unsigned long l_seq_nr; /* Log line unique sequence number */ - - spinlock_t free_lock; - spinlock_t close_lock; - spinlock_t gc_lock; -}; - -struct pblk_line_meta { - unsigned int smeta_len; /* Total length for smeta */ - unsigned int smeta_sec; /* Sectors needed for smeta */ - - unsigned int emeta_len[4]; /* Lengths for emeta: - * [0]: Total - * [1]: struct line_emeta + - * bb_bitmap + struct wa_counters - * [2]: L2P portion - * [3]: vsc - */ - unsigned int emeta_sec[4]; /* Sectors needed for emeta. Same layout - * as emeta_len - */ - - unsigned int emeta_bb; /* Boundary for bb that affects emeta */ - - unsigned int vsc_list_len; /* Length for vsc list */ - unsigned int sec_bitmap_len; /* Length for sector bitmap in line */ - unsigned int blk_bitmap_len; /* Length for block bitmap in line */ - unsigned int lun_bitmap_len; /* Length for lun bitmap in line */ - - unsigned int blk_per_line; /* Number of blocks in a full line */ - unsigned int sec_per_line; /* Number of sectors in a line */ - unsigned int dsec_per_line; /* Number of data sectors in a line */ - unsigned int min_blk_line; /* Min. number of good blocks in line */ - - unsigned int mid_thrs; /* Threshold for GC mid list */ - unsigned int high_thrs; /* Threshold for GC high list */ - - unsigned int meta_distance; /* Distance between data and metadata */ -}; - -enum { - PBLK_STATE_RUNNING = 0, - PBLK_STATE_STOPPING = 1, - PBLK_STATE_RECOVERING = 2, - PBLK_STATE_STOPPED = 3, -}; - -/* Internal format to support not power-of-2 device formats */ -struct pblk_addrf { - /* gen to dev */ - int sec_stripe; - int ch_stripe; - int lun_stripe; - - /* dev to gen */ - int sec_lun_stripe; - int sec_ws_stripe; -}; - -struct pblk { - struct nvm_tgt_dev *dev; - struct gendisk *disk; - - struct kobject kobj; - - struct pblk_lun *luns; - - struct pblk_line *lines; /* Line array */ - struct pblk_line_mgmt l_mg; /* Line management */ - struct pblk_line_meta lm; /* Line metadata */ - - struct nvm_addrf addrf; /* Aligned address format */ - struct pblk_addrf uaddrf; /* Unaligned address format */ - int addrf_len; - - struct pblk_rb rwb; - - int state; /* pblk line state */ - - int min_write_pgs; /* Minimum amount of pages required by controller */ - int min_write_pgs_data; /* Minimum amount of payload pages */ - int max_write_pgs; /* Maximum amount of pages supported by controller */ - int oob_meta_size; /* Size of OOB sector metadata */ - - sector_t capacity; /* Device capacity when bad blocks are subtracted */ - - int op; /* Percentage of device used for over-provisioning */ - int op_blks; /* Number of blocks used for over-provisioning */ - - /* pblk provisioning values. Used by rate limiter */ - struct pblk_rl rl; - - int sec_per_write; - - guid_t instance_uuid; - - /* Persistent write amplification counters, 4kb sector I/Os */ - atomic64_t user_wa; /* Sectors written by user */ - atomic64_t gc_wa; /* Sectors written by GC */ - atomic64_t pad_wa; /* Padded sectors written */ - - /* Reset values for delta write amplification measurements */ - u64 user_rst_wa; - u64 gc_rst_wa; - u64 pad_rst_wa; - - /* Counters used for calculating padding distribution */ - atomic64_t *pad_dist; /* Padding distribution buckets */ - u64 nr_flush_rst; /* Flushes reset value for pad dist.*/ - atomic64_t nr_flush; /* Number of flush/fua I/O */ - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Non-persistent debug counters, 4kb sector I/Os */ - atomic_long_t inflight_writes; /* Inflight writes (user and gc) */ - atomic_long_t padded_writes; /* Sectors padded due to flush/fua */ - atomic_long_t padded_wb; /* Sectors padded in write buffer */ - atomic_long_t req_writes; /* Sectors stored on write buffer */ - atomic_long_t sub_writes; /* Sectors submitted from buffer */ - atomic_long_t sync_writes; /* Sectors synced to media */ - atomic_long_t inflight_reads; /* Inflight sector read requests */ - atomic_long_t cache_reads; /* Read requests that hit the cache */ - atomic_long_t sync_reads; /* Completed sector read requests */ - atomic_long_t recov_writes; /* Sectors submitted from recovery */ - atomic_long_t recov_gc_writes; /* Sectors submitted from write GC */ - atomic_long_t recov_gc_reads; /* Sectors submitted from read GC */ -#endif - - spinlock_t lock; - - atomic_long_t read_failed; - atomic_long_t read_empty; - atomic_long_t read_high_ecc; - atomic_long_t read_failed_gc; - atomic_long_t write_failed; - atomic_long_t erase_failed; - - atomic_t inflight_io; /* General inflight I/O counter */ - - struct task_struct *writer_ts; - - /* Simple translation map of logical addresses to physical addresses. - * The logical addresses is known by the host system, while the physical - * addresses are used when writing to the disk block device. - */ - unsigned char *trans_map; - spinlock_t trans_lock; - - struct list_head compl_list; - - spinlock_t resubmit_lock; /* Resubmit list lock */ - struct list_head resubmit_list; /* Resubmit list for failed writes*/ - - mempool_t page_bio_pool; - mempool_t gen_ws_pool; - mempool_t rec_pool; - mempool_t r_rq_pool; - mempool_t w_rq_pool; - mempool_t e_rq_pool; - - struct workqueue_struct *close_wq; - struct workqueue_struct *bb_wq; - struct workqueue_struct *r_end_wq; - - struct timer_list wtimer; - - struct pblk_gc gc; -}; - -struct pblk_line_ws { - struct pblk *pblk; - struct pblk_line *line; - void *priv; - struct work_struct ws; -}; - -#define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx)) -#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx)) - -#define pblk_err(pblk, fmt, ...) \ - pr_err("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) -#define pblk_info(pblk, fmt, ...) \ - pr_info("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) -#define pblk_warn(pblk, fmt, ...) \ - pr_warn("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) -#define pblk_debug(pblk, fmt, ...) \ - pr_debug("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) - -/* - * pblk ring buffer operations - */ -int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold, - unsigned int seg_sz); -int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio, - unsigned int nr_entries, unsigned int *pos); -int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos); -void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, unsigned int pos); -void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, struct pblk_line *line, - u64 paddr, unsigned int pos); -struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos); -void pblk_rb_flush(struct pblk_rb *rb); - -void pblk_rb_sync_l2p(struct pblk_rb *rb); -unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, - unsigned int pos, unsigned int nr_entries, - unsigned int count); -int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, - struct ppa_addr ppa); -unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries); - -unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags); -unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries); -unsigned int pblk_rb_ptr_wrap(struct pblk_rb *rb, unsigned int p, - unsigned int nr_entries); -void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags); -unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb); - -unsigned int pblk_rb_read_count(struct pblk_rb *rb); -unsigned int pblk_rb_sync_count(struct pblk_rb *rb); -unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos); - -int pblk_rb_tear_down_check(struct pblk_rb *rb); -int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos); -void pblk_rb_free(struct pblk_rb *rb); -ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf); - -/* - * pblk core - */ -struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type); -void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type); -int pblk_alloc_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd); -void pblk_free_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd); -void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write); -int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx); -void pblk_discard(struct pblk *pblk, struct bio *bio); -struct nvm_chk_meta *pblk_get_chunk_meta(struct pblk *pblk); -struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk, - struct nvm_chk_meta *lp, - struct ppa_addr ppa); -void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); -void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); -int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf); -int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf); -int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line); -void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd); -struct pblk_line *pblk_line_get(struct pblk *pblk); -struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); -struct pblk_line *pblk_line_replace_data(struct pblk *pblk); -void pblk_ppa_to_line_put(struct pblk *pblk, struct ppa_addr ppa); -void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd); -int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line); -void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line); -struct pblk_line *pblk_line_get_data(struct pblk *pblk); -struct pblk_line *pblk_line_get_erase(struct pblk *pblk); -int pblk_line_erase(struct pblk *pblk, struct pblk_line *line); -int pblk_line_is_full(struct pblk_line *line); -void pblk_line_free(struct pblk_line *line); -void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line); -void pblk_line_close(struct pblk *pblk, struct pblk_line *line); -void pblk_line_close_ws(struct work_struct *work); -void pblk_pipeline_stop(struct pblk *pblk); -void __pblk_pipeline_stop(struct pblk *pblk); -void __pblk_pipeline_flush(struct pblk *pblk); -void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *), gfp_t gfp_mask, - struct workqueue_struct *wq); -u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line); -int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line); -int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line, - void *emeta_buf); -int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa); -void pblk_line_put(struct kref *ref); -void pblk_line_put_wq(struct kref *ref); -struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line); -u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line); -void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); -u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); -u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); -int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, - unsigned long secs_to_flush, bool skip_meta); -void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa, - unsigned long *lun_bitmap); -void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa); -void pblk_up_chunk(struct pblk *pblk, struct ppa_addr ppa); -void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap); -int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, - int nr_pages); -void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, - int nr_pages); -void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa); -void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, - u64 paddr); -void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa); -void pblk_update_map_cache(struct pblk *pblk, sector_t lba, - struct ppa_addr ppa); -void pblk_update_map_dev(struct pblk *pblk, sector_t lba, - struct ppa_addr ppa, struct ppa_addr entry_line); -int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, - struct pblk_line *gc_line, u64 paddr); -void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, - u64 *lba_list, int nr_secs); -int pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, - sector_t blba, int nr_secs, bool *from_cache); -void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd); -void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd); - -/* - * pblk user I/O write path - */ -void pblk_write_to_cache(struct pblk *pblk, struct bio *bio, - unsigned long flags); -int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq); - -/* - * pblk map - */ -int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned int sentry, unsigned long *lun_bitmap, - unsigned int valid_secs, struct ppa_addr *erase_ppa); -int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, - unsigned long *lun_bitmap, unsigned int valid_secs, - unsigned int off); - -/* - * pblk write thread - */ -int pblk_write_ts(void *data); -void pblk_write_timer_fn(struct timer_list *t); -void pblk_write_should_kick(struct pblk *pblk); -void pblk_write_kick(struct pblk *pblk); - -/* - * pblk read path - */ -extern struct bio_set pblk_bio_set; -void pblk_submit_read(struct pblk *pblk, struct bio *bio); -int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq); -/* - * pblk recovery - */ -struct pblk_line *pblk_recov_l2p(struct pblk *pblk); -int pblk_recov_pad(struct pblk *pblk); -int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta); - -/* - * pblk gc - */ -#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */ -#define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */ -#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */ - -int pblk_gc_init(struct pblk *pblk); -void pblk_gc_exit(struct pblk *pblk, bool graceful); -void pblk_gc_should_start(struct pblk *pblk); -void pblk_gc_should_stop(struct pblk *pblk); -void pblk_gc_should_kick(struct pblk *pblk); -void pblk_gc_free_full_lines(struct pblk *pblk); -void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, - int *gc_active); -int pblk_gc_sysfs_force(struct pblk *pblk, int force); -void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line); - -/* - * pblk rate limiter - */ -void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold); -void pblk_rl_free(struct pblk_rl *rl); -void pblk_rl_update_rates(struct pblk_rl *rl); -int pblk_rl_high_thrs(struct pblk_rl *rl); -unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); -unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl); -int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); -void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); -void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); -int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries); -void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); -void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); -int pblk_rl_max_io(struct pblk_rl *rl); -void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); -void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line, - bool used); -int pblk_rl_is_limit(struct pblk_rl *rl); - -void pblk_rl_werr_line_in(struct pblk_rl *rl); -void pblk_rl_werr_line_out(struct pblk_rl *rl); - -/* - * pblk sysfs - */ -int pblk_sysfs_init(struct gendisk *tdisk); -void pblk_sysfs_exit(struct gendisk *tdisk); - -static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx) -{ - return c_ctx - sizeof(struct nvm_rq); -} - -static inline void *emeta_to_bb(struct line_emeta *emeta) -{ - return emeta->bb_bitmap; -} - -static inline void *emeta_to_wa(struct pblk_line_meta *lm, - struct line_emeta *emeta) -{ - return emeta->bb_bitmap + lm->blk_bitmap_len; -} - -static inline void *emeta_to_lbas(struct pblk *pblk, struct line_emeta *emeta) -{ - return ((void *)emeta + pblk->lm.emeta_len[1]); -} - -static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta) -{ - return (emeta_to_lbas(pblk, emeta) + pblk->lm.emeta_len[2]); -} - -static inline int pblk_line_vsc(struct pblk_line *line) -{ - return le32_to_cpu(*line->vsc); -} - -static inline int pblk_ppa_to_line_id(struct ppa_addr p) -{ - return p.a.blk; -} - -static inline struct pblk_line *pblk_ppa_to_line(struct pblk *pblk, - struct ppa_addr p) -{ - return &pblk->lines[pblk_ppa_to_line_id(p)]; -} - -static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p) -{ - return p.a.lun * geo->num_ch + p.a.ch; -} - -static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr, - u64 line_id) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct ppa_addr ppa; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf; - - ppa.ppa = 0; - ppa.g.blk = line_id; - ppa.g.pg = (paddr & ppaf->pg_mask) >> ppaf->pg_offset; - ppa.g.lun = (paddr & ppaf->lun_mask) >> ppaf->lun_offset; - ppa.g.ch = (paddr & ppaf->ch_mask) >> ppaf->ch_offset; - ppa.g.pl = (paddr & ppaf->pln_mask) >> ppaf->pln_offset; - ppa.g.sec = (paddr & ppaf->sec_mask) >> ppaf->sec_offset; - } else { - struct pblk_addrf *uaddrf = &pblk->uaddrf; - int secs, chnls, luns; - - ppa.ppa = 0; - - ppa.m.chk = line_id; - - paddr = div_u64_rem(paddr, uaddrf->sec_stripe, &secs); - ppa.m.sec = secs; - - paddr = div_u64_rem(paddr, uaddrf->ch_stripe, &chnls); - ppa.m.grp = chnls; - - paddr = div_u64_rem(paddr, uaddrf->lun_stripe, &luns); - ppa.m.pu = luns; - - ppa.m.sec += uaddrf->sec_stripe * paddr; - } - - return ppa; -} - -static inline struct nvm_chk_meta *pblk_dev_ppa_to_chunk(struct pblk *pblk, - struct ppa_addr p) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line *line = pblk_ppa_to_line(pblk, p); - int pos = pblk_ppa_to_pos(geo, p); - - return &line->chks[pos]; -} - -static inline u64 pblk_dev_ppa_to_chunk_addr(struct pblk *pblk, - struct ppa_addr p) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - return dev_to_chunk_addr(dev->parent, &pblk->addrf, p); -} - -static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk, - struct ppa_addr p) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - u64 paddr; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf; - - paddr = (u64)p.g.ch << ppaf->ch_offset; - paddr |= (u64)p.g.lun << ppaf->lun_offset; - paddr |= (u64)p.g.pg << ppaf->pg_offset; - paddr |= (u64)p.g.pl << ppaf->pln_offset; - paddr |= (u64)p.g.sec << ppaf->sec_offset; - } else { - struct pblk_addrf *uaddrf = &pblk->uaddrf; - u64 secs = p.m.sec; - int sec_stripe; - - paddr = (u64)p.m.grp * uaddrf->sec_stripe; - paddr += (u64)p.m.pu * uaddrf->sec_lun_stripe; - - secs = div_u64_rem(secs, uaddrf->sec_stripe, &sec_stripe); - paddr += secs * uaddrf->sec_ws_stripe; - paddr += sec_stripe; - } - - return paddr; -} - -static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - return nvm_ppa32_to_ppa64(dev->parent, &pblk->addrf, ppa32); -} - -static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - return nvm_ppa64_to_ppa32(dev->parent, &pblk->addrf, ppa64); -} - -static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk, - sector_t lba) -{ - struct ppa_addr ppa; - - if (pblk->addrf_len < 32) { - u32 *map = (u32 *)pblk->trans_map; - - ppa = pblk_ppa32_to_ppa64(pblk, map[lba]); - } else { - struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map; - - ppa = map[lba]; - } - - return ppa; -} - -static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba, - struct ppa_addr ppa) -{ - if (pblk->addrf_len < 32) { - u32 *map = (u32 *)pblk->trans_map; - - map[lba] = pblk_ppa64_to_ppa32(pblk, ppa); - } else { - u64 *map = (u64 *)pblk->trans_map; - - map[lba] = ppa.ppa; - } -} - -static inline int pblk_ppa_empty(struct ppa_addr ppa_addr) -{ - return (ppa_addr.ppa == ADDR_EMPTY); -} - -static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr) -{ - ppa_addr->ppa = ADDR_EMPTY; -} - -static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa) -{ - return (lppa.ppa == rppa.ppa); -} - -static inline int pblk_addr_in_cache(struct ppa_addr ppa) -{ - return (ppa.ppa != ADDR_EMPTY && ppa.c.is_cached); -} - -static inline int pblk_addr_to_cacheline(struct ppa_addr ppa) -{ - return ppa.c.line; -} - -static inline struct ppa_addr pblk_cacheline_to_addr(int addr) -{ - struct ppa_addr p; - - p.c.line = addr; - p.c.is_cached = 1; - - return p; -} - -static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk, - struct line_header *header) -{ - u32 crc = ~(u32)0; - - crc = crc32_le(crc, (unsigned char *)header + sizeof(crc), - sizeof(struct line_header) - sizeof(crc)); - - return crc; -} - -static inline u32 pblk_calc_smeta_crc(struct pblk *pblk, - struct line_smeta *smeta) -{ - struct pblk_line_meta *lm = &pblk->lm; - u32 crc = ~(u32)0; - - crc = crc32_le(crc, (unsigned char *)smeta + - sizeof(struct line_header) + sizeof(crc), - lm->smeta_len - - sizeof(struct line_header) - sizeof(crc)); - - return crc; -} - -static inline u32 pblk_calc_emeta_crc(struct pblk *pblk, - struct line_emeta *emeta) -{ - struct pblk_line_meta *lm = &pblk->lm; - u32 crc = ~(u32)0; - - crc = crc32_le(crc, (unsigned char *)emeta + - sizeof(struct line_header) + sizeof(crc), - lm->emeta_len[0] - - sizeof(struct line_header) - sizeof(crc)); - - return crc; -} - -static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs) -{ - return !(nr_secs % pblk->min_write_pgs); -} - -#ifdef CONFIG_NVM_PBLK_DEBUG -static inline void print_ppa(struct pblk *pblk, struct ppa_addr *p, - char *msg, int error) -{ - struct nvm_geo *geo = &pblk->dev->geo; - - if (p->c.is_cached) { - pblk_err(pblk, "ppa: (%s: %x) cache line: %llu\n", - msg, error, (u64)p->c.line); - } else if (geo->version == NVM_OCSSD_SPEC_12) { - pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n", - msg, error, - p->g.ch, p->g.lun, p->g.blk, - p->g.pg, p->g.pl, p->g.sec); - } else { - pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n", - msg, error, - p->m.grp, p->m.pu, p->m.chk, p->m.sec); - } -} - -static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd, - int error) -{ - int bit = -1; - - if (rqd->nr_ppas == 1) { - print_ppa(pblk, &rqd->ppa_addr, "rqd", error); - return; - } - - while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas, - bit + 1)) < rqd->nr_ppas) { - print_ppa(pblk, &rqd->ppa_list[bit], "rqd", error); - } - - pblk_err(pblk, "error:%d, ppa_status:%llx\n", error, rqd->ppa_status); -} - -static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, - struct ppa_addr *ppas, int nr_ppas) -{ - struct nvm_geo *geo = &tgt_dev->geo; - struct ppa_addr *ppa; - int i; - - for (i = 0; i < nr_ppas; i++) { - ppa = &ppas[i]; - - if (geo->version == NVM_OCSSD_SPEC_12) { - if (!ppa->c.is_cached && - ppa->g.ch < geo->num_ch && - ppa->g.lun < geo->num_lun && - ppa->g.pl < geo->num_pln && - ppa->g.blk < geo->num_chk && - ppa->g.pg < geo->num_pg && - ppa->g.sec < geo->ws_min) - continue; - } else { - if (!ppa->c.is_cached && - ppa->m.grp < geo->num_ch && - ppa->m.pu < geo->num_lun && - ppa->m.chk < geo->num_chk && - ppa->m.sec < geo->clba) - continue; - } - - print_ppa(tgt_dev->q->queuedata, ppa, "boundary", i); - - return 1; - } - return 0; -} - -static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) { - WARN_ON(1); - return -EINVAL; - } - - if (rqd->opcode == NVM_OP_PWRITE) { - struct pblk_line *line; - int i; - - for (i = 0; i < rqd->nr_ppas; i++) { - line = pblk_ppa_to_line(pblk, ppa_list[i]); - - spin_lock(&line->lock); - if (line->state != PBLK_LINESTATE_OPEN) { - pblk_err(pblk, "bad ppa: line:%d,state:%d\n", - line->id, line->state); - WARN_ON(1); - spin_unlock(&line->lock); - return -EINVAL; - } - spin_unlock(&line->lock); - } - } - - return 0; -} -#endif - -static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr) -{ - struct pblk_line_meta *lm = &pblk->lm; - - if (paddr > lm->sec_per_line) - return 1; - - return 0; -} - -static inline unsigned int pblk_get_bi_idx(struct bio *bio) -{ - return bio->bi_iter.bi_idx; -} - -static inline sector_t pblk_get_lba(struct bio *bio) -{ - return bio->bi_iter.bi_sector / NR_PHY_IN_LOG; -} - -static inline unsigned int pblk_get_secs(struct bio *bio) -{ - return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE; -} - -static inline char *pblk_disk_name(struct pblk *pblk) -{ - struct gendisk *disk = pblk->disk; - - return disk->disk_name; -} - -static inline unsigned int pblk_get_min_chks(struct pblk *pblk) -{ - struct pblk_line_meta *lm = &pblk->lm; - /* In a worst-case scenario every line will have OP invalid sectors. - * We will then need a minimum of 1/OP lines to free up a single line - */ - - return DIV_ROUND_UP(100, pblk->op) * lm->blk_per_line; -} - -static inline struct pblk_sec_meta *pblk_get_meta(struct pblk *pblk, - void *meta, int index) -{ - return meta + - max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size) - * index; -} - -static inline int pblk_dma_meta_size(struct pblk *pblk) -{ - return max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size) - * NVM_MAX_VLBA; -} - -static inline int pblk_is_oob_meta_supported(struct pblk *pblk) -{ - return pblk->oob_meta_size >= sizeof(struct pblk_sec_meta); -} -#endif /* PBLK_H_ */ diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index cbc509784b2e..dfaacd472e5d 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -12,7 +12,6 @@ obj-$(CONFIG_NVME_TCP) += nvme-tcp.o nvme-core-y := core.o ioctl.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o -nvme-core-$(CONFIG_NVM) += lightnvm.o nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dfd9dec0c1f6..ce33014e3eb0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -587,9 +587,6 @@ static void nvme_free_ns(struct kref *kref) { struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); - if (ns->ndev) - nvme_nvm_unregister(ns); - put_disk(ns->disk); nvme_put_ns_head(ns->head); nvme_put_ctrl(ns->ctrl); @@ -3218,9 +3215,6 @@ static const struct attribute_group nvme_ns_id_attr_group = { const struct attribute_group *nvme_ns_id_attr_groups[] = { &nvme_ns_id_attr_group, -#ifdef CONFIG_NVM - &nvme_nvm_attr_group, -#endif NULL, }; @@ -3767,13 +3761,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (nvme_update_ns_info(ns, id)) goto out_put_disk; - if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - if (nvme_nvm_register(ns, disk->disk_name, node)) { - dev_warn(ctrl->device, "LightNVM init failure\n"); - goto out_put_disk; - } - } - down_write(&ctrl->namespaces_rwsem); list_add_tail(&ns->list, &ctrl->namespaces); up_write(&ctrl->namespaces_rwsem); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 305ddd415e45..22314962842d 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -342,9 +342,7 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, case NVME_IOCTL_IO64_CMD: return nvme_user_cmd64(ns->ctrl, ns, argp); default: - if (!ns->ndev) - return -ENOTTY; - return nvme_nvm_ioctl(ns, cmd, argp); + return -ENOTTY; } } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c deleted file mode 100644 index e9d9ad47f70f..000000000000 --- a/drivers/nvme/host/lightnvm.c +++ /dev/null @@ -1,1274 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * nvme-lightnvm.c - LightNVM NVMe device - * - * Copyright (C) 2014-2015 IT University of Copenhagen - * Initial release: Matias Bjorling - */ - -#include "nvme.h" - -#include -#include -#include -#include -#include -#include - -enum nvme_nvm_admin_opcode { - nvme_nvm_admin_identity = 0xe2, - nvme_nvm_admin_get_bb_tbl = 0xf2, - nvme_nvm_admin_set_bb_tbl = 0xf1, -}; - -enum nvme_nvm_log_page { - NVME_NVM_LOG_REPORT_CHUNK = 0xca, -}; - -struct nvme_nvm_ph_rw { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2; - __le64 metadata; - __le64 prp1; - __le64 prp2; - __le64 spba; - __le16 length; - __le16 control; - __le32 dsmgmt; - __le64 resv; -}; - -struct nvme_nvm_erase_blk { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd[2]; - __le64 prp1; - __le64 prp2; - __le64 spba; - __le16 length; - __le16 control; - __le32 dsmgmt; - __le64 resv; -}; - -struct nvme_nvm_identity { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd[2]; - __le64 prp1; - __le64 prp2; - __u32 rsvd11[6]; -}; - -struct nvme_nvm_getbbtbl { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd[2]; - __le64 prp1; - __le64 prp2; - __le64 spba; - __u32 rsvd4[4]; -}; - -struct nvme_nvm_setbbtbl { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __le64 rsvd[2]; - __le64 prp1; - __le64 prp2; - __le64 spba; - __le16 nlb; - __u8 value; - __u8 rsvd3; - __u32 rsvd4[3]; -}; - -struct nvme_nvm_command { - union { - struct nvme_common_command common; - struct nvme_nvm_ph_rw ph_rw; - struct nvme_nvm_erase_blk erase; - struct nvme_nvm_identity identity; - struct nvme_nvm_getbbtbl get_bb; - struct nvme_nvm_setbbtbl set_bb; - }; -}; - -struct nvme_nvm_id12_grp { - __u8 mtype; - __u8 fmtype; - __le16 res16; - __u8 num_ch; - __u8 num_lun; - __u8 num_pln; - __u8 rsvd1; - __le16 num_chk; - __le16 num_pg; - __le16 fpg_sz; - __le16 csecs; - __le16 sos; - __le16 rsvd2; - __le32 trdt; - __le32 trdm; - __le32 tprt; - __le32 tprm; - __le32 tbet; - __le32 tbem; - __le32 mpos; - __le32 mccap; - __le16 cpar; - __u8 reserved[906]; -} __packed; - -struct nvme_nvm_id12_addrf { - __u8 ch_offset; - __u8 ch_len; - __u8 lun_offset; - __u8 lun_len; - __u8 pln_offset; - __u8 pln_len; - __u8 blk_offset; - __u8 blk_len; - __u8 pg_offset; - __u8 pg_len; - __u8 sec_offset; - __u8 sec_len; - __u8 res[4]; -} __packed; - -struct nvme_nvm_id12 { - __u8 ver_id; - __u8 vmnt; - __u8 cgrps; - __u8 res; - __le32 cap; - __le32 dom; - struct nvme_nvm_id12_addrf ppaf; - __u8 resv[228]; - struct nvme_nvm_id12_grp grp; - __u8 resv2[2880]; -} __packed; - -struct nvme_nvm_bb_tbl { - __u8 tblid[4]; - __le16 verid; - __le16 revid; - __le32 rvsd1; - __le32 tblks; - __le32 tfact; - __le32 tgrown; - __le32 tdresv; - __le32 thresv; - __le32 rsvd2[8]; - __u8 blk[]; -}; - -struct nvme_nvm_id20_addrf { - __u8 grp_len; - __u8 pu_len; - __u8 chk_len; - __u8 lba_len; - __u8 resv[4]; -}; - -struct nvme_nvm_id20 { - __u8 mjr; - __u8 mnr; - __u8 resv[6]; - - struct nvme_nvm_id20_addrf lbaf; - - __le32 mccap; - __u8 resv2[12]; - - __u8 wit; - __u8 resv3[31]; - - /* Geometry */ - __le16 num_grp; - __le16 num_pu; - __le32 num_chk; - __le32 clba; - __u8 resv4[52]; - - /* Write data requirements */ - __le32 ws_min; - __le32 ws_opt; - __le32 mw_cunits; - __le32 maxoc; - __le32 maxocpu; - __u8 resv5[44]; - - /* Performance related metrics */ - __le32 trdt; - __le32 trdm; - __le32 twrt; - __le32 twrm; - __le32 tcrst; - __le32 tcrsm; - __u8 resv6[40]; - - /* Reserved area */ - __u8 resv7[2816]; - - /* Vendor specific */ - __u8 vs[1024]; -}; - -struct nvme_nvm_chk_meta { - __u8 state; - __u8 type; - __u8 wi; - __u8 rsvd[5]; - __le64 slba; - __le64 cnlb; - __le64 wp; -}; - -/* - * Check we didn't inadvertently grow the command struct - */ -static inline void _nvme_nvm_check_size(void) -{ - BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_grp) != 960); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_addrf) != 16); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id12) != NVME_IDENTIFY_DATA_SIZE); - BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id20_addrf) != 8); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id20) != NVME_IDENTIFY_DATA_SIZE); - BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) != 32); - BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) != - sizeof(struct nvm_chk_meta)); -} - -static void nvme_nvm_set_addr_12(struct nvm_addrf_12 *dst, - struct nvme_nvm_id12_addrf *src) -{ - dst->ch_len = src->ch_len; - dst->lun_len = src->lun_len; - dst->blk_len = src->blk_len; - dst->pg_len = src->pg_len; - dst->pln_len = src->pln_len; - dst->sec_len = src->sec_len; - - dst->ch_offset = src->ch_offset; - dst->lun_offset = src->lun_offset; - dst->blk_offset = src->blk_offset; - dst->pg_offset = src->pg_offset; - dst->pln_offset = src->pln_offset; - dst->sec_offset = src->sec_offset; - - dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset; - dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset; - dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset; - dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset; - dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset; - dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset; -} - -static int nvme_nvm_setup_12(struct nvme_nvm_id12 *id, - struct nvm_geo *geo) -{ - struct nvme_nvm_id12_grp *src; - int sec_per_pg, sec_per_pl, pg_per_blk; - - if (id->cgrps != 1) - return -EINVAL; - - src = &id->grp; - - if (src->mtype != 0) { - pr_err("nvm: memory type not supported\n"); - return -EINVAL; - } - - /* 1.2 spec. only reports a single version id - unfold */ - geo->major_ver_id = id->ver_id; - geo->minor_ver_id = 2; - - /* Set compacted version for upper layers */ - geo->version = NVM_OCSSD_SPEC_12; - - geo->num_ch = src->num_ch; - geo->num_lun = src->num_lun; - geo->all_luns = geo->num_ch * geo->num_lun; - - geo->num_chk = le16_to_cpu(src->num_chk); - - geo->csecs = le16_to_cpu(src->csecs); - geo->sos = le16_to_cpu(src->sos); - - pg_per_blk = le16_to_cpu(src->num_pg); - sec_per_pg = le16_to_cpu(src->fpg_sz) / geo->csecs; - sec_per_pl = sec_per_pg * src->num_pln; - geo->clba = sec_per_pl * pg_per_blk; - - geo->all_chunks = geo->all_luns * geo->num_chk; - geo->total_secs = geo->clba * geo->all_chunks; - - geo->ws_min = sec_per_pg; - geo->ws_opt = sec_per_pg; - geo->mw_cunits = geo->ws_opt << 3; /* default to MLC safe values */ - - /* Do not impose values for maximum number of open blocks as it is - * unspecified in 1.2. Users of 1.2 must be aware of this and eventually - * specify these values through a quirk if restrictions apply. - */ - geo->maxoc = geo->all_luns * geo->num_chk; - geo->maxocpu = geo->num_chk; - - geo->mccap = le32_to_cpu(src->mccap); - - geo->trdt = le32_to_cpu(src->trdt); - geo->trdm = le32_to_cpu(src->trdm); - geo->tprt = le32_to_cpu(src->tprt); - geo->tprm = le32_to_cpu(src->tprm); - geo->tbet = le32_to_cpu(src->tbet); - geo->tbem = le32_to_cpu(src->tbem); - - /* 1.2 compatibility */ - geo->vmnt = id->vmnt; - geo->cap = le32_to_cpu(id->cap); - geo->dom = le32_to_cpu(id->dom); - - geo->mtype = src->mtype; - geo->fmtype = src->fmtype; - - geo->cpar = le16_to_cpu(src->cpar); - geo->mpos = le32_to_cpu(src->mpos); - - geo->pln_mode = NVM_PLANE_SINGLE; - - if (geo->mpos & 0x020202) { - geo->pln_mode = NVM_PLANE_DOUBLE; - geo->ws_opt <<= 1; - } else if (geo->mpos & 0x040404) { - geo->pln_mode = NVM_PLANE_QUAD; - geo->ws_opt <<= 2; - } - - geo->num_pln = src->num_pln; - geo->num_pg = le16_to_cpu(src->num_pg); - geo->fpg_sz = le16_to_cpu(src->fpg_sz); - - nvme_nvm_set_addr_12((struct nvm_addrf_12 *)&geo->addrf, &id->ppaf); - - return 0; -} - -static void nvme_nvm_set_addr_20(struct nvm_addrf *dst, - struct nvme_nvm_id20_addrf *src) -{ - dst->ch_len = src->grp_len; - dst->lun_len = src->pu_len; - dst->chk_len = src->chk_len; - dst->sec_len = src->lba_len; - - dst->sec_offset = 0; - dst->chk_offset = dst->sec_len; - dst->lun_offset = dst->chk_offset + dst->chk_len; - dst->ch_offset = dst->lun_offset + dst->lun_len; - - dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset; - dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset; - dst->chk_mask = ((1ULL << dst->chk_len) - 1) << dst->chk_offset; - dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset; -} - -static int nvme_nvm_setup_20(struct nvme_nvm_id20 *id, - struct nvm_geo *geo) -{ - geo->major_ver_id = id->mjr; - geo->minor_ver_id = id->mnr; - - /* Set compacted version for upper layers */ - geo->version = NVM_OCSSD_SPEC_20; - - geo->num_ch = le16_to_cpu(id->num_grp); - geo->num_lun = le16_to_cpu(id->num_pu); - geo->all_luns = geo->num_ch * geo->num_lun; - - geo->num_chk = le32_to_cpu(id->num_chk); - geo->clba = le32_to_cpu(id->clba); - - geo->all_chunks = geo->all_luns * geo->num_chk; - geo->total_secs = geo->clba * geo->all_chunks; - - geo->ws_min = le32_to_cpu(id->ws_min); - geo->ws_opt = le32_to_cpu(id->ws_opt); - geo->mw_cunits = le32_to_cpu(id->mw_cunits); - geo->maxoc = le32_to_cpu(id->maxoc); - geo->maxocpu = le32_to_cpu(id->maxocpu); - - geo->trdt = le32_to_cpu(id->trdt); - geo->trdm = le32_to_cpu(id->trdm); - geo->tprt = le32_to_cpu(id->twrt); - geo->tprm = le32_to_cpu(id->twrm); - geo->tbet = le32_to_cpu(id->tcrst); - geo->tbem = le32_to_cpu(id->tcrsm); - - nvme_nvm_set_addr_20(&geo->addrf, &id->lbaf); - - return 0; -} - -static int nvme_nvm_identity(struct nvm_dev *nvmdev) -{ - struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_nvm_id12 *id; - struct nvme_nvm_command c = {}; - int ret; - - c.identity.opcode = nvme_nvm_admin_identity; - c.identity.nsid = cpu_to_le32(ns->head->ns_id); - - id = kmalloc(sizeof(struct nvme_nvm_id12), GFP_KERNEL); - if (!id) - return -ENOMEM; - - ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, - id, sizeof(struct nvme_nvm_id12)); - if (ret) { - ret = -EIO; - goto out; - } - - /* - * The 1.2 and 2.0 specifications share the first byte in their geometry - * command to make it possible to know what version a device implements. - */ - switch (id->ver_id) { - case 1: - ret = nvme_nvm_setup_12(id, &nvmdev->geo); - break; - case 2: - ret = nvme_nvm_setup_20((struct nvme_nvm_id20 *)id, - &nvmdev->geo); - break; - default: - dev_err(ns->ctrl->device, "OCSSD revision not supported (%d)\n", - id->ver_id); - ret = -EINVAL; - } - -out: - kfree(id); - return ret; -} - -static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, - u8 *blks) -{ - struct request_queue *q = nvmdev->q; - struct nvm_geo *geo = &nvmdev->geo; - struct nvme_ns *ns = q->queuedata; - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_nvm_command c = {}; - struct nvme_nvm_bb_tbl *bb_tbl; - int nr_blks = geo->num_chk * geo->num_pln; - int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks; - int ret = 0; - - c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; - c.get_bb.nsid = cpu_to_le32(ns->head->ns_id); - c.get_bb.spba = cpu_to_le64(ppa.ppa); - - bb_tbl = kzalloc(tblsz, GFP_KERNEL); - if (!bb_tbl) - return -ENOMEM; - - ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c, - bb_tbl, tblsz); - if (ret) { - dev_err(ctrl->device, "get bad block table failed (%d)\n", ret); - ret = -EIO; - goto out; - } - - if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' || - bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') { - dev_err(ctrl->device, "bbt format mismatch\n"); - ret = -EINVAL; - goto out; - } - - if (le16_to_cpu(bb_tbl->verid) != 1) { - ret = -EINVAL; - dev_err(ctrl->device, "bbt version not supported\n"); - goto out; - } - - if (le32_to_cpu(bb_tbl->tblks) != nr_blks) { - ret = -EINVAL; - dev_err(ctrl->device, - "bbt unsuspected blocks returned (%u!=%u)", - le32_to_cpu(bb_tbl->tblks), nr_blks); - goto out; - } - - memcpy(blks, bb_tbl->blk, geo->num_chk * geo->num_pln); -out: - kfree(bb_tbl); - return ret; -} - -static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas, - int nr_ppas, int type) -{ - struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_nvm_command c = {}; - int ret = 0; - - c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl; - c.set_bb.nsid = cpu_to_le32(ns->head->ns_id); - c.set_bb.spba = cpu_to_le64(ppas->ppa); - c.set_bb.nlb = cpu_to_le16(nr_ppas - 1); - c.set_bb.value = type; - - ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, - NULL, 0); - if (ret) - dev_err(ns->ctrl->device, "set bad block table failed (%d)\n", - ret); - return ret; -} - -/* - * Expect the lba in device format - */ -static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, - sector_t slba, int nchks, - struct nvm_chk_meta *meta) -{ - struct nvm_geo *geo = &ndev->geo; - struct nvme_ns *ns = ndev->q->queuedata; - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_nvm_chk_meta *dev_meta, *dev_meta_off; - struct ppa_addr ppa; - size_t left = nchks * sizeof(struct nvme_nvm_chk_meta); - size_t log_pos, offset, len; - int i, max_len; - int ret = 0; - - /* - * limit requests to maximum 256K to avoid issuing arbitrary large - * requests when the device does not specific a maximum transfer size. - */ - max_len = min_t(unsigned int, ctrl->max_hw_sectors << 9, 256 * 1024); - - dev_meta = kmalloc(max_len, GFP_KERNEL); - if (!dev_meta) - return -ENOMEM; - - /* Normalize lba address space to obtain log offset */ - ppa.ppa = slba; - ppa = dev_to_generic_addr(ndev, ppa); - - log_pos = ppa.m.chk; - log_pos += ppa.m.pu * geo->num_chk; - log_pos += ppa.m.grp * geo->num_lun * geo->num_chk; - - offset = log_pos * sizeof(struct nvme_nvm_chk_meta); - - while (left) { - len = min_t(unsigned int, left, max_len); - - memset(dev_meta, 0, max_len); - dev_meta_off = dev_meta; - - ret = nvme_get_log(ctrl, ns->head->ns_id, - NVME_NVM_LOG_REPORT_CHUNK, 0, NVME_CSI_NVM, - dev_meta, len, offset); - if (ret) { - dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); - break; - } - - for (i = 0; i < len; i += sizeof(struct nvme_nvm_chk_meta)) { - meta->state = dev_meta_off->state; - meta->type = dev_meta_off->type; - meta->wi = dev_meta_off->wi; - meta->slba = le64_to_cpu(dev_meta_off->slba); - meta->cnlb = le64_to_cpu(dev_meta_off->cnlb); - meta->wp = le64_to_cpu(dev_meta_off->wp); - - meta++; - dev_meta_off++; - } - - offset += len; - left -= len; - } - - kfree(dev_meta); - - return ret; -} - -static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns, - struct nvme_nvm_command *c) -{ - c->ph_rw.opcode = rqd->opcode; - c->ph_rw.nsid = cpu_to_le32(ns->head->ns_id); - c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa); - c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); - c->ph_rw.control = cpu_to_le16(rqd->flags); - c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); -} - -static void nvme_nvm_end_io(struct request *rq, blk_status_t status) -{ - struct nvm_rq *rqd = rq->end_io_data; - - rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64); - rqd->error = nvme_req(rq)->status; - nvm_end_io(rqd); - - kfree(nvme_req(rq)->cmd); - blk_mq_free_request(rq); -} - -static struct request *nvme_nvm_alloc_request(struct request_queue *q, - struct nvm_rq *rqd, - struct nvme_nvm_command *cmd) -{ - struct nvme_ns *ns = q->queuedata; - struct request *rq; - - nvme_nvm_rqtocmd(rqd, ns, cmd); - - rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0); - if (IS_ERR(rq)) - return rq; - - rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; - - if (rqd->bio) - blk_rq_append_bio(rq, rqd->bio); - else - rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); - - return rq; -} - -static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd, - void *buf) -{ - struct nvm_geo *geo = &dev->geo; - struct request_queue *q = dev->q; - struct nvme_nvm_command *cmd; - struct request *rq; - int ret; - - cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); - if (!cmd) - return -ENOMEM; - - rq = nvme_nvm_alloc_request(q, rqd, cmd); - if (IS_ERR(rq)) { - ret = PTR_ERR(rq); - goto err_free_cmd; - } - - if (buf) { - ret = blk_rq_map_kern(q, rq, buf, geo->csecs * rqd->nr_ppas, - GFP_KERNEL); - if (ret) - goto err_free_cmd; - } - - rq->end_io_data = rqd; - - blk_execute_rq_nowait(NULL, rq, 0, nvme_nvm_end_io); - - return 0; - -err_free_cmd: - kfree(cmd); - return ret; -} - -static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name, - int size) -{ - struct nvme_ns *ns = nvmdev->q->queuedata; - - return dma_pool_create(name, ns->ctrl->dev, size, PAGE_SIZE, 0); -} - -static void nvme_nvm_destroy_dma_pool(void *pool) -{ - struct dma_pool *dma_pool = pool; - - dma_pool_destroy(dma_pool); -} - -static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, - gfp_t mem_flags, dma_addr_t *dma_handler) -{ - return dma_pool_alloc(pool, mem_flags, dma_handler); -} - -static void nvme_nvm_dev_dma_free(void *pool, void *addr, - dma_addr_t dma_handler) -{ - dma_pool_free(pool, addr, dma_handler); -} - -static struct nvm_dev_ops nvme_nvm_dev_ops = { - .identity = nvme_nvm_identity, - - .get_bb_tbl = nvme_nvm_get_bb_tbl, - .set_bb_tbl = nvme_nvm_set_bb_tbl, - - .get_chk_meta = nvme_nvm_get_chk_meta, - - .submit_io = nvme_nvm_submit_io, - - .create_dma_pool = nvme_nvm_create_dma_pool, - .destroy_dma_pool = nvme_nvm_destroy_dma_pool, - .dev_dma_alloc = nvme_nvm_dev_dma_alloc, - .dev_dma_free = nvme_nvm_dev_dma_free, -}; - -static int nvme_nvm_submit_user_cmd(struct request_queue *q, - struct nvme_ns *ns, - struct nvme_nvm_command *vcmd, - void __user *ubuf, unsigned int bufflen, - void __user *meta_buf, unsigned int meta_len, - void __user *ppa_buf, unsigned int ppa_len, - u32 *result, u64 *status, unsigned int timeout) -{ - bool write = nvme_is_write((struct nvme_command *)vcmd); - struct nvm_dev *dev = ns->ndev; - struct request *rq; - struct bio *bio = NULL; - __le64 *ppa_list = NULL; - dma_addr_t ppa_dma; - __le64 *metadata = NULL; - dma_addr_t metadata_dma; - DECLARE_COMPLETION_ONSTACK(wait); - int ret = 0; - - rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0); - if (IS_ERR(rq)) { - ret = -ENOMEM; - goto err_cmd; - } - - if (timeout) - rq->timeout = timeout; - - if (ppa_buf && ppa_len) { - ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); - if (!ppa_list) { - ret = -ENOMEM; - goto err_rq; - } - if (copy_from_user(ppa_list, (void __user *)ppa_buf, - sizeof(u64) * (ppa_len + 1))) { - ret = -EFAULT; - goto err_ppa; - } - vcmd->ph_rw.spba = cpu_to_le64(ppa_dma); - } else { - vcmd->ph_rw.spba = cpu_to_le64((uintptr_t)ppa_buf); - } - - if (ubuf && bufflen) { - ret = blk_rq_map_user(q, rq, NULL, ubuf, bufflen, GFP_KERNEL); - if (ret) - goto err_ppa; - bio = rq->bio; - - if (meta_buf && meta_len) { - metadata = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, - &metadata_dma); - if (!metadata) { - ret = -ENOMEM; - goto err_map; - } - - if (write) { - if (copy_from_user(metadata, - (void __user *)meta_buf, - meta_len)) { - ret = -EFAULT; - goto err_meta; - } - } - vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma); - } - - bio_set_dev(bio, ns->disk->part0); - } - - blk_execute_rq(NULL, rq, 0); - - if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) - ret = -EINTR; - else if (nvme_req(rq)->status & 0x7ff) - ret = -EIO; - if (result) - *result = nvme_req(rq)->status & 0x7ff; - if (status) - *status = le64_to_cpu(nvme_req(rq)->result.u64); - - if (metadata && !ret && !write) { - if (copy_to_user(meta_buf, (void *)metadata, meta_len)) - ret = -EFAULT; - } -err_meta: - if (meta_buf && meta_len) - dma_pool_free(dev->dma_pool, metadata, metadata_dma); -err_map: - if (bio) - blk_rq_unmap_user(bio); -err_ppa: - if (ppa_buf && ppa_len) - dma_pool_free(dev->dma_pool, ppa_list, ppa_dma); -err_rq: - blk_mq_free_request(rq); -err_cmd: - return ret; -} - -static int nvme_nvm_submit_vio(struct nvme_ns *ns, - struct nvm_user_vio __user *uvio) -{ - struct nvm_user_vio vio; - struct nvme_nvm_command c; - unsigned int length; - int ret; - - if (copy_from_user(&vio, uvio, sizeof(vio))) - return -EFAULT; - if (vio.flags) - return -EINVAL; - - memset(&c, 0, sizeof(c)); - c.ph_rw.opcode = vio.opcode; - c.ph_rw.nsid = cpu_to_le32(ns->head->ns_id); - c.ph_rw.control = cpu_to_le16(vio.control); - c.ph_rw.length = cpu_to_le16(vio.nppas); - - length = (vio.nppas + 1) << ns->lba_shift; - - ret = nvme_nvm_submit_user_cmd(ns->queue, ns, &c, - (void __user *)(uintptr_t)vio.addr, length, - (void __user *)(uintptr_t)vio.metadata, - vio.metadata_len, - (void __user *)(uintptr_t)vio.ppa_list, vio.nppas, - &vio.result, &vio.status, 0); - - if (ret && copy_to_user(uvio, &vio, sizeof(vio))) - return -EFAULT; - - return ret; -} - -static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, - struct nvm_passthru_vio __user *uvcmd) -{ - struct nvm_passthru_vio vcmd; - struct nvme_nvm_command c; - struct request_queue *q; - unsigned int timeout = 0; - int ret; - - if (copy_from_user(&vcmd, uvcmd, sizeof(vcmd))) - return -EFAULT; - if ((vcmd.opcode != 0xF2) && (!capable(CAP_SYS_ADMIN))) - return -EACCES; - if (vcmd.flags) - return -EINVAL; - - memset(&c, 0, sizeof(c)); - c.common.opcode = vcmd.opcode; - c.common.nsid = cpu_to_le32(ns->head->ns_id); - c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3); - /* cdw11-12 */ - c.ph_rw.length = cpu_to_le16(vcmd.nppas); - c.ph_rw.control = cpu_to_le16(vcmd.control); - c.common.cdw13 = cpu_to_le32(vcmd.cdw13); - c.common.cdw14 = cpu_to_le32(vcmd.cdw14); - c.common.cdw15 = cpu_to_le32(vcmd.cdw15); - - if (vcmd.timeout_ms) - timeout = msecs_to_jiffies(vcmd.timeout_ms); - - q = admin ? ns->ctrl->admin_q : ns->queue; - - ret = nvme_nvm_submit_user_cmd(q, ns, - (struct nvme_nvm_command *)&c, - (void __user *)(uintptr_t)vcmd.addr, vcmd.data_len, - (void __user *)(uintptr_t)vcmd.metadata, - vcmd.metadata_len, - (void __user *)(uintptr_t)vcmd.ppa_list, vcmd.nppas, - &vcmd.result, &vcmd.status, timeout); - - if (ret && copy_to_user(uvcmd, &vcmd, sizeof(vcmd))) - return -EFAULT; - - return ret; -} - -int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp) -{ - switch (cmd) { - case NVME_NVM_IOCTL_ADMIN_VIO: - return nvme_nvm_user_vcmd(ns, 1, argp); - case NVME_NVM_IOCTL_IO_VIO: - return nvme_nvm_user_vcmd(ns, 0, argp); - case NVME_NVM_IOCTL_SUBMIT_VIO: - return nvme_nvm_submit_vio(ns, argp); - default: - return -ENOTTY; - } -} - -int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) -{ - struct request_queue *q = ns->queue; - struct nvm_dev *dev; - struct nvm_geo *geo; - - _nvme_nvm_check_size(); - - dev = nvm_alloc_dev(node); - if (!dev) - return -ENOMEM; - - /* Note that csecs and sos will be overridden if it is a 1.2 drive. */ - geo = &dev->geo; - geo->csecs = 1 << ns->lba_shift; - geo->sos = ns->ms; - if (ns->features & NVME_NS_EXT_LBAS) - geo->ext = true; - else - geo->ext = false; - geo->mdts = ns->ctrl->max_hw_sectors; - - dev->q = q; - memcpy(dev->name, disk_name, DISK_NAME_LEN); - dev->ops = &nvme_nvm_dev_ops; - dev->private_data = ns; - ns->ndev = dev; - - return nvm_register(dev); -} - -void nvme_nvm_unregister(struct nvme_ns *ns) -{ - nvm_unregister(ns->ndev); -} - -static ssize_t nvm_dev_attr_show(struct device *dev, - struct device_attribute *dattr, char *page) -{ - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvm_dev *ndev = ns->ndev; - struct nvm_geo *geo = &ndev->geo; - struct attribute *attr; - - if (!ndev) - return 0; - - attr = &dattr->attr; - - if (strcmp(attr->name, "version") == 0) { - if (geo->major_ver_id == 1) - return scnprintf(page, PAGE_SIZE, "%u\n", - geo->major_ver_id); - else - return scnprintf(page, PAGE_SIZE, "%u.%u\n", - geo->major_ver_id, - geo->minor_ver_id); - } else if (strcmp(attr->name, "capabilities") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->cap); - } else if (strcmp(attr->name, "read_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdt); - } else if (strcmp(attr->name, "read_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdm); - } else { - return scnprintf(page, - PAGE_SIZE, - "Unhandled attr(%s) in `%s`\n", - attr->name, __func__); - } -} - -static ssize_t nvm_dev_attr_show_ppaf(struct nvm_addrf_12 *ppaf, char *page) -{ - return scnprintf(page, PAGE_SIZE, - "0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", - ppaf->ch_offset, ppaf->ch_len, - ppaf->lun_offset, ppaf->lun_len, - ppaf->pln_offset, ppaf->pln_len, - ppaf->blk_offset, ppaf->blk_len, - ppaf->pg_offset, ppaf->pg_len, - ppaf->sec_offset, ppaf->sec_len); -} - -static ssize_t nvm_dev_attr_show_12(struct device *dev, - struct device_attribute *dattr, char *page) -{ - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvm_dev *ndev = ns->ndev; - struct nvm_geo *geo = &ndev->geo; - struct attribute *attr; - - if (!ndev) - return 0; - - attr = &dattr->attr; - - if (strcmp(attr->name, "vendor_opcode") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->vmnt); - } else if (strcmp(attr->name, "device_mode") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->dom); - /* kept for compatibility */ - } else if (strcmp(attr->name, "media_manager") == 0) { - return scnprintf(page, PAGE_SIZE, "%s\n", "gennvm"); - } else if (strcmp(attr->name, "ppa_format") == 0) { - return nvm_dev_attr_show_ppaf((void *)&geo->addrf, page); - } else if (strcmp(attr->name, "media_type") == 0) { /* u8 */ - return scnprintf(page, PAGE_SIZE, "%u\n", geo->mtype); - } else if (strcmp(attr->name, "flash_media_type") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->fmtype); - } else if (strcmp(attr->name, "num_channels") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch); - } else if (strcmp(attr->name, "num_luns") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun); - } else if (strcmp(attr->name, "num_planes") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pln); - } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */ - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk); - } else if (strcmp(attr->name, "num_pages") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pg); - } else if (strcmp(attr->name, "page_size") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->fpg_sz); - } else if (strcmp(attr->name, "hw_sector_size") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->csecs); - } else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */ - return scnprintf(page, PAGE_SIZE, "%u\n", geo->sos); - } else if (strcmp(attr->name, "prog_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt); - } else if (strcmp(attr->name, "prog_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm); - } else if (strcmp(attr->name, "erase_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet); - } else if (strcmp(attr->name, "erase_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem); - } else if (strcmp(attr->name, "multiplane_modes") == 0) { - return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mpos); - } else if (strcmp(attr->name, "media_capabilities") == 0) { - return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mccap); - } else if (strcmp(attr->name, "max_phys_secs") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", NVM_MAX_VLBA); - } else { - return scnprintf(page, PAGE_SIZE, - "Unhandled attr(%s) in `%s`\n", - attr->name, __func__); - } -} - -static ssize_t nvm_dev_attr_show_20(struct device *dev, - struct device_attribute *dattr, char *page) -{ - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvm_dev *ndev = ns->ndev; - struct nvm_geo *geo = &ndev->geo; - struct attribute *attr; - - if (!ndev) - return 0; - - attr = &dattr->attr; - - if (strcmp(attr->name, "groups") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch); - } else if (strcmp(attr->name, "punits") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun); - } else if (strcmp(attr->name, "chunks") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk); - } else if (strcmp(attr->name, "clba") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->clba); - } else if (strcmp(attr->name, "ws_min") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_min); - } else if (strcmp(attr->name, "ws_opt") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_opt); - } else if (strcmp(attr->name, "maxoc") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxoc); - } else if (strcmp(attr->name, "maxocpu") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxocpu); - } else if (strcmp(attr->name, "mw_cunits") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->mw_cunits); - } else if (strcmp(attr->name, "write_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt); - } else if (strcmp(attr->name, "write_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm); - } else if (strcmp(attr->name, "reset_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet); - } else if (strcmp(attr->name, "reset_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem); - } else { - return scnprintf(page, PAGE_SIZE, - "Unhandled attr(%s) in `%s`\n", - attr->name, __func__); - } -} - -#define NVM_DEV_ATTR_RO(_name) \ - DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL) -#define NVM_DEV_ATTR_12_RO(_name) \ - DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_12, NULL) -#define NVM_DEV_ATTR_20_RO(_name) \ - DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_20, NULL) - -/* general attributes */ -static NVM_DEV_ATTR_RO(version); -static NVM_DEV_ATTR_RO(capabilities); - -static NVM_DEV_ATTR_RO(read_typ); -static NVM_DEV_ATTR_RO(read_max); - -/* 1.2 values */ -static NVM_DEV_ATTR_12_RO(vendor_opcode); -static NVM_DEV_ATTR_12_RO(device_mode); -static NVM_DEV_ATTR_12_RO(ppa_format); -static NVM_DEV_ATTR_12_RO(media_manager); -static NVM_DEV_ATTR_12_RO(media_type); -static NVM_DEV_ATTR_12_RO(flash_media_type); -static NVM_DEV_ATTR_12_RO(num_channels); -static NVM_DEV_ATTR_12_RO(num_luns); -static NVM_DEV_ATTR_12_RO(num_planes); -static NVM_DEV_ATTR_12_RO(num_blocks); -static NVM_DEV_ATTR_12_RO(num_pages); -static NVM_DEV_ATTR_12_RO(page_size); -static NVM_DEV_ATTR_12_RO(hw_sector_size); -static NVM_DEV_ATTR_12_RO(oob_sector_size); -static NVM_DEV_ATTR_12_RO(prog_typ); -static NVM_DEV_ATTR_12_RO(prog_max); -static NVM_DEV_ATTR_12_RO(erase_typ); -static NVM_DEV_ATTR_12_RO(erase_max); -static NVM_DEV_ATTR_12_RO(multiplane_modes); -static NVM_DEV_ATTR_12_RO(media_capabilities); -static NVM_DEV_ATTR_12_RO(max_phys_secs); - -/* 2.0 values */ -static NVM_DEV_ATTR_20_RO(groups); -static NVM_DEV_ATTR_20_RO(punits); -static NVM_DEV_ATTR_20_RO(chunks); -static NVM_DEV_ATTR_20_RO(clba); -static NVM_DEV_ATTR_20_RO(ws_min); -static NVM_DEV_ATTR_20_RO(ws_opt); -static NVM_DEV_ATTR_20_RO(maxoc); -static NVM_DEV_ATTR_20_RO(maxocpu); -static NVM_DEV_ATTR_20_RO(mw_cunits); -static NVM_DEV_ATTR_20_RO(write_typ); -static NVM_DEV_ATTR_20_RO(write_max); -static NVM_DEV_ATTR_20_RO(reset_typ); -static NVM_DEV_ATTR_20_RO(reset_max); - -static struct attribute *nvm_dev_attrs[] = { - /* version agnostic attrs */ - &dev_attr_version.attr, - &dev_attr_capabilities.attr, - &dev_attr_read_typ.attr, - &dev_attr_read_max.attr, - - /* 1.2 attrs */ - &dev_attr_vendor_opcode.attr, - &dev_attr_device_mode.attr, - &dev_attr_media_manager.attr, - &dev_attr_ppa_format.attr, - &dev_attr_media_type.attr, - &dev_attr_flash_media_type.attr, - &dev_attr_num_channels.attr, - &dev_attr_num_luns.attr, - &dev_attr_num_planes.attr, - &dev_attr_num_blocks.attr, - &dev_attr_num_pages.attr, - &dev_attr_page_size.attr, - &dev_attr_hw_sector_size.attr, - &dev_attr_oob_sector_size.attr, - &dev_attr_prog_typ.attr, - &dev_attr_prog_max.attr, - &dev_attr_erase_typ.attr, - &dev_attr_erase_max.attr, - &dev_attr_multiplane_modes.attr, - &dev_attr_media_capabilities.attr, - &dev_attr_max_phys_secs.attr, - - /* 2.0 attrs */ - &dev_attr_groups.attr, - &dev_attr_punits.attr, - &dev_attr_chunks.attr, - &dev_attr_clba.attr, - &dev_attr_ws_min.attr, - &dev_attr_ws_opt.attr, - &dev_attr_maxoc.attr, - &dev_attr_maxocpu.attr, - &dev_attr_mw_cunits.attr, - - &dev_attr_write_typ.attr, - &dev_attr_write_max.attr, - &dev_attr_reset_typ.attr, - &dev_attr_reset_max.attr, - - NULL, -}; - -static umode_t nvm_dev_attrs_visible(struct kobject *kobj, - struct attribute *attr, int index) -{ - struct device *dev = kobj_to_dev(kobj); - struct gendisk *disk = dev_to_disk(dev); - struct nvme_ns *ns = disk->private_data; - struct nvm_dev *ndev = ns->ndev; - struct device_attribute *dev_attr = - container_of(attr, typeof(*dev_attr), attr); - - if (!ndev) - return 0; - - if (dev_attr->show == nvm_dev_attr_show) - return attr->mode; - - switch (ndev->geo.major_ver_id) { - case 1: - if (dev_attr->show == nvm_dev_attr_show_12) - return attr->mode; - break; - case 2: - if (dev_attr->show == nvm_dev_attr_show_20) - return attr->mode; - break; - } - - return 0; -} - -const struct attribute_group nvme_nvm_attr_group = { - .name = "lightnvm", - .attrs = nvm_dev_attrs, - .is_visible = nvm_dev_attrs_visible, -}; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5cd1fa3b8464..ab803f91ace1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -92,11 +91,6 @@ enum nvme_quirks { */ NVME_QUIRK_NO_DEEPEST_PS = (1 << 5), - /* - * Supports the LighNVM command set if indicated in vs[1]. - */ - NVME_QUIRK_LIGHTNVM = (1 << 6), - /* * Set MEDIUM priority on SQ creation */ @@ -823,26 +817,6 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) } #endif -#ifdef CONFIG_NVM -int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); -void nvme_nvm_unregister(struct nvme_ns *ns); -extern const struct attribute_group nvme_nvm_attr_group; -int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp); -#else -static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, - int node) -{ - return 0; -} - -static inline void nvme_nvm_unregister(struct nvme_ns *ns) {}; -static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp) -{ - return -ENOTTY; -} -#endif /* CONFIG_NVM */ - static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) { return dev_to_disk(dev)->private_data; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 51852085239e..db7a9bee2014 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3243,12 +3243,6 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(0x1b4b, 0x1092), /* Lexar 256 GB SSD */ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | NVME_QUIRK_IGNORE_DEV_SUBNQN, }, - { PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, - { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, - { PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ -- cgit From 27453b45e62da8656739f7e1365ea9318e7b040e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:34 -0700 Subject: nvme-pci: limit maximum queue depth to 4095 We are going to use the upper 4-bits of the command_id for a generation counter, so enforce the new queue depth upper limit. As we enforce both min and max queue depth, use param_set_uint_minmax istead of open coding it. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index db7a9bee2014..4880badb26d4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -60,6 +60,8 @@ MODULE_PARM_DESC(sgl_threshold, "Use SGLs when average request segment size is larger or equal to " "this size. Use 0 to disable SGLs."); +#define NVME_PCI_MIN_QUEUE_SIZE 2 +#define NVME_PCI_MAX_QUEUE_SIZE 4095 static int io_queue_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops io_queue_depth_ops = { .set = io_queue_depth_set, @@ -68,7 +70,7 @@ static const struct kernel_param_ops io_queue_depth_ops = { static unsigned int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); -MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); +MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2 and < 4096"); static int io_queue_count_set(const char *val, const struct kernel_param *kp) { @@ -157,14 +159,8 @@ struct nvme_dev { static int io_queue_depth_set(const char *val, const struct kernel_param *kp) { - int ret; - u32 n; - - ret = kstrtou32(val, 10, &n); - if (ret != 0 || n < 2) - return -EINVAL; - - return param_set_uint(val, kp); + return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE, + NVME_PCI_MAX_QUEUE_SIZE); } static inline unsigned int sq_idx(unsigned int qid, u32 stride) -- cgit From 3b01a9d0caa8276d9ce314e09610f7fb70f49a00 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:35 -0700 Subject: nvme-tcp: don't check blk_mq_tag_to_rq when receiving pdu data We already validate it when receiving the c2hdata pdu header and this is not changing so this is a redundant check. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 8cb15ee5b249..d649b446da66 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -702,17 +702,9 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; - struct nvme_tcp_request *req; - struct request *rq; - - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); - if (!rq) { - dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); - return -ENOENT; - } - req = blk_mq_rq_to_pdu(rq); + struct request *rq = + blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); while (true) { int recv_len, ret; -- cgit From e7006de6c23803799be000a5dcce4d916a36541a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:36 -0700 Subject: nvme: code command_id with a genctr for use-after-free validation We cannot detect a (perhaps buggy) controller that is sending us a completion for a request that was already completed (for example sending a completion twice), this phenomenon was seen in the wild a few times. So to protect against this, we use the upper 4 msbits of the nvme sqe command_id to use as a 4-bit generation counter and verify it matches the existing request generation that is incrementing on every execution. The 16-bit command_id structure now is constructed by: | xxxx | xxxxxxxxxxxx | gen request tag This means that we are giving up some possible queue depth as 12 bits allow for a maximum queue depth of 4095 instead of 65536, however we never create such long queues anyways so no real harm done. Suggested-by: Keith Busch Signed-off-by: Sagi Grimberg Acked-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Daniel Wagner Tested-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/nvme.h | 47 +++++++++++++++++++++++++++++++++++++++++++++- drivers/nvme/host/pci.c | 2 +- drivers/nvme/host/rdma.c | 4 ++-- drivers/nvme/host/tcp.c | 26 ++++++++++++------------- drivers/nvme/target/loop.c | 4 ++-- 6 files changed, 66 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ce33014e3eb0..b9a46c54f714 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1026,7 +1026,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) return BLK_STS_IOERR; } - cmd->common.command_id = req->tag; + nvme_req(req)->genctr++; + cmd->common.command_id = nvme_cid(req); trace_nvme_setup_cmd(req, cmd); return ret; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ab803f91ace1..57d2ac00a6bd 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -152,6 +152,7 @@ enum nvme_quirks { struct nvme_request { struct nvme_command *cmd; union nvme_result result; + u8 genctr; u8 retries; u8 flags; u16 status; @@ -491,6 +492,49 @@ struct nvme_ctrl_ops { int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); }; +/* + * nvme command_id is constructed as such: + * | xxxx | xxxxxxxxxxxx | + * gen request tag + */ +#define nvme_genctr_mask(gen) (gen & 0xf) +#define nvme_cid_install_genctr(gen) (nvme_genctr_mask(gen) << 12) +#define nvme_genctr_from_cid(cid) ((cid & 0xf000) >> 12) +#define nvme_tag_from_cid(cid) (cid & 0xfff) + +static inline u16 nvme_cid(struct request *rq) +{ + return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag; +} + +static inline struct request *nvme_find_rq(struct blk_mq_tags *tags, + u16 command_id) +{ + u8 genctr = nvme_genctr_from_cid(command_id); + u16 tag = nvme_tag_from_cid(command_id); + struct request *rq; + + rq = blk_mq_tag_to_rq(tags, tag); + if (unlikely(!rq)) { + pr_err("could not locate request for tag %#x\n", + tag); + return NULL; + } + if (unlikely(nvme_genctr_mask(nvme_req(rq)->genctr) != genctr)) { + dev_err(nvme_req(rq)->ctrl->device, + "request %#x genctr mismatch (got %#x expected %#x)\n", + tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr)); + return NULL; + } + return rq; +} + +static inline struct request *nvme_cid_to_rq(struct blk_mq_tags *tags, + u16 command_id) +{ + return blk_mq_tag_to_rq(tags, nvme_tag_from_cid(command_id)); +} + #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj, const char *dev_name); @@ -588,7 +632,8 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl) static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) { - return !qid && command_id >= NVME_AQ_BLK_MQ_DEPTH; + return !qid && + nvme_tag_from_cid(command_id) >= NVME_AQ_BLK_MQ_DEPTH; } void nvme_complete_rq(struct request *req); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4880badb26d4..0471c2c7d64b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1010,7 +1010,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) return; } - req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), command_id); + req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id); if (unlikely(!req)) { dev_warn(nvmeq->dev->ctrl.device, "invalid id %d completed on queue %d\n", diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 7f6b3a991501..69ae67652f38 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1730,10 +1730,10 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, struct request *rq; struct nvme_rdma_request *req; - rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id); + rq = nvme_find_rq(nvme_rdma_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "tag 0x%x on QP %#x not found\n", + "got bad command_id %#x on QP %#x\n", cqe->command_id, queue->qp->qp_num); nvme_rdma_error_recovery(queue->ctrl); return; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index d649b446da66..0a97ba02f61e 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -487,11 +487,11 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, { struct request *rq; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag 0x%x not found\n", - nvme_tcp_queue_id(queue), cqe->command_id); + "got bad cqe.command_id %#x on queue %d\n", + cqe->command_id, nvme_tcp_queue_id(queue)); nvme_tcp_error_recovery(&queue->ctrl->ctrl); return -EINVAL; } @@ -508,11 +508,11 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, { struct request *rq; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); + "got bad c2hdata.command_id %#x on queue %d\n", + pdu->command_id, nvme_tcp_queue_id(queue)); return -ENOENT; } @@ -606,7 +606,7 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, data->hdr.plen = cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); data->ttag = pdu->ttag; - data->command_id = rq->tag; + data->command_id = nvme_cid(rq); data->data_offset = cpu_to_le32(req->data_sent); data->data_length = cpu_to_le32(req->pdu_len); return 0; @@ -619,11 +619,11 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, struct request *rq; int ret; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); + "got bad r2t.command_id %#x on queue %d\n", + pdu->command_id, nvme_tcp_queue_id(queue)); return -ENOENT; } req = blk_mq_rq_to_pdu(rq); @@ -703,7 +703,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, { struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; struct request *rq = - blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id); struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); while (true) { @@ -796,8 +796,8 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, } if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { - struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), - pdu->command_id); + struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); nvme_tcp_end_request(rq, NVME_SC_SUCCESS); queue->nr_cqe++; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 3a17a7e26bbf..0285ccc7541f 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -107,10 +107,10 @@ static void nvme_loop_queue_response(struct nvmet_req *req) } else { struct request *rq; - rq = blk_mq_tag_to_rq(nvme_loop_tagset(queue), cqe->command_id); + rq = nvme_find_rq(nvme_loop_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "tag 0x%x on queue %d not found\n", + "got bad command_id %#x on queue %d\n", cqe->command_id, nvme_loop_queue_idx(queue)); return; } -- cgit From 0521905e859fd1a07949cb18efb20cdd4aab3b20 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 14 Jul 2021 14:02:37 -0700 Subject: nvme-pci: use attribute group for cmb sysfs Appending sysfs files to the controller kobject is a bit clunky and becomes a maintenance problem as more attributes are added. The attribute group infrastructure handles this better, so use that. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 72 +++++++++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 26 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0471c2c7d64b..6658f58ef824 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -155,6 +155,8 @@ struct nvme_dev { unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; + + bool attrs_added; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -1804,17 +1806,6 @@ static int nvme_create_io_queues(struct nvme_dev *dev) return ret >= 0 ? 0 : ret; } -static ssize_t nvme_cmb_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); - - return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", - ndev->cmbloc, ndev->cmbsz); -} -static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); - static u64 nvme_cmb_size_unit(struct nvme_dev *dev) { u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; @@ -1883,20 +1874,6 @@ static void nvme_map_cmb(struct nvme_dev *dev) if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) pci_p2pmem_publish(pdev, true); - - if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL)) - dev_warn(dev->ctrl.device, - "failed to add sysfs attribute for CMB\n"); -} - -static inline void nvme_release_cmb(struct nvme_dev *dev) -{ - if (dev->cmb_size) { - sysfs_remove_file_from_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL); - dev->cmb_size = 0; - } } static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) @@ -2076,6 +2053,38 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) return ret; } +static ssize_t cmb_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n", + ndev->cmbloc, ndev->cmbsz); +} +static DEVICE_ATTR_RO(cmb); + +static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct nvme_ctrl *ctrl = + dev_get_drvdata(container_of(kobj, struct device, kobj)); + struct nvme_dev *dev = to_nvme_dev(ctrl); + + if (a == &dev_attr_cmb.attr && !dev->cmbsz) + return 0; + return a->mode; +} + +static struct attribute *nvme_pci_attrs[] = { + &dev_attr_cmb.attr, + NULL, +}; + +static const struct attribute_group nvme_pci_attr_group = { + .attrs = nvme_pci_attrs, + .is_visible = nvme_pci_attrs_are_visible, +}; + /* * nirqs is the number of interrupts available for write and read * queues. The core already reserved an interrupt for the admin queue. @@ -2747,6 +2756,10 @@ static void nvme_reset_work(struct work_struct *work) goto out; } + if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj, + &nvme_pci_attr_group)) + dev->attrs_added = true; + nvme_start_ctrl(&dev->ctrl); return; @@ -2995,6 +3008,13 @@ static void nvme_shutdown(struct pci_dev *pdev) nvme_disable_prepare_reset(dev, true); } +static void nvme_remove_attrs(struct nvme_dev *dev) +{ + if (dev->attrs_added) + sysfs_remove_group(&dev->ctrl.device->kobj, + &nvme_pci_attr_group); +} + /* * The driver's remove may be called on a device in a partially initialized * state. This function must not have any dependencies on the device state in @@ -3016,7 +3036,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_stop_ctrl(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_dev_disable(dev, true); - nvme_release_cmb(dev); + nvme_remove_attrs(dev); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); -- cgit From 1751e97aa940656b5de0e620f02cf193a275e014 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 16 Jul 2021 09:22:49 +0200 Subject: nvme-pci: cmb sysfs: one file, one value An attribute should only be exporting one value as recommended in Documentation/filesystems/sysfs.rst. Implement CMB attributes this way. The old attribute will remain for backward compatibility. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6658f58ef824..909dadcdab09 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2063,6 +2063,24 @@ static ssize_t cmb_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(cmb); +static ssize_t cmbloc_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%u\n", ndev->cmbloc); +} +static DEVICE_ATTR_RO(cmbloc); + +static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%u\n", ndev->cmbsz); +} +static DEVICE_ATTR_RO(cmbsz); + static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { @@ -2070,13 +2088,19 @@ static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, dev_get_drvdata(container_of(kobj, struct device, kobj)); struct nvme_dev *dev = to_nvme_dev(ctrl); - if (a == &dev_attr_cmb.attr && !dev->cmbsz) - return 0; + if (a == &dev_attr_cmb.attr || + a == &dev_attr_cmbloc.attr || + a == &dev_attr_cmbsz.attr) { + if (!dev->cmbsz) + return 0; + } return a->mode; } static struct attribute *nvme_pci_attrs[] = { &dev_attr_cmb.attr, + &dev_attr_cmbloc.attr, + &dev_attr_cmbsz.attr, NULL, }; -- cgit From e23439e977ed2b247912c2b5c6945ef1bc380100 Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Fri, 9 Jul 2021 10:32:47 +0800 Subject: nvme-fabrics: remove superfluous nvmf_host_put in nvmf_parse_options Opts->host is NULL there. It is checked just before. So remove nvmf_host_put. It is introduced by commit 59a2f3f00fd7 ("nvme: fix potential memory leak in option parsing"). Signed-off-by: Hou Pu Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index a5469fd9d4c3..668c6bb7a567 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -719,7 +719,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -EINVAL; goto out; } - nvmf_host_put(opts->host); opts->host = nvmf_host_add(p); kfree(p); if (!opts->host) { -- cgit From a7b5e8d864b356fdacfea08d9042261c37bc918e Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Mon, 5 Jul 2021 11:15:28 +0800 Subject: nvme: add set feature tracing support A nvme connect command produces following trace. Before: /sys/kernel/debug/tracing# cat trace | grep feature kworker/5:1H-98 [005] .... 3221.294844: nvme_setup_cmd: nvme0: qid=0, cmdid=25, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_set_features cdw10=07 00 00 00 07 00 07 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) kworker/4:1H-124 [004] .... 3222.009186: nvme_setup_cmd: nvme0: qid=0, cmdid=17, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_set_features cdw10=0b 00 00 00 00 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) After: /sys/kernel/debug/tracing# cat trace | grep feature kworker/0:1H-253 [000] .... 196.060509: nvme_setup_cmd: nvme0: qid=0, cmdid=29, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_set_features fid=0x7, sv=0x0, cdw11=0x70007) kworker/0:1H-253 [000] .... 196.763947: nvme_setup_cmd: nvme0: qid=0, cmdid=29, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_set_features fid=0xb, sv=0x0, cdw11=0x900) Using ',' to separate different field like others in nvmet_trace_admin_get_features. Signed-off-by: Hou Pu Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 6543015b6121..2a89c5aa0790 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -72,6 +72,20 @@ static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_admin_set_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sv = cdw10[3] & 0x8; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x, sv=0x%x, cdw11=0x%x", fid, sv, cdw11); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_admin_get_features(struct trace_seq *p, u8 *cdw10) { @@ -80,7 +94,7 @@ static const char *nvme_trace_admin_get_features(struct trace_seq *p, u8 sel = cdw10[1] & 0x7; u32 cdw11 = get_unaligned_le32(cdw10 + 4); - trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11); + trace_seq_printf(p, "fid=0x%x, sel=0x%x, cdw11=0x%x", fid, sel, cdw11); trace_seq_putc(p, 0); return ret; @@ -201,6 +215,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, return nvme_trace_create_cq(p, cdw10); case nvme_admin_identify: return nvme_trace_admin_identify(p, cdw10); + case nvme_admin_set_features: + return nvme_trace_admin_set_features(p, cdw10); case nvme_admin_get_features: return nvme_trace_admin_get_features(p, cdw10); case nvme_admin_get_lba_status: -- cgit From 8d84f9de69ca23f2637dc19d96f39228c8426e97 Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Mon, 5 Jul 2021 11:15:29 +0800 Subject: nvmet: add set feature tracing support A nvme connect command produces following trace from the target side. Before: kworker/0:1H-56 [000] .... 9012.155139: nvmet_req_init: nvmet1: qid=0, cmdid=16, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_set_features, cdw10=07 00 00 00 07 00 07 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) kworker/0:1H-56 [000] .... 9012.872272: nvmet_req_init: nvmet1: qid=0, cmdid=13, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_set_features, cdw10=0b 00 00 00 00 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) cmdline:/sys/kernel/debug/tracing# cat trace | grep feature kworker/0:1H-56 [000] .... 203.493914: nvmet_req_init: nvmet1: qid=0, cmdid=29, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_set_features, fid=0x7, sv=0x0, cdw11=0x70007) kworker/0:1H-56 [000] .... 204.197079: nvmet_req_init: nvmet1: qid=0, cmdid=29, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_set_features, fid=0xb, sv=0x0, cdw11=0x900) Using ',' to separate different field like others in nvmet_trace_admin_get_features. Signed-off-by: Hou Pu Signed-off-by: Christoph Hellwig --- drivers/nvme/target/trace.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index 1373a3c67962..bff454d46255 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -27,7 +27,7 @@ static const char *nvmet_trace_admin_get_features(struct trace_seq *p, u8 sel = cdw10[1] & 0x7; u32 cdw11 = get_unaligned_le32(cdw10 + 4); - trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11); + trace_seq_printf(p, "fid=0x%x, sel=0x%x, cdw11=0x%x", fid, sel, cdw11); trace_seq_putc(p, 0); return ret; @@ -49,6 +49,20 @@ static const char *nvmet_trace_get_lba_status(struct trace_seq *p, return ret; } +static const char *nvmet_trace_admin_set_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sv = cdw10[3] & 0x8; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x, sv=0x%x, cdw11=0x%x", fid, sv, cdw11); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -94,6 +108,8 @@ const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p, switch (opcode) { case nvme_admin_identify: return nvmet_trace_admin_identify(p, cdw10); + case nvme_admin_set_features: + return nvmet_trace_admin_set_features(p, cdw10); case nvme_admin_get_features: return nvmet_trace_admin_get_features(p, cdw10); case nvme_admin_get_lba_status: -- cgit From ad0e9a80ba0f20db0f86e23d1ad2979513a9a8ee Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 6 Jul 2021 15:56:50 +0100 Subject: nvmet: remove redundant assignments of variable status There are two occurrances where variable status is being assigned a value that is never read and it is being re-assigned a new value almost immediately afterwards on an error exit path. The assignments are redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/zns.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 17f8b7a45f21..46bc30fe85d2 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -115,14 +115,11 @@ void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req) } status = nvmet_req_find_ns(req); - if (status) { - status = NVME_SC_INTERNAL; + if (status) goto done; - } if (!bdev_is_zoned(req->ns->bdev)) { req->error_loc = offsetof(struct nvme_identify, nsid); - status = NVME_SC_INVALID_NS | NVME_SC_DNR; goto done; } -- cgit From e5ad96f388b765fe6b52f64f37e910c0ba4f3de7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 27 Jul 2021 09:40:44 -0700 Subject: nvme-pci: disable hmb on idle suspend An idle suspend may or may not disable host memory access from devices placed in low power mode. Either way, it should always be safe to disable the host memory buffer prior to entering the low power mode, and this should also always be faster than a full device shutdown. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 909dadcdab09..5b23d5818f75 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3087,8 +3087,13 @@ static int nvme_resume(struct device *dev) if (ndev->last_ps == U32_MAX || nvme_set_power_state(ctrl, ndev->last_ps) != 0) - return nvme_try_sched_reset(&ndev->ctrl); + goto reset; + if (ctrl->hmpre && nvme_setup_host_mem(ndev)) + goto reset; + return 0; +reset: + return nvme_try_sched_reset(ctrl); } static int nvme_suspend(struct device *dev) @@ -3112,15 +3117,9 @@ static int nvme_suspend(struct device *dev) * the PCI bus layer to put it into D3 in order to take the PCIe link * down, so as to allow the platform to achieve its minimum low-power * state (which may not be possible if the link is up). - * - * If a host memory buffer is enabled, shut down the device as the NVMe - * specification allows the device to access the host memory buffer in - * host DRAM from all power states, but hosts will fail access to DRAM - * during S3. */ if (pm_suspend_via_firmware() || !ctrl->npss || !pcie_aspm_enabled(pdev) || - ndev->nr_host_mem_descs || (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) return nvme_disable_prepare_reset(ndev, true); @@ -3131,6 +3130,17 @@ static int nvme_suspend(struct device *dev) if (ctrl->state != NVME_CTRL_LIVE) goto unfreeze; + /* + * Host memory access may not be successful in a system suspend state, + * but the specification allows the controller to access memory in a + * non-operational power state. + */ + if (ndev->hmb) { + ret = nvme_set_host_mem(ndev, 0); + if (ret < 0) + goto unfreeze; + } + ret = nvme_get_power_state(ctrl, &ndev->last_ps); if (ret < 0) goto unfreeze; -- cgit From a5df5e79c43c84d9fb88f56b707c5ff52b27ccca Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 27 Jul 2021 09:40:43 -0700 Subject: nvme: allow user toggling hmb usage The NVMe host memory buffer may consume a non-negligable amount of memory. Controllers are required to function without the host memory buffer enabled, but with possibly degraded performance. Export a sysfs property to toggle this feature on a per-device granularity so users may choose to reclaim memory at the expense of storage performance. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5b23d5818f75..b82492cd7503 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -137,6 +137,7 @@ struct nvme_dev { u32 cmbloc; struct nvme_ctrl ctrl; u32 last_ps; + bool hmb; mempool_t *iod_mempool; @@ -1896,7 +1897,9 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) dev_warn(dev->ctrl.device, "failed to set host mem (err %d, flags %#x).\n", ret, bits); - } + } else + dev->hmb = bits & NVME_HOST_MEM_ENABLE; + return ret; } @@ -2081,6 +2084,42 @@ static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(cmbsz); +static ssize_t hmb_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%d\n", ndev->hmb); +} + +static ssize_t hmb_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + bool new; + int ret; + + if (strtobool(buf, &new) < 0) + return -EINVAL; + + if (new == ndev->hmb) + return count; + + if (new) { + ret = nvme_setup_host_mem(ndev); + } else { + ret = nvme_set_host_mem(ndev, 0); + if (!ret) + nvme_free_host_mem(ndev); + } + + if (ret < 0) + return ret; + + return count; +} +static DEVICE_ATTR_RW(hmb); + static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { @@ -2094,6 +2133,9 @@ static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, if (!dev->cmbsz) return 0; } + if (a == &dev_attr_hmb.attr && !ctrl->hmpre) + return 0; + return a->mode; } @@ -2101,6 +2143,7 @@ static struct attribute *nvme_pci_attrs[] = { &dev_attr_cmb.attr, &dev_attr_cmbloc.attr, &dev_attr_cmbsz.attr, + &dev_attr_hmb.attr, NULL, }; -- cgit From d48f92cd2739258a1292be56bbeadb5b6a57ea09 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 6 Aug 2021 08:41:43 -0700 Subject: nvme-tcp: pair send_mutex init with destroy Each mutex_init() should have a corresponding mutex_destroy(). Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 0a97ba02f61e..95d4cf777d24 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1220,6 +1220,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) sock_release(queue->sock); kfree(queue->pdu); + mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); } @@ -1525,6 +1526,7 @@ err_sock: sock_release(queue->sock); queue->sock = NULL; err_destroy_mutex: + mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); return ret; } -- cgit From 664227fde63844d69e9ec9e90a8a7801e6ff072d Mon Sep 17 00:00:00 2001 From: Ruozhu Li Date: Sat, 7 Aug 2021 11:50:23 +0800 Subject: nvme-tcp: don't update queue count when failing to set io queues We update ctrl->queue_count and schedule another reconnect when io queue count is zero.But we will never try to create any io queue in next reco- nnection, because ctrl->queue_count already set to zero.We will end up having an admin-only session in Live state, which is exactly what we try to avoid in the original patch. Update ctrl->queue_count after queue_count zero checking to fix it. Signed-off-by: Ruozhu Li Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 95d4cf777d24..645025620154 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1763,13 +1763,13 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) if (ret) return ret; - ctrl->queue_count = nr_io_queues + 1; - if (ctrl->queue_count < 2) { + if (nr_io_queues == 0) { dev_err(ctrl->device, "unable to set any I/O queues\n"); return -ENOMEM; } + ctrl->queue_count = nr_io_queues + 1; dev_info(ctrl->device, "creating %d I/O queues.\n", nr_io_queues); -- cgit From 85032874f80ba17bf187de1d14d9603bf3f582b8 Mon Sep 17 00:00:00 2001 From: Ruozhu Li Date: Wed, 28 Jul 2021 17:41:20 +0800 Subject: nvme-rdma: don't update queue count when failing to set io queues We update ctrl->queue_count and schedule another reconnect when io queue count is zero.But we will never try to create any io queue in next reco- nnection, because ctrl->queue_count already set to zero.We will end up having an admin-only session in Live state, which is exactly what we try to avoid in the original patch. Update ctrl->queue_count after queue_count zero checking to fix it. Signed-off-by: Ruozhu Li Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 69ae67652f38..a68704e39084 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -735,13 +735,13 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) if (ret) return ret; - ctrl->ctrl.queue_count = nr_io_queues + 1; - if (ctrl->ctrl.queue_count < 2) { + if (nr_io_queues == 0) { dev_err(ctrl->ctrl.device, "unable to set any I/O queues\n"); return -ENOMEM; } + ctrl->ctrl.queue_count = nr_io_queues + 1; dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues); -- cgit From e804d5abe2d74cfe23f5f83be580d1cdc9307111 Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Sun, 8 Aug 2021 09:20:14 +0300 Subject: nvmet: pass back cntlid on successful completion According to the NVMe specification, the response dword 0 value of the Connect command is based on status code: return cntlid for successful compeltion return IPO and IATTR for connect invalid parameters. Fix a missing error information for a zero sized queue, and return the cntlid also for I/O queue Connect commands. Signed-off-by: Amit Engel Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fabrics-cmd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 7d0f3523fdab..8ef564c3b32c 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -120,6 +120,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) if (!sqsize) { pr_warn("queue size zero!\n"); req->error_loc = offsetof(struct nvmf_connect_command, sqsize); + req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize); ret = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; goto err; } @@ -260,11 +261,11 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) } status = nvmet_install_queue(ctrl, req); - if (status) { - /* pass back cntlid that had the issue of installing queue */ - req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); + if (status) goto out_ctrl_put; - } + + /* pass back cntlid for successful completion */ + req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); -- cgit From b71df12605cabab47d58bd926badaf4130280e4d Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Sun, 8 Aug 2021 18:06:15 +0300 Subject: nvmet: avoid duplicate qid in connect cmd According to the NVMe specification, if the host sends a Connect command specifying a queue id which has already been created, a status value of NVME_SC_CMD_SEQ_ERROR is returned. Signed-off-by: Amit Engel Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 1 + drivers/nvme/target/fabrics-cmd.c | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index ac7210a3ea1c..66d05eecc2a9 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -802,6 +802,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq) * controller teardown as a result of a keep-alive expiration. */ ctrl->reset_tbkas = true; + sq->ctrl->sqs[sq->qid] = NULL; nvmet_ctrl_put(ctrl); sq->ctrl = NULL; /* allows reusing the queue later */ } diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 8ef564c3b32c..6f14da5cd08f 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -111,12 +111,6 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) struct nvmet_ctrl *old; u16 ret; - old = cmpxchg(&req->sq->ctrl, NULL, ctrl); - if (old) { - pr_warn("queue already connected!\n"); - req->error_loc = offsetof(struct nvmf_connect_command, opcode); - return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; - } if (!sqsize) { pr_warn("queue size zero!\n"); req->error_loc = offsetof(struct nvmf_connect_command, sqsize); @@ -125,6 +119,19 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) goto err; } + if (ctrl->sqs[qid] != NULL) { + pr_warn("qid %u has already been created\n", qid); + req->error_loc = offsetof(struct nvmf_connect_command, qid); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + old = cmpxchg(&req->sq->ctrl, NULL, ctrl); + if (old) { + pr_warn("queue already connected!\n"); + req->error_loc = offsetof(struct nvmf_connect_command, opcode); + return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + } + /* note: convert queue size from 0's-based value to 1's-based value */ nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1); nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1); @@ -139,6 +146,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) if (ret) { pr_err("failed to install queue %d cntlid %d ret %x\n", qid, ctrl->cntlid, ret); + ctrl->sqs[qid] = NULL; goto err; } } -- cgit From e19e9f47f341cafcaf41253723f083223a4652a5 Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Thu, 5 Aug 2021 18:02:51 +0300 Subject: nvmet: check that host sqsize does not exceed ctrl MQES Check that host sqsize is not greater-than Maximum Queue Entries Supported (MQES) value supported by the controller. Signed-off-by: Amit Engel Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fabrics-cmd.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 6f14da5cd08f..7d0454cee920 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -109,6 +109,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) u16 qid = le16_to_cpu(c->qid); u16 sqsize = le16_to_cpu(c->sqsize); struct nvmet_ctrl *old; + u16 mqes = NVME_CAP_MQES(ctrl->cap); u16 ret; if (!sqsize) { @@ -125,6 +126,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; } + if (sqsize > mqes) { + pr_warn("sqsize %u is larger than MQES supported %u cntlid %d\n", + sqsize, mqes, ctrl->cntlid); + req->error_loc = offsetof(struct nvmf_connect_command, sqsize); + req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize); + return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + } + old = cmpxchg(&req->sq->ctrl, NULL, ctrl); if (old) { pr_warn("queue already connected!\n"); -- cgit From b1a811633f7321cf1ae2bb76a66805b7720e44c9 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Thu, 12 Aug 2021 12:15:01 +0300 Subject: block: nbd: add sanity check for first_minor Syzbot hit WARNING in internal_create_group(). The problem was in too big disk->first_minor. disk->first_minor is initialized by value, which comes from userspace and there wasn't any sanity checks about value correctness. It can cause duplicate creation of sysfs files/links, because disk->first_minor will be passed to MKDEV() which causes truncation to byte. Since maximum minor value is 0xff, let's check if first_minor is correct minor number. NOTE: the root case of the reported warning was in wrong error handling in register_disk(), but we can avoid passing knowingly wrong values to sysfs API, because sysfs error messages can confuse users. For example: user passed 1048576 as index, but sysfs complains about duplicate creation of /dev/block/43:0. It's not obvious how 1048576 becomes 0. Log and reproducer for above example can be found on syzkaller bug report page. Link: https://syzkaller.appspot.com/bug?id=03c2ae9146416edf811958d5fd7acfab75b143d1 Fixes: b0d9111a2d53 ("nbd: use an idr to keep track of nbd devices") Reported-by: syzbot+9937dc42271cd87d4b98@syzkaller.appspotmail.com Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Skripkin Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 0fe82626bf70..379032a64a7c 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1750,7 +1750,17 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) refcount_set(&nbd->refs, refs); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; + + /* Too big first_minor can cause duplicate creation of + * sysfs files/links, since first_minor will be truncated to + * byte in __device_add_disk(). + */ disk->first_minor = index << part_shift; + if (disk->first_minor > 0xff) { + err = -EINVAL; + goto out_free_idr; + } + disk->minors = 1 << part_shift; disk->fops = &nbd_fops; disk->private_data = nbd; -- cgit From 0866200ed7fdfbfba0c033aad63ff407e5368570 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 25 May 2021 08:59:46 -0700 Subject: nvme: Have NVME_FABRICS select NVME_CORE instead of transport drivers Transport drivers need both core and fabrics modules, instead of selecting both, have the selection transitive such that NVME_FABRICS selects NVME_CORE and transport drivers select NVME_FABRICS. Suggested-by: Keith Busch Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/Kconfig | 4 +--- drivers/nvme/target/Kconfig | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index c3f3d77f1aac..dc0450ca23a3 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -33,12 +33,12 @@ config NVME_HWMON in the system. config NVME_FABRICS + select NVME_CORE tristate config NVME_RDMA tristate "NVM Express over Fabrics RDMA host driver" depends on INFINIBAND && INFINIBAND_ADDR_TRANS && BLOCK - select NVME_CORE select NVME_FABRICS select SG_POOL help @@ -55,7 +55,6 @@ config NVME_FC tristate "NVM Express over Fabrics FC host driver" depends on BLOCK depends on HAS_DMA - select NVME_CORE select NVME_FABRICS select SG_POOL help @@ -72,7 +71,6 @@ config NVME_TCP tristate "NVM Express over Fabrics TCP host driver" depends on INET depends on BLOCK - select NVME_CORE select NVME_FABRICS select CRYPTO select CRYPTO_CRC32C diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 4be2ececbc45..973561c93888 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -31,7 +31,6 @@ config NVME_TARGET_PASSTHRU config NVME_TARGET_LOOP tristate "NVMe loopback device support" depends on NVME_TARGET - select NVME_CORE select NVME_FABRICS select SG_POOL help @@ -65,7 +64,6 @@ config NVME_TARGET_FC config NVME_TARGET_FCLOOP tristate "NVMe over Fabrics FC Transport Loopback Test driver" depends on NVME_TARGET - select NVME_CORE select NVME_FABRICS select SG_POOL depends on NVME_FC -- cgit From 77979058dfcf4818abf7dd84423a7d66dafd8487 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 16 Aug 2021 09:24:52 -0700 Subject: nvme: remove nvm_ndev from ns Now that the lightnvm driver is removed, we don't need a pointer to it's now non-existent struct. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 57d2ac00a6bd..37c5ef5a3331 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -444,7 +444,6 @@ struct nvme_ns { u32 ana_grpid; #endif struct list_head siblings; - struct nvm_dev *ndev; struct kref kref; struct nvme_ns_head *head; -- cgit From 9891668e43c8e9f2d0d50088b151edefc2e560e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 14:45:29 +0200 Subject: nvme: remove the unused NVME_NS_* enum These values are unused now that the lightnvm support is gone. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/nvme.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 37c5ef5a3331..a2e1f298b217 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -47,11 +47,6 @@ extern struct workqueue_struct *nvme_wq; extern struct workqueue_struct *nvme_reset_wq; extern struct workqueue_struct *nvme_delete_wq; -enum { - NVME_NS_LBA = 0, - NVME_NS_LIGHTNVM = 1, -}; - /* * List of workarounds for devices that required behavior not specified in * the standard. -- cgit From 93f63bc41f699318807df202a175d564c26bda87 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 25 Aug 2021 18:31:03 +0200 Subject: nbd: add missing locking to the nbd_dev_add error path idr_remove needs external synchronization. Fixes: 6e4df4c64881 ("nbd: reduce the nbd_index_mutex scope") Signed-off-by: Tetsuo Handa [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210825163108.50713-2-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 379032a64a7c..0c1389da3066 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1770,7 +1770,9 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) return nbd; out_free_idr: + mutex_lock(&nbd_index_mutex); idr_remove(&nbd_index_idr, index); + mutex_unlock(&nbd_index_mutex); out_free_tags: blk_mq_free_tag_set(&nbd->tag_set); out_free_nbd: -- cgit From 409e0ff10ead30a620ee48acb6d4545d9cb95359 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Aug 2021 18:31:04 +0200 Subject: nbd: reset NBD to NULL when restarting in nbd_genl_connect When nbd_genl_connect restarts to wait for a disconnecting device, nbd needs to be reset to NULL. Do that by facoring out a helper to find an unused device. Fixes: 6177b56c96ff ("nbd: refactor device search and allocation in nbd_genl_connect") Reported-by: Tetsuo Handa Reported-by: Hillf Danton Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210825163108.50713-3-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 0c1389da3066..938ca7f5a11f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1781,6 +1781,20 @@ out: return ERR_PTR(err); } +static struct nbd_device *nbd_find_unused(void) +{ + struct nbd_device *nbd; + int id; + + lockdep_assert_held(&nbd_index_mutex); + + idr_for_each_entry(&nbd_index_idr, nbd, id) + if (!refcount_read(&nbd->config_refs)) + return nbd; + + return NULL; +} + /* Netlink interface. */ static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = { [NBD_ATTR_INDEX] = { .type = NLA_U32 }, @@ -1828,7 +1842,7 @@ static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) { DECLARE_COMPLETION_ONSTACK(destroy_complete); - struct nbd_device *nbd = NULL; + struct nbd_device *nbd; struct nbd_config *config; int index = -1; int ret; @@ -1849,20 +1863,10 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) } again: mutex_lock(&nbd_index_mutex); - if (index == -1) { - struct nbd_device *tmp; - int id; - - idr_for_each_entry(&nbd_index_idr, tmp, id) { - if (!refcount_read(&tmp->config_refs)) { - nbd = tmp; - break; - } - } - } else { + if (index == -1) + nbd = nbd_find_unused(); + else nbd = idr_find(&nbd_index_idr, index); - } - if (nbd) { if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { -- cgit From 75b7f62aa65d5c496391ec2c3db3561aaf81a403 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 25 Aug 2021 18:31:05 +0200 Subject: nbd: prevent IDR lookups from finding partially initialized devices Previously nbd_index_mutex was held during whole add/remove/lookup operations in order to guarantee that partially initialized devices are not reachable via idr_find() or idr_for_each(). But now that partially initialized devices become reachable as soon as idr_alloc() succeeds, we need to skip partially initialized devices. Since it seems that all functions use refcount_inc_not_zero(&nbd->refs) in order to skip destroying devices, update nbd->refs from zero to non-zero as the last step of device initialization in order to also skip partially initialized devices. Fixes: 6e4df4c64881 ("nbd: reduce the nbd_index_mutex scope") Signed-off-by: Tetsuo Handa [hch: split from a larger patch, added comments] Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210825163108.50713-4-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 938ca7f5a11f..b1ed2360ef32 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1747,7 +1747,11 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) mutex_init(&nbd->config_lock); refcount_set(&nbd->config_refs, 0); - refcount_set(&nbd->refs, refs); + /* + * Start out with a zero references to keep other threads from using + * this device until it is fully initialized. + */ + refcount_set(&nbd->refs, 0); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; @@ -1766,6 +1770,11 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) disk->private_data = nbd; sprintf(disk->disk_name, "nbd%d", index); add_disk(disk); + + /* + * Now publish the device. + */ + refcount_set(&nbd->refs, refs); nbd_total_devices++; return nbd; -- cgit From b190300decb352a0b865d7aa379e89b17d772a43 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 25 Aug 2021 18:31:06 +0200 Subject: nbd: set nbd->index before releasing nbd_index_mutex Set nbd->index before releasing nbd_index_mutex, as populate_nbd_status() might access nbd->index as soon as nbd_index_mutex is released. Fixes: 6e4df4c64881 ("nbd: reduce the nbd_index_mutex scope") Signed-off-by: Tetsuo Handa [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210825163108.50713-5-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index b1ed2360ef32..6a832bf81647 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1720,10 +1720,10 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) if (err >= 0) index = err; } + nbd->index = index; mutex_unlock(&nbd_index_mutex); if (err < 0) goto out_free_tags; - nbd->index = index; disk = blk_mq_alloc_disk(&nbd->tag_set, NULL); if (IS_ERR(disk)) { -- cgit From 438cd318c8dfa5228ffd43af1b98d7cd7d92e1c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Aug 2021 18:31:07 +0200 Subject: nbd: only return usable devices from nbd_find_unused Device marked as NBD_DESTROY_ON_DISCONNECT can and should be skipped given that they won't survive the disconnect. So skip them and try to grab a reference directly and just continue if the the devices is being torn down or created and thus has a zero refcount. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210825163108.50713-6-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6a832bf81647..70dc9d80a173 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1790,16 +1790,20 @@ out: return ERR_PTR(err); } -static struct nbd_device *nbd_find_unused(void) +static struct nbd_device *nbd_find_get_unused(void) { struct nbd_device *nbd; int id; lockdep_assert_held(&nbd_index_mutex); - idr_for_each_entry(&nbd_index_idr, nbd, id) - if (!refcount_read(&nbd->config_refs)) + idr_for_each_entry(&nbd_index_idr, nbd, id) { + if (refcount_read(&nbd->config_refs) || + test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) + continue; + if (refcount_inc_not_zero(&nbd->refs)) return nbd; + } return NULL; } @@ -1873,10 +1877,10 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) again: mutex_lock(&nbd_index_mutex); if (index == -1) - nbd = nbd_find_unused(); + nbd = nbd_find_get_unused(); else nbd = idr_find(&nbd_index_idr, index); - if (nbd) { + if (nbd && index != -1) { if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { nbd->destroy_complete = &destroy_complete; @@ -1889,8 +1893,6 @@ again: if (!refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); - if (index == -1) - goto again; pr_err("nbd: device at index %d is going down\n", index); return -EINVAL; -- cgit From 7ee656c3ac3d047b4cf1269f83ac9d6c0bba916b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Aug 2021 18:31:08 +0200 Subject: nbd: remove nbd->destroy_complete The nbd->destroy_complete pointer is not really needed. For creating a device without a specific index we now simplify skip devices marked NBD_DESTROY_ON_DISCONNECT as there is not much point to reuse them. For device creation with a specific index there is no real need to treat the case of a requested but not finished disconnect different than any other device that is being shutdown, i.e. we can just return an error, as a slightly different race window would anyway. Fixes: 6e4df4c64881 ("nbd: reduce the nbd_index_mutex scope") Reported-by: Tetsuo Handa Reported-by: syzbot+2c98885bcd769f56b6d6@syzkaller.appspotmail.com Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210825163108.50713-7-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 52 ++++++++++++++-------------------------------------- 1 file changed, 14 insertions(+), 38 deletions(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 70dc9d80a173..44143068c91e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -120,7 +120,6 @@ struct nbd_device { struct task_struct *task_recv; struct task_struct *task_setup; - struct completion *destroy_complete; unsigned long flags; char *backend; @@ -235,19 +234,6 @@ static const struct device_attribute backend_attr = { .show = backend_show, }; -/* - * Place this in the last just before the nbd is freed to - * make sure that the disk and the related kobject are also - * totally removed to avoid duplicate creation of the same - * one. - */ -static void nbd_notify_destroy_completion(struct nbd_device *nbd) -{ - if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && - nbd->destroy_complete) - complete(nbd->destroy_complete); -} - static void nbd_dev_remove(struct nbd_device *nbd) { struct gendisk *disk = nbd->disk; @@ -262,7 +248,6 @@ static void nbd_dev_remove(struct nbd_device *nbd) */ mutex_lock(&nbd_index_mutex); idr_remove(&nbd_index_idr, nbd->index); - nbd_notify_destroy_completion(nbd); mutex_unlock(&nbd_index_mutex); kfree(nbd); @@ -1702,7 +1687,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) BLK_MQ_F_BLOCKING; nbd->tag_set.driver_data = nbd; INIT_WORK(&nbd->remove_work, nbd_dev_remove_work); - nbd->destroy_complete = NULL; nbd->backend = NULL; err = blk_mq_alloc_tag_set(&nbd->tag_set); @@ -1854,7 +1838,6 @@ static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) { - DECLARE_COMPLETION_ONSTACK(destroy_complete); struct nbd_device *nbd; struct nbd_config *config; int index = -1; @@ -1876,31 +1859,24 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) } again: mutex_lock(&nbd_index_mutex); - if (index == -1) + if (index == -1) { nbd = nbd_find_get_unused(); - else + } else { nbd = idr_find(&nbd_index_idr, index); - if (nbd && index != -1) { - if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && - test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { - nbd->destroy_complete = &destroy_complete; - mutex_unlock(&nbd_index_mutex); - - /* wait until the nbd device is completely destroyed */ - wait_for_completion(&destroy_complete); - goto again; - } - - if (!refcount_inc_not_zero(&nbd->refs)) { - mutex_unlock(&nbd_index_mutex); - pr_err("nbd: device at index %d is going down\n", - index); - return -EINVAL; + if (nbd) { + if ((test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && + test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) || + !refcount_inc_not_zero(&nbd->refs)) { + mutex_unlock(&nbd_index_mutex); + pr_err("nbd: device at index %d is going down\n", + index); + return -EINVAL; + } } - mutex_unlock(&nbd_index_mutex); - } else { - mutex_unlock(&nbd_index_mutex); + } + mutex_unlock(&nbd_index_mutex); + if (!nbd) { nbd = nbd_dev_add(index, 2); if (IS_ERR(nbd)) { pr_err("nbd: failed to add new device\n"); -- cgit From 46d4703b1db4c86ab5acb2331b10df999f005e8e Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Wed, 18 Aug 2021 13:57:48 +0800 Subject: md/raid10: Remove unnecessary rcu_dereference in raid10_handle_discard We are seeing the following warning in raid10_handle_discard. [ 695.110751] ============================= [ 695.131439] WARNING: suspicious RCU usage [ 695.151389] 4.18.0-319.el8.x86_64+debug #1 Not tainted [ 695.174413] ----------------------------- [ 695.192603] drivers/md/raid10.c:1776 suspicious rcu_dereference_check() usage! [ 695.225107] other info that might help us debug this: [ 695.260940] rcu_scheduler_active = 2, debug_locks = 1 [ 695.290157] no locks held by mkfs.xfs/10186. In the first loop of function raid10_handle_discard. It already determines which disk need to handle discard request and add the rdev reference count rdev->nr_pending. So the conf->mirrors will not change until all bios come back from underlayer disks. It doesn't need to use rcu_dereference to get rdev. Cc: stable@vger.kernel.org Fixes: d30588b2731f ('md/raid10: improve raid10 discard request') Signed-off-by: Xiao Ni Acked-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid10.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 16977e8e075d..d5d92337e35e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1712,6 +1712,11 @@ retry_discard: } else r10_bio->master_bio = (struct bio *)first_r10bio; + /* + * first select target devices under rcu_lock and + * inc refcount on their rdev. Record them by setting + * bios[x] to bio + */ rcu_read_lock(); for (disk = 0; disk < geo->raid_disks; disk++) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); @@ -1743,9 +1748,6 @@ retry_discard: for (disk = 0; disk < geo->raid_disks; disk++) { sector_t dev_start, dev_end; struct bio *mbio, *rbio = NULL; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[disk].replacement); /* * Now start to calculate the start and end address for each disk. @@ -1775,9 +1777,12 @@ retry_discard: /* * It only handles discard bio which size is >= stripe size, so - * dev_end > dev_start all the time + * dev_end > dev_start all the time. + * It doesn't need to use rcu lock to get rdev here. We already + * add rdev->nr_pending in the first loop. */ if (r10_bio->devs[disk].bio) { + struct md_rdev *rdev = conf->mirrors[disk].rdev; mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); mbio->bi_end_io = raid10_end_discard_request; mbio->bi_private = r10_bio; @@ -1790,6 +1795,7 @@ retry_discard: bio_endio(mbio); } if (r10_bio->devs[disk].repl_bio) { + struct md_rdev *rrdev = conf->mirrors[disk].replacement; rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); rbio->bi_end_io = raid10_end_discard_request; rbio->bi_private = r10_bio; -- cgit From 6607cd319b6b91bff94e90f798a61c031650b514 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 24 Aug 2021 09:16:54 +0800 Subject: raid1: ensure write behind bio has less than BIO_MAX_VECS sectors We can't split write behind bio with more than BIO_MAX_VECS sectors, otherwise the below call trace was triggered because we could allocate oversized write behind bio later. [ 8.097936] bvec_alloc+0x90/0xc0 [ 8.098934] bio_alloc_bioset+0x1b3/0x260 [ 8.099959] raid1_make_request+0x9ce/0xc50 [raid1] [ 8.100988] ? __bio_clone_fast+0xa8/0xe0 [ 8.102008] md_handle_request+0x158/0x1d0 [md_mod] [ 8.103050] md_submit_bio+0xcd/0x110 [md_mod] [ 8.104084] submit_bio_noacct+0x139/0x530 [ 8.105127] submit_bio+0x78/0x1d0 [ 8.106163] ext4_io_submit+0x48/0x60 [ext4] [ 8.107242] ext4_writepages+0x652/0x1170 [ext4] [ 8.108300] ? do_writepages+0x41/0x100 [ 8.109338] ? __ext4_mark_inode_dirty+0x240/0x240 [ext4] [ 8.110406] do_writepages+0x41/0x100 [ 8.111450] __filemap_fdatawrite_range+0xc5/0x100 [ 8.112513] file_write_and_wait_range+0x61/0xb0 [ 8.113564] ext4_sync_file+0x73/0x370 [ext4] [ 8.114607] __x64_sys_fsync+0x33/0x60 [ 8.115635] do_syscall_64+0x33/0x40 [ 8.116670] entry_SYSCALL_64_after_hwframe+0x44/0xae Thanks for the comment from Christoph. [1]. https://bugs.archlinux.org/task/70992 Cc: stable@vger.kernel.org # v5.12+ Reported-by: Jens Stutte Tested-by: Jens Stutte Reviewed-by: Christoph Hellwig Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid1.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'drivers') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 51f2547c2007..21b348408478 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1331,6 +1331,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct raid1_plug_cb *plug = NULL; int first_clone; int max_sectors; + bool write_behind = false; if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, @@ -1383,6 +1384,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, max_sectors = r1_bio->sectors; for (i = 0; i < disks; i++) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + + /* + * The write-behind io is only attempted on drives marked as + * write-mostly, which means we could allocate write behind + * bio later. + */ + if (rdev && test_bit(WriteMostly, &rdev->flags)) + write_behind = true; + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { atomic_inc(&rdev->nr_pending); blocked_rdev = rdev; @@ -1456,6 +1466,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, goto retry_write; } + /* + * When using a bitmap, we may call alloc_behind_master_bio below. + * alloc_behind_master_bio allocates a copy of the data payload a page + * at a time and thus needs a new bio that can fit the whole payload + * this bio in page sized chunks. + */ + if (write_behind && bitmap) + max_sectors = min_t(int, max_sectors, + BIO_MAX_VECS * (PAGE_SIZE >> 9)); if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, GFP_NOIO, &conf->bio_split); -- cgit From c7e9d0020361f4308a70cdfd6d5335e273eb8717 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Sat, 7 Aug 2021 10:37:02 +0300 Subject: Revert "floppy: reintroduce O_NDELAY fix" The patch breaks userspace implementations (e.g. fdutils) and introduces regressions in behaviour. Previously, it was possible to O_NDELAY open a floppy device with no media inserted or with write protected media without an error. Some userspace tools use this particular behavior for probing. It's not the first time when we revert this patch. Previous revert is in commit f2791e7eadf4 (Revert "floppy: refactor open() flags handling"). This reverts commit 8a0c014cd20516ade9654fc13b51345ec58e7be8. Link: https://lore.kernel.org/linux-block/de10cb47-34d1-5a88-7751-225ca380f735@compro.net/ Reported-by: Mark Hounschell Cc: Jiri Kosina Cc: Wim Osterholt Cc: Kurt Garloff Cc: Signed-off-by: Denis Efremov --- drivers/block/floppy.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 87460e0e5c72..fef79ea52e3e 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4029,23 +4029,23 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) if (fdc_state[FDC(drive)].rawcmd == 1) fdc_state[FDC(drive)].rawcmd = 2; - if (mode & (FMODE_READ|FMODE_WRITE)) { - drive_state[drive].last_checked = 0; - clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags); - if (bdev_check_media_change(bdev)) - floppy_revalidate(bdev->bd_disk); - if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags)) - goto out; - if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags)) + if (!(mode & FMODE_NDELAY)) { + if (mode & (FMODE_READ|FMODE_WRITE)) { + drive_state[drive].last_checked = 0; + clear_bit(FD_OPEN_SHOULD_FAIL_BIT, + &drive_state[drive].flags); + if (bdev_check_media_change(bdev)) + floppy_revalidate(bdev->bd_disk); + if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags)) + goto out; + if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags)) + goto out; + } + res = -EROFS; + if ((mode & FMODE_WRITE) && + !test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags)) goto out; } - - res = -EROFS; - - if ((mode & FMODE_WRITE) && - !test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags)) - goto out; - mutex_unlock(&open_lock); mutex_unlock(&floppy_mutex); return 0; -- cgit