diff options
Diffstat (limited to 'drivers/block/xen-blkfront.c')
| -rw-r--r-- | drivers/block/xen-blkfront.c | 984 |
1 files changed, 432 insertions, 552 deletions
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 0ed4b200fa58..04fc6b552c04 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -42,11 +42,13 @@ #include <linux/cdrom.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/major.h> #include <linux/mutex.h> #include <linux/scatterlist.h> #include <linux/bitmap.h> #include <linux/list.h> #include <linux/workqueue.h> +#include <linux/sched/mm.h> #include <xen/xen.h> #include <xen/xenbus.h> @@ -79,6 +81,7 @@ enum blkif_state { BLKIF_STATE_DISCONNECTED, BLKIF_STATE_CONNECTED, BLKIF_STATE_SUSPENDED, + BLKIF_STATE_ERROR, }; struct grant { @@ -88,6 +91,7 @@ struct grant { }; enum blk_req_status { + REQ_PROCESSING, REQ_WAITING, REQ_DONE, REQ_ERROR, @@ -148,12 +152,13 @@ static unsigned int xen_blkif_max_ring_order; module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444); MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); +static bool __read_mostly xen_blkif_trusted = true; +module_param_named(trusted, xen_blkif_trusted, bool, 0644); +MODULE_PARM_DESC(trusted, "Is the backend trusted"); + #define BLK_RING_SIZE(info) \ __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages) -#define BLK_MAX_RING_SIZE \ - __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) - /* * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 * characters are enough. Define to 20 to keep consistent with backend. @@ -177,12 +182,12 @@ struct blkfront_ring_info { unsigned int evtchn, irq; struct work_struct work; struct gnttab_free_callback callback; - struct blk_shadow shadow[BLK_MAX_RING_SIZE]; struct list_head indirect_pages; struct list_head grants; unsigned int persistent_gnts_c; unsigned long shadow_free; struct blkfront_info *dev_info; + struct blk_shadow shadow[]; }; /* @@ -197,6 +202,7 @@ struct blkfront_info struct gendisk *gd; u16 sector_size; unsigned int physical_sector_size; + unsigned long vdisk_info; int vdevice; blkif_vdev_t handle; enum blkif_state connected; @@ -207,7 +213,11 @@ struct blkfront_info unsigned int feature_fua:1; unsigned int feature_discard:1; unsigned int feature_secdiscard:1; + /* Connect-time cached feature_persistent parameter */ + unsigned int feature_persistent_parm:1; + /* Persistent grants feature negotiation result */ unsigned int feature_persistent:1; + unsigned int bounce:1; unsigned int discard_granularity; unsigned int discard_alignment; /* Number of 4KB segments handled */ @@ -216,6 +226,7 @@ struct blkfront_info struct blk_mq_tag_set tag_set; struct blkfront_ring_info *rinfo; unsigned int nr_rings; + unsigned int rinfo_size; /* Save uncomplete reqs and bios for migration. */ struct list_head requests; struct bio_list bio_list; @@ -226,8 +237,6 @@ static unsigned int nr_minors; static unsigned long *minors; static DEFINE_SPINLOCK(minor_lock); -#define GRANT_INVALID_REF 0 - #define PARTS_PER_DISK 16 #define PARTS_PER_EXT_DISK 256 @@ -262,6 +271,18 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); static void blkfront_gather_backend_features(struct blkfront_info *info); static int negotiate_mq(struct blkfront_info *info); +#define for_each_rinfo(info, ptr, idx) \ + for ((ptr) = (info)->rinfo, (idx) = 0; \ + (idx) < (info)->nr_rings; \ + (idx)++, (ptr) = (void *)(ptr) + (info)->rinfo_size) + +static inline struct blkfront_ring_info * +get_rinfo(const struct blkfront_info *info, unsigned int i) +{ + BUG_ON(i >= info->nr_rings); + return (void *)info->rinfo + i * info->rinfo_size; +} + static int get_id_from_freelist(struct blkfront_ring_info *rinfo) { unsigned long free = rinfo->shadow_free; @@ -297,8 +318,8 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) if (!gnt_list_entry) goto out_of_memory; - if (info->feature_persistent) { - granted_page = alloc_page(GFP_NOIO); + if (info->bounce) { + granted_page = alloc_page(GFP_NOIO | __GFP_ZERO); if (!granted_page) { kfree(gnt_list_entry); goto out_of_memory; @@ -306,7 +327,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) gnt_list_entry->page = granted_page; } - gnt_list_entry->gref = GRANT_INVALID_REF; + gnt_list_entry->gref = INVALID_GRANT_REF; list_add(&gnt_list_entry->node, &rinfo->grants); i++; } @@ -317,7 +338,7 @@ out_of_memory: list_for_each_entry_safe(gnt_list_entry, n, &rinfo->grants, node) { list_del(&gnt_list_entry->node); - if (info->feature_persistent) + if (info->bounce) __free_page(gnt_list_entry->page); kfree(gnt_list_entry); i--; @@ -335,7 +356,7 @@ static struct grant *get_free_grant(struct blkfront_ring_info *rinfo) node); list_del(&gnt_list_entry->node); - if (gnt_list_entry->gref != GRANT_INVALID_REF) + if (gnt_list_entry->gref != INVALID_GRANT_REF) rinfo->persistent_gnts_c--; return gnt_list_entry; @@ -357,13 +378,13 @@ static struct grant *get_grant(grant_ref_t *gref_head, struct grant *gnt_list_entry = get_free_grant(rinfo); struct blkfront_info *info = rinfo->dev_info; - if (gnt_list_entry->gref != GRANT_INVALID_REF) + if (gnt_list_entry->gref != INVALID_GRANT_REF) return gnt_list_entry; /* Assign a gref to this page */ gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); BUG_ON(gnt_list_entry->gref == -ENOSPC); - if (info->feature_persistent) + if (info->bounce) grant_foreign_access(gnt_list_entry, info); else { /* Grant access to the GFN passed by the caller */ @@ -381,13 +402,13 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head, struct grant *gnt_list_entry = get_free_grant(rinfo); struct blkfront_info *info = rinfo->dev_info; - if (gnt_list_entry->gref != GRANT_INVALID_REF) + if (gnt_list_entry->gref != INVALID_GRANT_REF) return gnt_list_entry; /* Assign a gref to this page */ gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); BUG_ON(gnt_list_entry->gref == -ENOSPC); - if (!info->feature_persistent) { + if (!info->bounce) { struct page *indirect_page; /* Fetch a pre-allocated page to use for indirect grefs */ @@ -472,11 +493,11 @@ static void blkif_restart_queue_callback(void *arg) schedule_work(&rinfo->work); } -static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) +static int blkif_getgeo(struct gendisk *disk, struct hd_geometry *hg) { /* We don't have real geometry info, but let's at least return values consistent with the size of the device */ - sector_t nsect = get_capacity(bd->bd_disk); + sector_t nsect = get_capacity(disk); sector_t cylinders = nsect; hg->heads = 0xff; @@ -488,37 +509,25 @@ static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) return 0; } -static int blkif_ioctl(struct block_device *bdev, fmode_t mode, +static int blkif_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned command, unsigned long argument) { struct blkfront_info *info = bdev->bd_disk->private_data; int i; - dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n", - command, (long)argument); - switch (command) { case CDROMMULTISESSION: - dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n"); for (i = 0; i < sizeof(struct cdrom_multisession); i++) if (put_user(0, (char __user *)(argument + i))) return -EFAULT; return 0; - - case CDROM_GET_CAPABILITY: { - struct gendisk *gd = info->gd; - if (gd->flags & GENHD_FL_CD) - return 0; - return -EINVAL; - } - + case CDROM_GET_CAPABILITY: + if (!(info->vdisk_info & VDISK_CDROM)) + return -EINVAL; + return 0; default: - /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", - command);*/ - return -EINVAL; /* same return as native Linux */ + return -EINVAL; } - - return 0; } static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo, @@ -532,10 +541,10 @@ static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo, id = get_id_from_freelist(rinfo); rinfo->shadow[id].request = req; - rinfo->shadow[id].status = REQ_WAITING; + rinfo->shadow[id].status = REQ_PROCESSING; rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID; - (*ring_req)->u.rw.id = id; + rinfo->shadow[id].req.u.rw.id = id; return id; } @@ -543,11 +552,12 @@ static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo, static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo) { struct blkfront_info *info = rinfo->dev_info; - struct blkif_request *ring_req; + struct blkif_request *ring_req, *final_ring_req; unsigned long id; /* Fill out a communications ring structure. */ - id = blkif_ring_get_request(rinfo, req, &ring_req); + id = blkif_ring_get_request(rinfo, req, &final_ring_req); + ring_req = &rinfo->shadow[id].req; ring_req->operation = BLKIF_OP_DISCARD; ring_req->u.discard.nr_sectors = blk_rq_sectors(req); @@ -558,8 +568,9 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf else ring_req->u.discard.flag = 0; - /* Keep a private copy so we can reissue requests when recovering. */ - rinfo->shadow[id].req = *ring_req; + /* Copy the request to the ring page. */ + *final_ring_req = *ring_req; + rinfo->shadow[id].status = REQ_WAITING; return 0; } @@ -571,7 +582,7 @@ struct setup_rw_req { struct blkif_request *ring_req; grant_ref_t gref_head; unsigned int id; - /* Only used when persistent grant is used and it's a read request */ + /* Only used when persistent grant is used and it's a write request */ bool need_copy; unsigned int bvec_off; char *bvec_data; @@ -692,6 +703,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri { struct blkfront_info *info = rinfo->dev_info; struct blkif_request *ring_req, *extra_ring_req = NULL; + struct blkif_request *final_ring_req, *final_extra_ring_req = NULL; unsigned long id, extra_id = NO_ASSOCIATED_ID; bool require_extra_req = false; int i; @@ -699,7 +711,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri .grant_idx = 0, .segments = NULL, .rinfo = rinfo, - .need_copy = rq_data_dir(req) && info->feature_persistent, + .need_copy = rq_data_dir(req) && info->bounce, }; /* @@ -736,9 +748,10 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri } /* Fill out a communications ring structure. */ - id = blkif_ring_get_request(rinfo, req, &ring_req); + id = blkif_ring_get_request(rinfo, req, &final_ring_req); + ring_req = &rinfo->shadow[id].req; - num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg); + num_sg = blk_rq_map_sg(req, rinfo->shadow[id].sg); num_grant = 0; /* Calculate the number of grant used */ for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) @@ -767,13 +780,19 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri ring_req->u.rw.handle = info->handle; ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE : BLKIF_OP_READ; - if (req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA) { + if (req_op(req) == REQ_OP_FLUSH || + (req_op(req) == REQ_OP_WRITE && (req->cmd_flags & REQ_FUA))) { /* * Ideally we can do an unordered flush-to-disk. * In case the backend onlysupports barriers, use that. * A barrier request a superset of FUA, so we can * implement it the same way. (It's also a FLUSH+FUA, * since it is guaranteed ordered WRT previous writes.) + * + * Note that can end up here with a FUA write and the + * flags cleared. This happens when the flag was + * run-time disabled after a failing I/O, and we'll + * simplify submit it as a normal write. */ if (info->feature_flush && info->feature_fua) ring_req->operation = @@ -781,13 +800,13 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri else if (info->feature_flush) ring_req->operation = BLKIF_OP_FLUSH_DISKCACHE; - else - ring_req->operation = 0; } ring_req->u.rw.nr_segments = num_grant; if (unlikely(require_extra_req)) { extra_id = blkif_ring_get_request(rinfo, req, - &extra_ring_req); + &final_extra_ring_req); + extra_ring_req = &rinfo->shadow[extra_id].req; + /* * Only the first request contains the scatter-gather * list. @@ -829,10 +848,13 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri if (setup.segments) kunmap_atomic(setup.segments); - /* Keep a private copy so we can reissue requests when recovering. */ - rinfo->shadow[id].req = *ring_req; - if (unlikely(require_extra_req)) - rinfo->shadow[extra_id].req = *extra_ring_req; + /* Copy request(s) to the ring page. */ + *final_ring_req = *ring_req; + rinfo->shadow[id].status = REQ_WAITING; + if (unlikely(require_extra_req)) { + *final_extra_ring_req = *extra_ring_req; + rinfo->shadow[extra_id].status = REQ_WAITING; + } if (new_persistent_gnts) gnttab_free_grant_references(setup.gref_head); @@ -868,16 +890,6 @@ static inline void flush_requests(struct blkfront_ring_info *rinfo) notify_remote_via_irq(rinfo->irq); } -static inline bool blkif_request_flush_invalid(struct request *req, - struct blkfront_info *info) -{ - return (blk_rq_is_passthrough(req) || - ((req_op(req) == REQ_OP_FLUSH) && - !info->feature_flush) || - ((req->cmd_flags & REQ_FUA) && - !info->feature_fua)); -} - static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *qd) { @@ -886,16 +898,25 @@ static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, struct blkfront_info *info = hctx->queue->queuedata; struct blkfront_ring_info *rinfo = NULL; - BUG_ON(info->nr_rings <= qid); - rinfo = &info->rinfo[qid]; + rinfo = get_rinfo(info, qid); blk_mq_start_request(qd->rq); spin_lock_irqsave(&rinfo->ring_lock, flags); - if (RING_FULL(&rinfo->ring)) - goto out_busy; - if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info)) - goto out_err; + /* + * Check if the backend actually supports flushes. + * + * While the block layer won't send us flushes if we don't claim to + * support them, the Xen protocol allows the backend to revoke support + * at any time. That is of course a really bad idea and dangerous, but + * has been allowed for 10+ years. In that case we simply clear the + * flags, and directly return here for an empty flush and ignore the + * FUA flag later on. + */ + if (unlikely(req_op(qd->rq) == REQ_OP_FLUSH && !info->feature_flush)) + goto complete; + if (RING_FULL(&rinfo->ring)) + goto out_busy; if (blkif_queue_request(qd->rq, rinfo)) goto out_busy; @@ -903,14 +924,14 @@ static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_STS_OK; -out_err: - spin_unlock_irqrestore(&rinfo->ring_lock, flags); - return BLK_STS_IOERR; - out_busy: blk_mq_stop_hw_queue(hctx); spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_STS_DEV_RESOURCE; +complete: + spin_unlock_irqrestore(&rinfo->ring_lock, flags); + blk_mq_end_request(qd->rq, BLK_STS_OK); + return BLK_STS_OK; } static void blkif_complete_rq(struct request *rq) @@ -923,80 +944,41 @@ static const struct blk_mq_ops blkfront_mq_ops = { .complete = blkif_complete_rq, }; -static void blkif_set_queue_limits(struct blkfront_info *info) +static void blkif_set_queue_limits(const struct blkfront_info *info, + struct queue_limits *lim) { - struct request_queue *rq = info->rq; - struct gendisk *gd = info->gd; unsigned int segments = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; - blk_queue_flag_set(QUEUE_FLAG_VIRT, rq); - if (info->feature_discard) { - blk_queue_flag_set(QUEUE_FLAG_DISCARD, rq); - blk_queue_max_discard_sectors(rq, get_capacity(gd)); - rq->limits.discard_granularity = info->discard_granularity; - rq->limits.discard_alignment = info->discard_alignment; + lim->max_hw_discard_sectors = UINT_MAX; + if (info->discard_granularity) + lim->discard_granularity = info->discard_granularity; + lim->discard_alignment = info->discard_alignment; if (info->feature_secdiscard) - blk_queue_flag_set(QUEUE_FLAG_SECERASE, rq); + lim->max_secure_erase_sectors = UINT_MAX; + } + + if (info->feature_flush) { + lim->features |= BLK_FEAT_WRITE_CACHE; + if (info->feature_fua) + lim->features |= BLK_FEAT_FUA; } /* Hard sector size and max sectors impersonate the equiv. hardware. */ - blk_queue_logical_block_size(rq, info->sector_size); - blk_queue_physical_block_size(rq, info->physical_sector_size); - blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512); + lim->logical_block_size = info->sector_size; + lim->physical_block_size = info->physical_sector_size; + lim->max_hw_sectors = (segments * XEN_PAGE_SIZE) / 512; /* Each segment in a request is up to an aligned page in size. */ - blk_queue_segment_boundary(rq, PAGE_SIZE - 1); - blk_queue_max_segment_size(rq, PAGE_SIZE); + lim->seg_boundary_mask = PAGE_SIZE - 1; + lim->max_segment_size = PAGE_SIZE; /* Ensure a merged request will fit in a single I/O ring slot. */ - blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG); + lim->max_segments = segments / GRANTS_PER_PSEG; /* Make sure buffer addresses are sector-aligned. */ - blk_queue_dma_alignment(rq, 511); -} - -static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, - unsigned int physical_sector_size) -{ - struct request_queue *rq; - struct blkfront_info *info = gd->private_data; - - memset(&info->tag_set, 0, sizeof(info->tag_set)); - info->tag_set.ops = &blkfront_mq_ops; - info->tag_set.nr_hw_queues = info->nr_rings; - if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) { - /* - * When indirect descriptior is not supported, the I/O request - * will be split between multiple request in the ring. - * To avoid problems when sending the request, divide by - * 2 the depth of the queue. - */ - info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2; - } else - info->tag_set.queue_depth = BLK_RING_SIZE(info); - info->tag_set.numa_node = NUMA_NO_NODE; - info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; - info->tag_set.cmd_size = sizeof(struct blkif_req); - info->tag_set.driver_data = info; - - if (blk_mq_alloc_tag_set(&info->tag_set)) - return -EINVAL; - rq = blk_mq_init_queue(&info->tag_set); - if (IS_ERR(rq)) { - blk_mq_free_tag_set(&info->tag_set); - return PTR_ERR(rq); - } - - rq->queuedata = info; - info->rq = gd->queue = rq; - info->gd = gd; - info->sector_size = sector_size; - info->physical_sector_size = physical_sector_size; - blkif_set_queue_limits(info); - - return 0; + lim->dma_alignment = 511; } static const char *flush_info(struct blkfront_info *info) @@ -1011,13 +993,12 @@ static const char *flush_info(struct blkfront_info *info) static void xlvbd_flush(struct blkfront_info *info) { - blk_queue_write_cache(info->rq, info->feature_flush ? true : false, - info->feature_fua ? true : false); - pr_info("blkfront: %s: %s %s %s %s %s\n", + pr_info("blkfront: %s: %s %s %s %s %s %s %s\n", info->gd->disk_name, flush_info(info), "persistent grants:", info->feature_persistent ? "enabled;" : "disabled;", "indirect descriptors:", - info->max_indirect_segments ? "enabled;" : "disabled;"); + info->max_indirect_segments ? "enabled;" : "disabled;", + "bounce buffer:", info->bounce ? "enabled" : "disabled;"); } static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) @@ -1089,10 +1070,9 @@ static char *encode_disk_name(char *ptr, unsigned int n) } static int xlvbd_alloc_gendisk(blkif_sector_t capacity, - struct blkfront_info *info, - u16 vdisk_info, u16 sector_size, - unsigned int physical_sector_size) + struct blkfront_info *info) { + struct queue_limits lim = {}; struct gendisk *gd; int nr_minors = 1; int err; @@ -1113,8 +1093,8 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, if (!VDEV_IS_EXTENDED(info->vdevice)) { err = xen_translate_vdev(info->vdevice, &minor, &offset); if (err) - return err; - nr_parts = PARTS_PER_DISK; + return err; + nr_parts = PARTS_PER_DISK; } else { minor = BLKIF_MINOR_EXT(info->vdevice); nr_parts = PARTS_PER_EXT_DISK; @@ -1135,12 +1115,35 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, err = xlbd_reserve_minors(minor, nr_minors); if (err) - goto out; - err = -ENODEV; + return err; - gd = alloc_disk(nr_minors); - if (gd == NULL) - goto release; + memset(&info->tag_set, 0, sizeof(info->tag_set)); + info->tag_set.ops = &blkfront_mq_ops; + info->tag_set.nr_hw_queues = info->nr_rings; + if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) { + /* + * When indirect descriptior is not supported, the I/O request + * will be split between multiple request in the ring. + * To avoid problems when sending the request, divide by + * 2 the depth of the queue. + */ + info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2; + } else + info->tag_set.queue_depth = BLK_RING_SIZE(info); + info->tag_set.numa_node = NUMA_NO_NODE; + info->tag_set.cmd_size = sizeof(struct blkif_req); + info->tag_set.driver_data = info; + + err = blk_mq_alloc_tag_set(&info->tag_set); + if (err) + goto out_release_minors; + + blkif_set_queue_limits(info, &lim); + gd = blk_mq_alloc_disk(&info->tag_set, &lim, info); + if (IS_ERR(gd)) { + err = PTR_ERR(gd); + goto out_free_tag_set; + } strcpy(gd->disk_name, DEV_NAME); ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); @@ -1153,68 +1156,30 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, gd->major = XENVBD_MAJOR; gd->first_minor = minor; + gd->minors = nr_minors; gd->fops = &xlvbd_block_fops; gd->private_data = info; set_capacity(gd, capacity); - if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size)) { - del_gendisk(gd); - goto release; - } + info->rq = gd->queue; + info->gd = gd; xlvbd_flush(info); - if (vdisk_info & VDISK_READONLY) + if (info->vdisk_info & VDISK_READONLY) set_disk_ro(gd, 1); - - if (vdisk_info & VDISK_REMOVABLE) + if (info->vdisk_info & VDISK_REMOVABLE) gd->flags |= GENHD_FL_REMOVABLE; - if (vdisk_info & VDISK_CDROM) - gd->flags |= GENHD_FL_CD; - return 0; - release: +out_free_tag_set: + blk_mq_free_tag_set(&info->tag_set); +out_release_minors: xlbd_release_minors(minor, nr_minors); - out: return err; } -static void xlvbd_release_gendisk(struct blkfront_info *info) -{ - unsigned int minor, nr_minors, i; - - if (info->rq == NULL) - return; - - /* No more blkif_request(). */ - blk_mq_stop_hw_queues(info->rq); - - for (i = 0; i < info->nr_rings; i++) { - struct blkfront_ring_info *rinfo = &info->rinfo[i]; - - /* No more gnttab callback work. */ - gnttab_cancel_free_callback(&rinfo->callback); - - /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work(&rinfo->work); - } - - del_gendisk(info->gd); - - minor = info->gd->first_minor; - nr_minors = info->gd->minors; - xlbd_release_minors(minor, nr_minors); - - blk_cleanup_queue(info->rq); - blk_mq_free_tag_set(&info->tag_set); - info->rq = NULL; - - put_disk(info->gd); - info->gd = NULL; -} - /* Already hold rinfo->ring_lock. */ static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo) { @@ -1252,7 +1217,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n; - BUG_ON(info->feature_persistent); + BUG_ON(info->bounce); list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); @@ -1264,12 +1229,12 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) list_for_each_entry_safe(persistent_gnt, n, &rinfo->grants, node) { list_del(&persistent_gnt->node); - if (persistent_gnt->gref != GRANT_INVALID_REF) { + if (persistent_gnt->gref != INVALID_GRANT_REF) { gnttab_end_foreign_access(persistent_gnt->gref, - 0, 0UL); + NULL); rinfo->persistent_gnts_c--; } - if (info->feature_persistent) + if (info->bounce) __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1289,8 +1254,8 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) rinfo->shadow[i].req.u.rw.nr_segments; for (j = 0; j < segs; j++) { persistent_gnt = rinfo->shadow[i].grants_used[j]; - gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); - if (info->feature_persistent) + gnttab_end_foreign_access(persistent_gnt->gref, NULL); + if (info->bounce) __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1304,17 +1269,17 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) for (j = 0; j < INDIRECT_GREFS(segs); j++) { persistent_gnt = rinfo->shadow[i].indirect_grants[j]; - gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); + gnttab_end_foreign_access(persistent_gnt->gref, NULL); __free_page(persistent_gnt->page); kfree(persistent_gnt); } free_shadow: - kfree(rinfo->shadow[i].grants_used); + kvfree(rinfo->shadow[i].grants_used); rinfo->shadow[i].grants_used = NULL; - kfree(rinfo->shadow[i].indirect_grants); + kvfree(rinfo->shadow[i].indirect_grants); rinfo->shadow[i].indirect_grants = NULL; - kfree(rinfo->shadow[i].sg); + kvfree(rinfo->shadow[i].sg); rinfo->shadow[i].sg = NULL; } @@ -1325,14 +1290,8 @@ free_shadow: flush_work(&rinfo->work); /* Free resources associated with old device channel. */ - for (i = 0; i < info->nr_ring_pages; i++) { - if (rinfo->ring_ref[i] != GRANT_INVALID_REF) { - gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0); - rinfo->ring_ref[i] = GRANT_INVALID_REF; - } - } - free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * XEN_PAGE_SIZE)); - rinfo->ring.sring = NULL; + xenbus_teardown_ring((void **)&rinfo->ring.sring, info->nr_ring_pages, + rinfo->ring_ref); if (rinfo->irq) unbind_from_irqhandler(rinfo->irq, rinfo); @@ -1342,6 +1301,7 @@ free_shadow: static void blkif_free(struct blkfront_info *info, int suspend) { unsigned int i; + struct blkfront_ring_info *rinfo; /* Prevent new requests being issued until we fix things up. */ info->connected = suspend ? @@ -1350,10 +1310,10 @@ static void blkif_free(struct blkfront_info *info, int suspend) if (info->rq) blk_mq_stop_hw_queues(info->rq); - for (i = 0; i < info->nr_rings; i++) - blkif_free_ring(&info->rinfo[i]); + for_each_rinfo(info, rinfo, i) + blkif_free_ring(rinfo); - kfree(info->rinfo); + kvfree(info->rinfo); info->rinfo = NULL; info->nr_rings = 0; } @@ -1393,7 +1353,6 @@ static enum blk_req_status blkif_rsp_to_req_status(int rsp) case BLKIF_RSP_EOPNOTSUPP: return REQ_EOPNOTSUPP; case BLKIF_RSP_ERROR: - /* Fallthrough. */ default: return REQ_ERROR; } @@ -1405,8 +1364,8 @@ static enum blk_req_status blkif_rsp_to_req_status(int rsp) static int blkif_get_final_status(enum blk_req_status s1, enum blk_req_status s2) { - BUG_ON(s1 == REQ_WAITING); - BUG_ON(s2 == REQ_WAITING); + BUG_ON(s1 < REQ_DONE); + BUG_ON(s2 < REQ_DONE); if (s1 == REQ_ERROR || s2 == REQ_ERROR) return BLKIF_RSP_ERROR; @@ -1415,9 +1374,15 @@ static int blkif_get_final_status(enum blk_req_status s1, return BLKIF_RSP_OKAY; } -static bool blkif_completion(unsigned long *id, - struct blkfront_ring_info *rinfo, - struct blkif_response *bret) +/* + * Return values: + * 1 response processed. + * 0 missing further responses. + * -1 error while processing. + */ +static int blkif_completion(unsigned long *id, + struct blkfront_ring_info *rinfo, + struct blkif_response *bret) { int i = 0; struct scatterlist *sg; @@ -1439,8 +1404,8 @@ static bool blkif_completion(unsigned long *id, s->status = blkif_rsp_to_req_status(bret->status); /* Wait the second response if not yet here. */ - if (s2->status == REQ_WAITING) - return false; + if (s2->status < REQ_DONE) + return 0; bret->status = blkif_get_final_status(s->status, s2->status); @@ -1473,7 +1438,7 @@ static bool blkif_completion(unsigned long *id, data.s = s; num_sg = s->num_sg; - if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { + if (bret->operation == BLKIF_OP_READ && info->bounce) { for_each_sg(s->sg, sg, num_sg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); @@ -1491,139 +1456,172 @@ static bool blkif_completion(unsigned long *id, } /* Add the persistent grant into the list of free grants */ for (i = 0; i < num_grant; i++) { - if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { + if (!gnttab_try_end_foreign_access(s->grants_used[i]->gref)) { /* * If the grant is still mapped by the backend (the * backend has chosen to make this grant persistent) * we add it at the head of the list, so it will be * reused first. */ - if (!info->feature_persistent) - pr_alert_ratelimited("backed has not unmapped grant: %u\n", - s->grants_used[i]->gref); + if (!info->feature_persistent) { + pr_alert("backed has not unmapped grant: %u\n", + s->grants_used[i]->gref); + return -1; + } list_add(&s->grants_used[i]->node, &rinfo->grants); rinfo->persistent_gnts_c++; } else { /* - * If the grant is not mapped by the backend we end the - * foreign access and add it to the tail of the list, - * so it will not be picked again unless we run out of - * persistent grants. + * If the grant is not mapped by the backend we add it + * to the tail of the list, so it will not be picked + * again unless we run out of persistent grants. */ - gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); - s->grants_used[i]->gref = GRANT_INVALID_REF; + s->grants_used[i]->gref = INVALID_GRANT_REF; list_add_tail(&s->grants_used[i]->node, &rinfo->grants); } } if (s->req.operation == BLKIF_OP_INDIRECT) { for (i = 0; i < INDIRECT_GREFS(num_grant); i++) { - if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { - if (!info->feature_persistent) - pr_alert_ratelimited("backed has not unmapped grant: %u\n", - s->indirect_grants[i]->gref); + if (!gnttab_try_end_foreign_access(s->indirect_grants[i]->gref)) { + if (!info->feature_persistent) { + pr_alert("backed has not unmapped grant: %u\n", + s->indirect_grants[i]->gref); + return -1; + } list_add(&s->indirect_grants[i]->node, &rinfo->grants); rinfo->persistent_gnts_c++; } else { struct page *indirect_page; - gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL); /* * Add the used indirect page back to the list of * available pages for indirect grefs. */ - if (!info->feature_persistent) { + if (!info->bounce) { indirect_page = s->indirect_grants[i]->page; list_add(&indirect_page->lru, &rinfo->indirect_pages); } - s->indirect_grants[i]->gref = GRANT_INVALID_REF; + s->indirect_grants[i]->gref = INVALID_GRANT_REF; list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants); } } } - return true; + return 1; } static irqreturn_t blkif_interrupt(int irq, void *dev_id) { struct request *req; - struct blkif_response *bret; + struct blkif_response bret; RING_IDX i, rp; unsigned long flags; struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id; struct blkfront_info *info = rinfo->dev_info; + unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS; - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { + xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS); return IRQ_HANDLED; + } spin_lock_irqsave(&rinfo->ring_lock, flags); again: - rp = rinfo->ring.sring->rsp_prod; - rmb(); /* Ensure we see queued responses up to 'rp'. */ + rp = READ_ONCE(rinfo->ring.sring->rsp_prod); + virt_rmb(); /* Ensure we see queued responses up to 'rp'. */ + if (RING_RESPONSE_PROD_OVERFLOW(&rinfo->ring, rp)) { + pr_alert("%s: illegal number of responses %u\n", + info->gd->disk_name, rp - rinfo->ring.rsp_cons); + goto err; + } for (i = rinfo->ring.rsp_cons; i != rp; i++) { unsigned long id; + unsigned int op; + + eoiflag = 0; + + RING_COPY_RESPONSE(&rinfo->ring, i, &bret); + id = bret.id; - bret = RING_GET_RESPONSE(&rinfo->ring, i); - id = bret->id; /* * The backend has messed up and given us an id that we would * never have given to it (we stamp it up to BLK_RING_SIZE - * look in get_id_from_freelist. */ if (id >= BLK_RING_SIZE(info)) { - WARN(1, "%s: response to %s has incorrect id (%ld)\n", - info->gd->disk_name, op_name(bret->operation), id); - /* We can't safely get the 'struct request' as - * the id is busted. */ - continue; + pr_alert("%s: response has incorrect id (%ld)\n", + info->gd->disk_name, id); + goto err; + } + if (rinfo->shadow[id].status != REQ_WAITING) { + pr_alert("%s: response references no pending request\n", + info->gd->disk_name); + goto err; } + + rinfo->shadow[id].status = REQ_PROCESSING; req = rinfo->shadow[id].request; - if (bret->operation != BLKIF_OP_DISCARD) { + op = rinfo->shadow[id].req.operation; + if (op == BLKIF_OP_INDIRECT) + op = rinfo->shadow[id].req.u.indirect.indirect_op; + if (bret.operation != op) { + pr_alert("%s: response has wrong operation (%u instead of %u)\n", + info->gd->disk_name, bret.operation, op); + goto err; + } + + if (bret.operation != BLKIF_OP_DISCARD) { + int ret; + /* * We may need to wait for an extra response if the * I/O request is split in 2 */ - if (!blkif_completion(&id, rinfo, bret)) + ret = blkif_completion(&id, rinfo, &bret); + if (!ret) continue; + if (unlikely(ret < 0)) + goto err; } if (add_id_to_freelist(rinfo, id)) { WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", - info->gd->disk_name, op_name(bret->operation), id); + info->gd->disk_name, op_name(bret.operation), id); continue; } - if (bret->status == BLKIF_RSP_OKAY) + if (bret.status == BLKIF_RSP_OKAY) blkif_req(req)->error = BLK_STS_OK; else blkif_req(req)->error = BLK_STS_IOERR; - switch (bret->operation) { + switch (bret.operation) { case BLKIF_OP_DISCARD: - if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { + if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) { struct request_queue *rq = info->rq; - printk(KERN_WARNING "blkfront: %s: %s op failed\n", - info->gd->disk_name, op_name(bret->operation)); + + pr_warn_ratelimited("blkfront: %s: %s op failed\n", + info->gd->disk_name, op_name(bret.operation)); blkif_req(req)->error = BLK_STS_NOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, rq); - blk_queue_flag_clear(QUEUE_FLAG_SECERASE, rq); + blk_queue_disable_discard(rq); + blk_queue_disable_secure_erase(rq); } break; case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: - if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { - printk(KERN_WARNING "blkfront: %s: %s op failed\n", - info->gd->disk_name, op_name(bret->operation)); + if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) { + pr_warn_ratelimited("blkfront: %s: %s op failed\n", + info->gd->disk_name, op_name(bret.operation)); blkif_req(req)->error = BLK_STS_NOTSUPP; } - if (unlikely(bret->status == BLKIF_RSP_ERROR && + if (unlikely(bret.status == BLKIF_RSP_ERROR && rinfo->shadow[id].req.u.rw.nr_segments == 0)) { - printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", - info->gd->disk_name, op_name(bret->operation)); + pr_warn_ratelimited("blkfront: %s: empty %s op failed\n", + info->gd->disk_name, op_name(bret.operation)); blkif_req(req)->error = BLK_STS_NOTSUPP; } if (unlikely(blkif_req(req)->error)) { @@ -1631,21 +1629,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) blkif_req(req)->error = BLK_STS_OK; info->feature_fua = 0; info->feature_flush = 0; - xlvbd_flush(info); } - /* fall through */ + fallthrough; case BLKIF_OP_READ: case BLKIF_OP_WRITE: - if (unlikely(bret->status != BLKIF_RSP_OKAY)) - dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " - "request: %x\n", bret->status); + if (unlikely(bret.status != BLKIF_RSP_OKAY)) + dev_dbg_ratelimited(&info->xbdev->dev, + "Bad return from blkdev data request: %#x\n", + bret.status); break; default: BUG(); } - blk_mq_complete_request(req); + if (likely(!blk_should_fake_timeout(req->q))) + blk_mq_complete_request(req); } rinfo->ring.rsp_cons = i; @@ -1662,6 +1661,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) spin_unlock_irqrestore(&rinfo->ring_lock, flags); + xen_irq_lateeoi(irq, eoiflag); + + return IRQ_HANDLED; + + err: + info->connected = BLKIF_STATE_ERROR; + + spin_unlock_irqrestore(&rinfo->ring_lock, flags); + + /* No EOI in order to avoid further interrupts. */ + + pr_alert("%s disabled for further use\n", info->gd->disk_name); return IRQ_HANDLED; } @@ -1670,38 +1681,23 @@ static int setup_blkring(struct xenbus_device *dev, struct blkfront_ring_info *rinfo) { struct blkif_sring *sring; - int err, i; + int err; struct blkfront_info *info = rinfo->dev_info; unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; - grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; - - for (i = 0; i < info->nr_ring_pages; i++) - rinfo->ring_ref[i] = GRANT_INVALID_REF; - - sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, - get_order(ring_size)); - if (!sring) { - xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); - return -ENOMEM; - } - SHARED_RING_INIT(sring); - FRONT_RING_INIT(&rinfo->ring, sring, ring_size); - err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref); - if (err < 0) { - free_pages((unsigned long)sring, get_order(ring_size)); - rinfo->ring.sring = NULL; + err = xenbus_setup_ring(dev, GFP_NOIO, (void **)&sring, + info->nr_ring_pages, rinfo->ring_ref); + if (err) goto fail; - } - for (i = 0; i < info->nr_ring_pages; i++) - rinfo->ring_ref[i] = gref[i]; + + XEN_FRONT_RING_INIT(&rinfo->ring, sring, ring_size); err = xenbus_alloc_evtchn(dev, &rinfo->evtchn); if (err) goto fail; - err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0, - "blkif", rinfo); + err = bind_evtchn_to_irqhandler_lateeoi(rinfo->evtchn, blkif_interrupt, + 0, "blkif", rinfo); if (err <= 0) { xenbus_dev_fatal(dev, err, "bind_evtchn_to_irqhandler failed"); @@ -1763,11 +1759,11 @@ abort_transaction: return err; } -static void free_info(struct blkfront_info *info) -{ - list_del(&info->info_list); - kfree(info); -} +/* Enable the persistent grants feature. */ +static bool feature_persistent = true; +module_param(feature_persistent, bool, 0644); +MODULE_PARM_DESC(feature_persistent, + "Enables the persistent grants feature"); /* Common code used when first setting up, and when resuming. */ static int talk_to_blkback(struct xenbus_device *dev, @@ -1778,10 +1774,15 @@ static int talk_to_blkback(struct xenbus_device *dev, int err; unsigned int i, max_page_order; unsigned int ring_page_order; + struct blkfront_ring_info *rinfo; if (!info) return -ENODEV; + /* Check if backend is trusted. */ + info->bounce = !xen_blkif_trusted || + !xenbus_read_unsigned(dev->nodename, "trusted", 1); + max_page_order = xenbus_read_unsigned(info->xbdev->otherend, "max-ring-page-order", 0); ring_page_order = min(xen_blkif_max_ring_order, max_page_order); @@ -1791,9 +1792,7 @@ static int talk_to_blkback(struct xenbus_device *dev, if (err) goto destroy_blkring; - for (i = 0; i < info->nr_rings; i++) { - struct blkfront_ring_info *rinfo = &info->rinfo[i]; - + for_each_rinfo(info, rinfo, i) { /* Create shared ring, alloc event channel. */ err = setup_blkring(dev, rinfo); if (err) @@ -1818,7 +1817,7 @@ again: /* We already got the number of queues/rings in _probe */ if (info->nr_rings == 1) { - err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename); + err = write_per_ring_nodes(xbt, info->rinfo, dev->nodename); if (err) goto destroy_blkring; } else { @@ -1840,10 +1839,10 @@ again: goto abort_transaction; } - for (i = 0; i < info->nr_rings; i++) { + for_each_rinfo(info, rinfo, i) { memset(path, 0, pathsize); snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i); - err = write_per_ring_nodes(xbt, &info->rinfo[i], path); + err = write_per_ring_nodes(xbt, rinfo, path); if (err) { kfree(path); goto destroy_blkring; @@ -1857,8 +1856,9 @@ again: message = "writing protocol"; goto abort_transaction; } - err = xenbus_printf(xbt, dev->nodename, - "feature-persistent", "%u", 1); + info->feature_persistent_parm = feature_persistent; + err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u", + info->feature_persistent_parm); if (err) dev_warn(&dev->dev, "writing persistent grants feature to xenbus"); @@ -1871,9 +1871,8 @@ again: goto destroy_blkring; } - for (i = 0; i < info->nr_rings; i++) { + for_each_rinfo(info, rinfo, i) { unsigned int j; - struct blkfront_ring_info *rinfo = &info->rinfo[i]; for (j = 0; j < BLK_RING_SIZE(info); j++) rinfo->shadow[j].req.u.rw.id = j + 1; @@ -1889,13 +1888,6 @@ again: xenbus_dev_fatal(dev, err, "%s", message); destroy_blkring: blkif_free(info, 0); - - mutex_lock(&blkfront_mutex); - free_info(info); - mutex_unlock(&blkfront_mutex); - - dev_set_drvdata(&dev->dev, NULL); - return err; } @@ -1903,6 +1895,7 @@ static int negotiate_mq(struct blkfront_info *info) { unsigned int backend_max_queues; unsigned int i; + struct blkfront_ring_info *rinfo; BUG_ON(info->nr_rings); @@ -1914,19 +1907,16 @@ static int negotiate_mq(struct blkfront_info *info) if (!info->nr_rings) info->nr_rings = 1; - info->rinfo = kcalloc(info->nr_rings, - sizeof(struct blkfront_ring_info), - GFP_KERNEL); + info->rinfo_size = struct_size(info->rinfo, shadow, + BLK_RING_SIZE(info)); + info->rinfo = kvcalloc(info->nr_rings, info->rinfo_size, GFP_KERNEL); if (!info->rinfo) { xenbus_dev_fatal(info->xbdev, -ENOMEM, "allocating ring_info structure"); info->nr_rings = 0; return -ENOMEM; } - for (i = 0; i < info->nr_rings; i++) { - struct blkfront_ring_info *rinfo; - - rinfo = &info->rinfo[i]; + for_each_rinfo(info, rinfo, i) { INIT_LIST_HEAD(&rinfo->indirect_pages); INIT_LIST_HEAD(&rinfo->grants); rinfo->dev_info = info; @@ -1935,7 +1925,8 @@ static int negotiate_mq(struct blkfront_info *info) } return 0; } -/** + +/* * Entry point to this code when a new device is created. Allocate the basic * structures and the ring buffer for communication with the backend, and * inform the backend of the appropriate details for those. Switch to @@ -2014,21 +2005,21 @@ static int blkfront_probe(struct xenbus_device *dev, static int blkif_recover(struct blkfront_info *info) { + struct queue_limits lim; unsigned int r_index; struct request *req, *n; int rc; struct bio *bio; - unsigned int segs; + struct blkfront_ring_info *rinfo; + lim = queue_limits_start_update(info->rq); blkfront_gather_backend_features(info); - /* Reset limits changed by blk_mq_update_nr_hw_queues(). */ - blkif_set_queue_limits(info); - segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; - blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG); - - for (r_index = 0; r_index < info->nr_rings; r_index++) { - struct blkfront_ring_info *rinfo = &info->rinfo[r_index]; + blkif_set_queue_limits(info, &lim); + rc = queue_limits_commit_update(info->rq, &lim); + if (rc) + return rc; + for_each_rinfo(info, rinfo, r_index) { rc = blkfront_setup_indirect(rinfo); if (rc) return rc; @@ -2038,10 +2029,7 @@ static int blkif_recover(struct blkfront_info *info) /* Now safe for us to use the shared ring */ info->connected = BLKIF_STATE_CONNECTED; - for (r_index = 0; r_index < info->nr_rings; r_index++) { - struct blkfront_ring_info *rinfo; - - rinfo = &info->rinfo[r_index]; + for_each_rinfo(info, rinfo, r_index) { /* Kick any other new requests queued since we resumed */ kick_pending_request_queues(rinfo); } @@ -2049,7 +2037,9 @@ static int blkif_recover(struct blkfront_info *info) list_for_each_entry_safe(req, n, &info->requests, queuelist) { /* Requeue pending requests (flush or discard) */ list_del_init(&req->queuelist); - BUG_ON(req->nr_phys_segments > segs); + BUG_ON(req->nr_phys_segments > + (info->max_indirect_segments ? : + BLKIF_MAX_SEGMENTS_PER_REQUEST)); blk_mq_requeue_request(req, false); } blk_mq_start_stopped_hw_queues(info->rq, true); @@ -2063,7 +2053,7 @@ static int blkif_recover(struct blkfront_info *info) return 0; } -/** +/* * We are reconnecting to the backend, due to a suspend/resume, or a backend * driver restart. We tear down our blkif structure and recreate it, but * leave the device-layer structures intact so that this is transparent to the @@ -2074,13 +2064,13 @@ static int blkfront_resume(struct xenbus_device *dev) struct blkfront_info *info = dev_get_drvdata(&dev->dev); int err = 0; unsigned int i, j; + struct blkfront_ring_info *rinfo; dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename); bio_list_init(&info->bio_list); INIT_LIST_HEAD(&info->requests); - for (i = 0; i < info->nr_rings; i++) { - struct blkfront_ring_info *rinfo = &info->rinfo[i]; + for_each_rinfo(info, rinfo, i) { struct bio_list merge_bio; struct blk_shadow *shadow = rinfo->shadow; @@ -2132,55 +2122,37 @@ static int blkfront_resume(struct xenbus_device *dev) static void blkfront_closing(struct blkfront_info *info) { struct xenbus_device *xbdev = info->xbdev; - struct block_device *bdev = NULL; - - mutex_lock(&info->mutex); + struct blkfront_ring_info *rinfo; + unsigned int i; - if (xbdev->state == XenbusStateClosing) { - mutex_unlock(&info->mutex); + if (xbdev->state == XenbusStateClosing) return; - } - - if (info->gd) - bdev = bdget_disk(info->gd, 0); - mutex_unlock(&info->mutex); - - if (!bdev) { - xenbus_frontend_closed(xbdev); - return; + /* No more blkif_request(). */ + if (info->rq && info->gd) { + blk_mq_stop_hw_queues(info->rq); + blk_mark_disk_dead(info->gd); } - mutex_lock(&bdev->bd_mutex); + for_each_rinfo(info, rinfo, i) { + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&rinfo->callback); - if (bdev->bd_openers) { - xenbus_dev_error(xbdev, -EBUSY, - "Device in use; refusing to close"); - xenbus_switch_state(xbdev, XenbusStateClosing); - } else { - xlvbd_release_gendisk(info); - xenbus_frontend_closed(xbdev); + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_work(&rinfo->work); } - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); + xenbus_frontend_closed(xbdev); } static void blkfront_setup_discard(struct blkfront_info *info) { - int err; - unsigned int discard_granularity; - unsigned int discard_alignment; - info->feature_discard = 1; - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "discard-granularity", "%u", &discard_granularity, - "discard-alignment", "%u", &discard_alignment, - NULL); - if (!err) { - info->discard_granularity = discard_granularity; - info->discard_alignment = discard_alignment; - } + info->discard_granularity = xenbus_read_unsigned(info->xbdev->otherend, + "discard-granularity", + 0); + info->discard_alignment = xenbus_read_unsigned(info->xbdev->otherend, + "discard-alignment", 0); info->feature_secdiscard = !!xenbus_read_unsigned(info->xbdev->otherend, "discard-secure", 0); @@ -2188,10 +2160,12 @@ static void blkfront_setup_discard(struct blkfront_info *info) static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) { - unsigned int psegs, grants; + unsigned int psegs, grants, memflags; int err, i; struct blkfront_info *info = rinfo->dev_info; + memflags = memalloc_noio_save(); + if (info->max_indirect_segments == 0) { if (!HAS_EXTRA_REQ) grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; @@ -2213,17 +2187,18 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) if (err) goto out_of_memory; - if (!info->feature_persistent && info->max_indirect_segments) { + if (!info->bounce && info->max_indirect_segments) { /* - * We are using indirect descriptors but not persistent - * grants, we need to allocate a set of pages that can be + * We are using indirect descriptors but don't have a bounce + * buffer, we need to allocate a set of pages that can be * used for mapping indirect grefs */ int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); BUG_ON(!list_empty(&rinfo->indirect_pages)); for (i = 0; i < num; i++) { - struct page *indirect_page = alloc_page(GFP_NOIO); + struct page *indirect_page = alloc_page(GFP_KERNEL | + __GFP_ZERO); if (!indirect_page) goto out_of_memory; list_add(&indirect_page->lru, &rinfo->indirect_pages); @@ -2232,17 +2207,17 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) for (i = 0; i < BLK_RING_SIZE(info); i++) { rinfo->shadow[i].grants_used = - kcalloc(grants, - sizeof(rinfo->shadow[i].grants_used[0]), - GFP_NOIO); - rinfo->shadow[i].sg = kcalloc(psegs, - sizeof(rinfo->shadow[i].sg[0]), - GFP_NOIO); + kvcalloc(grants, + sizeof(rinfo->shadow[i].grants_used[0]), + GFP_KERNEL); + rinfo->shadow[i].sg = kvcalloc(psegs, + sizeof(rinfo->shadow[i].sg[0]), + GFP_KERNEL); if (info->max_indirect_segments) rinfo->shadow[i].indirect_grants = - kcalloc(INDIRECT_GREFS(grants), - sizeof(rinfo->shadow[i].indirect_grants[0]), - GFP_NOIO); + kvcalloc(INDIRECT_GREFS(grants), + sizeof(rinfo->shadow[i].indirect_grants[0]), + GFP_KERNEL); if ((rinfo->shadow[i].grants_used == NULL) || (rinfo->shadow[i].sg == NULL) || (info->max_indirect_segments && @@ -2251,16 +2226,17 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) sg_init_table(rinfo->shadow[i].sg, psegs); } + memalloc_noio_restore(memflags); return 0; out_of_memory: for (i = 0; i < BLK_RING_SIZE(info); i++) { - kfree(rinfo->shadow[i].grants_used); + kvfree(rinfo->shadow[i].grants_used); rinfo->shadow[i].grants_used = NULL; - kfree(rinfo->shadow[i].sg); + kvfree(rinfo->shadow[i].sg); rinfo->shadow[i].sg = NULL; - kfree(rinfo->shadow[i].indirect_grants); + kvfree(rinfo->shadow[i].indirect_grants); rinfo->shadow[i].indirect_grants = NULL; } if (!list_empty(&rinfo->indirect_pages)) { @@ -2270,6 +2246,9 @@ out_of_memory: __free_page(indirect_page); } } + + memalloc_noio_restore(memflags); + return -ENOMEM; } @@ -2308,9 +2287,12 @@ static void blkfront_gather_backend_features(struct blkfront_info *info) if (xenbus_read_unsigned(info->xbdev->otherend, "feature-discard", 0)) blkfront_setup_discard(info); - info->feature_persistent = - !!xenbus_read_unsigned(info->xbdev->otherend, - "feature-persistent", 0); + if (info->feature_persistent_parm) + info->feature_persistent = + !!xenbus_read_unsigned(info->xbdev->otherend, + "feature-persistent", 0); + if (info->feature_persistent) + info->bounce = true; indirect_segments = xenbus_read_unsigned(info->xbdev->otherend, "feature-max-indirect-segments", 0); @@ -2334,11 +2316,8 @@ static void blkfront_gather_backend_features(struct blkfront_info *info) static void blkfront_connect(struct blkfront_info *info) { unsigned long long sectors; - unsigned long sector_size; - unsigned int physical_sector_size; - unsigned int binfo; - char *envp[] = { "RESIZE=1", NULL }; int err, i; + struct blkfront_ring_info *rinfo; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -2352,10 +2331,7 @@ static void blkfront_connect(struct blkfront_info *info) return; printk(KERN_INFO "Setting capacity to %Lu\n", sectors); - set_capacity(info->gd, sectors); - revalidate_disk(info->gd); - kobject_uevent_env(&disk_to_dev(info->gd)->kobj, - KOBJ_CHANGE, envp); + set_capacity_and_notify(info->gd, sectors); return; case BLKIF_STATE_SUSPENDED: @@ -2377,8 +2353,8 @@ static void blkfront_connect(struct blkfront_info *info) err = xenbus_gather(XBT_NIL, info->xbdev->otherend, "sectors", "%llu", §ors, - "info", "%u", &binfo, - "sector-size", "%lu", §or_size, + "info", "%u", &info->vdisk_info, + "sector-size", "%lu", &info->sector_size, NULL); if (err) { xenbus_dev_fatal(info->xbdev, err, @@ -2388,16 +2364,16 @@ static void blkfront_connect(struct blkfront_info *info) } /* - * physcial-sector-size is a newer field, so old backends may not + * physical-sector-size is a newer field, so old backends may not * provide this. Assume physical sector size to be the same as * sector_size in that case. */ - physical_sector_size = xenbus_read_unsigned(info->xbdev->otherend, + info->physical_sector_size = xenbus_read_unsigned(info->xbdev->otherend, "physical-sector-size", - sector_size); + info->sector_size); blkfront_gather_backend_features(info); - for (i = 0; i < info->nr_rings; i++) { - err = blkfront_setup_indirect(&info->rinfo[i]); + for_each_rinfo(info, rinfo, i) { + err = blkfront_setup_indirect(rinfo); if (err) { xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", info->xbdev->otherend); @@ -2406,8 +2382,7 @@ static void blkfront_connect(struct blkfront_info *info) } } - err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, - physical_sector_size); + err = xlvbd_alloc_gendisk(sectors, info); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", info->xbdev->otherend); @@ -2418,10 +2393,16 @@ static void blkfront_connect(struct blkfront_info *info) /* Kick pending requests. */ info->connected = BLKIF_STATE_CONNECTED; - for (i = 0; i < info->nr_rings; i++) - kick_pending_request_queues(&info->rinfo[i]); + for_each_rinfo(info, rinfo, i) + kick_pending_request_queues(rinfo); - device_add_disk(&info->xbdev->dev, info->gd, NULL); + err = device_add_disk(&info->xbdev->dev, info->gd, NULL); + if (err) { + put_disk(info->gd); + blk_mq_free_tag_set(&info->tag_set); + info->rq = NULL; + goto fail; + } info->is_ready = 1; return; @@ -2431,7 +2412,7 @@ fail: return; } -/** +/* * Callback received when the backend's state changes. */ static void blkback_changed(struct xenbus_device *dev, @@ -2447,6 +2428,7 @@ static void blkback_changed(struct xenbus_device *dev, break; if (talk_to_blkback(dev, info)) break; + break; case XenbusStateInitialising: case XenbusStateInitialised: case XenbusStateReconfiguring: @@ -2478,68 +2460,34 @@ static void blkback_changed(struct xenbus_device *dev, case XenbusStateClosed: if (dev->state == XenbusStateClosed) break; - /* fall through */ + fallthrough; case XenbusStateClosing: - if (info) - blkfront_closing(info); + blkfront_closing(info); break; } } -static int blkfront_remove(struct xenbus_device *xbdev) +static void blkfront_remove(struct xenbus_device *xbdev) { struct blkfront_info *info = dev_get_drvdata(&xbdev->dev); - struct block_device *bdev = NULL; - struct gendisk *disk; dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename); - if (!info) - return 0; - - blkif_free(info, 0); - - mutex_lock(&info->mutex); - - disk = info->gd; - if (disk) - bdev = bdget_disk(disk, 0); - - info->xbdev = NULL; - mutex_unlock(&info->mutex); - - if (!bdev) { - mutex_lock(&blkfront_mutex); - free_info(info); - mutex_unlock(&blkfront_mutex); - return 0; - } - - /* - * The xbdev was removed before we reached the Closed - * state. See if it's safe to remove the disk. If the bdev - * isn't closed yet, we let release take care of it. - */ - - mutex_lock(&bdev->bd_mutex); - info = disk->private_data; + if (info->gd) + del_gendisk(info->gd); - dev_warn(disk_to_dev(disk), - "%s was hot-unplugged, %d stale handles\n", - xbdev->nodename, bdev->bd_openers); + mutex_lock(&blkfront_mutex); + list_del(&info->info_list); + mutex_unlock(&blkfront_mutex); - if (info && !bdev->bd_openers) { - xlvbd_release_gendisk(info); - disk->private_data = NULL; - mutex_lock(&blkfront_mutex); - free_info(info); - mutex_unlock(&blkfront_mutex); + blkif_free(info, 0); + if (info->gd) { + xlbd_release_minors(info->gd->first_minor, info->gd->minors); + put_disk(info->gd); + blk_mq_free_tag_set(&info->tag_set); } - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); - - return 0; + kfree(info); } static int blkfront_is_ready(struct xenbus_device *dev) @@ -2549,89 +2497,12 @@ static int blkfront_is_ready(struct xenbus_device *dev) return info->is_ready && info->xbdev; } -static int blkif_open(struct block_device *bdev, fmode_t mode) -{ - struct gendisk *disk = bdev->bd_disk; - struct blkfront_info *info; - int err = 0; - - mutex_lock(&blkfront_mutex); - - info = disk->private_data; - if (!info) { - /* xbdev gone */ - err = -ERESTARTSYS; - goto out; - } - - mutex_lock(&info->mutex); - - if (!info->gd) - /* xbdev is closed */ - err = -ERESTARTSYS; - - mutex_unlock(&info->mutex); - -out: - mutex_unlock(&blkfront_mutex); - return err; -} - -static void blkif_release(struct gendisk *disk, fmode_t mode) -{ - struct blkfront_info *info = disk->private_data; - struct block_device *bdev; - struct xenbus_device *xbdev; - - mutex_lock(&blkfront_mutex); - - bdev = bdget_disk(disk, 0); - - if (!bdev) { - WARN(1, "Block device %s yanked out from us!\n", disk->disk_name); - goto out_mutex; - } - if (bdev->bd_openers) - goto out; - - /* - * Check if we have been instructed to close. We will have - * deferred this request, because the bdev was still open. - */ - - mutex_lock(&info->mutex); - xbdev = info->xbdev; - - if (xbdev && xbdev->state == XenbusStateClosing) { - /* pending switch to state closed */ - dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); - xlvbd_release_gendisk(info); - xenbus_frontend_closed(info->xbdev); - } - - mutex_unlock(&info->mutex); - - if (!xbdev) { - /* sudden device removal */ - dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); - xlvbd_release_gendisk(info); - disk->private_data = NULL; - free_info(info); - } - -out: - bdput(bdev); -out_mutex: - mutex_unlock(&blkfront_mutex); -} - static const struct block_device_operations xlvbd_block_fops = { .owner = THIS_MODULE, - .open = blkif_open, - .release = blkif_release, .getgeo = blkif_getgeo, .ioctl = blkif_ioctl, + .compat_ioctl = blkdev_compat_ptr_ioctl, }; @@ -2653,10 +2524,11 @@ static void purge_persistent_grants(struct blkfront_info *info) { unsigned int i; unsigned long flags; + struct blkfront_ring_info *rinfo; - for (i = 0; i < info->nr_rings; i++) { - struct blkfront_ring_info *rinfo = &info->rinfo[i]; + for_each_rinfo(info, rinfo, i) { struct grant *gnt_list_entry, *tmp; + LIST_HEAD(grants); spin_lock_irqsave(&rinfo->ring_lock, flags); @@ -2667,17 +2539,18 @@ static void purge_persistent_grants(struct blkfront_info *info) list_for_each_entry_safe(gnt_list_entry, tmp, &rinfo->grants, node) { - if (gnt_list_entry->gref == GRANT_INVALID_REF || - gnttab_query_foreign_access(gnt_list_entry->gref)) + if (gnt_list_entry->gref == INVALID_GRANT_REF || + !gnttab_try_end_foreign_access(gnt_list_entry->gref)) continue; list_del(&gnt_list_entry->node); - gnttab_end_foreign_access(gnt_list_entry->gref, 0, 0UL); rinfo->persistent_gnts_c--; - gnt_list_entry->gref = GRANT_INVALID_REF; - list_add_tail(&gnt_list_entry->node, &rinfo->grants); + gnt_list_entry->gref = INVALID_GRANT_REF; + list_add_tail(&gnt_list_entry->node, &grants); } + list_splice_tail(&grants, &rinfo->grants); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); } } @@ -2687,6 +2560,13 @@ static void blkfront_delay_work(struct work_struct *work) struct blkfront_info *info; bool need_schedule_work = false; + /* + * Note that when using bounce buffers but not persistent grants + * there's no need to run blkfront_delay_work because grants are + * revoked in blkif_completion or else an error is reported and the + * connection is closed. + */ + mutex_lock(&blkfront_mutex); list_for_each_entry(info, &info_list, info_list) { |
