From d15156138dad40205c9fdd9abe85c9e1479ae272 Mon Sep 17 00:00:00 2001 From: Douglas Gilbert Date: Tue, 1 Jul 2014 10:48:05 -0600 Subject: block SG_IO: add SG_FLAG_Q_AT_HEAD flag After the SG_IO ioctl was copied into the block layer and later into the bsg driver, subtle differences emerged. One difference is the way injected commands are queued through the block layer (i.e. this is not SCSI device queueing nor SATA NCQ). Summarizing: - SG_IO on block layer device: blk_exec*(at_head=false) - sg device SG_IO: at_head=true - bsg device SG_IO: at_head=true Some time ago Boaz Harrosh introduced a sg v4 flag called BSG_FLAG_Q_AT_TAIL to override the bsg driver default. A recent patch titled: "sg: add SG_FLAG_Q_AT_TAIL flag" allowed the sg driver default to be overridden. This patch allows a SG_IO ioctl sent to a block layer device to have its default overridden. ChangeLog: - introduce SG_FLAG_Q_AT_HEAD flag in sg.h to cause commands that are injected via a block layer device SG_IO ioctl to set at_head=true - make comments clearer about queueing in sg.h since the header is used both by the sg device and block layer device implementations of the SG_IO ioctl. - introduce BSG_FLAG_Q_AT_HEAD in bsg.h for compatibility (it does nothing) and update comments. Signed-off-by: Douglas Gilbert Reviewed-by: Christoph Hellwig Reviewed-by: Mike Christie Signed-off-by: Jens Axboe --- include/scsi/sg.h | 3 +++ include/uapi/linux/bsg.h | 11 ++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/scsi/sg.h b/include/scsi/sg.h index a9f3c6fc3f57..4734c15ab5d6 100644 --- a/include/scsi/sg.h +++ b/include/scsi/sg.h @@ -129,6 +129,9 @@ typedef struct sg_io_hdr #define SG_FLAG_MMAP_IO 4 /* request memory mapped IO */ #define SG_FLAG_NO_DXFER 0x10000 /* no transfer of kernel buffers to/from */ /* user space (debug indirect IO) */ +/* defaults:: for sg driver: Q_AT_HEAD; for block layer: Q_AT_TAIL */ +#define SG_FLAG_Q_AT_TAIL 0x10 +#define SG_FLAG_Q_AT_HEAD 0x20 /* following 'info' values are "or"-ed together */ #define SG_INFO_OK_MASK 0x1 diff --git a/include/uapi/linux/bsg.h b/include/uapi/linux/bsg.h index 7a12e1c0f371..02986cf8b6f1 100644 --- a/include/uapi/linux/bsg.h +++ b/include/uapi/linux/bsg.h @@ -10,12 +10,13 @@ #define BSG_SUB_PROTOCOL_SCSI_TRANSPORT 2 /* - * For flags member below - * sg.h sg_io_hdr also has bits defined for it's flags member. However - * none of these bits are implemented/used by bsg. The bits below are - * allocated to not conflict with sg.h ones anyway. + * For flag constants below: + * sg.h sg_io_hdr also has bits defined for it's flags member. These + * two flag values (0x10 and 0x20) have the same meaning in sg.h . For + * bsg the BSG_FLAG_Q_AT_HEAD flag is ignored since it is the deafult. */ -#define BSG_FLAG_Q_AT_TAIL 0x10 /* default, == 0 at this bit, is Q_AT_HEAD */ +#define BSG_FLAG_Q_AT_TAIL 0x10 /* default is Q_AT_HEAD */ +#define BSG_FLAG_Q_AT_HEAD 0x20 struct sg_io_v4 { __s32 guard; /* [i] 'Q' to differentiate from v3 */ -- cgit From cb553215d5d277d4838d7d6b7722e964bcf5ca1f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 26 Jun 2014 17:41:47 +0800 Subject: include/uapi/linux/virtio_blk.h: introduce feature of VIRTIO_BLK_F_MQ Current virtio-blk spec only supports one virtual queue for transfering data between VM and host, and inside VM all kinds of operations on the virtual queue needs to hold one lock, so cause below problems: - bad scalability - bad throughput This patch requests to introduce feature of VIRTIO_BLK_F_MQ so that more than one virtual queues can be used to virtio-blk device, then above problems can be solved or eased. Signed-off-by: Ming Lei Acked-by: Michael S. Tsirkin Signed-off-by: Jens Axboe --- include/uapi/linux/virtio_blk.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h index 6d8e61c48563..9ad67b267584 100644 --- a/include/uapi/linux/virtio_blk.h +++ b/include/uapi/linux/virtio_blk.h @@ -40,6 +40,7 @@ #define VIRTIO_BLK_F_WCE 9 /* Writeback mode enabled after reset */ #define VIRTIO_BLK_F_TOPOLOGY 10 /* Topology information is available */ #define VIRTIO_BLK_F_CONFIG_WCE 11 /* Writeback mode available in config */ +#define VIRTIO_BLK_F_MQ 12 /* support more than one vq */ #ifndef __KERNEL__ /* Old (deprecated) name for VIRTIO_BLK_F_WCE. */ @@ -77,6 +78,10 @@ struct virtio_blk_config { /* writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */ __u8 wce; + __u8 unused; + + /* number of vqs, only available when VIRTIO_BLK_F_MQ is set */ + __u16 num_queues; } __attribute__((packed)); /* -- cgit From 8fe39aac0578cbb0abf27e1be70ff581e0c1d836 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 22 Nov 2013 13:22:13 +0100 Subject: drbd: device->ldev is not guaranteed on an D_ATTACHING disk Some parts of the code assumed that get_ldev_if_state(device, D_ATTACHING) is sufficient to access the ldev member of the device object. That was wrong. ldev may not be there or might be freed at any time if the device has a disk state of D_ATTACHING. bm_rw() Documented that drbd_bm_read() is only called from drbd_adm_attach. drbd_bm_write() is only called when a reference is held, and it is documented that a caller has to hold a reference before calling drbd_bm_write() drbd_bm_write_page() Use get_ldev() instead of get_ldev_if_state(device, D_ATTACHING) drbd_bmio_set_n_write() No longer use get_ldev_if_state(device, D_ATTACHING). All callers hold a reference to ldev now. drbd_bmio_clear_n_write() All callers where holding a reference of ldev anyways. Remove the misleading get_ldev_if_state(device, D_ATTACHING) drbd_reconsider_max_bio_size() Removed the get_ldev_if_state(device, D_ATTACHING). All callers now pass a struct drbd_backing_dev* when they have a proper reference, or a NULL pointer. Before this fix, the receiver could trigger a NULL pointer deref when in drbd_reconsider_max_bio_size() drbd_bump_write_ordering() Used get_ldev_if_state(device, D_ATTACHING) with the wrong assumption. Remove it, and allow the caller to pass in a struct drbd_backing_dev* when the caller knows that accessing this bdev is safe. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/drbd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 3dbe9bd57a09..20ec8903b1e4 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -245,7 +245,7 @@ enum drbd_disk_state { D_DISKLESS, D_ATTACHING, /* In the process of reading the meta-data */ D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */ - /* when >= D_FAILED it is legal to access mdev->bc */ + /* when >= D_FAILED it is legal to access mdev->ldev */ D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */ D_INCONSISTENT, D_OUTDATED, -- cgit From aaaba34576407857f6146ff6c330f06e63fb2bf2 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 18 Mar 2014 12:30:09 +0100 Subject: drbd: implement csums-after-crash-only Checksum based resync trades CPU cycles for network bandwidth, in situations where we expect much of the to-be-resynced blocks to be actually identical on both sides already. In a "network hickup" scenario, it won't help: all to-be-resynced blocks will typically be different. The use case is for the resync of *potentially* different blocks after crash recovery -- the crash recovery had marked larger areas (those covered by the activity log) as need-to-be-resynced, just in case. Most of those blocks will be identical. This option makes it possible to configure checksum based resync, but only actually use it for the first resync after primary crash. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/drbd_genl.h | 3 +++ include/linux/drbd_limits.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 4193f5f2636c..71fc924c53fa 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -171,6 +171,9 @@ GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf, __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative) __flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF) /* 9: __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */ + /* 9: __str_field_def(31, DRBD_GENLA_F_MANDATORY, name, SHARED_SECRET_MAX) */ + /* 9: __u32_field(32, DRBD_F_REQUIRED | DRBD_F_INVARIANT, peer_node_id) */ + __flg_field_def(33, 0 /* OPTIONAL */, csums_after_crash_only, DRBD_CSUMS_AFTER_CRASH_ONLY_DEF) ) GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms, diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 17e50bb00521..9d2df1d51414 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -214,6 +214,7 @@ #define DRBD_ALLOW_TWO_PRIMARIES_DEF 0 #define DRBD_ALWAYS_ASBP_DEF 0 #define DRBD_USE_RLE_DEF 1 +#define DRBD_CSUMS_AFTER_CRASH_ONLY_DEF 0 #define DRBD_AL_STRIPES_MIN 1 #define DRBD_AL_STRIPES_MAX 1024 -- cgit From 5d0b17f1a29e8189d04aef447a3a53cfd05529b2 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 18 Mar 2014 14:24:35 +0100 Subject: drbd: New net configuration option socket-check-timeout In setups involving a DRBD-proxy and connections that experience a lot of buffer-bloat it might be necessary to set ping-timeout to an unusual high value. By default DRBD uses the same value to wait if a newly established TCP-connection is stable. Since the DRBD-proxy is usually located in the same data center such a long wait time may hinder DRBD's connect process. In such setups socket-check-timeout should be set to at least to the round trip time between DRBD and DRBD-proxy. I.e. in most cases to 1. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/drbd_genl.h | 1 + include/linux/drbd_limits.h | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 71fc924c53fa..7b131ed8f9c6 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -174,6 +174,7 @@ GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf, /* 9: __str_field_def(31, DRBD_GENLA_F_MANDATORY, name, SHARED_SECRET_MAX) */ /* 9: __u32_field(32, DRBD_F_REQUIRED | DRBD_F_INVARIANT, peer_node_id) */ __flg_field_def(33, 0 /* OPTIONAL */, csums_after_crash_only, DRBD_CSUMS_AFTER_CRASH_ONLY_DEF) + __u32_field_def(34, 0 /* OPTIONAL */, sock_check_timeo, DRBD_SOCKET_CHECK_TIMEO_DEF) ) GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms, diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 9d2df1d51414..8ac8c5d9a3ad 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -225,4 +225,9 @@ #define DRBD_AL_STRIPE_SIZE_MAX 16777216 #define DRBD_AL_STRIPE_SIZE_DEF 32 #define DRBD_AL_STRIPE_SIZE_SCALE 'k' /* kilobytes */ + +#define DRBD_SOCKET_CHECK_TIMEO_MIN 0 +#define DRBD_SOCKET_CHECK_TIMEO_MAX DRBD_PING_TIMEO_MAX +#define DRBD_SOCKET_CHECK_TIMEO_DEF 0 +#define DRBD_SOCKET_CHECK_TIMEO_SCALE '1' #endif -- cgit From bf0d6e4a1138e71cafdbbb99cde430eee50c4ff1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 6 May 2014 14:28:32 +0300 Subject: drbd: silence underflow warning in read_in_block() My static checker warns that "data_size" could be negative and underflow the limit check. The code looks suspicious but I don't know if it is a real bug. Signed-off-by: Dan Carpenter Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/drbd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 20ec8903b1e4..debb70d40547 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -52,7 +52,7 @@ #endif extern const char *drbd_buildtag(void); -#define REL_VERSION "8.4.3" +#define REL_VERSION "8.4.5" #define API_VERSION 1 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 101 -- cgit From 60ae81eee86dd7a520db8c1e3d702b49fc0418b5 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Thu, 22 May 2014 12:14:24 -0700 Subject: bcache: bcache_write tracepoint was crashing Signed-off-by: Kent Overstreet --- include/trace/events/bcache.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index c9c3c044b32f..6778e4135a8e 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -148,11 +148,13 @@ TRACE_EVENT(bcache_read, ); TRACE_EVENT(bcache_write, - TP_PROTO(struct bio *bio, bool writeback, bool bypass), - TP_ARGS(bio, writeback, bypass), + TP_PROTO(struct cache_set *c, u64 inode, struct bio *bio, + bool writeback, bool bypass), + TP_ARGS(c, inode, bio, writeback, bypass), TP_STRUCT__entry( - __field(dev_t, dev ) + __array(char, uuid, 16 ) + __field(u64, inode ) __field(sector_t, sector ) __field(unsigned int, nr_sector ) __array(char, rwbs, 6 ) @@ -161,7 +163,8 @@ TRACE_EVENT(bcache_write, ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; + memcpy(__entry->uuid, c->sb.set_uuid, 16); + __entry->inode = inode; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size); @@ -169,8 +172,8 @@ TRACE_EVENT(bcache_write, __entry->bypass = bypass; ), - TP_printk("%d,%d %s %llu + %u hit %u bypass %u", - MAJOR(__entry->dev), MINOR(__entry->dev), + TP_printk("%pU inode %llu %s %llu + %u hit %u bypass %u", + __entry->uuid, __entry->inode, __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->writeback, __entry->bypass) ); -- cgit From 913dc33fb2720fb5f979011664294137ddd8b13b Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Fri, 23 May 2014 11:18:35 -0700 Subject: bcache: fix crash in bcache_btree_node_alloc_fail tracepoint 'b' was NULL. Change-Id: Icac0fd04afa2d23f213d96d51afd53374e6dd0c0 --- include/trace/events/bcache.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 6778e4135a8e..981acf74b14f 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -261,9 +261,9 @@ DEFINE_EVENT(btree_node, bcache_btree_node_alloc, TP_ARGS(b) ); -DEFINE_EVENT(btree_node, bcache_btree_node_alloc_fail, - TP_PROTO(struct btree *b), - TP_ARGS(b) +DEFINE_EVENT(cache_set, bcache_btree_node_alloc_fail, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) ); DEFINE_EVENT(btree_node, bcache_btree_node_free, -- cgit