summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/drbd/drbd_actlog.c246
-rw-r--r--drivers/block/drbd/drbd_bitmap.c13
-rw-r--r--drivers/block/drbd/drbd_int.h179
-rw-r--r--drivers/block/drbd/drbd_main.c244
-rw-r--r--drivers/block/drbd/drbd_nl.c129
-rw-r--r--drivers/block/drbd/drbd_receiver.c4
-rw-r--r--drivers/block/drbd/drbd_req.c166
-rw-r--r--drivers/block/drbd/drbd_worker.c5
8 files changed, 732 insertions, 254 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 92510f8ad013..6afe173d5c2b 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -104,7 +104,6 @@ struct update_al_work {
int err;
};
-static int al_write_transaction(struct drbd_conf *mdev);
void *drbd_md_get_buffer(struct drbd_conf *mdev)
{
@@ -168,7 +167,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
bio->bi_end_io = drbd_md_io_complete;
bio->bi_rw = rw;
- if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */
+ if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL)
+ /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
+ ;
+ else if (!get_ldev_if_state(mdev, D_ATTACHING)) {
+ /* Corresponding put_ldev in drbd_md_io_complete() */
dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
err = -ENODEV;
goto out;
@@ -199,9 +202,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
BUG_ON(!bdev->md_bdev);
- dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n",
+ dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
current->comm, current->pid, __func__,
- (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
+ (void*)_RET_IP_ );
if (sector < drbd_md_first_sector(bdev) ||
sector + 7 > drbd_md_last_sector(bdev))
@@ -209,7 +213,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
current->comm, current->pid, __func__,
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
- err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE);
+ /* we do all our meta data IO in aligned 4k blocks. */
+ err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096);
if (err) {
dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
@@ -217,44 +222,99 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
return err;
}
-static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
+static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr)
{
- struct lc_element *al_ext;
struct lc_element *tmp;
- int wake;
-
- spin_lock_irq(&mdev->al_lock);
tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
if (unlikely(tmp != NULL)) {
struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
- if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
- wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
- spin_unlock_irq(&mdev->al_lock);
- if (wake)
- wake_up(&mdev->al_wait);
- return NULL;
- }
+ if (test_bit(BME_NO_WRITES, &bm_ext->flags))
+ return bm_ext;
+ }
+ return NULL;
+}
+
+static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock)
+{
+ struct lc_element *al_ext;
+ struct bm_extent *bm_ext;
+ int wake;
+
+ spin_lock_irq(&mdev->al_lock);
+ bm_ext = find_active_resync_extent(mdev, enr);
+ if (bm_ext) {
+ wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
+ spin_unlock_irq(&mdev->al_lock);
+ if (wake)
+ wake_up(&mdev->al_wait);
+ return NULL;
}
- al_ext = lc_get(mdev->act_log, enr);
+ if (nonblock)
+ al_ext = lc_try_get(mdev->act_log, enr);
+ else
+ al_ext = lc_get(mdev->act_log, enr);
spin_unlock_irq(&mdev->al_lock);
return al_ext;
}
-void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
+bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i)
{
/* for bios crossing activity log extent boundaries,
* we may need to activate two extents in one go */
unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
- unsigned enr;
- bool locked = false;
+ D_ASSERT((unsigned)(last - first) <= 1);
+ D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
+
+ /* FIXME figure out a fast path for bios crossing AL extent boundaries */
+ if (first != last)
+ return false;
+
+ return _al_get(mdev, first, true);
+}
+
+bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i)
+{
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned enr;
+ bool need_transaction = false;
D_ASSERT(first <= last);
D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
- for (enr = first; enr <= last; enr++)
- wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL);
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *al_ext;
+ wait_event(mdev->al_wait,
+ (al_ext = _al_get(mdev, enr, false)) != NULL);
+ if (al_ext->lc_number != enr)
+ need_transaction = true;
+ }
+ return need_transaction;
+}
+
+static int al_write_transaction(struct drbd_conf *mdev, bool delegate);
+
+/* When called through generic_make_request(), we must delegate
+ * activity log I/O to the worker thread: a further request
+ * submitted via generic_make_request() within the same task
+ * would be queued on current->bio_list, and would only start
+ * after this function returns (see generic_make_request()).
+ *
+ * However, if we *are* the worker, we must not delegate to ourselves.
+ */
+
+/*
+ * @delegate: delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate)
+{
+ bool locked = false;
+
+ BUG_ON(delegate && current == mdev->tconn->worker.task);
/* Serialize multiple transactions.
* This uses test_and_set_bit, memory barrier is implicit.
@@ -264,13 +324,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
(locked = lc_try_lock_for_transaction(mdev->act_log)));
if (locked) {
- /* drbd_al_write_transaction(mdev,al_ext,enr);
- * recurses into generic_make_request(), which
- * disallows recursion, bios being serialized on the
- * current->bio_tail list now.
- * we have to delegate updates to the activity log
- * to the worker thread. */
-
/* Double check: it may have been committed by someone else,
* while we have been waiting for the lock. */
if (mdev->act_log->pending_changes) {
@@ -280,11 +333,8 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
rcu_read_unlock();
- if (write_al_updates) {
- al_write_transaction(mdev);
- mdev->al_writ_cnt++;
- }
-
+ if (write_al_updates)
+ al_write_transaction(mdev, delegate);
spin_lock_irq(&mdev->al_lock);
/* FIXME
if (err)
@@ -298,6 +348,66 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
}
}
+/*
+ * @delegate: delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate)
+{
+ BUG_ON(delegate && current == mdev->tconn->worker.task);
+
+ if (drbd_al_begin_io_prepare(mdev, i))
+ drbd_al_begin_io_commit(mdev, delegate);
+}
+
+int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i)
+{
+ struct lru_cache *al = mdev->act_log;
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned nr_al_extents;
+ unsigned available_update_slots;
+ unsigned enr;
+
+ D_ASSERT(first <= last);
+
+ nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
+ available_update_slots = min(al->nr_elements - al->used,
+ al->max_pending_changes - al->pending_changes);
+
+ /* We want all necessary updates for a given request within the same transaction
+ * We could first check how many updates are *actually* needed,
+ * and use that instead of the worst-case nr_al_extents */
+ if (available_update_slots < nr_al_extents)
+ return -EWOULDBLOCK;
+
+ /* Is resync active in this area? */
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *tmp;
+ tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
+ if (unlikely(tmp != NULL)) {
+ struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+ if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+ if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags));
+ return -EBUSY;
+ return -EWOULDBLOCK;
+ }
+ }
+ }
+
+ /* Checkout the refcounts.
+ * Given that we checked for available elements and update slots above,
+ * this has to be successful. */
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *al_ext;
+ al_ext = lc_get_cumulative(mdev->act_log, enr);
+ if (!al_ext)
+ dev_info(DEV, "LOGIC BUG for enr=%u\n", enr);
+ }
+ return 0;
+}
+
void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
{
/* for bios crossing activity log extent boundaries,
@@ -350,6 +460,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
(BM_EXT_SHIFT - BM_BLOCK_SHIFT));
}
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev)
+{
+ const unsigned int stripes = mdev->ldev->md.al_stripes;
+ const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k;
+
+ /* transaction number, modulo on-disk ring buffer wrap around */
+ unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k);
+
+ /* ... to aligned 4k on disk block */
+ t = ((t % stripes) * stripe_size_4kB) + t/stripes;
+
+ /* ... to 512 byte sector in activity log */
+ t *= 8;
+
+ /* ... plus offset to the on disk position */
+ return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t;
+}
+
static int
_al_write_transaction(struct drbd_conf *mdev)
{
@@ -432,23 +560,27 @@ _al_write_transaction(struct drbd_conf *mdev)
if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
mdev->al_tr_cycle = 0;
- sector = mdev->ldev->md.md_offset
- + mdev->ldev->md.al_offset
- + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
+ sector = al_tr_number_to_on_disk_sector(mdev);
crc = crc32c(0, buffer, 4096);
buffer->crc32c = cpu_to_be32(crc);
if (drbd_bm_write_hinted(mdev))
err = -EIO;
- /* drbd_chk_io_error done already */
- else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
- err = -EIO;
- drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
- } else {
- /* advance ringbuffer position and transaction counter */
- mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
- mdev->al_tr_number++;
+ else {
+ bool write_al_updates;
+ rcu_read_lock();
+ write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
+ rcu_read_unlock();
+ if (write_al_updates) {
+ if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+ err = -EIO;
+ drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
+ } else {
+ mdev->al_tr_number++;
+ mdev->al_writ_cnt++;
+ }
+ }
}
drbd_md_put_buffer(mdev);
@@ -474,20 +606,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused)
/* Calls from worker context (see w_restart_disk_io()) need to write the
transaction directly. Others came through generic_make_request(),
those need to delegate it to the worker. */
-static int al_write_transaction(struct drbd_conf *mdev)
+static int al_write_transaction(struct drbd_conf *mdev, bool delegate)
{
- struct update_al_work al_work;
-
- if (current == mdev->tconn->worker.task)
+ if (delegate) {
+ struct update_al_work al_work;
+ init_completion(&al_work.event);
+ al_work.w.cb = w_al_write_transaction;
+ al_work.w.mdev = mdev;
+ drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
+ wait_for_completion(&al_work.event);
+ return al_work.err;
+ } else
return _al_write_transaction(mdev);
-
- init_completion(&al_work.event);
- al_work.w.cb = w_al_write_transaction;
- al_work.w.mdev = mdev;
- drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
- wait_for_completion(&al_work.event);
-
- return al_work.err;
}
static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 8dc29502dc08..64fbb8385cdc 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -612,6 +612,17 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
}
}
+/* For the layout, see comment above drbd_md_set_sector_offsets(). */
+static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
+{
+ u64 bitmap_sectors;
+ if (ldev->md.al_offset == 8)
+ bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
+ else
+ bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
+ return bitmap_sectors << (9 + 3);
+}
+
/*
* make sure the bitmap has enough room for the attached storage,
* if necessary, resize.
@@ -668,7 +679,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
words = ALIGN(bits, 64) >> LN2_BPL;
if (get_ldev(mdev)) {
- u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12;
+ u64 bits_on_disk = drbd_md_on_disk_bits(mdev->ldev);
put_ldev(mdev);
if (bits > bits_on_disk) {
dev_info(DEV, "bits = %lu\n", bits);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 6b51afa1aae1..f943aacfdad8 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -753,13 +753,16 @@ struct drbd_md {
u32 flags;
u32 md_size_sect;
- s32 al_offset; /* signed relative sector offset to al area */
+ s32 al_offset; /* signed relative sector offset to activity log */
s32 bm_offset; /* signed relative sector offset to bitmap */
- /* u32 al_nr_extents; important for restoring the AL
- * is stored into ldev->dc.al_extents, which in turn
- * gets applied to act_log->nr_elements
- */
+ /* cached value of bdev->disk_conf->meta_dev_idx (see below) */
+ s32 meta_dev_idx;
+
+ /* see al_tr_number_to_on_disk_sector() */
+ u32 al_stripes;
+ u32 al_stripe_size_4k;
+ u32 al_size_4k; /* cached product of the above */
};
struct drbd_backing_dev {
@@ -891,6 +894,14 @@ struct drbd_tconn { /* is a resource from the config file */
} send;
};
+struct submit_worker {
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ spinlock_t lock;
+ struct list_head writes;
+};
+
struct drbd_conf {
struct drbd_tconn *tconn;
int vnr; /* volume number within the connection */
@@ -1009,7 +1020,6 @@ struct drbd_conf {
struct lru_cache *act_log; /* activity log */
unsigned int al_tr_number;
int al_tr_cycle;
- int al_tr_pos; /* position of the next transaction in the journal */
wait_queue_head_t seq_wait;
atomic_t packet_seq;
unsigned int peer_seq;
@@ -1032,6 +1042,10 @@ struct drbd_conf {
atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
unsigned int peer_max_bio_size;
unsigned int local_max_bio_size;
+
+ /* any requests that would block in drbd_make_request()
+ * are deferred to this single-threaded work queue */
+ struct submit_worker submit;
};
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1148,25 +1162,44 @@ extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
char *why, enum bm_flag flags);
extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
-extern void drbd_go_diskless(struct drbd_conf *mdev);
extern void drbd_ldev_destroy(struct drbd_conf *mdev);
/* Meta data layout
- We reserve a 128MB Block (4k aligned)
- * either at the end of the backing device
- * or on a separate meta data device. */
+ *
+ * We currently have two possible layouts.
+ * Offsets in (512 byte) sectors.
+ * external:
+ * |----------- md_size_sect ------------------|
+ * [ 4k superblock ][ activity log ][ Bitmap ]
+ * | al_offset == 8 |
+ * | bm_offset = al_offset + X |
+ * ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * Variants:
+ * old, indexed fixed size meta data:
+ *
+ * internal:
+ * |----------- md_size_sect ------------------|
+ * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*]
+ * | al_offset < 0 |
+ * | bm_offset = al_offset - Y |
+ * ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ * [padding*] are zero or up to 7 unused 512 Byte sectors to the
+ * end of the device, so that the [4k superblock] will be 4k aligned.
+ *
+ * The activity log consists of 4k transaction blocks,
+ * which are written in a ring-buffer, or striped ring-buffer like fashion,
+ * which are writtensize used to be fixed 32kB,
+ * but is about to become configurable.
+ */
-/* The following numbers are sectors */
-/* Allows up to about 3.8TB, so if you want more,
+/* Our old fixed size meta data layout
+ * allows up to about 3.8TB, so if you want more,
* you need to use the "flexible" meta data format. */
-#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
-#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
-#define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */
-#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS)
-
-/* we do all meta data IO in 4k blocks */
-#define MD_BLOCK_SHIFT 12
-#define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT)
+#define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */
+#define MD_4kB_SECT 8
+#define MD_32kB_SECT 64
/* One activity log extent represents 4M of storage */
#define AL_EXTENT_SHIFT 22
@@ -1256,7 +1289,6 @@ struct bm_extent {
/* in one sector of the bitmap, we have this many activity_log extents. */
#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
-#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
@@ -1276,16 +1308,18 @@ struct bm_extent {
*/
#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
-#define DRBD_MAX_SECTORS_BM \
- ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
-#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
-#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
-#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
-#elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
+/* we have a certain meta data variant that has a fixed on-disk size of 128
+ * MiB, of which 4k are our "superblock", and 32k are the fixed size activity
+ * log, leaving this many sectors for the bitmap.
+ */
+
+#define DRBD_MAX_SECTORS_FIXED_BM \
+ ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
+#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
#else
-#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
+#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM
/* 16 TB in units of sectors */
#if BITS_PER_LONG == 32
/* adjust by one page worth of bitmap,
@@ -1418,6 +1452,7 @@ extern void conn_free_crypto(struct drbd_tconn *tconn);
extern int proc_details;
/* drbd_req */
+extern void do_submit(struct work_struct *ws);
extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
extern void drbd_make_request(struct request_queue *q, struct bio *bio);
extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
@@ -1576,7 +1611,10 @@ extern const char *drbd_conn_str(enum drbd_conns s);
extern const char *drbd_role_str(enum drbd_role s);
/* drbd_actlog.c */
-extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i);
+extern int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i);
+extern void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate);
+extern bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i);
+extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate);
extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i);
extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
@@ -1755,9 +1793,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
* BTW, for internal meta data, this happens to be the maximum capacity
* we could agree upon with our peer node.
*/
-static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev)
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
{
- switch (meta_dev_idx) {
+ switch (bdev->md.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
return bdev->md.md_offset + bdev->md.bm_offset;
@@ -1767,36 +1805,19 @@ static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backi
}
}
-static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
-{
- int meta_dev_idx;
-
- rcu_read_lock();
- meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
- rcu_read_unlock();
-
- return _drbd_md_first_sector(meta_dev_idx, bdev);
-}
-
/**
* drbd_md_last_sector() - Return the last sector number of the meta data area
* @bdev: Meta data block device.
*/
static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
{
- int meta_dev_idx;
-
- rcu_read_lock();
- meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
- rcu_read_unlock();
-
- switch (meta_dev_idx) {
+ switch (bdev->md.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
- return bdev->md.md_offset + MD_AL_OFFSET - 1;
+ return bdev->md.md_offset + MD_4kB_SECT -1;
case DRBD_MD_INDEX_FLEX_EXT:
default:
- return bdev->md.md_offset + bdev->md.md_size_sect;
+ return bdev->md.md_offset + bdev->md.md_size_sect -1;
}
}
@@ -1818,18 +1839,13 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev)
static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
{
sector_t s;
- int meta_dev_idx;
- rcu_read_lock();
- meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
- rcu_read_unlock();
-
- switch (meta_dev_idx) {
+ switch (bdev->md.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
s = drbd_get_capacity(bdev->backing_bdev)
? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
- _drbd_md_first_sector(meta_dev_idx, bdev))
+ drbd_md_first_sector(bdev))
: 0;
break;
case DRBD_MD_INDEX_FLEX_EXT:
@@ -1848,39 +1864,24 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
}
/**
- * drbd_md_ss__() - Return the sector number of our meta data super block
- * @mdev: DRBD device.
+ * drbd_md_ss() - Return the sector number of our meta data super block
* @bdev: Meta data block device.
*/
-static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
- struct drbd_backing_dev *bdev)
+static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
{
- int meta_dev_idx;
+ const int meta_dev_idx = bdev->md.meta_dev_idx;
- rcu_read_lock();
- meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
- rcu_read_unlock();
-
- switch (meta_dev_idx) {
- default: /* external, some index */
- return MD_RESERVED_SECT * meta_dev_idx;
- case DRBD_MD_INDEX_INTERNAL:
- /* with drbd08, internal meta data is always "flexible" */
- case DRBD_MD_INDEX_FLEX_INT:
- /* sizeof(struct md_on_disk_07) == 4k
- * position: last 4k aligned block of 4k size */
- if (!bdev->backing_bdev) {
- if (__ratelimit(&drbd_ratelimit_state)) {
- dev_err(DEV, "bdev->backing_bdev==NULL\n");
- dump_stack();
- }
- return 0;
- }
- return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
- - MD_AL_OFFSET;
- case DRBD_MD_INDEX_FLEX_EXT:
+ if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT)
return 0;
- }
+
+ /* Since drbd08, internal meta data is always "flexible".
+ * position: last 4k aligned block of 4k size */
+ if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+ meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)
+ return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
+
+ /* external, some index; this is the old fixed size layout */
+ return MD_128MB_SECT * bdev->md.meta_dev_idx;
}
static inline void
@@ -2053,9 +2054,11 @@ static inline void put_ldev(struct drbd_conf *mdev)
if (mdev->state.disk == D_DISKLESS)
/* even internal references gone, safe to destroy */
drbd_ldev_destroy(mdev);
- if (mdev->state.disk == D_FAILED)
+ if (mdev->state.disk == D_FAILED) {
/* all application IO references gone. */
- drbd_go_diskless(mdev);
+ if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
+ drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
+ }
wake_up(&mdev->misc_wait);
}
}
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index e98da675f0c1..a150b59897a0 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -45,7 +45,7 @@
#include <linux/reboot.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
-
+#include <linux/workqueue.h>
#define __KERNEL_SYSCALLS__
#include <linux/unistd.h>
#include <linux/vmalloc.h>
@@ -2300,6 +2300,7 @@ static void drbd_cleanup(void)
idr_for_each_entry(&minors, mdev, i) {
idr_remove(&minors, mdev_to_minor(mdev));
idr_remove(&mdev->tconn->volumes, mdev->vnr);
+ destroy_workqueue(mdev->submit.wq);
del_gendisk(mdev->vdisk);
/* synchronize_rcu(); No other threads running at this point */
kref_put(&mdev->kref, &drbd_minor_destroy);
@@ -2589,6 +2590,21 @@ void conn_destroy(struct kref *kref)
kfree(tconn);
}
+int init_submitter(struct drbd_conf *mdev)
+{
+ /* opencoded create_singlethread_workqueue(),
+ * to be able to say "drbd%d", ..., minor */
+ mdev->submit.wq = alloc_workqueue("drbd%u_submit",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 1, mdev->minor);
+ if (!mdev->submit.wq)
+ return -ENOMEM;
+
+ INIT_WORK(&mdev->submit.worker, do_submit);
+ spin_lock_init(&mdev->submit.lock);
+ INIT_LIST_HEAD(&mdev->submit.writes);
+ return 0;
+}
+
enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
{
struct drbd_conf *mdev;
@@ -2678,6 +2694,12 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
goto out_idr_remove_minor;
}
+ if (init_submitter(mdev)) {
+ err = ERR_NOMEM;
+ drbd_msg_put_info("unable to create submit workqueue");
+ goto out_idr_remove_vol;
+ }
+
add_disk(disk);
kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */
@@ -2688,6 +2710,8 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
return NO_ERROR;
+out_idr_remove_vol:
+ idr_remove(&tconn->volumes, vnr_got);
out_idr_remove_minor:
idr_remove(&minors, minor_got);
synchronize_rcu();
@@ -2834,8 +2858,9 @@ void conn_md_sync(struct drbd_tconn *tconn)
rcu_read_unlock();
}
+/* aligned 4kByte */
struct meta_data_on_disk {
- u64 la_size; /* last agreed size. */
+ u64 la_size_sect; /* last agreed size. */
u64 uuid[UI_SIZE]; /* UUIDs. */
u64 device_uuid;
u64 reserved_u64_1;
@@ -2843,13 +2868,17 @@ struct meta_data_on_disk {
u32 magic;
u32 md_size_sect;
u32 al_offset; /* offset to this block */
- u32 al_nr_extents; /* important for restoring the AL */
+ u32 al_nr_extents; /* important for restoring the AL (userspace) */
/* `-- act_log->nr_elements <-- ldev->dc.al_extents */
u32 bm_offset; /* offset to the bitmap, from here */
u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
u32 la_peer_max_bio_size; /* last peer max_bio_size */
- u32 reserved_u32[3];
+ /* see al_tr_number_to_on_disk_sector() */
+ u32 al_stripes;
+ u32 al_stripe_size_4k;
+
+ u8 reserved_u8[4096 - (7*8 + 10*4)];
} __packed;
/**
@@ -2862,6 +2891,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
sector_t sector;
int i;
+ /* Don't accidentally change the DRBD meta data layout. */
+ BUILD_BUG_ON(UI_SIZE != 4);
+ BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
+
del_timer(&mdev->md_sync_timer);
/* timer may be rearmed by drbd_md_mark_dirty() now. */
if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
@@ -2876,9 +2909,9 @@ void drbd_md_sync(struct drbd_conf *mdev)
if (!buffer)
goto out;
- memset(buffer, 0, 512);
+ memset(buffer, 0, sizeof(*buffer));
- buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
+ buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
for (i = UI_CURRENT; i < UI_SIZE; i++)
buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
@@ -2893,7 +2926,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
- D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
+ buffer->al_stripes = cpu_to_be32(mdev->ldev->md.al_stripes);
+ buffer->al_stripe_size_4k = cpu_to_be32(mdev->ldev->md.al_stripe_size_4k);
+
+ D_ASSERT(drbd_md_ss(mdev->ldev) == mdev->ldev->md.md_offset);
sector = mdev->ldev->md.md_offset;
if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
@@ -2911,13 +2947,141 @@ out:
put_ldev(mdev);
}
+static int check_activity_log_stripe_size(struct drbd_conf *mdev,
+ struct meta_data_on_disk *on_disk,
+ struct drbd_md *in_core)
+{
+ u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
+ u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
+ u64 al_size_4k;
+
+ /* both not set: default to old fixed size activity log */
+ if (al_stripes == 0 && al_stripe_size_4k == 0) {
+ al_stripes = 1;
+ al_stripe_size_4k = MD_32kB_SECT/8;
+ }
+
+ /* some paranoia plausibility checks */
+
+ /* we need both values to be set */
+ if (al_stripes == 0 || al_stripe_size_4k == 0)
+ goto err;
+
+ al_size_4k = (u64)al_stripes * al_stripe_size_4k;
+
+ /* Upper limit of activity log area, to avoid potential overflow
+ * problems in al_tr_number_to_on_disk_sector(). As right now, more
+ * than 72 * 4k blocks total only increases the amount of history,
+ * limiting this arbitrarily to 16 GB is not a real limitation ;-) */
+ if (al_size_4k > (16 * 1024 * 1024/4))
+ goto err;
+
+ /* Lower limit: we need at least 8 transaction slots (32kB)
+ * to not break existing setups */
+ if (al_size_4k < MD_32kB_SECT/8)
+ goto err;
+
+ in_core->al_stripe_size_4k = al_stripe_size_4k;
+ in_core->al_stripes = al_stripes;
+ in_core->al_size_4k = al_size_4k;
+
+ return 0;
+err:
+ dev_err(DEV, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
+ al_stripes, al_stripe_size_4k);
+ return -EINVAL;
+}
+
+static int check_offsets_and_sizes(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+ sector_t capacity = drbd_get_capacity(bdev->md_bdev);
+ struct drbd_md *in_core = &bdev->md;
+ s32 on_disk_al_sect;
+ s32 on_disk_bm_sect;
+
+ /* The on-disk size of the activity log, calculated from offsets, and
+ * the size of the activity log calculated from the stripe settings,
+ * should match.
+ * Though we could relax this a bit: it is ok, if the striped activity log
+ * fits in the available on-disk activity log size.
+ * Right now, that would break how resize is implemented.
+ * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
+ * of possible unused padding space in the on disk layout. */
+ if (in_core->al_offset < 0) {
+ if (in_core->bm_offset > in_core->al_offset)
+ goto err;
+ on_disk_al_sect = -in_core->al_offset;
+ on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
+ } else {
+ if (in_core->al_offset != MD_4kB_SECT)
+ goto err;
+ if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
+ goto err;
+
+ on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
+ on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
+ }
+
+ /* old fixed size meta data is exactly that: fixed. */
+ if (in_core->meta_dev_idx >= 0) {
+ if (in_core->md_size_sect != MD_128MB_SECT
+ || in_core->al_offset != MD_4kB_SECT
+ || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
+ || in_core->al_stripes != 1
+ || in_core->al_stripe_size_4k != MD_32kB_SECT/8)
+ goto err;
+ }
+
+ if (capacity < in_core->md_size_sect)
+ goto err;
+ if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
+ goto err;
+
+ /* should be aligned, and at least 32k */
+ if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
+ goto err;
+
+ /* should fit (for now: exactly) into the available on-disk space;
+ * overflow prevention is in check_activity_log_stripe_size() above. */
+ if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
+ goto err;
+
+ /* again, should be aligned */
+ if (in_core->bm_offset & 7)
+ goto err;
+
+ /* FIXME check for device grow with flex external meta data? */
+
+ /* can the available bitmap space cover the last agreed device size? */
+ if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
+ goto err;
+
+ return 0;
+
+err:
+ dev_err(DEV, "meta data offsets don't make sense: idx=%d "
+ "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
+ "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
+ in_core->meta_dev_idx,
+ in_core->al_stripes, in_core->al_stripe_size_4k,
+ in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
+ (unsigned long long)in_core->la_size_sect,
+ (unsigned long long)capacity);
+
+ return -EINVAL;
+}
+
+
/**
* drbd_md_read() - Reads in the meta data super block
* @mdev: DRBD device.
* @bdev: Device from which the meta data should be read in.
*
- * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
+ * Return NO_ERROR on success, and an enum drbd_ret_code in case
* something goes wrong.
+ *
+ * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
+ * even before @bdev is assigned to @mdev->ldev.
*/
int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
{
@@ -2925,12 +3089,17 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
u32 magic, flags;
int i, rv = NO_ERROR;
- if (!get_ldev_if_state(mdev, D_ATTACHING))
- return ERR_IO_MD_DISK;
+ if (mdev->state.disk != D_DISKLESS)
+ return ERR_DISK_CONFIGURED;
buffer = drbd_md_get_buffer(mdev);
if (!buffer)
- goto out;
+ return ERR_NOMEM;
+
+ /* First, figure out where our meta data superblock is located,
+ * and read it. */
+ bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
+ bdev->md.md_offset = drbd_md_ss(bdev);
if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
/* NOTE: can't do normal error processing here as this is
@@ -2949,45 +3118,51 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
rv = ERR_MD_UNCLEAN;
goto err;
}
+
+ rv = ERR_MD_INVALID;
if (magic != DRBD_MD_MAGIC_08) {
if (magic == DRBD_MD_MAGIC_07)
dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
else
dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
- rv = ERR_MD_INVALID;
goto err;
}
- if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
- dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
- be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
- rv = ERR_MD_INVALID;
+
+ if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
+ dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
+ be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
goto err;
}
+
+
+ /* convert to in_core endian */
+ bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
+ for (i = UI_CURRENT; i < UI_SIZE; i++)
+ bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
+ bdev->md.flags = be32_to_cpu(buffer->flags);
+ bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+
+ bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
+ bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
+ bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
+
+ if (check_activity_log_stripe_size(mdev, buffer, &bdev->md))
+ goto err;
+ if (check_offsets_and_sizes(mdev, bdev))
+ goto err;
+
if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
- rv = ERR_MD_INVALID;
goto err;
}
if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
- rv = ERR_MD_INVALID;
goto err;
}
- if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
- dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
- be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
- rv = ERR_MD_INVALID;
- goto err;
- }
-
- bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
- for (i = UI_CURRENT; i < UI_SIZE; i++)
- bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
- bdev->md.flags = be32_to_cpu(buffer->flags);
- bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+ rv = NO_ERROR;
spin_lock_irq(&mdev->tconn->req_lock);
if (mdev->state.conn < C_CONNECTED) {
@@ -3000,8 +3175,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
err:
drbd_md_put_buffer(mdev);
- out:
- put_ldev(mdev);
return rv;
}
@@ -3252,13 +3425,6 @@ static int w_go_diskless(struct drbd_work *w, int unused)
return 0;
}
-void drbd_go_diskless(struct drbd_conf *mdev)
-{
- D_ASSERT(mdev->state.disk == D_FAILED);
- if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
- drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
-}
-
/**
* drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
* @mdev: DRBD device.
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 2af26fc95280..42fda4ae2f87 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -696,37 +696,52 @@ out:
return 0;
}
-/* initializes the md.*_offset members, so we are able to find
- * the on disk meta data */
+/* Initializes the md.*_offset members, so we are able to find
+ * the on disk meta data.
+ *
+ * We currently have two possible layouts:
+ * external:
+ * |----------- md_size_sect ------------------|
+ * [ 4k superblock ][ activity log ][ Bitmap ]
+ * | al_offset == 8 |
+ * | bm_offset = al_offset + X |
+ * ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * internal:
+ * |----------- md_size_sect ------------------|
+ * [data.....][ Bitmap ][ activity log ][ 4k superblock ]
+ * | al_offset < 0 |
+ * | bm_offset = al_offset - Y |
+ * ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ * Activity log size used to be fixed 32kB,
+ * but is about to become configurable.
+ */
static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev)
{
sector_t md_size_sect = 0;
- int meta_dev_idx;
+ unsigned int al_size_sect = bdev->md.al_size_4k * 8;
- rcu_read_lock();
- meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+ bdev->md.md_offset = drbd_md_ss(bdev);
- switch (meta_dev_idx) {
+ switch (bdev->md.meta_dev_idx) {
default:
/* v07 style fixed size indexed meta data */
- bdev->md.md_size_sect = MD_RESERVED_SECT;
- bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
- bdev->md.al_offset = MD_AL_OFFSET;
- bdev->md.bm_offset = MD_BM_OFFSET;
+ bdev->md.md_size_sect = MD_128MB_SECT;
+ bdev->md.al_offset = MD_4kB_SECT;
+ bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
break;
case DRBD_MD_INDEX_FLEX_EXT:
/* just occupy the full device; unit: sectors */
bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
- bdev->md.md_offset = 0;
- bdev->md.al_offset = MD_AL_OFFSET;
- bdev->md.bm_offset = MD_BM_OFFSET;
+ bdev->md.al_offset = MD_4kB_SECT;
+ bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
break;
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
- bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
/* al size is still fixed */
- bdev->md.al_offset = -MD_AL_SECTORS;
+ bdev->md.al_offset = -al_size_sect;
/* we need (slightly less than) ~ this much bitmap sectors: */
md_size_sect = drbd_get_capacity(bdev->backing_bdev);
md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
@@ -735,14 +750,13 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
/* plus the "drbd meta data super block",
* and the activity log; */
- md_size_sect += MD_BM_OFFSET;
+ md_size_sect += MD_4kB_SECT + al_size_sect;
bdev->md.md_size_sect = md_size_sect;
/* bitmap offset is adjusted by 'super' block size */
- bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET;
+ bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT;
break;
}
- rcu_read_unlock();
}
/* input size is expected to be in KB */
@@ -805,7 +819,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
{
sector_t prev_first_sect, prev_size; /* previous meta location */
- sector_t la_size, u_size;
+ sector_t la_size_sect, u_size;
sector_t size;
char ppb[10];
@@ -828,7 +842,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
prev_first_sect = drbd_md_first_sector(mdev->ldev);
prev_size = mdev->ldev->md.md_size_sect;
- la_size = mdev->ldev->md.la_size_sect;
+ la_size_sect = mdev->ldev->md.la_size_sect;
/* TODO: should only be some assert here, not (re)init... */
drbd_md_set_sector_offsets(mdev, mdev->ldev);
@@ -864,7 +878,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
if (rv == dev_size_error)
goto out;
- la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
+ la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect);
md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
|| prev_size != mdev->ldev->md.md_size_sect;
@@ -886,9 +900,9 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
drbd_md_mark_dirty(mdev);
}
- if (size > la_size)
+ if (size > la_size_sect)
rv = grew;
- if (size < la_size)
+ if (size < la_size_sect)
rv = shrunk;
out:
lc_unlock(mdev->act_log);
@@ -903,7 +917,7 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
sector_t u_size, int assume_peer_has_space)
{
sector_t p_size = mdev->p_size; /* partner's disk size. */
- sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
+ sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
sector_t m_size; /* my size */
sector_t size = 0;
@@ -917,8 +931,8 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
if (p_size && m_size) {
size = min_t(sector_t, p_size, m_size);
} else {
- if (la_size) {
- size = la_size;
+ if (la_size_sect) {
+ size = la_size_sect;
if (m_size && m_size < size)
size = m_size;
if (p_size && p_size < size)
@@ -1127,15 +1141,32 @@ static bool should_set_defaults(struct genl_info *info)
return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
}
-static void enforce_disk_conf_limits(struct disk_conf *dc)
+static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
{
- if (dc->al_extents < DRBD_AL_EXTENTS_MIN)
- dc->al_extents = DRBD_AL_EXTENTS_MIN;
- if (dc->al_extents > DRBD_AL_EXTENTS_MAX)
- dc->al_extents = DRBD_AL_EXTENTS_MAX;
+ /* This is limited by 16 bit "slot" numbers,
+ * and by available on-disk context storage.
+ *
+ * Also (u16)~0 is special (denotes a "free" extent).
+ *
+ * One transaction occupies one 4kB on-disk block,
+ * we have n such blocks in the on disk ring buffer,
+ * the "current" transaction may fail (n-1),
+ * and there is 919 slot numbers context information per transaction.
+ *
+ * 72 transaction blocks amounts to more than 2**16 context slots,
+ * so cap there first.
+ */
+ const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
+ const unsigned int sufficient_on_disk =
+ (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
+ /AL_CONTEXT_PER_TRANSACTION;
+
+ unsigned int al_size_4k = bdev->md.al_size_4k;
- if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
- dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+ if (al_size_4k > sufficient_on_disk)
+ return max_al_nr;
+
+ return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
}
int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
@@ -1182,7 +1213,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
if (!expect(new_disk_conf->resync_rate >= 1))
new_disk_conf->resync_rate = 1;
- enforce_disk_conf_limits(new_disk_conf);
+ if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+ new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+ if (new_disk_conf->al_extents > drbd_al_extents_max(mdev->ldev))
+ new_disk_conf->al_extents = drbd_al_extents_max(mdev->ldev);
+
+ if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+ new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
if (fifo_size != mdev->rs_plan_s->size) {
@@ -1330,7 +1367,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
goto fail;
}
- enforce_disk_conf_limits(new_disk_conf);
+ if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+ new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
if (!new_plan) {
@@ -1399,8 +1437,16 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
goto fail;
}
- /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
- drbd_md_set_sector_offsets(mdev, nbc);
+ /* Read our meta data super block early.
+ * This also sets other on-disk offsets. */
+ retcode = drbd_md_read(mdev, nbc);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+ new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+ if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
+ new_disk_conf->al_extents = drbd_al_extents_max(nbc);
if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
@@ -1416,7 +1462,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
min_md_device_sectors = (2<<10);
} else {
max_possible_sectors = DRBD_MAX_SECTORS;
- min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1);
+ min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
}
if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
@@ -1467,8 +1513,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
if (!get_ldev_if_state(mdev, D_ATTACHING))
goto force_diskless;
- drbd_md_set_sector_offsets(mdev, nbc);
-
if (!mdev->bitmap) {
if (drbd_bm_init(mdev)) {
retcode = ERR_NOMEM;
@@ -1476,10 +1520,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
}
}
- retcode = drbd_md_read(mdev, nbc);
- if (retcode != NO_ERROR)
- goto force_diskless_dec;
-
if (mdev->state.conn < C_CONNECTED &&
mdev->state.role == R_PRIMARY &&
(mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
@@ -3162,6 +3202,7 @@ static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev)
CS_VERBOSE + CS_WAIT_COMPLETE);
idr_remove(&mdev->tconn->volumes, mdev->vnr);
idr_remove(&minors, mdev_to_minor(mdev));
+ destroy_workqueue(mdev->submit.wq);
del_gendisk(mdev->vdisk);
synchronize_rcu();
kref_put(&mdev->kref, &drbd_minor_destroy);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index a9eccfc6079b..1921871ca9a8 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2265,7 +2265,7 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
- drbd_al_begin_io(mdev, &peer_req->i);
+ drbd_al_begin_io(mdev, &peer_req->i, true);
}
err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
@@ -3992,7 +3992,7 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
clear_bit(DISCARD_MY_DATA, &mdev->flags);
- drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
+ drbd_md_sync(mdev); /* update connected indicator, la_size_sect, ... */
return 0;
}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 2b8303ad63c9..9f7ff1cb46ff 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -34,14 +34,14 @@
static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size);
/* Update disk stats at start of I/O request */
-static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
+static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
{
- const int rw = bio_data_dir(bio);
+ const int rw = bio_data_dir(req->master_bio);
int cpu;
cpu = part_stat_lock();
part_round_stats(cpu, &mdev->vdisk->part0);
part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
- part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
+ part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], req->i.size >> 9);
(void) cpu; /* The macro invocations above want the cpu argument, I do not like
the compiler warning about cpu only assigned but never used... */
part_inc_in_flight(&mdev->vdisk->part0, rw);
@@ -1020,12 +1020,24 @@ drbd_submit_req_private_bio(struct drbd_request *req)
bio_endio(bio, -EIO);
}
-void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
+static void drbd_queue_write(struct drbd_conf *mdev, struct drbd_request *req)
{
- const int rw = bio_rw(bio);
- struct bio_and_error m = { NULL, };
+ spin_lock(&mdev->submit.lock);
+ list_add_tail(&req->tl_requests, &mdev->submit.writes);
+ spin_unlock(&mdev->submit.lock);
+ queue_work(mdev->submit.wq, &mdev->submit.worker);
+}
+
+/* returns the new drbd_request pointer, if the caller is expected to
+ * drbd_send_and_submit() it (to save latency), or NULL if we queued the
+ * request on the submitter thread.
+ * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
+ */
+struct drbd_request *
+drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
+{
+ const int rw = bio_data_dir(bio);
struct drbd_request *req;
- bool no_remote = false;
/* allocate outside of all locks; */
req = drbd_req_new(mdev, bio);
@@ -1035,7 +1047,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
* if user cannot handle io errors, that's not our business. */
dev_err(DEV, "could not kmalloc() req\n");
bio_endio(bio, -ENOMEM);
- return;
+ return ERR_PTR(-ENOMEM);
}
req->start_time = start_time;
@@ -1044,19 +1056,27 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
req->private_bio = NULL;
}
- /* For WRITES going to the local disk, grab a reference on the target
- * extent. This waits for any resync activity in the corresponding
- * resync extent to finish, and, if necessary, pulls in the target
- * extent into the activity log, which involves further disk io because
- * of transactional on-disk meta data updates.
- * Empty flushes don't need to go into the activity log, they can only
- * flush data for pending writes which are already in there. */
+ /* Update disk stats */
+ _drbd_start_io_acct(mdev, req);
+
if (rw == WRITE && req->private_bio && req->i.size
&& !test_bit(AL_SUSPENDED, &mdev->flags)) {
+ if (!drbd_al_begin_io_fastpath(mdev, &req->i)) {
+ drbd_queue_write(mdev, req);
+ return NULL;
+ }
req->rq_state |= RQ_IN_ACT_LOG;
- drbd_al_begin_io(mdev, &req->i);
}
+ return req;
+}
+
+static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *req)
+{
+ const int rw = bio_rw(req->master_bio);
+ struct bio_and_error m = { NULL, };
+ bool no_remote = false;
+
spin_lock_irq(&mdev->tconn->req_lock);
if (rw == WRITE) {
/* This may temporarily give up the req_lock,
@@ -1078,9 +1098,6 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
goto out;
}
- /* Update disk stats */
- _drbd_start_io_acct(mdev, req, bio);
-
/* We fail READ/READA early, if we can not serve it.
* We must do this before req is registered on any lists.
* Otherwise, drbd_req_complete() will queue failed READ for retry. */
@@ -1137,7 +1154,116 @@ out:
if (m.bio)
complete_master_bio(mdev, &m);
- return;
+}
+
+void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
+{
+ struct drbd_request *req = drbd_request_prepare(mdev, bio, start_time);
+ if (IS_ERR_OR_NULL(req))
+ return;
+ drbd_send_and_submit(mdev, req);
+}
+
+static void submit_fast_path(struct drbd_conf *mdev, struct list_head *incoming)
+{
+ struct drbd_request *req, *tmp;
+ list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+ const int rw = bio_data_dir(req->master_bio);
+
+ if (rw == WRITE /* rw != WRITE should not even end up here! */
+ && req->private_bio && req->i.size
+ && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+ if (!drbd_al_begin_io_fastpath(mdev, &req->i))
+ continue;
+
+ req->rq_state |= RQ_IN_ACT_LOG;
+ }
+
+ list_del_init(&req->tl_requests);
+ drbd_send_and_submit(mdev, req);
+ }
+}
+
+static bool prepare_al_transaction_nonblock(struct drbd_conf *mdev,
+ struct list_head *incoming,
+ struct list_head *pending)
+{
+ struct drbd_request *req, *tmp;
+ int wake = 0;
+ int err;
+
+ spin_lock_irq(&mdev->al_lock);
+ list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+ err = drbd_al_begin_io_nonblock(mdev, &req->i);
+ if (err == -EBUSY)
+ wake = 1;
+ if (err)
+ continue;
+ req->rq_state |= RQ_IN_ACT_LOG;
+ list_move_tail(&req->tl_requests, pending);
+ }
+ spin_unlock_irq(&mdev->al_lock);
+ if (wake)
+ wake_up(&mdev->al_wait);
+
+ return !list_empty(pending);
+}
+
+void do_submit(struct work_struct *ws)
+{
+ struct drbd_conf *mdev = container_of(ws, struct drbd_conf, submit.worker);
+ LIST_HEAD(incoming);
+ LIST_HEAD(pending);
+ struct drbd_request *req, *tmp;
+
+ for (;;) {
+ spin_lock(&mdev->submit.lock);
+ list_splice_tail_init(&mdev->submit.writes, &incoming);
+ spin_unlock(&mdev->submit.lock);
+
+ submit_fast_path(mdev, &incoming);
+ if (list_empty(&incoming))
+ break;
+
+ wait_event(mdev->al_wait, prepare_al_transaction_nonblock(mdev, &incoming, &pending));
+ /* Maybe more was queued, while we prepared the transaction?
+ * Try to stuff them into this transaction as well.
+ * Be strictly non-blocking here, no wait_event, we already
+ * have something to commit.
+ * Stop if we don't make any more progres.
+ */
+ for (;;) {
+ LIST_HEAD(more_pending);
+ LIST_HEAD(more_incoming);
+ bool made_progress;
+
+ /* It is ok to look outside the lock,
+ * it's only an optimization anyways */
+ if (list_empty(&mdev->submit.writes))
+ break;
+
+ spin_lock(&mdev->submit.lock);
+ list_splice_tail_init(&mdev->submit.writes, &more_incoming);
+ spin_unlock(&mdev->submit.lock);
+
+ if (list_empty(&more_incoming))
+ break;
+
+ made_progress = prepare_al_transaction_nonblock(mdev, &more_incoming, &more_pending);
+
+ list_splice_tail_init(&more_pending, &pending);
+ list_splice_tail_init(&more_incoming, &incoming);
+
+ if (!made_progress)
+ break;
+ }
+ drbd_al_begin_io_commit(mdev, false);
+
+ list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
+ list_del_init(&req->tl_requests);
+ drbd_send_and_submit(mdev, req);
+ }
+ }
}
void drbd_make_request(struct request_queue *q, struct bio *bio)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 424dc7bdf9b7..f41e224caa7c 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -89,7 +89,8 @@ void drbd_md_io_complete(struct bio *bio, int error)
md_io->done = 1;
wake_up(&mdev->misc_wait);
bio_put(bio);
- put_ldev(mdev);
+ if (mdev->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
+ put_ldev(mdev);
}
/* reads on behalf of the partner,
@@ -1410,7 +1411,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
struct drbd_conf *mdev = w->mdev;
if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
- drbd_al_begin_io(mdev, &req->i);
+ drbd_al_begin_io(mdev, &req->i, false);
drbd_req_make_private_bio(req, req->master_bio);
req->private_bio->bi_bdev = mdev->ldev->backing_bdev;