summaryrefslogtreecommitdiff
path: root/drivers/md/bcache/journal.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-09 10:45:06 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-09 10:45:06 -0700
commit3b99107f0e0298e6fe0787f75b8f3d8306dfb230 (patch)
tree30536dbc9ca176470a2ae2938f952381e33f5deb /drivers/md/bcache/journal.c
parent0415052db4f92b7e272fc15802ad8b8be672deea (diff)
parentc9b3007feca018d3f7061f5d5a14cb00766ffe9b (diff)
Merge tag 'for-5.3/block-20190708' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: "This is the main block updates for 5.3. Nothing earth shattering or major in here, just fixes, additions, and improvements all over the map. This contains: - Series of documentation fixes (Bart) - Optimization of the blk-mq ctx get/put (Bart) - null_blk removal race condition fix (Bob) - req/bio_op() cleanups (Chaitanya) - Series cleaning up the segment accounting, and request/bio mapping (Christoph) - Series cleaning up the page getting/putting for bios (Christoph) - block cgroup cleanups and moving it to where it is used (Christoph) - block cgroup fixes (Tejun) - Series of fixes and improvements to bcache, most notably a write deadlock fix (Coly) - blk-iolatency STS_AGAIN and accounting fixes (Dennis) - Series of improvements and fixes to BFQ (Douglas, Paolo) - debugfs_create() return value check removal for drbd (Greg) - Use struct_size(), where appropriate (Gustavo) - Two lighnvm fixes (Heiner, Geert) - MD fixes, including a read balance and corruption fix (Guoqing, Marcos, Xiao, Yufen) - block opal shadow mbr additions (Jonas, Revanth) - sbitmap compare-and-exhange improvemnts (Pavel) - Fix for potential bio->bi_size overflow (Ming) - NVMe pull requests: - improved PCIe suspent support (Keith Busch) - error injection support for the admin queue (Akinobu Mita) - Fibre Channel discovery improvements (James Smart) - tracing improvements including nvmetc tracing support (Minwoo Im) - misc fixes and cleanups (Anton Eidelman, Minwoo Im, Chaitanya Kulkarni)" - Various little fixes and improvements to drivers and core" * tag 'for-5.3/block-20190708' of git://git.kernel.dk/linux-block: (153 commits) blk-iolatency: fix STS_AGAIN handling block: nr_phys_segments needs to be zero for REQ_OP_WRITE_ZEROES blk-mq: simplify blk_mq_make_request() blk-mq: remove blk_mq_put_ctx() sbitmap: Replace cmpxchg with xchg block: fix .bi_size overflow block: sed-opal: check size of shadow mbr block: sed-opal: ioctl for writing to shadow mbr block: sed-opal: add ioctl for done-mark of shadow mbr block: never take page references for ITER_BVEC direct-io: use bio_release_pages in dio_bio_complete block_dev: use bio_release_pages in bio_unmap_user block_dev: use bio_release_pages in blkdev_bio_end_io iomap: use bio_release_pages in iomap_dio_bio_end_io block: use bio_release_pages in bio_map_user_iov block: use bio_release_pages in bio_unmap_user block: optionally mark pages dirty in bio_release_pages block: move the BIO_NO_PAGE_REF check into bio_release_pages block: skd_main.c: Remove call to memset after dma_alloc_coherent block: mtip32xx: Remove call to memset after dma_alloc_coherent ...
Diffstat (limited to 'drivers/md/bcache/journal.c')
-rw-r--r--drivers/md/bcache/journal.c141
1 files changed, 100 insertions, 41 deletions
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 12dae9348147..be2a2a201603 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -100,6 +100,20 @@ reread: left = ca->sb.bucket_size - offset;
blocks = set_blocks(j, block_bytes(ca->set));
+ /*
+ * Nodes in 'list' are in linear increasing order of
+ * i->j.seq, the node on head has the smallest (oldest)
+ * journal seq, the node on tail has the biggest
+ * (latest) journal seq.
+ */
+
+ /*
+ * Check from the oldest jset for last_seq. If
+ * i->j.seq < j->last_seq, it means the oldest jset
+ * in list is expired and useless, remove it from
+ * this list. Otherwise, j is a condidate jset for
+ * further following checks.
+ */
while (!list_empty(list)) {
i = list_first_entry(list,
struct journal_replay, list);
@@ -109,13 +123,22 @@ reread: left = ca->sb.bucket_size - offset;
kfree(i);
}
+ /* iterate list in reverse order (from latest jset) */
list_for_each_entry_reverse(i, list, list) {
if (j->seq == i->j.seq)
goto next_set;
+ /*
+ * if j->seq is less than any i->j.last_seq
+ * in list, j is an expired and useless jset.
+ */
if (j->seq < i->j.last_seq)
goto next_set;
+ /*
+ * 'where' points to first jset in list which
+ * is elder then j.
+ */
if (j->seq > i->j.seq) {
where = &i->list;
goto add;
@@ -129,10 +152,12 @@ add:
if (!i)
return -ENOMEM;
memcpy(&i->j, j, bytes);
+ /* Add to the location after 'where' points to */
list_add(&i->list, where);
ret = 1;
- ja->seq[bucket_index] = j->seq;
+ if (j->seq > ja->seq[bucket_index])
+ ja->seq[bucket_index] = j->seq;
next_set:
offset += blocks * ca->sb.block_size;
len -= blocks * ca->sb.block_size;
@@ -268,7 +293,7 @@ bsearch:
struct journal_replay,
list)->j.seq;
- return ret;
+ return 0;
#undef read_bucket
}
@@ -391,60 +416,90 @@ err:
}
/* Journalling */
-#define journal_max_cmp(l, r) \
- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \
- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
-#define journal_min_cmp(l, r) \
- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \
- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
static void btree_flush_write(struct cache_set *c)
{
- /*
- * Try to find the btree node with that references the oldest journal
- * entry, best is our current candidate and is locked if non NULL:
- */
- struct btree *b;
- int i;
+ struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR];
+ unsigned int i, n;
+
+ if (c->journal.btree_flushing)
+ return;
+
+ spin_lock(&c->journal.flush_write_lock);
+ if (c->journal.btree_flushing) {
+ spin_unlock(&c->journal.flush_write_lock);
+ return;
+ }
+ c->journal.btree_flushing = true;
+ spin_unlock(&c->journal.flush_write_lock);
atomic_long_inc(&c->flush_write);
+ memset(btree_nodes, 0, sizeof(btree_nodes));
+ n = 0;
-retry:
- spin_lock(&c->journal.lock);
- if (heap_empty(&c->flush_btree)) {
- for_each_cached_btree(b, c, i)
- if (btree_current_write(b)->journal) {
- if (!heap_full(&c->flush_btree))
- heap_add(&c->flush_btree, b,
- journal_max_cmp);
- else if (journal_max_cmp(b,
- heap_peek(&c->flush_btree))) {
- c->flush_btree.data[0] = b;
- heap_sift(&c->flush_btree, 0,
- journal_max_cmp);
- }
- }
+ mutex_lock(&c->bucket_lock);
+ list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
+ if (btree_node_journal_flush(b))
+ pr_err("BUG: flush_write bit should not be set here!");
+
+ mutex_lock(&b->write_lock);
- for (i = c->flush_btree.used / 2 - 1; i >= 0; --i)
- heap_sift(&c->flush_btree, i, journal_min_cmp);
+ if (!btree_node_dirty(b)) {
+ mutex_unlock(&b->write_lock);
+ continue;
+ }
+
+ if (!btree_current_write(b)->journal) {
+ mutex_unlock(&b->write_lock);
+ continue;
+ }
+
+ set_btree_node_journal_flush(b);
+
+ mutex_unlock(&b->write_lock);
+
+ btree_nodes[n++] = b;
+ if (n == BTREE_FLUSH_NR)
+ break;
}
+ mutex_unlock(&c->bucket_lock);
- b = NULL;
- heap_pop(&c->flush_btree, b, journal_min_cmp);
- spin_unlock(&c->journal.lock);
+ for (i = 0; i < n; i++) {
+ b = btree_nodes[i];
+ if (!b) {
+ pr_err("BUG: btree_nodes[%d] is NULL", i);
+ continue;
+ }
+
+ /* safe to check without holding b->write_lock */
+ if (!btree_node_journal_flush(b)) {
+ pr_err("BUG: bnode %p: journal_flush bit cleaned", b);
+ continue;
+ }
- if (b) {
mutex_lock(&b->write_lock);
if (!btree_current_write(b)->journal) {
+ clear_bit(BTREE_NODE_journal_flush, &b->flags);
+ mutex_unlock(&b->write_lock);
+ pr_debug("bnode %p: written by others", b);
+ continue;
+ }
+
+ if (!btree_node_dirty(b)) {
+ clear_bit(BTREE_NODE_journal_flush, &b->flags);
mutex_unlock(&b->write_lock);
- /* We raced */
- atomic_long_inc(&c->retry_flush_write);
- goto retry;
+ pr_debug("bnode %p: dirty bit cleaned by others", b);
+ continue;
}
__bch_btree_node_write(b, NULL);
+ clear_bit(BTREE_NODE_journal_flush, &b->flags);
mutex_unlock(&b->write_lock);
}
+
+ spin_lock(&c->journal.flush_write_lock);
+ c->journal.btree_flushing = false;
+ spin_unlock(&c->journal.flush_write_lock);
}
#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
@@ -559,6 +614,7 @@ static void journal_reclaim(struct cache_set *c)
k->ptr[n++] = MAKE_PTR(0,
bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
ca->sb.nr_this_dev);
+ atomic_long_inc(&c->reclaimed_journal_buckets);
}
if (n) {
@@ -811,6 +867,10 @@ atomic_t *bch_journal(struct cache_set *c,
struct journal_write *w;
atomic_t *ret;
+ /* No journaling if CACHE_SET_IO_DISABLE set already */
+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
+ return NULL;
+
if (!CACHE_SYNC(&c->sb))
return NULL;
@@ -855,7 +915,6 @@ void bch_journal_free(struct cache_set *c)
free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
free_fifo(&c->journal.pin);
- free_heap(&c->flush_btree);
}
int bch_journal_alloc(struct cache_set *c)
@@ -863,6 +922,7 @@ int bch_journal_alloc(struct cache_set *c)
struct journal *j = &c->journal;
spin_lock_init(&j->lock);
+ spin_lock_init(&j->flush_write_lock);
INIT_DELAYED_WORK(&j->work, journal_write_work);
c->journal_delay_ms = 100;
@@ -870,8 +930,7 @@ int bch_journal_alloc(struct cache_set *c)
j->w[0].c = c;
j->w[1].c = c;
- if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) ||
- !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
return -ENOMEM;