diff options
Diffstat (limited to 'drivers/md/bcache/journal.c')
| -rw-r--r-- | drivers/md/bcache/journal.c | 577 |
1 files changed, 322 insertions, 255 deletions
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index b2fd412715b1..144693b7c46a 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -47,20 +47,18 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, closure_init_stack(&cl); - pr_debug("reading %u", bucket_index); + pr_debug("reading %u\n", bucket_index); while (offset < ca->sb.bucket_size) { reread: left = ca->sb.bucket_size - offset; len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS); - bio_reset(bio); + bio_reset(bio, ca->bdev, REQ_OP_READ); bio->bi_iter.bi_sector = bucket + offset; - bio_set_dev(bio, ca->bdev); bio->bi_iter.bi_size = len << 9; bio->bi_end_io = journal_read_endio; bio->bi_private = &cl; - bio_set_op_attrs(bio, REQ_OP_READ, 0); bch_bio_map(bio, data); closure_bio_submit(ca->set, bio, &cl); @@ -78,13 +76,13 @@ reread: left = ca->sb.bucket_size - offset; size_t blocks, bytes = set_bytes(j); if (j->magic != jset_magic(&ca->sb)) { - pr_debug("%u: bad magic", bucket_index); + pr_debug("%u: bad magic\n", bucket_index); return ret; } if (bytes > left << 9 || bytes > PAGE_SIZE << JSET_BITS) { - pr_info("%u: too big, %zu bytes, offset %u", + pr_info("%u: too big, %zu bytes, offset %u\n", bucket_index, bytes, offset); return ret; } @@ -93,13 +91,27 @@ reread: left = ca->sb.bucket_size - offset; goto reread; if (j->csum != csum_set(j)) { - pr_info("%u: bad csum, %zu bytes, offset %u", + pr_info("%u: bad csum, %zu bytes, offset %u\n", bucket_index, bytes, offset); return ret; } - blocks = set_blocks(j, block_bytes(ca->set)); + blocks = set_blocks(j, block_bytes(ca)); + /* + * Nodes in 'list' are in linear increasing order of + * i->j.seq, the node on head has the smallest (oldest) + * journal seq, the node on tail has the biggest + * (latest) journal seq. + */ + + /* + * Check from the oldest jset for last_seq. If + * i->j.seq < j->last_seq, it means the oldest jset + * in list is expired and useless, remove it from + * this list. Otherwise, j is a candidate jset for + * further following checks. + */ while (!list_empty(list)) { i = list_first_entry(list, struct journal_replay, list); @@ -109,13 +121,22 @@ reread: left = ca->sb.bucket_size - offset; kfree(i); } + /* iterate list in reverse order (from latest jset) */ list_for_each_entry_reverse(i, list, list) { if (j->seq == i->j.seq) goto next_set; + /* + * if j->seq is less than any i->j.last_seq + * in list, j is an expired and useless jset. + */ if (j->seq < i->j.last_seq) goto next_set; + /* + * 'where' points to first jset in list which + * is elder then j. + */ if (j->seq > i->j.seq) { where = &i->list; goto add; @@ -128,11 +149,14 @@ add: bytes, GFP_KERNEL); if (!i) return -ENOMEM; - memcpy(&i->j, j, bytes); + unsafe_memcpy(&i->j, j, bytes, + /* "bytes" was calculated by set_bytes() above */); + /* Add to the location after 'where' points to */ list_add(&i->list, where); ret = 1; - ja->seq[bucket_index] = j->seq; + if (j->seq > ja->seq[bucket_index]) + ja->seq[bucket_index] = j->seq; next_set: offset += blocks * ca->sb.block_size; len -= blocks * ca->sb.block_size; @@ -147,121 +171,115 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) { #define read_bucket(b) \ ({ \ - int ret = journal_read_bucket(ca, list, b); \ + ret = journal_read_bucket(ca, list, b); \ __set_bit(b, bitmap); \ if (ret < 0) \ return ret; \ ret; \ }) - struct cache *ca; - unsigned int iter; - - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; - DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); - unsigned int i, l, r, m; - uint64_t seq; + struct cache *ca = c->cache; + int ret = 0; + struct journal_device *ja = &ca->journal; + DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); + unsigned int i, l, r, m; + uint64_t seq; - bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); - pr_debug("%u journal buckets", ca->sb.njournal_buckets); + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); + pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); + /* + * Read journal buckets ordered by golden ratio hash to quickly + * find a sequence of buckets with valid journal entries + */ + for (i = 0; i < ca->sb.njournal_buckets; i++) { /* - * Read journal buckets ordered by golden ratio hash to quickly - * find a sequence of buckets with valid journal entries + * We must try the index l with ZERO first for + * correctness due to the scenario that the journal + * bucket is circular buffer which might have wrapped */ - for (i = 0; i < ca->sb.njournal_buckets; i++) { - /* - * We must try the index l with ZERO first for - * correctness due to the scenario that the journal - * bucket is circular buffer which might have wrapped - */ - l = (i * 2654435769U) % ca->sb.njournal_buckets; + l = (i * 2654435769U) % ca->sb.njournal_buckets; - if (test_bit(l, bitmap)) - break; + if (test_bit(l, bitmap)) + break; - if (read_bucket(l)) - goto bsearch; - } + if (read_bucket(l)) + goto bsearch; + } - /* - * If that fails, check all the buckets we haven't checked - * already - */ - pr_debug("falling back to linear search"); + /* + * If that fails, check all the buckets we haven't checked + * already + */ + pr_debug("falling back to linear search\n"); - for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); - l < ca->sb.njournal_buckets; - l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, - l + 1)) - if (read_bucket(l)) - goto bsearch; + for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) + if (read_bucket(l)) + goto bsearch; - /* no journal entries on this device? */ - if (l == ca->sb.njournal_buckets) - continue; + /* no journal entries on this device? */ + if (l == ca->sb.njournal_buckets) + goto out; bsearch: - BUG_ON(list_empty(list)); + BUG_ON(list_empty(list)); - /* Binary search */ - m = l; - r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); - pr_debug("starting binary search, l %u r %u", l, r); + /* Binary search */ + m = l; + r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); + pr_debug("starting binary search, l %u r %u\n", l, r); - while (l + 1 < r) { - seq = list_entry(list->prev, struct journal_replay, - list)->j.seq; + while (l + 1 < r) { + seq = list_entry(list->prev, struct journal_replay, + list)->j.seq; - m = (l + r) >> 1; - read_bucket(m); + m = (l + r) >> 1; + read_bucket(m); - if (seq != list_entry(list->prev, struct journal_replay, - list)->j.seq) - l = m; - else - r = m; - } + if (seq != list_entry(list->prev, struct journal_replay, + list)->j.seq) + l = m; + else + r = m; + } - /* - * Read buckets in reverse order until we stop finding more - * journal entries - */ - pr_debug("finishing up: m %u njournal_buckets %u", - m, ca->sb.njournal_buckets); - l = m; + /* + * Read buckets in reverse order until we stop finding more + * journal entries + */ + pr_debug("finishing up: m %u njournal_buckets %u\n", + m, ca->sb.njournal_buckets); + l = m; - while (1) { - if (!l--) - l = ca->sb.njournal_buckets - 1; + while (1) { + if (!l--) + l = ca->sb.njournal_buckets - 1; - if (l == m) - break; + if (l == m) + break; - if (test_bit(l, bitmap)) - continue; + if (test_bit(l, bitmap)) + continue; - if (!read_bucket(l)) - break; - } + if (!read_bucket(l)) + break; + } - seq = 0; + seq = 0; - for (i = 0; i < ca->sb.njournal_buckets; i++) - if (ja->seq[i] > seq) { - seq = ja->seq[i]; - /* - * When journal_reclaim() goes to allocate for - * the first time, it'll use the bucket after - * ja->cur_idx - */ - ja->cur_idx = i; - ja->last_idx = ja->discard_idx = (i + 1) % - ca->sb.njournal_buckets; + for (i = 0; i < ca->sb.njournal_buckets; i++) + if (ja->seq[i] > seq) { + seq = ja->seq[i]; + /* + * When journal_reclaim() goes to allocate for + * the first time, it'll use the bucket after + * ja->cur_idx + */ + ja->cur_idx = i; + ja->last_idx = (i + 1) % ca->sb.njournal_buckets; - } - } + } +out: if (!list_empty(list)) c->journal.seq = list_entry(list->prev, struct journal_replay, @@ -330,9 +348,12 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); - cache_set_err_on(n != i->j.seq, s, -"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", - n, i->j.seq - 1, start, end); + if (n != i->j.seq) { + pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", + n, i->j.seq - 1, start, end); + ret = -EIO; + goto err; + } for (k = i->j.start; k < bset_bkey_last(&i->j); @@ -357,7 +378,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) entries++; } - pr_info("journal replay done, %i keys in %i entries, seq %llu", + pr_info("journal replay done, %i keys in %i entries, seq %llu\n", keys, entries, end); err: while (!list_empty(list)) { @@ -369,132 +390,193 @@ err: return ret; } +void bch_journal_space_reserve(struct journal *j) +{ + j->do_reserve = true; +} + /* Journalling */ -#define journal_max_cmp(l, r) \ - (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ - fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) -#define journal_min_cmp(l, r) \ - (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ - fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) static void btree_flush_write(struct cache_set *c) { - /* - * Try to find the btree node with that references the oldest journal - * entry, best is our current candidate and is locked if non NULL: - */ - struct btree *b; - int i; + struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; + unsigned int i, nr; + int ref_nr; + atomic_t *fifo_front_p, *now_fifo_front_p; + size_t mask; - atomic_long_inc(&c->flush_write); - -retry: - spin_lock(&c->journal.lock); - if (heap_empty(&c->flush_btree)) { - for_each_cached_btree(b, c, i) - if (btree_current_write(b)->journal) { - if (!heap_full(&c->flush_btree)) - heap_add(&c->flush_btree, b, - journal_max_cmp); - else if (journal_max_cmp(b, - heap_peek(&c->flush_btree))) { - c->flush_btree.data[0] = b; - heap_sift(&c->flush_btree, 0, - journal_max_cmp); - } - } + if (c->journal.btree_flushing) + return; - for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) - heap_sift(&c->flush_btree, i, journal_min_cmp); + spin_lock(&c->journal.flush_write_lock); + if (c->journal.btree_flushing) { + spin_unlock(&c->journal.flush_write_lock); + return; } + c->journal.btree_flushing = true; + spin_unlock(&c->journal.flush_write_lock); - b = NULL; - heap_pop(&c->flush_btree, b, journal_min_cmp); + /* get the oldest journal entry and check its refcount */ + spin_lock(&c->journal.lock); + fifo_front_p = &fifo_front(&c->journal.pin); + ref_nr = atomic_read(fifo_front_p); + if (ref_nr <= 0) { + /* + * do nothing if no btree node references + * the oldest journal entry + */ + spin_unlock(&c->journal.lock); + goto out; + } spin_unlock(&c->journal.lock); - if (b) { + mask = c->journal.pin.mask; + nr = 0; + atomic_long_inc(&c->flush_write); + memset(btree_nodes, 0, sizeof(btree_nodes)); + + mutex_lock(&c->bucket_lock); + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + /* + * It is safe to get now_fifo_front_p without holding + * c->journal.lock here, because we don't need to know + * the exactly accurate value, just check whether the + * front pointer of c->journal.pin is changed. + */ + now_fifo_front_p = &fifo_front(&c->journal.pin); + /* + * If the oldest journal entry is reclaimed and front + * pointer of c->journal.pin changes, it is unnecessary + * to scan c->btree_cache anymore, just quit the loop and + * flush out what we have already. + */ + if (now_fifo_front_p != fifo_front_p) + break; + /* + * quit this loop if all matching btree nodes are + * scanned and record in btree_nodes[] already. + */ + ref_nr = atomic_read(fifo_front_p); + if (nr >= ref_nr) + break; + + if (btree_node_journal_flush(b)) + pr_err("BUG: flush_write bit should not be set here!\n"); + mutex_lock(&b->write_lock); + + if (!btree_node_dirty(b)) { + mutex_unlock(&b->write_lock); + continue; + } + if (!btree_current_write(b)->journal) { mutex_unlock(&b->write_lock); - /* We raced */ - atomic_long_inc(&c->retry_flush_write); - goto retry; + continue; } - __bch_btree_node_write(b, NULL); - mutex_unlock(&b->write_lock); - } -} + /* + * Only select the btree node which exactly references + * the oldest journal entry. + * + * If the journal entry pointed by fifo_front_p is + * reclaimed in parallel, don't worry: + * - the list_for_each_xxx loop will quit when checking + * next now_fifo_front_p. + * - If there are matched nodes recorded in btree_nodes[], + * they are clean now (this is why and how the oldest + * journal entry can be reclaimed). These selected nodes + * will be ignored and skipped in the following for-loop. + */ + if (((btree_current_write(b)->journal - fifo_front_p) & + mask) != 0) { + mutex_unlock(&b->write_lock); + continue; + } -#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) + set_btree_node_journal_flush(b); -static void journal_discard_endio(struct bio *bio) -{ - struct journal_device *ja = - container_of(bio, struct journal_device, discard_bio); - struct cache *ca = container_of(ja, struct cache, journal); + mutex_unlock(&b->write_lock); - atomic_set(&ja->discard_in_flight, DISCARD_DONE); + btree_nodes[nr++] = b; + /* + * To avoid holding c->bucket_lock too long time, + * only scan for BTREE_FLUSH_NR matched btree nodes + * at most. If there are more btree nodes reference + * the oldest journal entry, try to flush them next + * time when btree_flush_write() is called. + */ + if (nr == BTREE_FLUSH_NR) + break; + } + mutex_unlock(&c->bucket_lock); - closure_wake_up(&ca->set->journal.wait); - closure_put(&ca->set->cl); -} + for (i = 0; i < nr; i++) { + b = btree_nodes[i]; + if (!b) { + pr_err("BUG: btree_nodes[%d] is NULL\n", i); + continue; + } -static void journal_discard_work(struct work_struct *work) -{ - struct journal_device *ja = - container_of(work, struct journal_device, discard_work); + /* safe to check without holding b->write_lock */ + if (!btree_node_journal_flush(b)) { + pr_err("BUG: bnode %p: journal_flush bit cleaned\n", b); + continue; + } - submit_bio(&ja->discard_bio); -} + mutex_lock(&b->write_lock); + if (!btree_current_write(b)->journal) { + clear_bit(BTREE_NODE_journal_flush, &b->flags); + mutex_unlock(&b->write_lock); + pr_debug("bnode %p: written by others\n", b); + continue; + } -static void do_journal_discard(struct cache *ca) -{ - struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->discard_bio; + if (!btree_node_dirty(b)) { + clear_bit(BTREE_NODE_journal_flush, &b->flags); + mutex_unlock(&b->write_lock); + pr_debug("bnode %p: dirty bit cleaned by others\n", b); + continue; + } - if (!ca->discard) { - ja->discard_idx = ja->last_idx; - return; + __bch_btree_node_write(b, NULL); + clear_bit(BTREE_NODE_journal_flush, &b->flags); + mutex_unlock(&b->write_lock); } - switch (atomic_read(&ja->discard_in_flight)) { - case DISCARD_IN_FLIGHT: - return; - - case DISCARD_DONE: - ja->discard_idx = (ja->discard_idx + 1) % - ca->sb.njournal_buckets; +out: + spin_lock(&c->journal.flush_write_lock); + c->journal.btree_flushing = false; + spin_unlock(&c->journal.flush_write_lock); +} - atomic_set(&ja->discard_in_flight, DISCARD_READY); - /* fallthrough */ +#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) - case DISCARD_READY: - if (ja->discard_idx == ja->last_idx) - return; +static unsigned int free_journal_buckets(struct cache_set *c) +{ + struct journal *j = &c->journal; + struct cache *ca = c->cache; + struct journal_device *ja = &c->cache->journal; + unsigned int n; - atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); + /* In case njournal_buckets is not power of 2 */ + if (ja->cur_idx >= ja->last_idx) + n = ca->sb.njournal_buckets + ja->last_idx - ja->cur_idx; + else + n = ja->last_idx - ja->cur_idx; - bio_init(bio, bio->bi_inline_vecs, 1); - bio_set_op_attrs(bio, REQ_OP_DISCARD, 0); - bio->bi_iter.bi_sector = bucket_to_sector(ca->set, - ca->sb.d[ja->discard_idx]); - bio_set_dev(bio, ca->bdev); - bio->bi_iter.bi_size = bucket_bytes(ca); - bio->bi_end_io = journal_discard_endio; + if (n > (1 + j->do_reserve)) + return n - (1 + j->do_reserve); - closure_get(&ca->set->cl); - INIT_WORK(&ja->discard_work, journal_discard_work); - queue_work(bch_journal_wq, &ja->discard_work); - } + return 0; } static void journal_reclaim(struct cache_set *c) { struct bkey *k = &c->journal.key; - struct cache *ca; + struct cache *ca = c->cache; uint64_t last_seq; - unsigned int iter, n = 0; + struct journal_device *ja = &ca->journal; atomic_t p __maybe_unused; atomic_long_inc(&c->reclaim); @@ -506,45 +588,27 @@ static void journal_reclaim(struct cache_set *c) /* Update last_idx */ - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; - - while (ja->last_idx != ja->cur_idx && - ja->seq[ja->last_idx] < last_seq) - ja->last_idx = (ja->last_idx + 1) % - ca->sb.njournal_buckets; - } - - for_each_cache(ca, c, iter) - do_journal_discard(ca); + while (ja->last_idx != ja->cur_idx && + ja->seq[ja->last_idx] < last_seq) + ja->last_idx = (ja->last_idx + 1) % + ca->sb.njournal_buckets; if (c->journal.blocks_free) goto out; - /* - * Allocate: - * XXX: Sort by free journal space - */ - - for_each_cache(ca, c, iter) { - struct journal_device *ja = &ca->journal; - unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; - - /* No space available on this device */ - if (next == ja->discard_idx) - continue; + if (!free_journal_buckets(c)) + goto out; - ja->cur_idx = next; - k->ptr[n++] = MAKE_PTR(0, - bucket_to_sector(c, ca->sb.d[ja->cur_idx]), - ca->sb.nr_this_dev); - } + ja->cur_idx = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + k->ptr[0] = MAKE_PTR(0, + bucket_to_sector(c, ca->sb.d[ja->cur_idx]), + ca->sb.nr_this_dev); + atomic_long_inc(&c->reclaimed_journal_buckets); bkey_init(k); - SET_KEY_PTRS(k, n); + SET_KEY_PTRS(k, 1); + c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits; - if (n) - c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; out: if (!journal_full(&c->journal)) __closure_wake_up(&c->journal.wait); @@ -571,7 +635,7 @@ void bch_journal_next(struct journal *j) j->cur->data->keys = 0; if (fifo_full(&j->pin)) - pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); + pr_debug("journal_pin full (%zu)\n", fifo_used(&j->pin)); } static void journal_write_endio(struct bio *bio) @@ -582,11 +646,11 @@ static void journal_write_endio(struct bio *bio) closure_put(&w->c->journal.io); } -static void journal_write(struct closure *cl); +static CLOSURE_CALLBACK(journal_write); -static void journal_write_done(struct closure *cl) +static CLOSURE_CALLBACK(journal_write_done) { - struct journal *j = container_of(cl, struct journal, io); + closure_type(j, struct journal, io); struct journal_write *w = (j->cur == j->w) ? &j->w[1] : &j->w[0]; @@ -595,24 +659,24 @@ static void journal_write_done(struct closure *cl) continue_at_nobarrier(cl, journal_write, bch_journal_wq); } -static void journal_write_unlock(struct closure *cl) +static CLOSURE_CALLBACK(journal_write_unlock) __releases(&c->journal.lock) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io); + closure_type(c, struct cache_set, journal.io); c->journal.io_in_flight = 0; spin_unlock(&c->journal.lock); } -static void journal_write_unlocked(struct closure *cl) +static CLOSURE_CALLBACK(journal_write_unlocked) __releases(c->journal.lock) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io); - struct cache *ca; + closure_type(c, struct cache_set, journal.io); + struct cache *ca = c->cache; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; - unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) * - c->sb.block_size; + unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * + ca->sb.block_size; struct bio *bio; struct bio_list list; @@ -631,36 +695,32 @@ static void journal_write_unlocked(struct closure *cl) return; } - c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); + c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); w->data->btree_level = c->root->level; bkey_copy(&w->data->btree_root, &c->root->key); bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); - for_each_cache(ca, c, i) - w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; - - w->data->magic = jset_magic(&c->sb); + w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; + w->data->magic = jset_magic(&ca->sb); w->data->version = BCACHE_JSET_VERSION; w->data->last_seq = last_seq(&c->journal); w->data->csum = csum_set(w->data); for (i = 0; i < KEY_PTRS(k); i++) { - ca = PTR_CACHE(c, k, i); + ca = c->cache; bio = &ca->journal.bio; atomic_long_add(sectors, &ca->meta_sectors_written); - bio_reset(bio); + bio_reset(bio, ca->bdev, REQ_OP_WRITE | + REQ_SYNC | REQ_META | REQ_PREFLUSH | REQ_FUA); bio->bi_iter.bi_sector = PTR_OFFSET(k, i); - bio_set_dev(bio, ca->bdev); bio->bi_iter.bi_size = sectors << 9; bio->bi_end_io = journal_write_endio; bio->bi_private = w; - bio_set_op_attrs(bio, REQ_OP_WRITE, - REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); bch_bio_map(bio, w->data); trace_bcache_journal_write(bio, w->data->keys); @@ -671,6 +731,9 @@ static void journal_write_unlocked(struct closure *cl) ca->journal.seq[ca->journal.cur_idx] = w->data->seq; } + /* If KEY_PTRS(k) == 0, this jset gets lost in air */ + BUG_ON(i == 0); + atomic_dec_bug(&fifo_back(&c->journal.pin)); bch_journal_next(&c->journal); journal_reclaim(c); @@ -683,12 +746,12 @@ static void journal_write_unlocked(struct closure *cl) continue_at(cl, journal_write_done, NULL); } -static void journal_write(struct closure *cl) +static CLOSURE_CALLBACK(journal_write) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io); + closure_type(c, struct cache_set, journal.io); spin_lock(&c->journal.lock); - journal_write_unlocked(cl); + journal_write_unlocked(&cl->work); } static void journal_try_write(struct cache_set *c) @@ -714,6 +777,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, size_t sectors; struct closure cl; bool wait = false; + struct cache *ca = c->cache; closure_init_stack(&cl); @@ -723,10 +787,10 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, struct journal_write *w = c->journal.cur; sectors = __set_blocks(w->data, w->data->keys + nkeys, - block_bytes(c)) * c->sb.block_size; + block_bytes(ca)) * ca->sb.block_size; if (sectors <= min_t(size_t, - c->journal.blocks_free * c->sb.block_size, + c->journal.blocks_free * ca->sb.block_size, PAGE_SECTORS << JSET_BITS)) return w; @@ -787,7 +851,11 @@ atomic_t *bch_journal(struct cache_set *c, struct journal_write *w; atomic_t *ret; - if (!CACHE_SYNC(&c->sb)) + /* No journaling if CACHE_SET_IO_DISABLE set already */ + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) + return NULL; + + if (!CACHE_SYNC(&c->cache->sb)) return NULL; w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); @@ -803,8 +871,8 @@ atomic_t *bch_journal(struct cache_set *c, journal_try_write(c); } else if (!w->dirty) { w->dirty = true; - schedule_delayed_work(&c->journal.work, - msecs_to_jiffies(c->journal_delay_ms)); + queue_delayed_work(bch_flush_wq, &c->journal.work, + msecs_to_jiffies(c->journal_delay_ms)); spin_unlock(&c->journal.lock); } else { spin_unlock(&c->journal.lock); @@ -831,7 +899,6 @@ void bch_journal_free(struct cache_set *c) free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); free_fifo(&c->journal.pin); - free_heap(&c->flush_btree); } int bch_journal_alloc(struct cache_set *c) @@ -839,6 +906,7 @@ int bch_journal_alloc(struct cache_set *c) struct journal *j = &c->journal; spin_lock_init(&j->lock); + spin_lock_init(&j->flush_write_lock); INIT_DELAYED_WORK(&j->work, journal_write_work); c->journal_delay_ms = 100; @@ -846,10 +914,9 @@ int bch_journal_alloc(struct cache_set *c) j->w[0].c = c; j->w[1].c = c; - if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || - !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || - !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) || + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS))) return -ENOMEM; return 0; |
