summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-09 12:49:01 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-09 12:49:01 -0700
commit126e76ffbf78d9e948b641aadb265d16c57f5a3d (patch)
tree656e7838f0ec057936b80e15a774911df05c6005 /drivers/md
parentfbd01410e89a66f346ba1b3c0161e1198449b746 (diff)
parent175206cf9ab63161dec74d9cd7f9992e062491f5 (diff)
Merge branch 'for-4.14/block-postmerge' of git://git.kernel.dk/linux-block
Pull followup block layer updates from Jens Axboe: "I ended up splitting the main pull request for this series into two, mainly because of clashes between NVMe fixes that went into 4.13 after the for-4.14 branches were split off. This pull request is mostly NVMe, but not exclusively. In detail, it contains: - Two pull request for NVMe changes from Christoph. Nothing new on the feature front, basically just fixes all over the map for the core bits, transport, rdma, etc. - Series from Bart, cleaning up various bits in the BFQ scheduler. - Series of bcache fixes, which has been lingering for a release or two. Coly sent this in, but patches from various people in this area. - Set of patches for BFQ from Paolo himself, updating both documentation and fixing some corner cases in performance. - Series from Omar, attempting to now get the 4k loop support correct. Our confidence level is higher this time. - Series from Shaohua for loop as well, improving O_DIRECT performance and fixing a use-after-free" * 'for-4.14/block-postmerge' of git://git.kernel.dk/linux-block: (74 commits) bcache: initialize dirty stripes in flash_dev_run() loop: set physical block size to logical block size bcache: fix bch_hprint crash and improve output bcache: Update continue_at() documentation bcache: silence static checker warning bcache: fix for gc and write-back race bcache: increase the number of open buckets bcache: Correct return value for sysfs attach errors bcache: correct cache_dirty_target in __update_writeback_rate() bcache: gc does not work when triggering by manual command bcache: Don't reinvent the wheel but use existing llist API bcache: do not subtract sectors_to_gc for bypassed IO bcache: fix sequential large write IO bypass bcache: Fix leak of bdev reference block/loop: remove unused field block/loop: fix use after free bfq: Use icq_to_bic() consistently bfq: Suppress compiler warnings about comparisons bfq: Check kstrtoul() return value bfq: Declare local functions static ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/alloc.c4
-rw-r--r--drivers/md/bcache/bcache.h1
-rw-r--r--drivers/md/bcache/closure.c15
-rw-r--r--drivers/md/bcache/closure.h4
-rw-r--r--drivers/md/bcache/request.c12
-rw-r--r--drivers/md/bcache/super.c10
-rw-r--r--drivers/md/bcache/sysfs.c19
-rw-r--r--drivers/md/bcache/util.c50
-rw-r--r--drivers/md/bcache/writeback.c20
-rw-r--r--drivers/md/bcache/writeback.h21
10 files changed, 99 insertions, 57 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ca4abe1ccd8d..cacbe2dbd5c3 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -68,6 +68,8 @@
#include <linux/random.h>
#include <trace/events/bcache.h>
+#define MAX_OPEN_BUCKETS 128
+
/* Bucket heap / gen */
uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
@@ -671,7 +673,7 @@ int bch_open_buckets_alloc(struct cache_set *c)
spin_lock_init(&c->data_bucket_lock);
- for (i = 0; i < 6; i++) {
+ for (i = 0; i < MAX_OPEN_BUCKETS; i++) {
struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
if (!b)
return -ENOMEM;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index dee542fff68e..2ed9bd231d84 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -333,6 +333,7 @@ struct cached_dev {
/* Limit number of writeback bios in flight */
struct semaphore in_flight;
struct task_struct *writeback_thread;
+ struct workqueue_struct *writeback_write_wq;
struct keybuf writeback_keys;
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 864e673aec39..7d5286b05036 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -70,21 +70,10 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
list = llist_del_all(&wait_list->list);
/* We first reverse the list to preserve FIFO ordering and fairness */
-
- while (list) {
- struct llist_node *t = list;
- list = llist_next(list);
-
- t->next = reverse;
- reverse = t;
- }
+ reverse = llist_reverse_order(list);
/* Then do the wakeups */
-
- while (reverse) {
- cl = container_of(reverse, struct closure, list);
- reverse = llist_next(reverse);
-
+ llist_for_each_entry(cl, reverse, list) {
closure_set_waiting(cl, 0);
closure_sub(cl, CLOSURE_WAITING + 1);
}
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 1ec84ca81146..295b7e43f92c 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -312,8 +312,6 @@ static inline void closure_wake_up(struct closure_waitlist *list)
* been dropped with closure_put()), it will resume execution at @fn running out
* of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
*
- * NOTE: This macro expands to a return in the calling function!
- *
* This is because after calling continue_at() you no longer have a ref on @cl,
* and whatever @cl owns may be freed out from under you - a running closure fn
* has a ref on its own closure which continue_at() drops.
@@ -340,8 +338,6 @@ do { \
* Causes @fn to be executed out of @cl, in @wq context (or called directly if
* @wq is NULL).
*
- * NOTE: like continue_at(), this macro expands to a return in the caller!
- *
* The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
* thus it's not safe to touch anything protected by @cl after a
* continue_at_nobarrier().
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 0e1463d0c334..681b4f12b05a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -196,12 +196,12 @@ static void bch_data_insert_start(struct closure *cl)
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
struct bio *bio = op->bio, *n;
- if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
- wake_up_gc(op->c);
-
if (op->bypass)
return bch_data_invalidate(cl);
+ if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
+ wake_up_gc(op->c);
+
/*
* Journal writes are marked REQ_PREFLUSH; if the original write was a
* flush, it'll wait on the journal write.
@@ -400,12 +400,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
if (!congested && !dc->sequential_cutoff)
goto rescale;
- if (!congested &&
- mode == CACHE_MODE_WRITEBACK &&
- op_is_write(bio->bi_opf) &&
- op_is_sync(bio->bi_opf))
- goto rescale;
-
spin_lock(&dc->io_lock);
hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 974d832e54a6..fc0a31b13ac4 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1026,7 +1026,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
}
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
- bch_sectors_dirty_init(dc);
+ bch_sectors_dirty_init(&dc->disk);
atomic_set(&dc->has_dirty, 1);
atomic_inc(&dc->count);
bch_writeback_queue(dc);
@@ -1059,6 +1059,8 @@ static void cached_dev_free(struct closure *cl)
cancel_delayed_work_sync(&dc->writeback_rate_update);
if (!IS_ERR_OR_NULL(dc->writeback_thread))
kthread_stop(dc->writeback_thread);
+ if (dc->writeback_write_wq)
+ destroy_workqueue(dc->writeback_write_wq);
mutex_lock(&bch_register_lock);
@@ -1228,6 +1230,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
goto err;
bcache_device_attach(d, c, u - c->uuids);
+ bch_sectors_dirty_init(d);
bch_flash_dev_request_init(d);
add_disk(d->disk);
@@ -1374,9 +1377,6 @@ static void cache_set_flush(struct closure *cl)
struct btree *b;
unsigned i;
- if (!c)
- closure_return(cl);
-
bch_cache_accounting_destroy(&c->accounting);
kobject_put(&c->internal);
@@ -1964,6 +1964,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
else
err = "device busy";
mutex_unlock(&bch_register_lock);
+ if (!IS_ERR(bdev))
+ bdput(bdev);
if (attr == &ksysfs_register_quiet)
goto out;
}
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index f90f13616980..104c57cd666c 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -192,7 +192,7 @@ STORE(__cached_dev)
{
struct cached_dev *dc = container_of(kobj, struct cached_dev,
disk.kobj);
- unsigned v = size;
+ ssize_t v = size;
struct cache_set *c;
struct kobj_uevent_env *env;
@@ -227,7 +227,7 @@ STORE(__cached_dev)
bch_cached_dev_run(dc);
if (attr == &sysfs_cache_mode) {
- ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
+ v = bch_read_string_list(buf, bch_cache_modes + 1);
if (v < 0)
return v;
@@ -615,8 +615,21 @@ STORE(__bch_cache_set)
bch_cache_accounting_clear(&c->accounting);
}
- if (attr == &sysfs_trigger_gc)
+ if (attr == &sysfs_trigger_gc) {
+ /*
+ * Garbage collection thread only works when sectors_to_gc < 0,
+ * when users write to sysfs entry trigger_gc, most of time
+ * they want to forcibly triger gargage collection. Here -1 is
+ * set to c->sectors_to_gc, to make gc_should_run() give a
+ * chance to permit gc thread to run. "give a chance" means
+ * before going into gc_should_run(), there is still chance
+ * that c->sectors_to_gc being set to other positive value. So
+ * writing sysfs entry trigger_gc won't always make sure gc
+ * thread takes effect.
+ */
+ atomic_set(&c->sectors_to_gc, -1);
wake_up_gc(c);
+ }
if (attr == &sysfs_prune_cache) {
struct shrink_control sc;
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 8c3a938f4bf0..176d3c2ef5f5 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -74,24 +74,44 @@ STRTO_H(strtouint, unsigned int)
STRTO_H(strtoll, long long)
STRTO_H(strtoull, unsigned long long)
+/**
+ * bch_hprint() - formats @v to human readable string for sysfs.
+ *
+ * @v - signed 64 bit integer
+ * @buf - the (at least 8 byte) buffer to format the result into.
+ *
+ * Returns the number of bytes used by format.
+ */
ssize_t bch_hprint(char *buf, int64_t v)
{
static const char units[] = "?kMGTPEZY";
- char dec[4] = "";
- int u, t = 0;
-
- for (u = 0; v >= 1024 || v <= -1024; u++) {
- t = v & ~(~0 << 10);
- v >>= 10;
- }
-
- if (!u)
- return sprintf(buf, "%llu", v);
-
- if (v < 100 && v > -100)
- snprintf(dec, sizeof(dec), ".%i", t / 100);
-
- return sprintf(buf, "%lli%s%c", v, dec, units[u]);
+ int u = 0, t;
+
+ uint64_t q;
+
+ if (v < 0)
+ q = -v;
+ else
+ q = v;
+
+ /* For as long as the number is more than 3 digits, but at least
+ * once, shift right / divide by 1024. Keep the remainder for
+ * a digit after the decimal point.
+ */
+ do {
+ u++;
+
+ t = q & ~(~0 << 10);
+ q >>= 10;
+ } while (q >= 1000);
+
+ if (v < 0)
+ /* '-', up to 3 digits, '.', 1 digit, 1 character, null;
+ * yields 8 bytes.
+ */
+ return sprintf(buf, "-%llu.%i%c", q, t * 10 / 1024, units[u]);
+ else
+ return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]);
}
ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index c49022a8dc9d..e663ca082183 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -21,7 +21,8 @@
static void __update_writeback_rate(struct cached_dev *dc)
{
struct cache_set *c = dc->disk.c;
- uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
+ uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
+ bcache_flash_devs_sectors_dirty(c);
uint64_t cache_dirty_target =
div_u64(cache_sectors * dc->writeback_percent, 100);
@@ -186,7 +187,7 @@ static void write_dirty(struct closure *cl)
closure_bio_submit(&io->bio, cl);
- continue_at(cl, write_dirty_finish, system_wq);
+ continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
}
static void read_dirty_endio(struct bio *bio)
@@ -206,7 +207,7 @@ static void read_dirty_submit(struct closure *cl)
closure_bio_submit(&io->bio, cl);
- continue_at(cl, write_dirty, system_wq);
+ continue_at(cl, write_dirty, io->dc->writeback_write_wq);
}
static void read_dirty(struct cached_dev *dc)
@@ -481,17 +482,17 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
return MAP_CONTINUE;
}
-void bch_sectors_dirty_init(struct cached_dev *dc)
+void bch_sectors_dirty_init(struct bcache_device *d)
{
struct sectors_dirty_init op;
bch_btree_op_init(&op.op, -1);
- op.inode = dc->disk.id;
+ op.inode = d->id;
- bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
+ bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
sectors_dirty_init_fn, 0);
- dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
+ d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -515,6 +516,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
int bch_cached_dev_writeback_start(struct cached_dev *dc)
{
+ dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
+ WQ_MEM_RECLAIM, 0);
+ if (!dc->writeback_write_wq)
+ return -ENOMEM;
+
dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
"bcache_writeback");
if (IS_ERR(dc->writeback_thread))
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 629bd1a502fd..e35421d20d2e 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
return ret;
}
+static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
+{
+ uint64_t i, ret = 0;
+
+ mutex_lock(&bch_register_lock);
+
+ for (i = 0; i < c->nr_uuids; i++) {
+ struct bcache_device *d = c->devices[i];
+
+ if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
+ continue;
+ ret += bcache_dev_sectors_dirty(d);
+ }
+
+ mutex_unlock(&bch_register_lock);
+
+ return ret;
+}
+
static inline unsigned offset_to_stripe(struct bcache_device *d,
uint64_t offset)
{
@@ -84,7 +103,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
-void bch_sectors_dirty_init(struct cached_dev *dc);
+void bch_sectors_dirty_init(struct bcache_device *);
void bch_cached_dev_writeback_init(struct cached_dev *);
int bch_cached_dev_writeback_start(struct cached_dev *);