diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-01 10:39:57 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-01 10:39:57 -0700 |
commit | 694752922b12bd318aa80191bd9d8c3dcfb39055 (patch) | |
tree | 5afe83fd99100bea546dd5a1c1f778c58f41e5c0 /drivers/lightnvm/pblk-rl.c | |
parent | a351e9b9fc24e982ec2f0e76379a49826036da12 (diff) | |
parent | 9438b3e080beccf6022138ea62192d55cc7dc4ed (diff) |
Merge branch 'for-4.12/block' of git://git.kernel.dk/linux-block
Pull block layer updates from Jens Axboe:
- Add BFQ IO scheduler under the new blk-mq scheduling framework. BFQ
was initially a fork of CFQ, but subsequently changed to implement
fairness based on B-WF2Q+, a modified variant of WF2Q. BFQ is meant
to be used on desktop type single drives, providing good fairness.
From Paolo.
- Add Kyber IO scheduler. This is a full multiqueue aware scheduler,
using a scalable token based algorithm that throttles IO based on
live completion IO stats, similary to blk-wbt. From Omar.
- A series from Jan, moving users to separately allocated backing
devices. This continues the work of separating backing device life
times, solving various problems with hot removal.
- A series of updates for lightnvm, mostly from Javier. Includes a
'pblk' target that exposes an open channel SSD as a physical block
device.
- A series of fixes and improvements for nbd from Josef.
- A series from Omar, removing queue sharing between devices on mostly
legacy drivers. This helps us clean up other bits, if we know that a
queue only has a single device backing. This has been overdue for
more than a decade.
- Fixes for the blk-stats, and improvements to unify the stats and user
windows. This both improves blk-wbt, and enables other users to
register a need to receive IO stats for a device. From Omar.
- blk-throttle improvements from Shaohua. This provides a scalable
framework for implementing scalable priotization - particularly for
blk-mq, but applicable to any type of block device. The interface is
marked experimental for now.
- Bucketized IO stats for IO polling from Stephen Bates. This improves
efficiency of polled workloads in the presence of mixed block size
IO.
- A few fixes for opal, from Scott.
- A few pulls for NVMe, including a lot of fixes for NVMe-over-fabrics.
From a variety of folks, mostly Sagi and James Smart.
- A series from Bart, improving our exposed info and capabilities from
the blk-mq debugfs support.
- A series from Christoph, cleaning up how handle WRITE_ZEROES.
- A series from Christoph, cleaning up the block layer handling of how
we track errors in a request. On top of being a nice cleanup, it also
shrinks the size of struct request a bit.
- Removal of mg_disk and hd (sorry Linus) by Christoph. The former was
never used by platforms, and the latter has outlived it's usefulness.
- Various little bug fixes and cleanups from a wide variety of folks.
* 'for-4.12/block' of git://git.kernel.dk/linux-block: (329 commits)
block: hide badblocks attribute by default
blk-mq: unify hctx delay_work and run_work
block: add kblock_mod_delayed_work_on()
blk-mq: unify hctx delayed_run_work and run_work
nbd: fix use after free on module unload
MAINTAINERS: bfq: Add Paolo as maintainer for the BFQ I/O scheduler
blk-mq-sched: alloate reserved tags out of normal pool
mtip32xx: use runtime tag to initialize command header
scsi: Implement blk_mq_ops.show_rq()
blk-mq: Add blk_mq_ops.show_rq()
blk-mq: Show operation, cmd_flags and rq_flags names
blk-mq: Make blk_flags_show() callers append a newline character
blk-mq: Move the "state" debugfs attribute one level down
blk-mq: Unregister debugfs attributes earlier
blk-mq: Only unregister hctxs for which registration succeeded
blk-mq-debugfs: Rename functions for registering and unregistering the mq directory
blk-mq: Let blk_mq_debugfs_register() look up the queue name
blk-mq: Register <dev>/queue/mq after having registered <dev>/queue
ide-pm: always pass 0 error to ide_complete_rq in ide_do_devset
ide-pm: always pass 0 error to __blk_end_request_all
..
Diffstat (limited to 'drivers/lightnvm/pblk-rl.c')
-rw-r--r-- | drivers/lightnvm/pblk-rl.c | 184 |
1 files changed, 184 insertions, 0 deletions
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c new file mode 100644 index 000000000000..ab7cbb144f3f --- /dev/null +++ b/drivers/lightnvm/pblk-rl.c @@ -0,0 +1,184 @@ +/* + * Copyright (C) 2016 CNEX Labs + * Initial release: Javier Gonzalez <javier@cnexlabs.com> + * Matias Bjorling <matias@cnexlabs.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * pblk-rl.c - pblk's rate limiter for user I/O + * + */ + +#include "pblk.h" + +static void pblk_rl_kick_u_timer(struct pblk_rl *rl) +{ + mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000)); +} + +int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries) +{ + int rb_user_cnt = atomic_read(&rl->rb_user_cnt); + + return (!(rb_user_cnt + nr_entries > rl->rb_user_max)); +} + +int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries) +{ + int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt); + int rb_user_active; + + /* If there is no user I/O let GC take over space on the write buffer */ + rb_user_active = READ_ONCE(rl->rb_user_active); + return (!(rb_gc_cnt + nr_entries > rl->rb_gc_max && rb_user_active)); +} + +void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries) +{ + atomic_add(nr_entries, &rl->rb_user_cnt); + + /* Release user I/O state. Protect from GC */ + smp_store_release(&rl->rb_user_active, 1); + pblk_rl_kick_u_timer(rl); +} + +void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries) +{ + atomic_add(nr_entries, &rl->rb_gc_cnt); +} + +void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc) +{ + atomic_sub(nr_user, &rl->rb_user_cnt); + atomic_sub(nr_gc, &rl->rb_gc_cnt); +} + +unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl) +{ + return atomic_read(&rl->free_blocks); +} + +/* + * We check for (i) the number of free blocks in the current LUN and (ii) the + * total number of free blocks in the pblk instance. This is to even out the + * number of free blocks on each LUN when GC kicks in. + * + * Only the total number of free blocks is used to configure the rate limiter. + */ +static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max) +{ + unsigned long free_blocks = pblk_rl_nr_free_blks(rl); + + if (free_blocks >= rl->high) { + rl->rb_user_max = max - rl->rb_gc_rsv; + rl->rb_gc_max = rl->rb_gc_rsv; + rl->rb_state = PBLK_RL_HIGH; + } else if (free_blocks < rl->high) { + int shift = rl->high_pw - rl->rb_windows_pw; + int user_windows = free_blocks >> shift; + int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW; + int gc_max; + + rl->rb_user_max = user_max; + gc_max = max - rl->rb_user_max; + rl->rb_gc_max = max(gc_max, rl->rb_gc_rsv); + + if (free_blocks > rl->low) + rl->rb_state = PBLK_RL_MID; + else + rl->rb_state = PBLK_RL_LOW; + } + + return rl->rb_state; +} + +void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv) +{ + rl->rb_gc_rsv = rl->rb_gc_max = rsv; +} + +void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) +{ + struct pblk *pblk = container_of(rl, struct pblk, rl); + int blk_in_line = atomic_read(&line->blk_in_line); + int ret; + + atomic_add(blk_in_line, &rl->free_blocks); + /* Rates will not change that often - no need to lock update */ + ret = pblk_rl_update_rates(rl, rl->rb_budget); + + if (ret == (PBLK_RL_MID | PBLK_RL_LOW)) + pblk_gc_should_start(pblk); + else + pblk_gc_should_stop(pblk); +} + +void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) +{ + struct pblk *pblk = container_of(rl, struct pblk, rl); + int blk_in_line = atomic_read(&line->blk_in_line); + int ret; + + atomic_sub(blk_in_line, &rl->free_blocks); + + /* Rates will not change that often - no need to lock update */ + ret = pblk_rl_update_rates(rl, rl->rb_budget); + if (ret == (PBLK_RL_MID | PBLK_RL_LOW)) + pblk_gc_should_start(pblk); + else + pblk_gc_should_stop(pblk); +} + +int pblk_rl_gc_thrs(struct pblk_rl *rl) +{ + return rl->high; +} + +int pblk_rl_sysfs_rate_show(struct pblk_rl *rl) +{ + return rl->rb_user_max; +} + +static void pblk_rl_u_timer(unsigned long data) +{ + struct pblk_rl *rl = (struct pblk_rl *)data; + + /* Release user I/O state. Protect from GC */ + smp_store_release(&rl->rb_user_active, 0); +} + +void pblk_rl_free(struct pblk_rl *rl) +{ + del_timer(&rl->u_timer); +} + +void pblk_rl_init(struct pblk_rl *rl, int budget) +{ + unsigned int rb_windows; + + rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS; + rl->low = rl->total_blocks / PBLK_USER_LOW_THRS; + rl->high_pw = get_count_order(rl->high); + + /* This will always be a power-of-2 */ + rb_windows = budget / PBLK_MAX_REQ_ADDRS; + rl->rb_windows_pw = get_count_order(rb_windows) + 1; + + /* To start with, all buffer is available to user I/O writers */ + rl->rb_budget = budget; + rl->rb_user_max = budget; + atomic_set(&rl->rb_user_cnt, 0); + rl->rb_gc_max = 0; + rl->rb_state = PBLK_RL_HIGH; + atomic_set(&rl->rb_gc_cnt, 0); + + setup_timer(&rl->u_timer, pblk_rl_u_timer, (unsigned long)rl); + rl->rb_user_active = 0; +} |