From 4e5dff41be7b5201c1c47ceb3a2a8d698516bc2b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 Nov 2017 10:24:58 -0700 Subject: blk-mq: improve heavily contended tag case Even with a number of waitqueues, we can get into a situation where we are heavily contended on the waitqueue lock. I got a report on spc1 where we're spending seconds doing this. Arguably the use case is nasty, I reproduce it with one device and 1000 threads banging on the device. But that doesn't mean we shouldn't be handling it better. What ends up happening is that a thread will fail to get a tag, add itself to the waitqueue, and subsequently get woken up when a tag is freed - only to find itself going back to sleep on the waitqueue. Instead of waking all threads, use an exclusive wait and wake up our sbitmap batch count instead. This seems to work well for me (massive improvement for this use case), and it survives basic testing. But I haven't fully verified it yet. An additional improvement is running the queue and checking for a new tag BEFORE needing to add ourselves to the waitqueue. Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index c81b40ecd3f1..336dde07b230 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ws = bt_wait_ptr(bt, data->hctx); drop_ctx = data->ctx == NULL; do { - prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); - - tag = __blk_mq_get_tag(data, bt); - if (tag != -1) - break; - /* * We're out of tags on this hardware queue, kick any * pending IO submits before going to sleep waiting for @@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != -1) break; + prepare_to_wait_exclusive(&ws->wait, &wait, + TASK_UNINTERRUPTIBLE); + + tag = __blk_mq_get_tag(data, bt); + if (tag != -1) + break; + if (data->ctx) blk_mq_put_ctx(data->ctx); -- cgit v1.2.3-58-ga151 From 913a9500b94566351e8f920e7f2501c8124205b1 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 5 Jan 2018 00:09:06 -0700 Subject: blk-mq: remove confusing comment of blk_mq_sched_dispatch_requests Commit de1482974080 ("blk-mq: introduce .get_budget and .put_budget in blk_mq_ops") changes the function to return bool type, and then commit 1f460b63d4b3 ("blk-mq: don't restart queue when .get_budget returns BLK_STS_RESOURCE") changes it back to void, but the comment remains. Signed-off-by: Liu Bo Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index c117bd8fd1f6..2ff7cf0cbf73 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -172,7 +172,6 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) WRITE_ONCE(hctx->dispatch_from, ctx); } -/* return true if hw queue need to be run again */ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; -- cgit v1.2.3-58-ga151 From 6cc77e9cb08041627fe1d32ac3a743249deb8167 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Dec 2017 15:43:38 +0900 Subject: block: introduce zoned block devices zone write locking Components relying only on the request_queue structure for accessing block devices (e.g. I/O schedulers) have a limited knowledged of the device characteristics. In particular, the device capacity cannot be easily discovered, which for a zoned block device also result in the inability to easily know the number of zones of the device (the zone size is indicated by the chunk_sectors field of the queue limits). Introduce the nr_zones field to the request_queue structure to simplify access to this information. Also, add the bitmap seq_zone_bitmap which indicates which zones of the device are sequential zones (write preferred or write required) and the bitmap seq_zones_wlock which indicates if a zone is write locked, that is, if a write request targeting a zone was dispatched to the device. These fields are initialized by the low level block device driver (sd.c for ZBC/ZAC disks). They are not initialized by stacking drivers (device mappers) handling zoned block devices (e.g. dm-linear). Using this, I/O schedulers can introduce zone write locking to control request dispatching to a zoned block device and avoid write request reordering by limiting to at most a single write request per zone outside of the scheduler at any time. Based on previous patches from Damien Le Moal. Signed-off-by: Christoph Hellwig [Damien] * Fixed comments and identation in blkdev.h * Changed helper functions * Fixed this commit message Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk-zoned.c | 42 +++++++++++++++++++ include/linux/blkdev.h | 111 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 154 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index b8881750a3ac..e6e5bbc4c366 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1641,6 +1641,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) lockdep_assert_held(q->queue_lock); + blk_req_zone_write_unlock(req); blk_pm_put_request(req); elv_completed_request(q, req); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index ff57fb51b338..acb7252c7e81 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -21,6 +21,48 @@ static inline sector_t blk_zone_start(struct request_queue *q, return sector & ~zone_mask; } +/* + * Return true if a request is a write requests that needs zone write locking. + */ +bool blk_req_needs_zone_write_lock(struct request *rq) +{ + if (!rq->q->seq_zones_wlock) + return false; + + if (blk_rq_is_passthrough(rq)) + return false; + + switch (req_op(rq)) { + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + return blk_rq_zone_is_seq(rq); + default: + return false; + } +} +EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); + +void __blk_req_zone_write_lock(struct request *rq) +{ + if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), + rq->q->seq_zones_wlock))) + return; + + WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); + rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; +} +EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); + +void __blk_req_zone_write_unlock(struct request *rq) +{ + rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; + if (rq->q->seq_zones_wlock) + WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), + rq->q->seq_zones_wlock)); +} +EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); + /* * Check that a zone report belongs to the partition. * If yes, fix its start sector and write pointer, copy it in the diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8089ca17db9a..46e606f5b44b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -121,6 +121,8 @@ typedef __u32 __bitwise req_flags_t; /* Look at ->special_vec for the actual data payload instead of the bio chain. */ #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) +/* The per-zone write lock is held for this request */ +#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ @@ -546,6 +548,22 @@ struct request_queue { struct queue_limits limits; + /* + * Zoned block device information for request dispatch control. + * nr_zones is the total number of zones of the device. This is always + * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones + * bits which indicates if a zone is conventional (bit clear) or + * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones + * bits which indicates if a zone is write locked, that is, if a write + * request targeting the zone was dispatched. All three fields are + * initialized by the low level device driver (e.g. scsi/sd.c). + * Stacking drivers (device mappers) may or may not initialize + * these fields. + */ + unsigned int nr_zones; + unsigned long *seq_zones_bitmap; + unsigned long *seq_zones_wlock; + /* * sg stuff */ @@ -790,6 +808,27 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } +static inline unsigned int blk_queue_nr_zones(struct request_queue *q) +{ + return q->nr_zones; +} + +static inline unsigned int blk_queue_zone_no(struct request_queue *q, + sector_t sector) +{ + if (!blk_queue_is_zoned(q)) + return 0; + return sector >> ilog2(q->limits.chunk_sectors); +} + +static inline bool blk_queue_zone_is_seq(struct request_queue *q, + sector_t sector) +{ + if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap) + return false; + return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); +} + static inline bool rq_is_sync(struct request *rq) { return op_is_sync(rq->cmd_flags); @@ -1029,6 +1068,16 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> 9; } +static inline unsigned int blk_rq_zone_no(struct request *rq) +{ + return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); +} + +static inline unsigned int blk_rq_zone_is_seq(struct request *rq) +{ + return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); +} + /* * Some commands like WRITE SAME have a payload or data transfer size which * is different from the size of the request. Any driver that supports such @@ -1578,7 +1627,15 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev) if (q) return blk_queue_zone_sectors(q); + return 0; +} +static inline unsigned int bdev_nr_zones(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_nr_zones(q); return 0; } @@ -1954,6 +2011,60 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); + +#ifdef CONFIG_BLK_DEV_ZONED +bool blk_req_needs_zone_write_lock(struct request *rq); +void __blk_req_zone_write_lock(struct request *rq); +void __blk_req_zone_write_unlock(struct request *rq); + +static inline void blk_req_zone_write_lock(struct request *rq) +{ + if (blk_req_needs_zone_write_lock(rq)) + __blk_req_zone_write_lock(rq); +} + +static inline void blk_req_zone_write_unlock(struct request *rq) +{ + if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) + __blk_req_zone_write_unlock(rq); +} + +static inline bool blk_req_zone_is_write_locked(struct request *rq) +{ + return rq->q->seq_zones_wlock && + test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock); +} + +static inline bool blk_req_can_dispatch_to_zone(struct request *rq) +{ + if (!blk_req_needs_zone_write_lock(rq)) + return true; + return !blk_req_zone_is_write_locked(rq); +} +#else +static inline bool blk_req_needs_zone_write_lock(struct request *rq) +{ + return false; +} + +static inline void blk_req_zone_write_lock(struct request *rq) +{ +} + +static inline void blk_req_zone_write_unlock(struct request *rq) +{ +} +static inline bool blk_req_zone_is_write_locked(struct request *rq) +{ + return false; +} + +static inline bool blk_req_can_dispatch_to_zone(struct request *rq) +{ + return true; +} +#endif /* CONFIG_BLK_DEV_ZONED */ + #else /* CONFIG_BLOCK */ struct block_device; -- cgit v1.2.3-58-ga151 From bf09ce56f0e654b94d980b9aa89e3fce78887e01 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 21 Dec 2017 15:43:39 +0900 Subject: mq-deadline: Introduce dispatch helpers Avoid directly referencing the next_rq and fifo_list arrays using the helper functions deadline_next_request() and deadline_fifo_request() to facilitate changes in the dispatch request selection in __dd_dispatch_request() for zoned block devices. Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/mq-deadline.c | 45 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 0179e484ec98..8bd6db9e69c7 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -191,6 +191,35 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) return 0; } +/* + * For the specified data direction, return the next request to + * dispatch using arrival ordered lists. + */ +static struct request * +deadline_fifo_request(struct deadline_data *dd, int data_dir) +{ + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + if (list_empty(&dd->fifo_list[data_dir])) + return NULL; + + return rq_entry_fifo(dd->fifo_list[data_dir].next); +} + +/* + * For the specified data direction, return the next request to + * dispatch using sector position sorted lists. + */ +static struct request * +deadline_next_request(struct deadline_data *dd, int data_dir) +{ + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + return dd->next_rq[data_dir]; +} + /* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc @@ -198,7 +227,7 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; - struct request *rq; + struct request *rq, *next_rq; bool reads, writes; int data_dir; @@ -214,10 +243,9 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) /* * batches are currently reads XOR writes */ - if (dd->next_rq[WRITE]) - rq = dd->next_rq[WRITE]; - else - rq = dd->next_rq[READ]; + rq = deadline_next_request(dd, WRITE); + if (!rq) + rq = deadline_next_request(dd, READ); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ @@ -260,19 +288,20 @@ dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ - if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { + next_rq = deadline_next_request(dd, data_dir); + if (deadline_check_fifo(dd, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + rq = deadline_fifo_request(dd, data_dir); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ - rq = dd->next_rq[data_dir]; + rq = next_rq; } dd->batching = 0; -- cgit v1.2.3-58-ga151 From 5700f69178e91a6b21250049b86148ed5e9550c1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 21 Dec 2017 15:43:40 +0900 Subject: mq-deadline: Introduce zone locking support Introduce zone write locking to avoid write request reordering with zoned block devices. This is achieved using a finer selection of the next request to dispatch: 1) Any non-write request is always allowed to proceed. 2) Any write to a conventional zone is always allowed to proceed. 3) For a write to a sequential zone, the zone lock is first checked. a) If the zone is not locked, the write is allowed to proceed after its target zone is locked. b) If the zone is locked, the write request is skipped and the next request in the dispatch queue tested (back to step 1). For a write request that has locked its target zone, the zone is unlocked either when the request completes with a call to the method deadline_request_completed() or when the request is requeued using dd_insert_request(). Requests targeting a locked zone are always left in the scheduler queue to preserve the lba ordering for write requests. If no write request can be dispatched, allow reads to be dispatched even if the write batch is not done. If the device used is not a zoned block device, or if zoned block device support is disabled, this patch does not modify mq-deadline behavior. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/mq-deadline.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 8bd6db9e69c7..d56972e8ebda 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -59,6 +59,7 @@ struct deadline_data { int front_merges; spinlock_t lock; + spinlock_t zone_lock; struct list_head dispatch; }; @@ -198,13 +199,33 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) static struct request * deadline_fifo_request(struct deadline_data *dd, int data_dir) { + struct request *rq; + unsigned long flags; + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) return NULL; if (list_empty(&dd->fifo_list[data_dir])) return NULL; - return rq_entry_fifo(dd->fifo_list[data_dir].next); + rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + spin_lock_irqsave(&dd->zone_lock, flags); + list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { + if (blk_req_can_dispatch_to_zone(rq)) + goto out; + } + rq = NULL; +out: + spin_unlock_irqrestore(&dd->zone_lock, flags); + + return rq; } /* @@ -214,10 +235,32 @@ deadline_fifo_request(struct deadline_data *dd, int data_dir) static struct request * deadline_next_request(struct deadline_data *dd, int data_dir) { + struct request *rq; + unsigned long flags; + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) return NULL; - return dd->next_rq[data_dir]; + rq = dd->next_rq[data_dir]; + if (!rq) + return NULL; + + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + spin_lock_irqsave(&dd->zone_lock, flags); + while (rq) { + if (blk_req_can_dispatch_to_zone(rq)) + break; + rq = deadline_latter_request(rq); + } + spin_unlock_irqrestore(&dd->zone_lock, flags); + + return rq; } /* @@ -259,7 +302,8 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) if (reads) { BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); - if (writes && (dd->starved++ >= dd->writes_starved)) + if (deadline_fifo_request(dd, WRITE) && + (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = READ; @@ -304,6 +348,13 @@ dispatch_find_request: rq = next_rq; } + /* + * For a zoned block device, if we only have writes queued and none of + * them can be dispatched, rq will be NULL. + */ + if (!rq) + return NULL; + dd->batching = 0; dispatch_request: @@ -313,6 +364,10 @@ dispatch_request: dd->batching++; deadline_move_request(dd, rq); done: + /* + * If the request needs its target zone locked, do it. + */ + blk_req_zone_write_lock(rq); rq->rq_flags |= RQF_STARTED; return rq; } @@ -368,6 +423,7 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e) dd->front_merges = 1; dd->fifo_batch = fifo_batch; spin_lock_init(&dd->lock); + spin_lock_init(&dd->zone_lock); INIT_LIST_HEAD(&dd->dispatch); q->elevator = eq; @@ -424,6 +480,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); + /* + * This may be a requeue of a write request that has locked its + * target zone. If it is the case, this releases the zone lock. + */ + blk_req_zone_write_unlock(rq); + if (blk_mq_sched_try_insert_merge(q, rq)) return; @@ -468,6 +530,26 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, spin_unlock(&dd->lock); } +/* + * For zoned block devices, write unlock the target zone of + * completed write requests. Do this while holding the zone lock + * spinlock so that the zone is never unlocked while deadline_fifo_request() + * while deadline_next_request() are executing. + */ +static void dd_completed_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + if (blk_queue_is_zoned(q)) { + struct deadline_data *dd = q->elevator->elevator_data; + unsigned long flags; + + spin_lock_irqsave(&dd->zone_lock, flags); + blk_req_zone_write_unlock(rq); + spin_unlock_irqrestore(&dd->zone_lock, flags); + } +} + static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; @@ -669,6 +751,7 @@ static struct elevator_type mq_deadline = { .ops.mq = { .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, + .completed_request = dd_completed_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, -- cgit v1.2.3-58-ga151 From c117bac70133dbff9ed7fcbd91ef82b4ee518797 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 21 Dec 2017 15:43:41 +0900 Subject: deadline-iosched: Introduce dispatch helpers Avoid directly referencing the next_rq and fifo_list arrays using the helper functions deadline_next_request() and deadline_fifo_request() to facilitate changes in the dispatch request selection in deadline_dispatch_requests() for zoned block devices. While at it, also remove the unnecessary forward declaration of the function deadline_move_request(). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/deadline-iosched.c | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index b83f77460d28..81e3f0897457 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -50,8 +50,6 @@ struct deadline_data { int front_merges; }; -static void deadline_move_request(struct deadline_data *, struct request *); - static inline struct rb_root * deadline_rb_root(struct deadline_data *dd, struct request *rq) { @@ -230,6 +228,35 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) return 0; } +/* + * For the specified data direction, return the next request to dispatch using + * arrival ordered lists. + */ +static struct request * +deadline_fifo_request(struct deadline_data *dd, int data_dir) +{ + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + if (list_empty(&dd->fifo_list[data_dir])) + return NULL; + + return rq_entry_fifo(dd->fifo_list[data_dir].next); +} + +/* + * For the specified data direction, return the next request to dispatch using + * sector position sorted lists. + */ +static struct request * +deadline_next_request(struct deadline_data *dd, int data_dir) +{ + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + return dd->next_rq[data_dir]; +} + /* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc @@ -239,16 +266,15 @@ static int deadline_dispatch_requests(struct request_queue *q, int force) struct deadline_data *dd = q->elevator->elevator_data; const int reads = !list_empty(&dd->fifo_list[READ]); const int writes = !list_empty(&dd->fifo_list[WRITE]); - struct request *rq; + struct request *rq, *next_rq; int data_dir; /* * batches are currently reads XOR writes */ - if (dd->next_rq[WRITE]) - rq = dd->next_rq[WRITE]; - else - rq = dd->next_rq[READ]; + rq = deadline_next_request(dd, WRITE); + if (!rq) + rq = deadline_next_request(dd, READ); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ @@ -291,19 +317,20 @@ dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ - if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { + next_rq = deadline_next_request(dd, data_dir); + if (deadline_check_fifo(dd, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + rq = deadline_fifo_request(dd, data_dir); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ - rq = dd->next_rq[data_dir]; + rq = next_rq; } dd->batching = 0; -- cgit v1.2.3-58-ga151 From 8dc8146f9c92c17caa3c50f979d351c87ed372f8 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 21 Dec 2017 15:43:42 +0900 Subject: deadline-iosched: Introduce zone locking support Introduce zone write locking to avoid write request reordering with zoned block devices. This is achieved using a finer selection of the next request to dispatch: 1) Any non-write request is always allowed to proceed. 2) Any write to a conventional zone is always allowed to proceed. 3) For a write to a sequential zone, the zone lock is first checked. a) If the zone is not locked, the write is allowed to proceed after its target zone is locked. b) If the zone is locked, the write request is skipped and the next request in the dispatch queue tested (back to step 1). For a write request that has locked its target zone, the zone is unlocked either when the request completes and the method deadline_request_completed() is called, or when the request is requeued using the method deadline_add_request(). Requests targeting a locked zone are always left in the scheduler queue to preserve the initial write order. If no write request can be dispatched, allow reads to be dispatched even if the write batch is not done. If the device used is not a zoned block device, or if zoned block device support is disabled, this patch does not modify deadline behavior. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/deadline-iosched.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 81e3f0897457..9de9f156e203 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -98,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request *rq) struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); + /* + * This may be a requeue of a write request that has locked its + * target zone. If it is the case, this releases the zone lock. + */ + blk_req_zone_write_unlock(rq); + deadline_add_rq_rb(dd, rq); /* @@ -188,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq) { struct request_queue *q = rq->q; + /* + * For a zoned block device, write requests must write lock their + * target zone. + */ + blk_req_zone_write_lock(rq); + deadline_remove_request(q, rq); elv_dispatch_add_tail(q, rq); } @@ -235,13 +247,28 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) static struct request * deadline_fifo_request(struct deadline_data *dd, int data_dir) { + struct request *rq; + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) return NULL; if (list_empty(&dd->fifo_list[data_dir])) return NULL; - return rq_entry_fifo(dd->fifo_list[data_dir].next); + rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { + if (blk_req_can_dispatch_to_zone(rq)) + return rq; + } + + return NULL; } /* @@ -251,10 +278,29 @@ deadline_fifo_request(struct deadline_data *dd, int data_dir) static struct request * deadline_next_request(struct deadline_data *dd, int data_dir) { + struct request *rq; + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) return NULL; - return dd->next_rq[data_dir]; + rq = dd->next_rq[data_dir]; + if (!rq) + return NULL; + + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + while (rq) { + if (blk_req_can_dispatch_to_zone(rq)) + return rq; + rq = deadline_latter_request(rq); + } + + return NULL; } /* @@ -288,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue *q, int force) if (reads) { BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); - if (writes && (dd->starved++ >= dd->writes_starved)) + if (deadline_fifo_request(dd, WRITE) && + (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = READ; @@ -333,6 +380,13 @@ dispatch_find_request: rq = next_rq; } + /* + * For a zoned block device, if we only have writes queued and none of + * them can be dispatched, rq will be NULL. + */ + if (!rq) + return 0; + dd->batching = 0; dispatch_request: @@ -345,6 +399,16 @@ dispatch_request: return 1; } +/* + * For zoned block devices, write unlock the target zone of completed + * write requests. + */ +static void +deadline_completed_request(struct request_queue *q, struct request *rq) +{ + blk_req_zone_write_unlock(rq); +} + static void deadline_exit_queue(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; @@ -466,6 +530,7 @@ static struct elevator_type iosched_deadline = { .elevator_merged_fn = deadline_merged_request, .elevator_merge_req_fn = deadline_merged_requests, .elevator_dispatch_fn = deadline_dispatch_requests, + .elevator_completed_req_fn = deadline_completed_request, .elevator_add_req_fn = deadline_add_request, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, -- cgit v1.2.3-58-ga151 From f0ba5ea2fe45c0ad24a7dedae84a97f7aa046494 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 20 Dec 2017 17:27:36 +0100 Subject: block, bfq: increase threshold to deem I/O as random If two processes do I/O close to each other, i.e., are cooperating processes in BFQ (and CFQ'S) nomenclature, then BFQ merges their associated bfq_queues, so as to get sequential I/O from the union of the I/O requests of the processes, and thus reach a higher throughput. A merged queue is then split if its I/O stops being sequential. In this respect, BFQ deems the I/O of a bfq_queue as (mostly) sequential only if less than 4 I/O requests are random, out of the last 32 requests inserted into the queue. Unfortunately, extensive testing (with the interleaved_io benchmark of the S suite [1], and with real applications spawning cooperating processes) has clearly shown that, with such a low threshold, only a rather low I/O throughput may be reached when several cooperating processes do I/O. In particular, the outcome of each test run was bimodal: if queue merging occurred and was stable during the test, then the throughput was close to the peak rate of the storage device, otherwise the throughput was arbitrarily low (usually around 1/10 of the peak rate with a rotational device). The probability to get the unlucky outcomes grew with the number of cooperating processes: it was already significant with 5 processes, and close to one with 7 or more processes. The cause of the low throughput in the unlucky runs was that the merged queues containing the I/O of these cooperating processes were soon split, because they contained more random I/O requests than those tolerated by the 4/32 threshold, but - that I/O would have however allowed the storage device to reach peak throughput or almost peak throughput; - in contrast, the I/O of these processes, if served individually (from separate queues) yielded a rather low throughput. So we repeated our tests with increasing values of the threshold, until we found the minimum value (19) for which we obtained maximum throughput, reliably, with at least up to 9 cooperating processes. Then we checked that the use of that higher threshold value did not cause any regression for any other benchmark in the suite [1]. This commit raises the threshold to such a higher value. [1] https://github.com/Algodev-github/S Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index bcb6d21baf12..0f48583b9380 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -178,7 +178,7 @@ static struct kmem_cache *bfq_pool; #define BFQQ_SEEK_THR (sector_t)(8 * 100) #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) +#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) /* Min number of samples required to perform peak-rate update */ #define BFQ_RATE_MIN_SAMPLES 32 -- cgit v1.2.3-58-ga151 From 05e90283561648301e30232fe0c91bd345ceba03 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 20 Dec 2017 12:38:31 +0100 Subject: block, bfq: add missing rq_pos_tree update on rq removal If two processes do I/O close to each other, then BFQ merges the bfq_queues associated with these processes, to get a more sequential I/O, and thus a higher throughput. In this respect, to detect whether two processes are doing I/O close to each other, BFQ keeps a list of the head-of-line I/O requests of all active bfq_queues. The list is ordered by initial sectors, and implemented through a red-black tree (rq_pos_tree). Unfortunately, the update of the rq_pos_tree was incomplete, because the tree was not updated on the removal of the head-of-line I/O request of a bfq_queue, in case the queue did not remain empty. This commit adds the missing update. Signed-off-by: Paolo Valente Signed-off-by: Angelo Ruocco Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0f48583b9380..fa395a260a23 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1627,6 +1627,8 @@ static void bfq_remove_request(struct request_queue *q, rb_erase(&bfqq->pos_node, bfqq->pos_root); bfqq->pos_root = NULL; } + } else { + bfq_pos_tree_add_move(bfqd, bfqq); } if (rq->cmd_flags & REQ_META) -- cgit v1.2.3-58-ga151 From 1be6e8a964ee9aa8d4daac523ce29e5f486dd756 Mon Sep 17 00:00:00 2001 From: Angelo Ruocco Date: Wed, 20 Dec 2017 12:38:32 +0100 Subject: block, bfq: check low_latency flag in bfq_bfqq_save_state() A just-created bfq_queue will certainly be deemed as interactive on the arrival of its first I/O request, if the low_latency flag is set. Yet, if the queue is merged with another queue on the arrival of its first I/O request, it will not have the chance to be flagged as interactive. Nevertheless, if the queue is then split soon enough, it has to be flagged as interactive after the split. To handle this early-merge scenario correctly, BFQ saves the state of the queue, on the merge, as if the latter had already been deemed interactive. So, if the queue is split soon, it will get weight-raised, because the previous state of the queue is resumed on the split. Unfortunately, in the act of saving the state of the newly-created queue, BFQ doesn't check whether the low_latency flag is set, and this causes early-merged queues to be then weight-raised, on queue splits, even if low_latency is off. This commit addresses this problem by adding the missing check. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index fa395a260a23..2cf395daee80 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2064,7 +2064,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); if (unlikely(bfq_bfqq_just_created(bfqq) && - !bfq_bfqq_in_large_burst(bfqq))) { + !bfq_bfqq_in_large_burst(bfqq) && + bfqq->bfqd->low_latency)) { /* * bfqq being merged right after being created: bfqq * would have deserved interactive weight raising, but -- cgit v1.2.3-58-ga151 From 7b8fa3b900a087bc03b11329a92398fde563ba37 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 20 Dec 2017 12:38:33 +0100 Subject: block, bfq: let a queue be merged only shortly after starting I/O In BFQ and CFQ, two processes are said to be cooperating if they do I/O in such a way that the union of their I/O requests yields a sequential I/O pattern. To get such a sequential I/O pattern out of the non-sequential pattern of each cooperating process, BFQ and CFQ merge the queues associated with these processes. In more detail, cooperating processes, and thus their associated queues, usually start, or restart, to do I/O shortly after each other. This is the case, e.g., for the I/O threads of KVM/QEMU and of the dump utility. Basing on this assumption, this commit allows a bfq_queue to be merged only during a short time interval (100ms) after it starts, or re-starts, to do I/O. This filtering provides two important benefits. First, it greatly reduces the probability that two non-cooperating processes have their queues merged by mistake, if they just happen to do I/O close to each other for a short time interval. These spurious merges cause loss of service guarantees. A low-weight bfq_queue may unjustly get more than its expected share of the throughput: if such a low-weight queue is merged with a high-weight queue, then the I/O for the low-weight queue is served as if the queue had a high weight. This may damage other high-weight queues unexpectedly. For instance, because of this issue, lxterminal occasionally took 7.5 seconds to start, instead of 6.5 seconds, when some sequential readers and writers did I/O in the background on a FUJITSU MHX2300BT HDD. The reason is that the bfq_queues associated with some of the readers or the writers were merged with the high-weight queues of some processes that had to do some urgent but little I/O. The readers then exploited the inherited high weight for all or most of their I/O, during the start-up of terminal. The filtering introduced by this commit eliminated any outlier caused by spurious queue merges in our start-up time tests. This filtering also provides a little boost of the throughput sustainable by BFQ: 3-4%, depending on the CPU. The reason is that, once a bfq_queue cannot be merged any longer, this commit makes BFQ stop updating the data needed to handle merging for the queue. Signed-off-by: Paolo Valente Signed-off-by: Angelo Ruocco Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 57 ++++++++++++++++++++++++++++++++++++++++++----------- block/bfq-iosched.h | 2 ++ block/bfq-wf2q.c | 4 ++++ 3 files changed, 52 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2cf395daee80..7066d90f09df 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -166,6 +166,20 @@ static const int bfq_async_charge_factor = 10; /* Default timeout values, in jiffies, approximating CFQ defaults. */ const int bfq_timeout = HZ / 8; +/* + * Time limit for merging (see comments in bfq_setup_cooperator). Set + * to the slowest value that, in our tests, proved to be effective in + * removing false positives, while not causing true positives to miss + * queue merging. + * + * As can be deduced from the low time limit below, queue merging, if + * successful, happens at the very beggining of the I/O of the involved + * cooperating processes, as a consequence of the arrival of the very + * first requests from each cooperator. After that, there is very + * little chance to find cooperators. + */ +static const unsigned long bfq_merge_time_limit = HZ/10; + static struct kmem_cache *bfq_pool; /* Below this threshold (in ns), we consider thinktime immediate. */ @@ -444,6 +458,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, return bfqq; } +static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) +{ + return bfqq->service_from_backlogged > 0 && + time_is_before_jiffies(bfqq->first_IO_time + + bfq_merge_time_limit); +} + void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct rb_node **p, *parent; @@ -454,6 +475,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->pos_root = NULL; } + /* + * bfqq cannot be merged any longer (see comments in + * bfq_setup_cooperator): no point in adding bfqq into the + * position tree. + */ + if (bfq_too_late_for_merging(bfqq)) + return; + if (bfq_class_idle(bfqq)) return; if (!bfqq->next_rq) @@ -1935,6 +1964,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) { + if (bfq_too_late_for_merging(new_bfqq)) + return false; + if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || (bfqq->ioprio_class != new_bfqq->ioprio_class)) return false; @@ -2003,6 +2035,20 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_queue *in_service_bfqq, *new_bfqq; + /* + * Prevent bfqq from being merged if it has been created too + * long ago. The idea is that true cooperating processes, and + * thus their associated bfq_queues, are supposed to be + * created shortly after each other. This is the case, e.g., + * for KVM/QEMU and dump I/O threads. Basing on this + * assumption, the following filtering greatly reduces the + * probability that two non-cooperating processes, which just + * happen to do close I/O for some short time interval, have + * their queues merged by mistake. + */ + if (bfq_too_late_for_merging(bfqq)) + return NULL; + if (bfqq->new_bfqq) return bfqq->new_bfqq; @@ -3002,17 +3048,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, */ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); - /* - * Increase service_from_backlogged before next statement, - * because the possible next invocation of - * bfq_bfqq_charge_time would likely inflate - * entity->service. In contrast, service_from_backlogged must - * contain real service, to enable the soft real-time - * heuristic to correctly compute the bandwidth consumed by - * bfqq. - */ - bfqq->service_from_backlogged += entity->service; - /* * As above explained, charge slow (typically seeky) and * timed-out queues with the time and not the service diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 91c4390903a1..5d47b58d5fc8 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -344,6 +344,8 @@ struct bfq_queue { unsigned long wr_start_at_switch_to_srt; unsigned long split_time; /* time of last split */ + + unsigned long first_IO_time; /* time of first I/O for this queue */ }; /** diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index e495d3f9b4b0..4456eda34e48 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -835,6 +835,10 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served) struct bfq_entity *entity = &bfqq->entity; struct bfq_service_tree *st; + if (!bfqq->service_from_backlogged) + bfqq->first_IO_time = jiffies; + + bfqq->service_from_backlogged += served; for_each_entity(entity) { st = bfq_entity_service_tree(entity); -- cgit v1.2.3-58-ga151 From 4403e4e467c365b4189e3e3d3ad35cf67b8c36ed Mon Sep 17 00:00:00 2001 From: Angelo Ruocco Date: Wed, 20 Dec 2017 12:38:34 +0100 Subject: block, bfq: remove superfluous check in queue-merging setup When two or more processes do I/O in a way that the their requests are sequential in respect to one another, BFQ merges the bfq_queues associated with the processes. This way the overall I/O pattern becomes sequential, and thus there is a boost in througput. These cooperating processes usually start or restart to do I/O shortly after each other. So, in order to avoid merging non-cooperating processes, BFQ ensures that none of these queues has been in weight raising for too long. In this respect, from commit "block, bfq-sq, bfq-mq: let a queue be merged only shortly after being created", BFQ checks whether any queue (and not only weight-raised ones) is doing I/O continuously from too long to be merged. This new additional check makes the first one useless: a queue doing I/O from long enough, if being weight-raised, is also a queue in weight raising for too long to be merged. Accordingly, this commit removes the first check. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 36 +++++------------------------------- 1 file changed, 5 insertions(+), 31 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7066d90f09df..9625550b2f85 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1990,20 +1990,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, return true; } -/* - * If this function returns true, then bfqq cannot be merged. The idea - * is that true cooperation happens very early after processes start - * to do I/O. Usually, late cooperations are just accidental false - * positives. In case bfqq is weight-raised, such false positives - * would evidently degrade latency guarantees for bfqq. - */ -static bool wr_from_too_long(struct bfq_queue *bfqq) -{ - return bfqq->wr_coeff > 1 && - time_is_before_jiffies(bfqq->last_wr_start_finish + - msecs_to_jiffies(100)); -} - /* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return @@ -2017,11 +2003,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq) * to maintain. Besides, in such a critical condition as an out of memory, * the benefits of queue merging may be little relevant, or even negligible. * - * Weight-raised queues can be merged only if their weight-raising - * period has just started. In fact cooperating processes are usually - * started together. Thus, with this filter we avoid false positives - * that would jeopardize low-latency guarantees. - * * WARNING: queue merging may impair fairness among non-weight raised * queues, for at least two reasons: 1) the original weight of a * merged queue may change during the merged state, 2) even being the @@ -2052,9 +2033,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->new_bfqq) return bfqq->new_bfqq; - if (!io_struct || - wr_from_too_long(bfqq) || - unlikely(bfqq == &bfqd->oom_bfqq)) + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; /* If there is only one backlogged queue, don't search. */ @@ -2063,12 +2042,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, in_service_bfqq = bfqd->in_service_queue; - if (!in_service_bfqq || in_service_bfqq == bfqq - || wr_from_too_long(in_service_bfqq) || - unlikely(in_service_bfqq == &bfqd->oom_bfqq)) - goto check_scheduled; - - if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + if (in_service_bfqq && in_service_bfqq != bfqq && + likely(in_service_bfqq != &bfqd->oom_bfqq) && + bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && bfqq->entity.parent == in_service_bfqq->entity.parent && bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); @@ -2080,12 +2056,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, * queues. The only thing we need is that the bio/request is not * NULL, as we need it to establish whether a cooperator exists. */ -check_scheduled: new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, bfq_io_struct_pos(io_struct, request)); - if (new_bfqq && !wr_from_too_long(new_bfqq) && - likely(new_bfqq != &bfqd->oom_bfqq) && + if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && bfq_may_be_close_cooperator(bfqq, new_bfqq)) return bfq_setup_merge(bfqq, new_bfqq); -- cgit v1.2.3-58-ga151 From a34b024448eb71b0e51ad011fa1862236e366034 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 15 Dec 2017 07:23:12 +0100 Subject: block, bfq: consider also past I/O in soft real-time detection BFQ privileges the I/O of soft real-time applications, such as video players, to guarantee to these application a high bandwidth and a low latency. In this respect, it is not easy to correctly detect when an application is soft real-time. A particularly nasty false positive is that of an I/O-bound application that occasionally happens to meet all requirements to be deemed as soft real-time. After being detected as soft real-time, such an application monopolizes the device. Fortunately, BFQ will realize soon that the application is actually not soft real-time and suspend every privilege. Yet, the application may happen again to be wrongly detected as soft real-time, and so on. As highlighted by our tests, this problem causes BFQ to occasionally fail to guarantee a high responsiveness, in the presence of heavy background I/O workloads. The reason is that the background workload happens to be detected as soft real-time, more or less frequently, during the execution of the interactive task under test. To give an idea, because of this problem, Libreoffice Writer occasionally takes 8 seconds, instead of 3, to start up, if there are sequential reads and writes in the background, on a Kingston SSDNow V300. This commit addresses this issue by leveraging the following facts. The reason why some applications are detected as soft real-time despite all BFQ checks to avoid false positives, is simply that, during high CPU or storage-device load, I/O-bound applications may happen to do I/O slowly enough to meet all soft real-time requirements, and pass all BFQ extra checks. Yet, this happens only for limited time periods: slow-speed time intervals are usually interspersed between other time intervals during which these applications do I/O at a very high speed. To exploit these facts, this commit introduces a little change, in the detection of soft real-time behavior, to systematically consider also the recent past: the higher the speed was in the recent past, the later next I/O should arrive for the application to be considered as soft real-time. At the beginning of a slow-speed interval, the minimum arrival time allowed for the next I/O usually happens to still be so high, to fall *after* the end of the slow-speed period itself. As a consequence, the application does not risk to be deemed as soft real-time during the slow-speed interval. Then, during the next high-speed interval, the application cannot, evidently, be deemed as soft real-time (exactly because of its speed), and so on. This extra filtering proved to be rather effective: in the above test, the frequency of false positives became so low that the start-up time was 3 seconds in all iterations (apart from occasional outliers, caused by page-cache-management issues, which are out of the scope of this commit, and cannot be solved by an I/O scheduler). Tested-by: Lee Tibbert Signed-off-by: Paolo Valente Signed-off-by: Angelo Ruocco Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 115 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 81 insertions(+), 34 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9625550b2f85..e33c5c4c9856 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2940,45 +2940,87 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, * whereas soft_rt_next_start is set to infinity for applications that do * not. * - * Unfortunately, even a greedy application may happen to behave in an - * isochronous way if the CPU load is high. In fact, the application may - * stop issuing requests while the CPUs are busy serving other processes, - * then restart, then stop again for a while, and so on. In addition, if - * the disk achieves a low enough throughput with the request pattern - * issued by the application (e.g., because the request pattern is random - * and/or the device is slow), then the application may meet the above - * bandwidth requirement too. To prevent such a greedy application to be - * deemed as soft real-time, a further rule is used in the computation of - * soft_rt_next_start: soft_rt_next_start must be higher than the current - * time plus the maximum time for which the arrival of a request is waited - * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. - * This filters out greedy applications, as the latter issue instead their - * next request as soon as possible after the last one has been completed - * (in contrast, when a batch of requests is completed, a soft real-time - * application spends some time processing data). + * Unfortunately, even a greedy (i.e., I/O-bound) application may + * happen to meet, occasionally or systematically, both the above + * bandwidth and isochrony requirements. This may happen at least in + * the following circumstances. First, if the CPU load is high. The + * application may stop issuing requests while the CPUs are busy + * serving other processes, then restart, then stop again for a while, + * and so on. The other circumstances are related to the storage + * device: the storage device is highly loaded or reaches a low-enough + * throughput with the I/O of the application (e.g., because the I/O + * is random and/or the device is slow). In all these cases, the + * I/O of the application may be simply slowed down enough to meet + * the bandwidth and isochrony requirements. To reduce the probability + * that greedy applications are deemed as soft real-time in these + * corner cases, a further rule is used in the computation of + * soft_rt_next_start: the return value of this function is forced to + * be higher than the maximum between the following two quantities. * - * Unfortunately, the last filter may easily generate false positives if - * only bfqd->bfq_slice_idle is used as a reference time interval and one - * or both the following cases occur: - * 1) HZ is so low that the duration of a jiffy is comparable to or higher - * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with - * HZ=100. + * (a) Current time plus: (1) the maximum time for which the arrival + * of a request is waited for when a sync queue becomes idle, + * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We + * postpone for a moment the reason for adding a few extra + * jiffies; we get back to it after next item (b). Lower-bounding + * the return value of this function with the current time plus + * bfqd->bfq_slice_idle tends to filter out greedy applications, + * because the latter issue their next request as soon as possible + * after the last one has been completed. In contrast, a soft + * real-time application spends some time processing data, after a + * batch of its requests has been completed. + * + * (b) Current value of bfqq->soft_rt_next_start. As pointed out + * above, greedy applications may happen to meet both the + * bandwidth and isochrony requirements under heavy CPU or + * storage-device load. In more detail, in these scenarios, these + * applications happen, only for limited time periods, to do I/O + * slowly enough to meet all the requirements described so far, + * including the filtering in above item (a). These slow-speed + * time intervals are usually interspersed between other time + * intervals during which these applications do I/O at a very high + * speed. Fortunately, exactly because of the high speed of the + * I/O in the high-speed intervals, the values returned by this + * function happen to be so high, near the end of any such + * high-speed interval, to be likely to fall *after* the end of + * the low-speed time interval that follows. These high values are + * stored in bfqq->soft_rt_next_start after each invocation of + * this function. As a consequence, if the last value of + * bfqq->soft_rt_next_start is constantly used to lower-bound the + * next value that this function may return, then, from the very + * beginning of a low-speed interval, bfqq->soft_rt_next_start is + * likely to be constantly kept so high that any I/O request + * issued during the low-speed interval is considered as arriving + * to soon for the application to be deemed as soft + * real-time. Then, in the high-speed interval that follows, the + * application will not be deemed as soft real-time, just because + * it will do I/O at a high speed. And so on. + * + * Getting back to the filtering in item (a), in the following two + * cases this filtering might be easily passed by a greedy + * application, if the reference quantity was just + * bfqd->bfq_slice_idle: + * 1) HZ is so low that the duration of a jiffy is comparable to or + * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow + * devices with HZ=100. The time granularity may be so coarse + * that the approximation, in jiffies, of bfqd->bfq_slice_idle + * is rather lower than the exact value. * 2) jiffies, instead of increasing at a constant rate, may stop increasing * for a while, then suddenly 'jump' by several units to recover the lost * increments. This seems to happen, e.g., inside virtual machines. - * To address this issue, we do not use as a reference time interval just - * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In - * particular we add the minimum number of jiffies for which the filter - * seems to be quite precise also in embedded systems and KVM/QEMU virtual - * machines. + * To address this issue, in the filtering in (a) we do not use as a + * reference time interval just bfqd->bfq_slice_idle, but + * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the + * minimum number of jiffies for which the filter seems to be quite + * precise also in embedded systems and KVM/QEMU virtual machines. */ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - return max(bfqq->last_idle_bklogged + - HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate, - jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); + return max3(bfqq->soft_rt_next_start, + bfqq->last_idle_bklogged + + HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate, + jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); } /** @@ -4014,10 +4056,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->split_time = bfq_smallest_from_now(); /* - * Set to the value for which bfqq will not be deemed as - * soft rt when it becomes backlogged. + * To not forget the possibly high bandwidth consumed by a + * process/queue in the recent past, + * bfq_bfqq_softrt_next_start() returns a value at least equal + * to the current value of bfqq->soft_rt_next_start (see + * comments on bfq_bfqq_softrt_next_start). Set + * soft_rt_next_start to now, to mean that bfqq has consumed + * no bandwidth so far. */ - bfqq->soft_rt_next_start = bfq_greatest_from_now(); + bfqq->soft_rt_next_start = jiffies; /* first request is almost certainly seeky */ bfqq->seek_history = 1; -- cgit v1.2.3-58-ga151 From 9b25bd0368d562d1929059e8eb9de4102567b923 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 4 Dec 2017 11:42:05 +0100 Subject: block, bfq: remove batches of confusing ifdefs Commit a33801e8b473 ("block, bfq: move debug blkio stats behind CONFIG_DEBUG_BLK_CGROUP") introduced two batches of confusing ifdefs: one reported in [1], plus a similar one in another function. This commit removes both batches, in the way suggested in [1]. [1] https://www.spinics.net/lists/linux-block/msg20043.html Fixes: a33801e8b473 ("block, bfq: move debug blkio stats behind CONFIG_DEBUG_BLK_CGROUP") Reported-by: Linus Torvalds Tested-by: Luca Miccio Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 127 +++++++++++++++++++++++++++++----------------------- 1 file changed, 72 insertions(+), 55 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e33c5c4c9856..7bd789da7a29 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3743,35 +3743,16 @@ exit: return rq; } -static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -{ - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct request *rq; #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - struct bfq_queue *in_serv_queue, *bfqq; - bool waiting_rq, idle_timer_disabled; -#endif - - spin_lock_irq(&bfqd->lock); - -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - in_serv_queue = bfqd->in_service_queue; - waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); - - rq = __bfq_dispatch_request(hctx); - - idle_timer_disabled = - waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); - -#else - rq = __bfq_dispatch_request(hctx); -#endif - spin_unlock_irq(&bfqd->lock); +static void bfq_update_dispatch_stats(struct request_queue *q, + struct request *rq, + struct bfq_queue *in_serv_queue, + bool idle_timer_disabled) +{ + struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - bfqq = rq ? RQ_BFQQ(rq) : NULL; if (!idle_timer_disabled && !bfqq) - return rq; + return; /* * rq and bfqq are guaranteed to exist until this function @@ -3786,7 +3767,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * In addition, the following queue lock guarantees that * bfqq_group(bfqq) exists as well. */ - spin_lock_irq(hctx->queue->queue_lock); + spin_lock_irq(q->queue_lock); if (idle_timer_disabled) /* * Since the idle timer has been disabled, @@ -3805,9 +3786,37 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) bfqg_stats_set_start_empty_time(bfqg); bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); } - spin_unlock_irq(hctx->queue->queue_lock); + spin_unlock_irq(q->queue_lock); +} +#else +static inline void bfq_update_dispatch_stats(struct request_queue *q, + struct request *rq, + struct bfq_queue *in_serv_queue, + bool idle_timer_disabled) {} #endif +static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct request *rq; + struct bfq_queue *in_serv_queue; + bool waiting_rq, idle_timer_disabled; + + spin_lock_irq(&bfqd->lock); + + in_serv_queue = bfqd->in_service_queue; + waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); + + rq = __bfq_dispatch_request(hctx); + + idle_timer_disabled = + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + + spin_unlock_irq(&bfqd->lock); + + bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, + idle_timer_disabled); + return rq; } @@ -4335,16 +4344,46 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) return idle_timer_disabled; } +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) +static void bfq_update_insert_stats(struct request_queue *q, + struct bfq_queue *bfqq, + bool idle_timer_disabled, + unsigned int cmd_flags) +{ + if (!bfqq) + return; + + /* + * bfqq still exists, because it can disappear only after + * either it is merged with another queue, or the process it + * is associated with exits. But both actions must be taken by + * the same process currently executing this flow of + * instructions. + * + * In addition, the following queue lock guarantees that + * bfqq_group(bfqq) exists as well. + */ + spin_lock_irq(q->queue_lock); + bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); + if (idle_timer_disabled) + bfqg_stats_update_idle_time(bfqq_group(bfqq)); + spin_unlock_irq(q->queue_lock); +} +#else +static inline void bfq_update_insert_stats(struct request_queue *q, + struct bfq_queue *bfqq, + bool idle_timer_disabled, + unsigned int cmd_flags) {} +#endif + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) struct bfq_queue *bfqq = RQ_BFQQ(rq); bool idle_timer_disabled = false; unsigned int cmd_flags; -#endif spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { @@ -4363,7 +4402,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, else list_add_tail(&rq->queuelist, &bfqd->dispatch); } else { -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred @@ -4371,9 +4409,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * redirected into a new queue. */ bfqq = RQ_BFQQ(rq); -#else - __bfq_insert_request(bfqd, rq); -#endif if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -4382,35 +4417,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } } -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* * Cache cmd_flags before releasing scheduler lock, because rq * may disappear afterwards (for example, because of a request * merge). */ cmd_flags = rq->cmd_flags; -#endif + spin_unlock_irq(&bfqd->lock); -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - if (!bfqq) - return; - /* - * bfqq still exists, because it can disappear only after - * either it is merged with another queue, or the process it - * is associated with exits. But both actions must be taken by - * the same process currently executing this flow of - * instruction. - * - * In addition, the following queue lock guarantees that - * bfqq_group(bfqq) exists as well. - */ - spin_lock_irq(q->queue_lock); - bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); - if (idle_timer_disabled) - bfqg_stats_update_idle_time(bfqq_group(bfqq)); - spin_unlock_irq(q->queue_lock); -#endif + bfq_update_insert_stats(q, bfqq, idle_timer_disabled, + cmd_flags); } static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -- cgit v1.2.3-58-ga151 From 7891f05cbf4944a5436491d66de2be7533089aea Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 20:22:06 +0800 Subject: block: bounce: avoid direct access to bvec table We will support multipage bvecs in the future, so change to iterator way for getting bv_page of bvec from original bio. Cc: Matthew Wilcox Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bounce.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/bounce.c b/block/bounce.c index fceb1a96480b..0274c31d6c05 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -137,21 +137,20 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) static void bounce_end_io(struct bio *bio, mempool_t *pool) { struct bio *bio_orig = bio->bi_private; - struct bio_vec *bvec, *org_vec; + struct bio_vec *bvec, orig_vec; int i; - int start = bio_orig->bi_iter.bi_idx; + struct bvec_iter orig_iter = bio_orig->bi_iter; /* * free up bounce indirect pages used */ bio_for_each_segment_all(bvec, bio, i) { - org_vec = bio_orig->bi_io_vec + i + start; - - if (bvec->bv_page == org_vec->bv_page) - continue; - - dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, pool); + orig_vec = bio_iter_iovec(bio_orig, orig_iter); + if (bvec->bv_page != orig_vec.bv_page) { + dec_zone_page_state(bvec->bv_page, NR_BOUNCE); + mempool_free(bvec->bv_page, pool); + } + bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len); } bio_orig->bi_status = bio->bi_status; -- cgit v1.2.3-58-ga151 From 3c892a098b0bfa3e571f1f0d2a7e72fbaeea691a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 20:22:07 +0800 Subject: block: bounce: don't access bio->bi_io_vec in copy_to_high_bio_irq Firstly this patch introduces BVEC_ITER_ALL_INIT for iterating one bio from start to end. As we need to support multipage bvecs, don't access bio->bi_io_vec in copy_to_high_bio_irq(), and just use the standard iterator for that. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bounce.c | 16 +++++++++++----- include/linux/bvec.h | 9 +++++++++ 2 files changed, 20 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/bounce.c b/block/bounce.c index 0274c31d6c05..c35a3d7f0528 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -113,24 +113,30 @@ int init_emergency_isa_pool(void) static void copy_to_high_bio_irq(struct bio *to, struct bio *from) { unsigned char *vfrom; - struct bio_vec tovec, *fromvec = from->bi_io_vec; + struct bio_vec tovec, fromvec; struct bvec_iter iter; + /* + * The bio of @from is created by bounce, so we can iterate + * its bvec from start to end, but the @from->bi_iter can't be + * trusted because it might be changed by splitting. + */ + struct bvec_iter from_iter = BVEC_ITER_ALL_INIT; bio_for_each_segment(tovec, to, iter) { - if (tovec.bv_page != fromvec->bv_page) { + fromvec = bio_iter_iovec(from, from_iter); + if (tovec.bv_page != fromvec.bv_page) { /* * fromvec->bv_offset and fromvec->bv_len might have * been modified by the block layer, so use the original * copy, bounce_copy_vec already uses tovec->bv_len */ - vfrom = page_address(fromvec->bv_page) + + vfrom = page_address(fromvec.bv_page) + tovec.bv_offset; bounce_copy_vec(&tovec, vfrom); flush_dcache_page(tovec.bv_page); } - - fromvec++; + bio_advance_iter(from, &from_iter, tovec.bv_len); } } diff --git a/include/linux/bvec.h b/include/linux/bvec.h index ec8a4d7af6bd..fe7a22dd133b 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -125,4 +125,13 @@ static inline bool bvec_iter_rewind(const struct bio_vec *bv, ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len)) +/* for iterating one bio from start to end */ +#define BVEC_ITER_ALL_INIT (struct bvec_iter) \ +{ \ + .bi_sector = 0, \ + .bi_size = UINT_MAX, \ + .bi_idx = 0, \ + .bi_bvec_done = 0, \ +} + #endif /* __LINUX_BVEC_ITER_H */ -- cgit v1.2.3-58-ga151 From 25d8be77e19224d8f21b363d77b5283c5dc21a57 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 20:22:10 +0800 Subject: block: move bio_alloc_pages() to bcache bcache is the only user of bio_alloc_pages(), so move this function into bcache, and avoid it being misused in the future. Also rename it to bch_bio_allo_pages() since it is bcache only. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 28 ---------------------------- drivers/md/bcache/btree.c | 2 +- drivers/md/bcache/debug.c | 2 +- drivers/md/bcache/movinggc.c | 2 +- drivers/md/bcache/request.c | 2 +- drivers/md/bcache/util.c | 27 +++++++++++++++++++++++++++ drivers/md/bcache/util.h | 1 + drivers/md/bcache/writeback.c | 2 +- include/linux/bio.h | 1 - 9 files changed, 33 insertions(+), 34 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 8bfdea58159b..fe1efbeaf4aa 100644 --- a/block/bio.c +++ b/block/bio.c @@ -968,34 +968,6 @@ void bio_advance(struct bio *bio, unsigned bytes) } EXPORT_SYMBOL(bio_advance); -/** - * bio_alloc_pages - allocates a single page for each bvec in a bio - * @bio: bio to allocate pages for - * @gfp_mask: flags for allocation - * - * Allocates pages up to @bio->bi_vcnt. - * - * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are - * freed. - */ -int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) -{ - int i; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, i) { - bv->bv_page = alloc_page(gfp_mask); - if (!bv->bv_page) { - while (--bv >= bio->bi_io_vec) - __free_page(bv->bv_page); - return -ENOMEM; - } - } - - return 0; -} -EXPORT_SYMBOL(bio_alloc_pages); - /** * bio_copy_data - copy contents of data buffers from one chain of bios to * another diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 02a4cf646fdc..ebb1874218e7 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -419,7 +419,7 @@ static void do_btree_node_write(struct btree *b) SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_sector_offset(&b->keys, i)); - if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { + if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { int j; struct bio_vec *bv; void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index c7a02c4900da..879ab21074c6 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -116,7 +116,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) return; check->bi_opf = REQ_OP_READ; - if (bio_alloc_pages(check, GFP_NOIO)) + if (bch_bio_alloc_pages(check, GFP_NOIO)) goto out_put; submit_bio_wait(check); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index d50c1c97da68..a24c3a95b2c0 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -162,7 +162,7 @@ static void read_moving(struct cache_set *c) bio_set_op_attrs(bio, REQ_OP_READ, 0); bio->bi_end_io = read_moving_endio; - if (bio_alloc_pages(bio, GFP_KERNEL)) + if (bch_bio_alloc_pages(bio, GFP_KERNEL)) goto err; trace_bcache_gc_copy(&w->key); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 643c3021624f..c493fb947dc9 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -841,7 +841,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, cache_bio->bi_private = &s->cl; bch_bio_map(cache_bio, NULL); - if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) + if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; if (reada) diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 61813d230015..a23cd6a14b74 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -283,6 +283,33 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, } } +/** + * bch_bio_alloc_pages - allocates a single page for each bvec in a bio + * @bio: bio to allocate pages for + * @gfp_mask: flags for allocation + * + * Allocates pages up to @bio->bi_vcnt. + * + * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are + * freed. + */ +int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +{ + int i; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, i) { + bv->bv_page = alloc_page(gfp_mask); + if (!bv->bv_page) { + while (--bv >= bio->bi_io_vec) + __free_page(bv->bv_page); + return -ENOMEM; + } + } + + return 0; +} + /* * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any * use permitted, subject to terms of PostgreSQL license; see.) diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index ed5e8a412eb8..4df4c5c1cab2 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -558,6 +558,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) } void bch_bio_map(struct bio *bio, void *base); +int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask); static inline sector_t bdev_sectors(struct block_device *bdev) { diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 56a37884ca8b..1ac2af6128b1 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -278,7 +278,7 @@ static void read_dirty(struct cached_dev *dc) bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); io->bio.bi_end_io = read_dirty_endio; - if (bio_alloc_pages(&io->bio, GFP_KERNEL)) + if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) goto err_free; trace_bcache_writeback(&w->key); diff --git a/include/linux/bio.h b/include/linux/bio.h index 435ddf04e889..367a979fd4a6 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -500,7 +500,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi) #endif extern void bio_copy_data(struct bio *dst, struct bio *src); -extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); extern void bio_free_pages(struct bio *bio); extern struct bio *bio_copy_user_iov(struct request_queue *, -- cgit v1.2.3-58-ga151 From 6a501bf0807b5dc024fe52a4f956800a352c39ab Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 20:22:14 +0800 Subject: blk-merge: compute bio->bi_seg_front_size efficiently It is enough to check and compute bio->bi_seg_front_size just after the 1st segment is found, but current code checks that for each bvec, which is inefficient. This patch follows the way in __blk_recalc_rq_segments() for computing bio->bi_seg_front_size, and it is more efficient and code becomes more readable too. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index f5dedd57dff6..a476337a8ff4 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -146,22 +146,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bvprvp = &bvprv; sectors += bv.bv_len >> 9; - if (nsegs == 1 && seg_size > front_seg_size) - front_seg_size = seg_size; continue; } new_segment: if (nsegs == queue_max_segments(q)) goto split; + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; + nsegs++; bvprv = bv; bvprvp = &bvprv; seg_size = bv.bv_len; sectors += bv.bv_len >> 9; - if (nsegs == 1 && seg_size > front_seg_size) - front_seg_size = seg_size; } do_split = false; @@ -174,6 +173,8 @@ split: bio = new; } + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; bio->bi_seg_front_size = front_seg_size; if (seg_size > bio->bi_seg_back_size) bio->bi_seg_back_size = seg_size; -- cgit v1.2.3-58-ga151 From a2d37968d784363842f87820a21e106741d28004 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 20:22:15 +0800 Subject: block: blk-merge: try to make front segments in full size When merging one bvec into segment, if the bvec is too big to merge, current policy is to move the whole bvec into another new segment. This patchset changes the policy into trying to maximize size of front segments, that means in above situation, part of bvec is merged into current segment, and the remainder is put into next segment. This patch prepares for support multipage bvec because it can be quite common to see this case and we should try to make front segments in full size. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index a476337a8ff4..ca2e7aec8e77 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -109,6 +109,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bool do_split = true; struct bio *new = NULL; const unsigned max_sectors = get_max_io_size(q, bio); + unsigned advance = 0; bio_for_each_segment(bv, bio, iter) { /* @@ -134,12 +135,32 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, } if (bvprvp && blk_queue_cluster(q)) { - if (seg_size + bv.bv_len > queue_max_segment_size(q)) - goto new_segment; if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv)) goto new_segment; if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv)) goto new_segment; + if (seg_size + bv.bv_len > queue_max_segment_size(q)) { + /* + * One assumption is that initial value of + * @seg_size(equals to bv.bv_len) won't be + * bigger than max segment size, but this + * becomes false after multipage bvecs. + */ + advance = queue_max_segment_size(q) - seg_size; + + if (advance > 0) { + seg_size += advance; + sectors += advance >> 9; + bv.bv_len -= advance; + bv.bv_offset += advance; + } + + /* + * Still need to put remainder of current + * bvec into a new segment. + */ + goto new_segment; + } seg_size += bv.bv_len; bvprv = bv; @@ -161,6 +182,12 @@ new_segment: seg_size = bv.bv_len; sectors += bv.bv_len >> 9; + /* restore the bvec for iterator */ + if (advance) { + bv.bv_len += advance; + bv.bv_offset -= advance; + advance = 0; + } } do_split = false; @@ -361,16 +388,29 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, { int nbytes = bvec->bv_len; + unsigned advance = 0; if (*sg && *cluster) { - if ((*sg)->length + nbytes > queue_max_segment_size(q)) - goto new_segment; - if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) goto new_segment; if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) goto new_segment; + /* + * try best to merge part of the bvec into previous + * segment and follow same policy with + * blk_bio_segment_split() + */ + if ((*sg)->length + nbytes > queue_max_segment_size(q)) { + advance = queue_max_segment_size(q) - (*sg)->length; + if (advance) { + (*sg)->length += advance; + bvec->bv_offset += advance; + bvec->bv_len -= advance; + } + goto new_segment; + } + (*sg)->length += nbytes; } else { new_segment: @@ -393,6 +433,10 @@ new_segment: sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); (*nsegs)++; + + /* for making iterator happy */ + bvec->bv_offset -= advance; + bvec->bv_len += advance; } *bvprv = *bvec; } -- cgit v1.2.3-58-ga151 From cf8c0c6a3830583bd0e7c94933e155bf97cd162b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 20:22:16 +0800 Subject: block: blk-merge: remove unnecessary check In this case, 'sectors' can't be zero at all, so remove the check and let the bio be split. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index ca2e7aec8e77..446f63e076aa 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -129,9 +129,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, nsegs++; sectors = max_sectors; } - if (sectors) - goto split; - /* Make this single bvec as the 1st segment */ + goto split; } if (bvprvp && blk_queue_cluster(q)) { -- cgit v1.2.3-58-ga151 From ca11f209a4c88743fb4b652fd812470e6fecc598 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 6 Jan 2018 09:23:11 -0700 Subject: mq-deadline: make it clear that __dd_dispatch_request() works on all hw queues Don't pass in the hardware queue to __dd_dispatch_request(), since it leads the reader to believe that we are returning a request for that specific hardware queue. That's not how mq-deadline works, the state for determining which request to serve next is shared across all hardware queues for a device. Reviewed-by: Omar Sandoval Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/mq-deadline.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index d56972e8ebda..c56f211c8440 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -267,9 +267,8 @@ deadline_next_request(struct deadline_data *dd, int data_dir) * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc */ -static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) +static struct request *__dd_dispatch_request(struct deadline_data *dd) { - struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct request *rq, *next_rq; bool reads, writes; int data_dir; @@ -372,13 +371,19 @@ done: return rq; } +/* + * One confusing aspect here is that we get called for a specific + * hardware queue, but we return a request that may not be for a + * different hardware queue. This is because mq-deadline has shared + * state for all hardware queues, in terms of sorting, FIFOs, etc. + */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct request *rq; spin_lock(&dd->lock); - rq = __dd_dispatch_request(hctx); + rq = __dd_dispatch_request(dd); spin_unlock(&dd->lock); return rq; -- cgit v1.2.3-58-ga151 From c2856ae2f315d754a0b6a268e4c6745b332b42e7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 6 Jan 2018 16:27:37 +0800 Subject: blk-mq: quiesce queue before freeing queue After queue is frozen, dispatch still may happen, for example: 1) requests are submitted from several contexts 2) requests from all these contexts are inserted to queue, but may dispatch to LLD in one of these paths, but other paths sill need to move on even all these requests are completed(that means blk_mq_freeze_queue_wait() returns at that time) 3) dispatch after queue freezing still moves on and causes use-after-free, because request queue is freed This patch quiesces queue after it is frozen, and makes sure all in-progress dispatch are completed. This patch fixes the following kernel crash when running heavy IOs vs. deleting device: [ 36.719251] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 [ 36.720318] IP: kyber_has_work+0x14/0x40 [ 36.720847] PGD 254bf5067 P4D 254bf5067 PUD 255e6a067 PMD 0 [ 36.721584] Oops: 0000 [#1] PREEMPT SMP [ 36.722105] Dumping ftrace buffer: [ 36.722570] (ftrace buffer empty) [ 36.723057] Modules linked in: scsi_debug ebtable_filter ebtables ip6table_filter ip6_tables tcm_loop iscsi_target_mod target_core_file target_core_iblock target_core_pscsi target_core_mod xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack libcrc32c bridge stp llc fuse iptable_filter ip_tables sd_mod sg btrfs xor zstd_decompress zstd_compress xxhash raid6_pq mptsas mptscsih bcache crc32c_intel ahci mptbase libahci serio_raw scsi_transport_sas nvme libata shpchp lpc_ich virtio_scsi nvme_core binfmt_misc dm_mod iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi null_blk configs [ 36.733438] CPU: 2 PID: 2374 Comm: fio Not tainted 4.15.0-rc2.blk_mq_quiesce+ #714 [ 36.735143] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.9.3-1.fc25 04/01/2014 [ 36.736688] RIP: 0010:kyber_has_work+0x14/0x40 [ 36.737515] RSP: 0018:ffffc9000209bca0 EFLAGS: 00010202 [ 36.738431] RAX: 0000000000000008 RBX: ffff88025578bfc8 RCX: ffff880257bf4ed0 [ 36.739581] RDX: 0000000000000038 RSI: ffffffff81a98c6d RDI: ffff88025578bfc8 [ 36.740730] RBP: ffff880253cebfc8 R08: ffffc9000209bda0 R09: ffff8802554f3480 [ 36.741885] R10: ffffc9000209be60 R11: ffff880263f72538 R12: ffff88025573e9e8 [ 36.743036] R13: ffff88025578bfd0 R14: 0000000000000001 R15: 0000000000000000 [ 36.744189] FS: 00007f9b9bee67c0(0000) GS:ffff88027fc80000(0000) knlGS:0000000000000000 [ 36.746617] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 36.748483] CR2: 0000000000000008 CR3: 0000000254bf4001 CR4: 00000000003606e0 [ 36.750164] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 36.751455] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 36.752796] Call Trace: [ 36.753992] blk_mq_do_dispatch_sched+0x7f/0xe0 [ 36.755110] blk_mq_sched_dispatch_requests+0x119/0x190 [ 36.756179] __blk_mq_run_hw_queue+0x83/0x90 [ 36.757144] __blk_mq_delay_run_hw_queue+0xaf/0x110 [ 36.758046] blk_mq_run_hw_queue+0x24/0x70 [ 36.758845] blk_mq_flush_plug_list+0x1e7/0x270 [ 36.759676] blk_flush_plug_list+0xd6/0x240 [ 36.760463] blk_finish_plug+0x27/0x40 [ 36.761195] do_io_submit+0x19b/0x780 [ 36.761921] ? entry_SYSCALL_64_fastpath+0x1a/0x7d [ 36.762788] entry_SYSCALL_64_fastpath+0x1a/0x7d [ 36.763639] RIP: 0033:0x7f9b9699f697 [ 36.764352] RSP: 002b:00007ffc10f991b8 EFLAGS: 00000206 ORIG_RAX: 00000000000000d1 [ 36.765773] RAX: ffffffffffffffda RBX: 00000000008f6f00 RCX: 00007f9b9699f697 [ 36.766965] RDX: 0000000000a5e6c0 RSI: 0000000000000001 RDI: 00007f9b8462a000 [ 36.768377] RBP: 0000000000000000 R08: 0000000000000001 R09: 00000000008f6420 [ 36.769649] R10: 00007f9b846e5000 R11: 0000000000000206 R12: 00007f9b795d6a70 [ 36.770807] R13: 00007f9b795e4140 R14: 00007f9b795e3fe0 R15: 0000000100000000 [ 36.771955] Code: 83 c7 10 e9 3f 68 d1 ff 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 8b 97 b0 00 00 00 48 8d 42 08 48 83 c2 38 <48> 3b 00 74 06 b8 01 00 00 00 c3 48 3b 40 08 75 f4 48 83 c0 10 [ 36.775004] RIP: kyber_has_work+0x14/0x40 RSP: ffffc9000209bca0 [ 36.776012] CR2: 0000000000000008 [ 36.776690] ---[ end trace 4045cbce364ff2a4 ]--- [ 36.777527] Kernel panic - not syncing: Fatal exception [ 36.778526] Dumping ftrace buffer: [ 36.779313] (ftrace buffer empty) [ 36.780081] Kernel Offset: disabled [ 36.780877] ---[ end Kernel panic - not syncing: Fatal exception Reviewed-by: Christoph Hellwig Cc: stable@vger.kernel.org Tested-by: Yi Zhang Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index e6e5bbc4c366..2e0d041e2daf 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -694,6 +694,15 @@ void blk_cleanup_queue(struct request_queue *q) queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); + /* + * make sure all in-progress dispatch are completed because + * blk_freeze_queue() can only complete all requests, and + * dispatch may still be in-progress since we dispatch requests + * from more than one contexts + */ + if (q->mq_ops) + blk_mq_quiesce_queue(q); + /* for synchronous bio-based driver finish in-flight integrity i/o */ blk_flush_integrity(); -- cgit v1.2.3-58-ga151 From 24f5a90f0d13a97b51aa79f468143fafea4246bb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 6 Jan 2018 16:27:38 +0800 Subject: blk-mq: quiesce queue during switching io sched and updating nr_requests Dispatch may still be in-progress after queue is frozen, so we have to quiesce queue before switching IO scheduler and updating nr_requests. Also when switching io schedulers, blk_mq_run_hw_queue() may still be called somewhere(such as from nvme_reset_work()), and io scheduler's per-hctx data may not be setup yet, so cause oops even inside blk_mq_hctx_has_pending(), such as it can be run just between: ret = e->ops.mq.init_sched(q, e); AND ret = e->ops.mq.init_hctx(hctx, i) inside blk_mq_init_sched(). This reverts commit 7a148c2fcff8330(block: don't call blk_mq_quiesce_queue() after queue is frozen) basically, and makes sure blk_mq_hctx_has_pending won't be called if queue is quiesced. Reviewed-by: Christoph Hellwig Fixes: 7a148c2fcff83309(block: don't call blk_mq_quiesce_queue() after queue is frozen) Reported-by: Yi Zhang Tested-by: Yi Zhang Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 27 ++++++++++++++++++++++++++- block/elevator.c | 2 ++ 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 11097477eeab..1c66c319325c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1285,7 +1285,30 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - if (blk_mq_hctx_has_pending(hctx)) { + int srcu_idx; + bool need_run; + + /* + * When queue is quiesced, we may be switching io scheduler, or + * updating nr_hw_queues, or other things, and we can't run queue + * any more, even __blk_mq_hctx_has_pending() can't be called safely. + * + * And queue will be rerun in blk_mq_unquiesce_queue() if it is + * quiesced. + */ + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { + rcu_read_lock(); + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx); + rcu_read_unlock(); + } else { + srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx); + srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); + } + + if (need_run) { __blk_mq_delay_run_hw_queue(hctx, async, 0); return true; } @@ -2710,6 +2733,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return -EINVAL; blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); ret = 0; queue_for_each_hw_ctx(q, hctx, i) { @@ -2733,6 +2757,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!ret) q->nr_requests = nr; + blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); return ret; diff --git a/block/elevator.c b/block/elevator.c index 7bda083d5968..138faeb08a7c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -968,6 +968,7 @@ static int elevator_switch_mq(struct request_queue *q, int ret; blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); if (q->elevator) { if (q->elevator->registered) @@ -994,6 +995,7 @@ static int elevator_switch_mq(struct request_queue *q, blk_add_trace_msg(q, "elv switch: none"); out: + blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); return ret; } -- cgit v1.2.3-58-ga151 From 7d4901a90d02500c8011472a060f9b2e60e6e605 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 6 Jan 2018 16:27:39 +0800 Subject: blk-mq: avoid to map CPU into stale hw queue blk_mq_pci_map_queues() may not map one CPU into any hw queue, but its previous map isn't cleared yet, and may point to one stale hw queue index. This patch fixes the following issue by clearing the mapping table before setting it up in blk_mq_pci_map_queues(). This patches fixes this following issue reported by Zhang Yi: [ 101.202734] BUG: unable to handle kernel NULL pointer dereference at 0000000094d3013f [ 101.211487] IP: blk_mq_map_swqueue+0xbc/0x200 [ 101.216346] PGD 0 P4D 0 [ 101.219171] Oops: 0000 [#1] SMP [ 101.222674] Modules linked in: sunrpc ipmi_ssif vfat fat intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate intel_uncore mxm_wmi intel_rapl_perf iTCO_wdt ipmi_si ipmi_devintf pcspkr iTCO_vendor_support sg dcdbas ipmi_msghandler wmi mei_me lpc_ich shpchp mei acpi_power_meter dm_multipath ip_tables xfs libcrc32c sd_mod mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm ahci libahci crc32c_intel libata tg3 nvme nvme_core megaraid_sas ptp i2c_core pps_core dm_mirror dm_region_hash dm_log dm_mod [ 101.284881] CPU: 0 PID: 504 Comm: kworker/u25:5 Not tainted 4.15.0-rc2 #1 [ 101.292455] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.5.5 08/16/2017 [ 101.301001] Workqueue: nvme-wq nvme_reset_work [nvme] [ 101.306636] task: 00000000f2c53190 task.stack: 000000002da874f9 [ 101.313241] RIP: 0010:blk_mq_map_swqueue+0xbc/0x200 [ 101.318681] RSP: 0018:ffffc9000234fd70 EFLAGS: 00010282 [ 101.324511] RAX: ffff88047ffc9480 RBX: ffff88047e130850 RCX: 0000000000000000 [ 101.332471] RDX: ffffe8ffffd40580 RSI: ffff88047e509b40 RDI: ffff88046f37a008 [ 101.340432] RBP: 000000000000000b R08: ffff88046f37a008 R09: 0000000011f94280 [ 101.348392] R10: ffff88047ffd4d00 R11: 0000000000000000 R12: ffff88046f37a008 [ 101.356353] R13: ffff88047e130f38 R14: 000000000000000b R15: ffff88046f37a558 [ 101.364314] FS: 0000000000000000(0000) GS:ffff880277c00000(0000) knlGS:0000000000000000 [ 101.373342] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 101.379753] CR2: 0000000000000098 CR3: 000000047f409004 CR4: 00000000001606f0 [ 101.387714] Call Trace: [ 101.390445] blk_mq_update_nr_hw_queues+0xbf/0x130 [ 101.395791] nvme_reset_work+0x6f4/0xc06 [nvme] [ 101.400848] ? pick_next_task_fair+0x290/0x5f0 [ 101.405807] ? __switch_to+0x1f5/0x430 [ 101.409988] ? put_prev_entity+0x2f/0xd0 [ 101.414365] process_one_work+0x141/0x340 [ 101.418836] worker_thread+0x47/0x3e0 [ 101.422921] kthread+0xf5/0x130 [ 101.426424] ? rescuer_thread+0x380/0x380 [ 101.430896] ? kthread_associate_blkcg+0x90/0x90 [ 101.436048] ret_from_fork+0x1f/0x30 [ 101.440034] Code: 48 83 3c ca 00 0f 84 2b 01 00 00 48 63 cd 48 8b 93 10 01 00 00 8b 0c 88 48 8b 83 20 01 00 00 4a 03 14 f5 60 04 af 81 48 8b 0c c8 <48> 8b 81 98 00 00 00 f0 4c 0f ab 30 8b 81 f8 00 00 00 89 42 44 [ 101.461116] RIP: blk_mq_map_swqueue+0xbc/0x200 RSP: ffffc9000234fd70 [ 101.468205] CR2: 0000000000000098 [ 101.471907] ---[ end trace 5fe710f98228a3ca ]--- [ 101.482489] Kernel panic - not syncing: Fatal exception [ 101.488505] Kernel Offset: disabled [ 101.497752] ---[ end Kernel panic - not syncing: Fatal exception Reviewed-by: Christoph Hellwig Suggested-by: Christoph Hellwig Reported-by: Yi Zhang Tested-by: Yi Zhang Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 1c66c319325c..dd21051fb251 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2622,9 +2622,27 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { - if (set->ops->map_queues) + if (set->ops->map_queues) { + int cpu; + /* + * transport .map_queues is usually done in the following + * way: + * + * for (queue = 0; queue < set->nr_hw_queues; queue++) { + * mask = get_cpu_mask(queue) + * for_each_cpu(cpu, mask) + * set->mq_map[cpu] = queue; + * } + * + * When we need to remap, the table has to be cleared for + * killing stale mapping since one CPU may not be mapped + * to any hw queue. + */ + for_each_possible_cpu(cpu) + set->mq_map[cpu] = 0; + return set->ops->map_queues(set); - else + } else return blk_mq_map_queues(set); } -- cgit v1.2.3-58-ga151 From fb350e0ad99359768e1e80b4784692031ec340e4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 6 Jan 2018 16:27:40 +0800 Subject: blk-mq: fix race between updating nr_hw_queues and switching io sched In both elevator_switch_mq() and blk_mq_update_nr_hw_queues(), sched tags can be allocated, and q->nr_hw_queue is used, and race is inevitable, for example: blk_mq_init_sched() may trigger use-after-free on hctx, which is freed in blk_mq_realloc_hw_ctxs() when nr_hw_queues is decreased. This patch fixes the race be holding q->sysfs_lock. Reviewed-by: Christoph Hellwig Reported-by: Yi Zhang Tested-by: Yi Zhang Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index dd21051fb251..111e1aa5562f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2407,6 +2407,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; blk_mq_sysfs_unregister(q); + + /* protect against switching io scheduler */ + mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int node; @@ -2451,6 +2454,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, } } q->nr_hw_queues = i; + mutex_unlock(&q->sysfs_lock); blk_mq_sysfs_register(q); } -- cgit v1.2.3-58-ga151 From 8ab0b7dc73e1b3e2987d42554b2bff503f692772 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 9 Jan 2018 21:28:29 +0800 Subject: blk-mq: fix kernel oops in blk_mq_tag_idle() HW queues may be unmapped in some cases, such as blk_mq_update_nr_hw_queues(), then we need to check it before calling blk_mq_tag_idle(), otherwise the following kernel oops can be triggered, so fix it by checking if the hw queue is unmapped since it doesn't make sense to idle the tags any more after hw queues are unmapped. [ 440.771298] Workqueue: nvme-wq nvme_rdma_del_ctrl_work [nvme_rdma] [ 440.779104] task: ffff894bae755ee0 ti: ffff893bf9bc8000 task.ti: ffff893bf9bc8000 [ 440.788359] RIP: 0010:[] [] __blk_mq_tag_idle+0x24/0x40 [ 440.798697] RSP: 0018:ffff893bf9bcbd10 EFLAGS: 00010286 [ 440.805538] RAX: 0000000000000000 RBX: ffff895bb131dc00 RCX: 000000000000011f [ 440.814426] RDX: 00000000ffffffff RSI: 0000000000000120 RDI: ffff895bb131dc00 [ 440.823301] RBP: ffff893bf9bcbd10 R08: 000000000001b860 R09: 4a51d361c00c0000 [ 440.832193] R10: b5907f32b4cc7003 R11: ffffd6cabfb57000 R12: ffff894bafd1e008 [ 440.841091] R13: 0000000000000001 R14: ffff895baf770000 R15: 0000000000000080 [ 440.849988] FS: 0000000000000000(0000) GS:ffff894bbdcc0000(0000) knlGS:0000000000000000 [ 440.859955] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 440.867274] CR2: 0000000000000008 CR3: 000000103d098000 CR4: 00000000001407e0 [ 440.876169] Call Trace: [ 440.879818] [] blk_mq_exit_hctx+0xd8/0xe0 [ 440.887051] [] blk_mq_free_queue+0xf0/0x160 [ 440.894465] [] blk_cleanup_queue+0xd9/0x150 [ 440.901881] [] nvme_ns_remove+0x5b/0xb0 [nvme_core] [ 440.910068] [] nvme_remove_namespaces+0x3b/0x60 [nvme_core] [ 440.919026] [] __nvme_rdma_remove_ctrl+0x2b/0xb0 [nvme_rdma] [ 440.928079] [] nvme_rdma_del_ctrl_work+0x17/0x20 [nvme_rdma] [ 440.937126] [] process_one_work+0x17a/0x440 [ 440.944517] [] worker_thread+0x278/0x3c0 [ 440.951607] [] ? manage_workers.isra.24+0x2a0/0x2a0 [ 440.959760] [] kthread+0xcf/0xe0 [ 440.966055] [] ? insert_kthread_work+0x40/0x40 [ 440.973715] [] ret_from_fork+0x58/0x90 [ 440.980586] [] ? insert_kthread_work+0x40/0x40 [ 440.988229] Code: 5b 41 5c 5d c3 66 90 0f 1f 44 00 00 48 8b 87 20 01 00 00 f0 0f ba 77 40 01 19 d2 85 d2 75 08 c3 0f 1f 80 00 00 00 00 55 48 89 e5 ff 48 08 48 8d 78 10 e8 7f 0f 05 00 5d c3 0f 1f 00 66 2e 0f [ 441.011620] RIP [] __blk_mq_tag_idle+0x24/0x40 [ 441.019301] RSP [ 441.024052] CR2: 0000000000000008 Reported-by: Zhang Yi Tested-by: Zhang Yi Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 111e1aa5562f..e258ad8dc171 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2015,7 +2015,8 @@ static void blk_mq_exit_hctx(struct request_queue *q, { blk_mq_debugfs_unregister_hctx(hctx); - blk_mq_tag_idle(hctx); + if (blk_mq_hw_queue_mapped(hctx)) + blk_mq_tag_idle(hctx); if (set->ops->exit_request) set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); -- cgit v1.2.3-58-ga151 From 52257ffbfcaf58d247b13fb148e27ed17c33e526 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 9 Jan 2018 10:27:58 +0100 Subject: block, bfq: put async queues for root bfq groups too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For each pair [device for which bfq is selected as I/O scheduler, group in blkio/io], bfq maintains a corresponding bfq group. Each such bfq group contains a set of async queues, with each async queue created on demand, i.e., when some I/O request arrives for it. On creation, an async queue gets an extra reference, to make sure that the queue is not freed as long as its bfq group exists. Accordingly, to allow the queue to be freed after the group exited, this extra reference must released on group exit. The above holds also for a bfq root group, i.e., for the bfq group corresponding to the root blkio/io root for a given device. Yet, by mistake, the references to the existing async queues of a root group are not released when the latter exits. This causes a memory leak when the instance of bfq for a given device exits. In a similar vein, bfqg_stats_xfer_dead is not executed for a root group. This commit fixes bfq_pd_offline so that the latter executes the above missing operations for a root group too. Reported-by: Holger Hoffstätte Reported-by: Guoqing Jiang Tested-by: Holger Hoffstätte Signed-off-by: Davide Ferrari Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index da1525ec4c87..d819dc77fe65 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -775,10 +775,11 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) unsigned long flags; int i; + spin_lock_irqsave(&bfqd->lock, flags); + if (!entity) /* root group */ - return; + goto put_async_queues; - spin_lock_irqsave(&bfqd->lock, flags); /* * Empty all service_trees belonging to this group before * deactivating the group itself. @@ -809,6 +810,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) } __bfq_deactivate_entity(entity, false); + +put_async_queues: bfq_put_async_queues(bfqd, bfqg); spin_unlock_irqrestore(&bfqd->lock, flags); -- cgit v1.2.3-58-ga151 From 0d52af590552473666da5b6111e7182d6cd23f92 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 9 Jan 2018 10:27:59 +0100 Subject: block, bfq: release oom-queue ref to root group on exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On scheduler init, a reference to the root group, and a reference to its corresponding blkg are taken for the oom queue. Yet these references are not released on scheduler exit, which prevents these objects from be freed. This commit adds the missing reference releases. Reported-by: Davide Ferrari Tested-by: Holger Hoffstätte Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7bd789da7a29..7c0b7f60811c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4893,6 +4893,9 @@ static void bfq_exit_queue(struct elevator_queue *e) hrtimer_cancel(&bfqd->idle_slice_timer); + /* release oom-queue reference to root group */ + bfqg_and_blkg_put(bfqd->root_group); + #ifdef CONFIG_BFQ_GROUP_IOSCHED blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); #else -- cgit v1.2.3-58-ga151 From 04ced159cec863f9bc27015d6b970bb13cfa6176 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Jan 2018 08:29:46 -0800 Subject: blk-mq: move hctx lock/unlock into a helper Move the RCU vs SRCU logic into lock/unlock helpers, which makes the actual functional bits within the locked region much easier to read. tj: Reordered in front of timeout revamp patches and added the missing blk_mq_run_hw_queue() conversion. Signed-off-by: Jens Axboe Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-mq.c | 66 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 32 insertions(+), 34 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index e258ad8dc171..bd7c47eb2923 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -557,6 +557,22 @@ static void __blk_mq_complete_request(struct request *rq) put_cpu(); } +static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) +{ + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) + rcu_read_unlock(); + else + srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); +} + +static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) +{ + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) + rcu_read_lock(); + else + *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); +} + /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed @@ -1214,17 +1230,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ WARN_ON_ONCE(in_interrupt()); - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { - rcu_read_lock(); - blk_mq_sched_dispatch_requests(hctx); - rcu_read_unlock(); - } else { - might_sleep(); + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - blk_mq_sched_dispatch_requests(hctx); - srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); - } + hctx_lock(hctx, &srcu_idx); + blk_mq_sched_dispatch_requests(hctx); + hctx_unlock(hctx, srcu_idx); } /* @@ -1296,17 +1306,10 @@ bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced. */ - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { - rcu_read_lock(); - need_run = !blk_queue_quiesced(hctx->queue) && - blk_mq_hctx_has_pending(hctx); - rcu_read_unlock(); - } else { - srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - need_run = !blk_queue_quiesced(hctx->queue) && - blk_mq_hctx_has_pending(hctx); - srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); - } + hctx_lock(hctx, &srcu_idx); + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx); + hctx_unlock(hctx, srcu_idx); if (need_run) { __blk_mq_delay_run_hw_queue(hctx, async, 0); @@ -1618,7 +1621,7 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, - blk_qc_t *cookie, bool may_sleep) + blk_qc_t *cookie) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { @@ -1668,25 +1671,20 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, } insert: - blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); + blk_mq_sched_insert_request(rq, false, run_queue, false, + hctx->flags & BLK_MQ_F_BLOCKING); } static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie) { - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { - rcu_read_lock(); - __blk_mq_try_issue_directly(hctx, rq, cookie, false); - rcu_read_unlock(); - } else { - unsigned int srcu_idx; + int srcu_idx; - might_sleep(); + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - __blk_mq_try_issue_directly(hctx, rq, cookie, true); - srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); - } + hctx_lock(hctx, &srcu_idx); + __blk_mq_try_issue_directly(hctx, rq, cookie); + hctx_unlock(hctx, srcu_idx); } static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) -- cgit v1.2.3-58-ga151 From 5197c05e16b49885cc9086f1676455371e821b0e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 08:29:47 -0800 Subject: blk-mq: protect completion path with RCU Currently, blk-mq protects only the issue path with RCU. This patch puts the completion path under the same RCU protection. This will be used to synchronize issue/completion against timeout by later patches, which will also add the comments. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index bd7c47eb2923..f5e57c80a82b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -584,11 +584,16 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) void blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); + int srcu_idx; if (unlikely(blk_should_fake_timeout(q))) return; + + hctx_lock(hctx, &srcu_idx); if (!blk_mark_rq_complete(rq)) __blk_mq_complete_request(rq); + hctx_unlock(hctx, srcu_idx); } EXPORT_SYMBOL(blk_mq_complete_request); -- cgit v1.2.3-58-ga151 From 1d9bd5161ba32db5665a617edc8b0723880f543e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 08:29:48 -0800 Subject: blk-mq: replace timeout synchronization with a RCU and generation based scheme Currently, blk-mq timeout path synchronizes against the usual issue/completion path using a complex scheme involving atomic bitflags, REQ_ATOM_*, memory barriers and subtle memory coherence rules. Unfortunately, it contains quite a few holes. There's a complex dancing around REQ_ATOM_STARTED and REQ_ATOM_COMPLETE between issue/completion and timeout paths; however, they don't have a synchronization point across request recycle instances and it isn't clear what the barriers add. blk_mq_check_expired() can easily read STARTED from N-2'th iteration, deadline from N-1'th, blk_mark_rq_complete() against Nth instance. In fact, it's pretty easy to make blk_mq_check_expired() terminate a later instance of a request. If we induce 5 sec delay before time_after_eq() test in blk_mq_check_expired(), shorten the timeout to 2s, and issue back-to-back large IOs, blk-mq starts timing out requests spuriously pretty quickly. Nothing actually timed out. It just made the call on a recycle instance of a request and then terminated a later instance long after the original instance finished. The scenario isn't theoretical either. This patch replaces the broken synchronization mechanism with a RCU and generation number based one. 1. Each request has a u64 generation + state value, which can be updated only by the request owner. Whenever a request becomes in-flight, the generation number gets bumped up too. This provides the basis for the timeout path to distinguish different recycle instances of the request. Also, marking a request in-flight and setting its deadline are protected with a seqcount so that the timeout path can fetch both values coherently. 2. The timeout path fetches the generation, state and deadline. If the verdict is timeout, it records the generation into a dedicated request abortion field and does RCU wait. 3. The completion path is also protected by RCU (from the previous patch) and checks whether the current generation number and state match the abortion field. If so, it skips completion. 4. The timeout path, after RCU wait, scans requests again and terminates the ones whose generation and state still match the ones requested for abortion. By now, the timeout path knows that either the generation number and state changed if it lost the race or the completion will yield to it and can safely timeout the request. While it's more lines of code, it's conceptually simpler, doesn't depend on direct use of subtle memory ordering or coherence, and hopefully doesn't terminate the wrong instance. While this change makes REQ_ATOM_COMPLETE synchronization unnecessary between issue/complete and timeout paths, REQ_ATOM_COMPLETE isn't removed yet as it's still used in other places. Future patches will move all state tracking to the new mechanism and remove all bitops in the hot paths. Note that this patch adds a comment explaining a race condition in BLK_EH_RESET_TIMER path. The race has always been there and this patch doesn't change it. It's just documenting the existing race. v2: - Fixed BLK_EH_RESET_TIMER handling as pointed out by Jianchao. - s/request->gstate_seqc/request->gstate_seq/ as suggested by Peter. - READ_ONCE() added in blk_mq_rq_update_state() as suggested by Peter. v3: - Fixed possible extended seqcount / u64_stats_sync read looping spotted by Peter. - MQ_RQ_IDLE was incorrectly being set in complete_request instead of free_request. Fixed. v4: - Rebased on top of hctx_lock() refactoring patch. - Added comment explaining the use of hctx_lock() in completion path. v5: - Added comments requested by Bart. - Note the addition of BLK_EH_RESET_TIMER race condition in the commit message. Signed-off-by: Tejun Heo Cc: "jianchao.wang" Cc: Peter Zijlstra Cc: Christoph Hellwig Cc: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 2 + block/blk-mq.c | 229 +++++++++++++++++++++++++++++++++---------------- block/blk-mq.h | 46 ++++++++++ block/blk-timeout.c | 2 +- block/blk.h | 6 -- include/linux/blk-mq.h | 1 + include/linux/blkdev.h | 23 +++++ 7 files changed, 230 insertions(+), 79 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 2e0d041e2daf..f843ae4f858d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->start_time = jiffies; set_start_time_ns(rq); rq->part = NULL; + seqcount_init(&rq->gstate_seq); + u64_stats_init(&rq->aborted_gstate_sync); } EXPORT_SYMBOL(blk_rq_init); diff --git a/block/blk-mq.c b/block/blk-mq.c index f5e57c80a82b..156203876c8c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -483,6 +483,7 @@ void blk_mq_free_request(struct request *rq) if (blk_rq_rl(rq)) blk_put_rl(blk_rq_rl(rq)); + blk_mq_rq_update_state(rq, MQ_RQ_IDLE); clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); if (rq->tag != -1) @@ -530,6 +531,8 @@ static void __blk_mq_complete_request(struct request *rq) bool shared = false; int cpu; + WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT); + if (rq->internal_tag != -1) blk_mq_sched_completed_request(rq); if (rq->rq_flags & RQF_STATS) { @@ -573,6 +576,36 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); } +static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate) +{ + unsigned long flags; + + /* + * blk_mq_rq_aborted_gstate() is used from the completion path and + * can thus be called from irq context. u64_stats_fetch in the + * middle of update on the same CPU leads to lockup. Disable irq + * while updating. + */ + local_irq_save(flags); + u64_stats_update_begin(&rq->aborted_gstate_sync); + rq->aborted_gstate = gstate; + u64_stats_update_end(&rq->aborted_gstate_sync); + local_irq_restore(flags); +} + +static u64 blk_mq_rq_aborted_gstate(struct request *rq) +{ + unsigned int start; + u64 aborted_gstate; + + do { + start = u64_stats_fetch_begin(&rq->aborted_gstate_sync); + aborted_gstate = rq->aborted_gstate; + } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start)); + + return aborted_gstate; +} + /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed @@ -590,8 +623,20 @@ void blk_mq_complete_request(struct request *rq) if (unlikely(blk_should_fake_timeout(q))) return; + /* + * If @rq->aborted_gstate equals the current instance, timeout is + * claiming @rq and we lost. This is synchronized through + * hctx_lock(). See blk_mq_timeout_work() for details. + * + * Completion path never blocks and we can directly use RCU here + * instead of hctx_lock() which can be either RCU or SRCU. + * However, that would complicate paths which want to synchronize + * against us. Let stay in sync with the issue path so that + * hctx_lock() covers both issue and completion paths. + */ hctx_lock(hctx, &srcu_idx); - if (!blk_mark_rq_complete(rq)) + if (blk_mq_rq_aborted_gstate(rq) != rq->gstate && + !blk_mark_rq_complete(rq)) __blk_mq_complete_request(rq); hctx_unlock(hctx, srcu_idx); } @@ -617,34 +662,32 @@ void blk_mq_start_request(struct request *rq) wbt_issue(q->rq_wb, &rq->issue_stat); } - blk_add_timer(rq); - + WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); /* - * Mark us as started and clear complete. Complete might have been - * set if requeue raced with timeout, which then marked it as - * complete. So be sure to clear complete again when we start - * the request, otherwise we'll ignore the completion event. + * Mark @rq in-flight which also advances the generation number, + * and register for timeout. Protect with a seqcount to allow the + * timeout path to read both @rq->gstate and @rq->deadline + * coherently. * - * Ensure that ->deadline is visible before we set STARTED, such that - * blk_mq_check_expired() is guaranteed to observe our ->deadline when - * it observes STARTED. + * This is the only place where a request is marked in-flight. If + * the timeout path reads an in-flight @rq->gstate, the + * @rq->deadline it reads together under @rq->gstate_seq is + * guaranteed to be the matching one. */ - smp_wmb(); + preempt_disable(); + write_seqcount_begin(&rq->gstate_seq); + + blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT); + blk_add_timer(rq); + + write_seqcount_end(&rq->gstate_seq); + preempt_enable(); + set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { - /* - * Coherence order guarantees these consecutive stores to a - * single variable propagate in the specified order. Thus the - * clear_bit() is ordered _after_ the set bit. See - * blk_mq_check_expired(). - * - * (the bits must be part of the same byte for this to be - * true). - */ + if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); - } if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -677,6 +720,7 @@ static void __blk_mq_requeue_request(struct request *rq) blk_mq_sched_requeue_request(rq); if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + blk_mq_rq_update_state(rq, MQ_RQ_IDLE); if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; } @@ -774,6 +818,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq); struct blk_mq_timeout_data { unsigned long next; unsigned int next_set; + unsigned int nr_expired; }; void blk_mq_rq_timed_out(struct request *req, bool reserved) @@ -801,6 +846,12 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved) __blk_mq_complete_request(req); break; case BLK_EH_RESET_TIMER: + /* + * As nothing prevents from completion happening while + * ->aborted_gstate is set, this may lead to ignored + * completions and further spurious timeouts. + */ + blk_mq_rq_update_aborted_gstate(req, 0); blk_add_timer(req); blk_clear_rq_complete(req); break; @@ -816,50 +867,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct blk_mq_timeout_data *data = priv; - unsigned long deadline; + unsigned long gstate, deadline; + int start; + + might_sleep(); if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) return; - /* - * Ensures that if we see STARTED we must also see our - * up-to-date deadline, see blk_mq_start_request(). - */ - smp_rmb(); - - deadline = READ_ONCE(rq->deadline); + /* read coherent snapshots of @rq->state_gen and @rq->deadline */ + while (true) { + start = read_seqcount_begin(&rq->gstate_seq); + gstate = READ_ONCE(rq->gstate); + deadline = rq->deadline; + if (!read_seqcount_retry(&rq->gstate_seq, start)) + break; + cond_resched(); + } - /* - * The rq being checked may have been freed and reallocated - * out already here, we avoid this race by checking rq->deadline - * and REQ_ATOM_COMPLETE flag together: - * - * - if rq->deadline is observed as new value because of - * reusing, the rq won't be timed out because of timing. - * - if rq->deadline is observed as previous value, - * REQ_ATOM_COMPLETE flag won't be cleared in reuse path - * because we put a barrier between setting rq->deadline - * and clearing the flag in blk_mq_start_request(), so - * this rq won't be timed out too. - */ - if (time_after_eq(jiffies, deadline)) { - if (!blk_mark_rq_complete(rq)) { - /* - * Again coherence order ensures that consecutive reads - * from the same variable must be in that order. This - * ensures that if we see COMPLETE clear, we must then - * see STARTED set and we'll ignore this timeout. - * - * (There's also the MB implied by the test_and_clear()) - */ - blk_mq_rq_timed_out(rq, reserved); - } + /* if in-flight && overdue, mark for abortion */ + if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT && + time_after_eq(jiffies, deadline)) { + blk_mq_rq_update_aborted_gstate(rq, gstate); + data->nr_expired++; + hctx->nr_expired++; } else if (!data->next_set || time_after(data->next, deadline)) { data->next = deadline; data->next_set = 1; } } +static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + /* + * We marked @rq->aborted_gstate and waited for RCU. If there were + * completions that we lost to, they would have finished and + * updated @rq->gstate by now; otherwise, the completion path is + * now guaranteed to see @rq->aborted_gstate and yield. If + * @rq->aborted_gstate still matches @rq->gstate, @rq is ours. + */ + if (READ_ONCE(rq->gstate) == rq->aborted_gstate && + !blk_mark_rq_complete(rq)) + blk_mq_rq_timed_out(rq, reserved); +} + static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = @@ -867,7 +919,9 @@ static void blk_mq_timeout_work(struct work_struct *work) struct blk_mq_timeout_data data = { .next = 0, .next_set = 0, + .nr_expired = 0, }; + struct blk_mq_hw_ctx *hctx; int i; /* A deadlock might occur if a request is stuck requiring a @@ -886,14 +940,40 @@ static void blk_mq_timeout_work(struct work_struct *work) if (!percpu_ref_tryget(&q->q_usage_counter)) return; + /* scan for the expired ones and set their ->aborted_gstate */ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); + if (data.nr_expired) { + bool has_rcu = false; + + /* + * Wait till everyone sees ->aborted_gstate. The + * sequential waits for SRCUs aren't ideal. If this ever + * becomes a problem, we can add per-hw_ctx rcu_head and + * wait in parallel. + */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->nr_expired) + continue; + + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) + has_rcu = true; + else + synchronize_srcu(hctx->queue_rq_srcu); + + hctx->nr_expired = 0; + } + if (has_rcu) + synchronize_rcu(); + + /* terminate the ones we won */ + blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL); + } + if (data.next_set) { data.next = blk_rq_timeout(round_jiffies_up(data.next)); mod_timer(&q->timeout, data.next); } else { - struct blk_mq_hw_ctx *hctx; - queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) @@ -1893,6 +1973,22 @@ static size_t order_to_size(unsigned int order) return (size_t)PAGE_SIZE << order; } +static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, int node) +{ + int ret; + + if (set->ops->init_request) { + ret = set->ops->init_request(set, rq, hctx_idx, node); + if (ret) + return ret; + } + + seqcount_init(&rq->gstate_seq); + u64_stats_init(&rq->aborted_gstate_sync); + return 0; +} + int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx, unsigned int depth) { @@ -1954,12 +2050,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, struct request *rq = p; tags->static_rqs[i] = rq; - if (set->ops->init_request) { - if (set->ops->init_request(set, rq, hctx_idx, - node)) { - tags->static_rqs[i] = NULL; - goto fail; - } + if (blk_mq_init_request(set, rq, hctx_idx, node)) { + tags->static_rqs[i] = NULL; + goto fail; } p += rq_size; @@ -2099,9 +2192,7 @@ static int blk_mq_init_hctx(struct request_queue *q, if (!hctx->fq) goto sched_exit_hctx; - if (set->ops->init_request && - set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx, - node)) + if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node)) goto free_fq; if (hctx->flags & BLK_MQ_F_BLOCKING) @@ -3019,12 +3110,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) static int __init blk_mq_init(void) { - /* - * See comment in block/blk.h rq_atomic_flags enum - */ - BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) != - (REQ_ATOM_COMPLETE / BITS_PER_BYTE)); - cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); return 0; diff --git a/block/blk-mq.h b/block/blk-mq.h index 6c7c3ff5bf62..cf01f6f8c73d 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -27,6 +27,19 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; +/* + * Bits for request->gstate. The lower two bits carry MQ_RQ_* state value + * and the upper bits the generation number. + */ +enum mq_rq_state { + MQ_RQ_IDLE = 0, + MQ_RQ_IN_FLIGHT = 1, + + MQ_RQ_STATE_BITS = 2, + MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1, + MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS, +}; + void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); @@ -85,6 +98,39 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved); void blk_mq_release(struct request_queue *q); +/** + * blk_mq_rq_state() - read the current MQ_RQ_* state of a request + * @rq: target request. + */ +static inline int blk_mq_rq_state(struct request *rq) +{ + return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK; +} + +/** + * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request + * @rq: target request. + * @state: new state to set. + * + * Set @rq's state to @state. The caller is responsible for ensuring that + * there are no other updaters. A request can transition into IN_FLIGHT + * only from IDLE and doing so increments the generation number. + */ +static inline void blk_mq_rq_update_state(struct request *rq, + enum mq_rq_state state) +{ + u64 old_val = READ_ONCE(rq->gstate); + u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state; + + if (state == MQ_RQ_IN_FLIGHT) { + WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE); + new_val += MQ_RQ_GEN_INC; + } + + /* avoid exposing interim values */ + WRITE_ONCE(rq->gstate, new_val); +} + static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 764ecf9aeb30..6427be7ac363 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -208,7 +208,7 @@ void blk_add_timer(struct request *req) if (!req->timeout) req->timeout = q->rq_timeout; - WRITE_ONCE(req->deadline, jiffies + req->timeout); + req->deadline = jiffies + req->timeout; /* * Only the non-mq case needs to add the request to a protected list. diff --git a/block/blk.h b/block/blk.h index 3f1446937aec..9cb2739edb6a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -123,12 +123,6 @@ void blk_account_io_done(struct request *req); * Internal atomic flags for request handling */ enum rq_atomic_flags { - /* - * Keep these two bits first - not because we depend on the - * value of them, but we do depend on them being in the same - * byte of storage to ensure ordering on writes. Keeping them - * first will achieve that nicely. - */ REQ_ATOM_COMPLETE = 0, REQ_ATOM_STARTED, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 95c9a5c862e2..460798dbac1f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -51,6 +51,7 @@ struct blk_mq_hw_ctx { unsigned int queue_num; atomic_t nr_active; + unsigned int nr_expired; struct hlist_node cpuhp_dead; struct kobject kobj; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 46e606f5b44b..ae563d01b29d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,6 +27,8 @@ #include #include #include +#include +#include struct module; struct scsi_ioctl_command; @@ -230,6 +232,27 @@ struct request { unsigned short write_hint; + /* + * On blk-mq, the lower bits of ->gstate (generation number and + * state) carry the MQ_RQ_* state value and the upper bits the + * generation number which is monotonically incremented and used to + * distinguish the reuse instances. + * + * ->gstate_seq allows updates to ->gstate and other fields + * (currently ->deadline) during request start to be read + * atomically from the timeout path, so that it can operate on a + * coherent set of information. + */ + seqcount_t gstate_seq; + u64 gstate; + + /* + * ->aborted_gstate is used by the timeout to claim a specific + * recycle instance of this request. See blk_mq_timeout_work(). + */ + struct u64_stats_sync aborted_gstate_sync; + u64 aborted_gstate; + unsigned long deadline; struct list_head timeout_list; -- cgit v1.2.3-58-ga151 From 67818d25738b1c9ffb8541ca875b2ae3304869d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 08:29:49 -0800 Subject: blk-mq: use blk_mq_rq_state() instead of testing REQ_ATOM_COMPLETE blk_mq_check_inflight() and blk_mq_poll_hybrid_sleep() test REQ_ATOM_COMPLETE to determine the request state. Both uses are speculative and we can test REQ_ATOM_STARTED and blk_mq_rq_state() for equivalent results. Replace the tests. This will allow removing REQ_ATOM_COMPLETE usages from blk-mq. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 156203876c8c..50dda2ff0d85 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -95,8 +95,7 @@ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, { struct mq_inflight *mi = priv; - if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) && - !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { + if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) { /* * index[0] counts the specific partition that was asked * for. index[1] counts the ones that are active on the @@ -3024,7 +3023,8 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, hrtimer_init_sleeper(&hs, current); do { - if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) + if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) && + blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) break; set_current_state(TASK_UNINTERRUPTIBLE); hrtimer_start_expires(&hs.timer, mode); -- cgit v1.2.3-58-ga151 From 358f70da49d77c43f2ca11b5da584213b2add29c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 08:29:50 -0800 Subject: blk-mq: make blk_abort_request() trigger timeout path With issue/complete and timeout paths now using the generation number and state based synchronization, blk_abort_request() is the only one which depends on REQ_ATOM_COMPLETE for arbitrating completion. There's no reason for blk_abort_request() to be a completely separate path. This patch makes blk_abort_request() piggyback on the timeout path instead of trying to terminate the request directly. This removes the last dependency on REQ_ATOM_COMPLETE in blk-mq. Note that this makes blk_abort_request() asynchronous - it initiates abortion but the actual termination will happen after a short while, even when the caller owns the request. AFAICS, SCSI and ATA should be fine with that and I think mtip32xx and dasd should be safe but not completely sure. It'd be great if people who know the drivers take a look. v2: - Add comment explaining the lack of synchronization around ->deadline update as requested by Bart. Signed-off-by: Tejun Heo Cc: Asai Thambi SP Cc: Stefan Haberland Cc: Jan Hoeppner Cc: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- block/blk-mq.h | 2 -- block/blk-timeout.c | 13 +++++++++---- 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 50dda2ff0d85..90f6910a83f6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -820,7 +820,7 @@ struct blk_mq_timeout_data { unsigned int nr_expired; }; -void blk_mq_rq_timed_out(struct request *req, bool reserved) +static void blk_mq_rq_timed_out(struct request *req, bool reserved) { const struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; diff --git a/block/blk-mq.h b/block/blk-mq.h index cf01f6f8c73d..6b2d61629d48 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -94,8 +94,6 @@ extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); -extern void blk_mq_rq_timed_out(struct request *req, bool reserved); - void blk_mq_release(struct request_queue *q); /** diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 6427be7ac363..4f04cd1e0b74 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -156,12 +156,17 @@ void blk_timeout_work(struct work_struct *work) */ void blk_abort_request(struct request *req) { - if (blk_mark_rq_complete(req)) - return; - if (req->q->mq_ops) { - blk_mq_rq_timed_out(req, false); + /* + * All we need to ensure is that timeout scan takes place + * immediately and that scan sees the new timeout value. + * No need for fancy synchronizations. + */ + req->deadline = jiffies; + mod_timer(&req->q->timeout, 0); } else { + if (blk_mark_rq_complete(req)) + return; blk_delete_timer(req); blk_rq_timed_out(req); } -- cgit v1.2.3-58-ga151 From 634f9e4631a88025d3b90c1884e9a1b6a13d01d2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 08:29:51 -0800 Subject: blk-mq: remove REQ_ATOM_COMPLETE usages from blk-mq After the recent updates to use generation number and state based synchronization, blk-mq no longer depends on REQ_ATOM_COMPLETE except to avoid firing the same timeout multiple times. Remove all REQ_ATOM_COMPLETE usages and use a new rq_flags flag RQF_MQ_TIMEOUT_EXPIRED to avoid firing the same timeout multiple times. This removes atomic bitops from hot paths too. v2: Removed blk_clear_rq_complete() from blk_mq_rq_timed_out(). v3: Added RQF_MQ_TIMEOUT_EXPIRED flag. Signed-off-by: Tejun Heo Cc: "jianchao.wang" Signed-off-by: Jens Axboe --- block/blk-mq.c | 15 +++++++-------- block/blk-timeout.c | 1 + include/linux/blkdev.h | 2 ++ 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 90f6910a83f6..d1000c6cbec6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -634,8 +634,7 @@ void blk_mq_complete_request(struct request *rq) * hctx_lock() covers both issue and completion paths. */ hctx_lock(hctx, &srcu_idx); - if (blk_mq_rq_aborted_gstate(rq) != rq->gstate && - !blk_mark_rq_complete(rq)) + if (blk_mq_rq_aborted_gstate(rq) != rq->gstate) __blk_mq_complete_request(rq); hctx_unlock(hctx, srcu_idx); } @@ -685,8 +684,6 @@ void blk_mq_start_request(struct request *rq) preempt_enable(); set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) - clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -837,6 +834,8 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved) if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) return; + req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED; + if (ops->timeout) ret = ops->timeout(req, reserved); @@ -852,7 +851,6 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved) */ blk_mq_rq_update_aborted_gstate(req, 0); blk_add_timer(req); - blk_clear_rq_complete(req); break; case BLK_EH_NOT_HANDLED: break; @@ -871,7 +869,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, might_sleep(); - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + if ((rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) || + !test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) return; /* read coherent snapshots of @rq->state_gen and @rq->deadline */ @@ -906,8 +905,8 @@ static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx, * now guaranteed to see @rq->aborted_gstate and yield. If * @rq->aborted_gstate still matches @rq->gstate, @rq is ours. */ - if (READ_ONCE(rq->gstate) == rq->aborted_gstate && - !blk_mark_rq_complete(rq)) + if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) && + READ_ONCE(rq->gstate) == rq->aborted_gstate) blk_mq_rq_timed_out(rq, reserved); } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 4f04cd1e0b74..ebe99963386c 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -214,6 +214,7 @@ void blk_add_timer(struct request *req) req->timeout = q->rq_timeout; req->deadline = jiffies + req->timeout; + req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED; /* * Only the non-mq case needs to add the request to a protected list. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ae563d01b29d..007a7cf1f262 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -125,6 +125,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* The per-zone write lock is held for this request */ #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) +/* timeout is expired */ +#define RQF_MQ_TIMEOUT_EXPIRED ((__force req_flags_t)(1 << 20)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ -- cgit v1.2.3-58-ga151 From 5a61c36398d0626bad377a7f5b9391b21e16e91d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 08:29:52 -0800 Subject: blk-mq: remove REQ_ATOM_STARTED After the recent updates to use generation number and state based synchronization, we can easily replace REQ_ATOM_STARTED usages by adding an extra state to distinguish completed but not yet freed state. Add MQ_RQ_COMPLETE and replace REQ_ATOM_STARTED usages with blk_mq_rq_state() tests. REQ_ATOM_STARTED no longer has any users left and is removed. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 4 +--- block/blk-mq.c | 37 ++++++++----------------------------- block/blk-mq.h | 1 + block/blk.h | 1 - 4 files changed, 10 insertions(+), 33 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b56a4f35720d..8adc83786256 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -271,7 +271,6 @@ static const char *const cmd_flag_name[] = { #define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name static const char *const rqf_name[] = { RQF_NAME(SORTED), - RQF_NAME(STARTED), RQF_NAME(QUEUED), RQF_NAME(SOFTBARRIER), RQF_NAME(FLUSH_SEQ), @@ -295,7 +294,6 @@ static const char *const rqf_name[] = { #define RQAF_NAME(name) [REQ_ATOM_##name] = #name static const char *const rqaf_name[] = { RQAF_NAME(COMPLETE), - RQAF_NAME(STARTED), RQAF_NAME(POLL_SLEPT), }; #undef RQAF_NAME @@ -409,7 +407,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) const struct show_busy_params *params = data; if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx && - test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + blk_mq_rq_state(rq) != MQ_RQ_IDLE) __blk_mq_debugfs_rq_show(params->m, list_entry_rq(&rq->queuelist)); } diff --git a/block/blk-mq.c b/block/blk-mq.c index d1000c6cbec6..275812909d77 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -483,7 +483,6 @@ void blk_mq_free_request(struct request *rq) blk_put_rl(blk_rq_rl(rq)); blk_mq_rq_update_state(rq, MQ_RQ_IDLE); - clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); if (rq->tag != -1) blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); @@ -531,6 +530,7 @@ static void __blk_mq_complete_request(struct request *rq) int cpu; WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT); + blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE); if (rq->internal_tag != -1) blk_mq_sched_completed_request(rq); @@ -642,7 +642,7 @@ EXPORT_SYMBOL(blk_mq_complete_request); int blk_mq_request_started(struct request *rq) { - return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + return blk_mq_rq_state(rq) != MQ_RQ_IDLE; } EXPORT_SYMBOL_GPL(blk_mq_request_started); @@ -661,7 +661,6 @@ void blk_mq_start_request(struct request *rq) } WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); - WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); /* * Mark @rq in-flight which also advances the generation number, @@ -683,8 +682,6 @@ void blk_mq_start_request(struct request *rq) write_seqcount_end(&rq->gstate_seq); preempt_enable(); - set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - if (q->dma_drain_size && blk_rq_bytes(rq)) { /* * Make sure space for the drain appears. We know we can do @@ -697,13 +694,9 @@ void blk_mq_start_request(struct request *rq) EXPORT_SYMBOL(blk_mq_start_request); /* - * When we reach here because queue is busy, REQ_ATOM_COMPLETE - * flag isn't set yet, so there may be race with timeout handler, - * but given rq->deadline is just set in .queue_rq() under - * this situation, the race won't be possible in reality because - * rq->timeout should be set as big enough to cover the window - * between blk_mq_start_request() called from .queue_rq() and - * clearing REQ_ATOM_STARTED here. + * When we reach here because queue is busy, it's safe to change the state + * to IDLE without checking @rq->aborted_gstate because we should still be + * holding the RCU read lock and thus protected against timeout. */ static void __blk_mq_requeue_request(struct request *rq) { @@ -715,7 +708,7 @@ static void __blk_mq_requeue_request(struct request *rq) wbt_requeue(q->rq_wb, &rq->issue_stat); blk_mq_sched_requeue_request(rq); - if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) { blk_mq_rq_update_state(rq, MQ_RQ_IDLE); if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; @@ -822,18 +815,6 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved) const struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; - /* - * We know that complete is set at this point. If STARTED isn't set - * anymore, then the request isn't active and the "timeout" should - * just be ignored. This can happen due to the bitflag ordering. - * Timeout first checks if STARTED is set, and if it is, assumes - * the request is active. But if we race with completion, then - * both flags will get cleared. So check here again, and ignore - * a timeout event with a request that isn't active. - */ - if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) - return; - req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED; if (ops->timeout) @@ -869,8 +850,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, might_sleep(); - if ((rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) || - !test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) return; /* read coherent snapshots of @rq->state_gen and @rq->deadline */ @@ -3022,8 +3002,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, hrtimer_init_sleeper(&hs, current); do { - if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) && - blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) + if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) break; set_current_state(TASK_UNINTERRUPTIBLE); hrtimer_start_expires(&hs.timer, mode); diff --git a/block/blk-mq.h b/block/blk-mq.h index 6b2d61629d48..8591a54d989b 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -34,6 +34,7 @@ struct blk_mq_ctx { enum mq_rq_state { MQ_RQ_IDLE = 0, MQ_RQ_IN_FLIGHT = 1, + MQ_RQ_COMPLETE = 2, MQ_RQ_STATE_BITS = 2, MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1, diff --git a/block/blk.h b/block/blk.h index 9cb2739edb6a..a68dbe312ea3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -124,7 +124,6 @@ void blk_account_io_done(struct request *req); */ enum rq_atomic_flags { REQ_ATOM_COMPLETE = 0, - REQ_ATOM_STARTED, REQ_ATOM_POLL_SLEPT, }; -- cgit v1.2.3-58-ga151 From 05707b64aed8f5f1674b25334fb720d651459d5e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 08:29:53 -0800 Subject: blk-mq: rename blk_mq_hw_ctx->queue_rq_srcu to ->srcu The RCU protection has been expanded to cover both queueing and completion paths making ->queue_rq_srcu a misnomer. Rename it to ->srcu as suggested by Bart. Signed-off-by: Tejun Heo Cc: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 14 +++++++------- include/linux/blk-mq.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 275812909d77..0269d44d512e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -219,7 +219,7 @@ void blk_mq_quiesce_queue(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) { if (hctx->flags & BLK_MQ_F_BLOCKING) - synchronize_srcu(hctx->queue_rq_srcu); + synchronize_srcu(hctx->srcu); else rcu = true; } @@ -564,7 +564,7 @@ static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) if (!(hctx->flags & BLK_MQ_F_BLOCKING)) rcu_read_unlock(); else - srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); + srcu_read_unlock(hctx->srcu, srcu_idx); } static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) @@ -572,7 +572,7 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) if (!(hctx->flags & BLK_MQ_F_BLOCKING)) rcu_read_lock(); else - *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); + *srcu_idx = srcu_read_lock(hctx->srcu); } static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate) @@ -937,7 +937,7 @@ static void blk_mq_timeout_work(struct work_struct *work) if (!(hctx->flags & BLK_MQ_F_BLOCKING)) has_rcu = true; else - synchronize_srcu(hctx->queue_rq_srcu); + synchronize_srcu(hctx->srcu); hctx->nr_expired = 0; } @@ -2101,7 +2101,7 @@ static void blk_mq_exit_hctx(struct request_queue *q, set->ops->exit_hctx(hctx, hctx_idx); if (hctx->flags & BLK_MQ_F_BLOCKING) - cleanup_srcu_struct(hctx->queue_rq_srcu); + cleanup_srcu_struct(hctx->srcu); blk_mq_remove_cpuhp(hctx); blk_free_flush_queue(hctx->fq); @@ -2174,7 +2174,7 @@ static int blk_mq_init_hctx(struct request_queue *q, goto free_fq; if (hctx->flags & BLK_MQ_F_BLOCKING) - init_srcu_struct(hctx->queue_rq_srcu); + init_srcu_struct(hctx->srcu); blk_mq_debugfs_register_hctx(q, hctx); @@ -2463,7 +2463,7 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) { int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); - BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu), + BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), __alignof__(struct blk_mq_hw_ctx)) != sizeof(struct blk_mq_hw_ctx)); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 460798dbac1f..8efcf49796a3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -66,7 +66,7 @@ struct blk_mq_hw_ctx { #endif /* Must be the last member - see also blk_mq_hw_ctx_size(). */ - struct srcu_struct queue_rq_srcu[0]; + struct srcu_struct srcu[0]; }; struct blk_mq_tag_set { -- cgit v1.2.3-58-ga151 From 08b5a6e2a769f720977b245431b45134c0bdd377 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Jan 2018 09:32:25 -0700 Subject: blk-mq: silence false positive warnings in hctx_unlock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some stupider versions of gcc, it complains: block/blk-mq.c: In function ‘blk_mq_complete_request’: ./include/linux/srcu.h:175:2: warning: ‘srcu_idx’ may be used uninitialized in this function [-Wmaybe-uninitialized] __srcu_read_unlock(sp, idx); ^ block/blk-mq.c:620:6: note: ‘srcu_idx’ was declared here int srcu_idx; ^ which is completely bogus, since we only use srcu_idx when hctx->flags & BLK_MQ_F_BLOCKING is set, and that's the case where hctx_lock() has initialized it. Just set it to '0' in the normal path in hctx_lock() to silence this annoying warning. Fixes: 04ced159cec8 ("blk-mq: move hctx lock/unlock into a helper") Fixes: 5197c05e16b4 ("blk-mq: protect completion path with RCU") Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 0269d44d512e..8de354606690 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -569,9 +569,11 @@ static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) { - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { + /* shut up gcc false positive */ + *srcu_idx = 0; rcu_read_lock(); - else + } else *srcu_idx = srcu_read_lock(hctx->srcu); } -- cgit v1.2.3-58-ga151 From ee3e4de525aad5d9b2ef1fdd28341587a97d740e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Jan 2018 10:09:15 -0800 Subject: blk-mq: Fix spelling in a source code comment Change "nedeing" into "needing" and "caes" into "cases". Fixes: commit f906a6a0f426 ("blk-mq: improve tag waiting setup for non-shared tags") Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Omar Sandoval Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8de354606690..9aa24c9508f9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1089,8 +1089,8 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, /* * Mark us waiting for a tag. For shared tags, this involves hooking us into - * the tag wakeups. For non-shared tags, we can simply mark us nedeing a - * restart. For both caes, take care to check the condition again after + * the tag wakeups. For non-shared tags, we can simply mark us needing a + * restart. For both cases, take care to check the condition again after * marking us as waiting. */ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, -- cgit v1.2.3-58-ga151 From aa98192dead2027a8f20a609472cdd7caf15dae4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Jan 2018 10:11:00 -0800 Subject: block: Fix kernel-doc warnings reported when building with W=1 Commit 3a025e1d1c2e ("Add optional check for bad kernel-doc comments") causes W=1 the kernel-doc script to be run and thereby causes several new warnings to appear when building the kernel with W=1. Fix the block layer kernel-doc headers such that the block layer again builds cleanly with W=1. Signed-off-by: Bart Van Assche Cc: Martin K. Petersen Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/bsg-lib.c | 3 ++- block/scsi_ioctl.c | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 15d25ccd51a5..1474153f73e3 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -30,7 +30,7 @@ /** * bsg_teardown_job - routine to teardown a bsg job - * @job: bsg_job that is to be torn down + * @kref: kref inside bsg_job that is to be torn down */ static void bsg_teardown_job(struct kref *kref) { @@ -251,6 +251,7 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req) * @name: device to give bsg device * @job_fn: bsg job handler * @dd_job_size: size of LLD data needed for each job + * @release: @dev release function */ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, bsg_job_fn *job_fn, int dd_job_size, diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index edcfff974527..5cddff44a2f8 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -384,9 +384,10 @@ out_put_request: /** * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl - * @file: file this ioctl operates on (optional) * @q: request queue to send scsi commands down * @disk: gendisk to operate on (option) + * @mode: mode used to open the file through which the ioctl has been + * submitted * @sic: userspace structure describing the command to perform * * Send down the scsi command described by @sic to the device below @@ -415,10 +416,10 @@ out_put_request: * Positive numbers returned are the compacted SCSI error codes (4 * bytes in one int) where the lowest byte is the SCSI status. */ -#define OMAX_SB_LEN 16 /* For backward compatibility */ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, struct scsi_ioctl_command __user *sic) { + enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */ struct request *rq; struct scsi_request *req; int err; -- cgit v1.2.3-58-ga151 From 8abef10b3de1144cfe968f454946f13eb1ac3d0a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Jan 2018 12:20:51 -0700 Subject: bfq-iosched: don't call bfqg_and_blkg_put for !CONFIG_BFQ_GROUP_IOSCHED It's not available if we don't have group io scheduling set, and there's no need to call it. Fixes: 0d52af590552 ("block, bfq: release oom-queue ref to root group on exit") Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7c0b7f60811c..5e6f837f663e 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4893,10 +4893,10 @@ static void bfq_exit_queue(struct elevator_queue *e) hrtimer_cancel(&bfqd->idle_slice_timer); +#ifdef CONFIG_BFQ_GROUP_IOSCHED /* release oom-queue reference to root group */ bfqg_and_blkg_put(bfqd->root_group); -#ifdef CONFIG_BFQ_GROUP_IOSCHED blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); #else spin_lock_irq(&bfqd->lock); -- cgit v1.2.3-58-ga151 From b4b6cb613519b7449da510bccf08986371b328cb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 10 Jan 2018 10:51:29 +0800 Subject: Revert "block: blk-merge: try to make front segments in full size" This reverts commit a2d37968d784363842f87820a21e106741d28004. If max segment size isn't 512-aligned, this patch won't work well. Also once multipage bvec is enabled, adjacent bvecs won't be physically contiguous if page is added via bio_add_page(), so we don't need this kind of complicated logic. Reported-by: Dmitry Osipenko Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 54 +++++------------------------------------------------- 1 file changed, 5 insertions(+), 49 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 446f63e076aa..8452fc7164cc 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -109,7 +109,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bool do_split = true; struct bio *new = NULL; const unsigned max_sectors = get_max_io_size(q, bio); - unsigned advance = 0; bio_for_each_segment(bv, bio, iter) { /* @@ -133,32 +132,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, } if (bvprvp && blk_queue_cluster(q)) { + if (seg_size + bv.bv_len > queue_max_segment_size(q)) + goto new_segment; if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv)) goto new_segment; if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv)) goto new_segment; - if (seg_size + bv.bv_len > queue_max_segment_size(q)) { - /* - * One assumption is that initial value of - * @seg_size(equals to bv.bv_len) won't be - * bigger than max segment size, but this - * becomes false after multipage bvecs. - */ - advance = queue_max_segment_size(q) - seg_size; - - if (advance > 0) { - seg_size += advance; - sectors += advance >> 9; - bv.bv_len -= advance; - bv.bv_offset += advance; - } - - /* - * Still need to put remainder of current - * bvec into a new segment. - */ - goto new_segment; - } seg_size += bv.bv_len; bvprv = bv; @@ -180,12 +159,6 @@ new_segment: seg_size = bv.bv_len; sectors += bv.bv_len >> 9; - /* restore the bvec for iterator */ - if (advance) { - bv.bv_len += advance; - bv.bv_offset -= advance; - advance = 0; - } } do_split = false; @@ -386,29 +359,16 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, { int nbytes = bvec->bv_len; - unsigned advance = 0; if (*sg && *cluster) { + if ((*sg)->length + nbytes > queue_max_segment_size(q)) + goto new_segment; + if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) goto new_segment; if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) goto new_segment; - /* - * try best to merge part of the bvec into previous - * segment and follow same policy with - * blk_bio_segment_split() - */ - if ((*sg)->length + nbytes > queue_max_segment_size(q)) { - advance = queue_max_segment_size(q) - (*sg)->length; - if (advance) { - (*sg)->length += advance; - bvec->bv_offset += advance; - bvec->bv_len -= advance; - } - goto new_segment; - } - (*sg)->length += nbytes; } else { new_segment: @@ -431,10 +391,6 @@ new_segment: sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); (*nsegs)++; - - /* for making iterator happy */ - bvec->bv_offset -= advance; - bvec->bv_len += advance; } *bvprv = *bvec; } -- cgit v1.2.3-58-ga151 From 8993d445df388e3541f48920a2353cfc904b220a Mon Sep 17 00:00:00 2001 From: Chiara Bruschi Date: Mon, 18 Dec 2017 16:21:59 +0000 Subject: block, bfq: fix occurrences of request finish method's old name Commit '7b9e93616399' ("blk-mq-sched: unify request finished methods") changed the old name of current bfq_finish_request method, but left it unchanged elsewhere in the code (related comments, part of function name bfq_put_rq_priv_body). This commit fixes all occurrences of the old name of this method by changing them into the current name. Fixes: 7b9e93616399 ("blk-mq-sched: unify request finished methods") Reviewed-by: Paolo Valente Signed-off-by: Federico Motta Signed-off-by: Chiara Bruschi Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 5e6f837f663e..f352b1677143 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3684,8 +3684,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) } /* - * We exploit the put_rq_private hook to decrement - * rq_in_driver, but put_rq_private will not be + * We exploit the bfq_finish_request hook to decrement + * rq_in_driver, but bfq_finish_request will not be * invoked on this request. So, to avoid unbalance, * just start this request, without incrementing * rq_in_driver. As a negative consequence, @@ -3694,14 +3694,14 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * bfq_schedule_dispatch to be invoked uselessly. * * As for implementing an exact solution, the - * put_request hook, if defined, is probably invoked - * also on this request. So, by exploiting this hook, - * we could 1) increment rq_in_driver here, and 2) - * decrement it in put_request. Such a solution would - * let the value of the counter be always accurate, - * but it would entail using an extra interface - * function. This cost seems higher than the benefit, - * being the frequency of non-elevator-private + * bfq_finish_request hook, if defined, is probably + * invoked also on this request. So, by exploiting + * this hook, we could 1) increment rq_in_driver here, + * and 2) decrement it in bfq_finish_request. Such a + * solution would let the value of the counter be + * always accurate, but it would entail using an extra + * interface function. This cost seems higher than the + * benefit, being the frequency of non-elevator-private * requests very low. */ goto start_rq; @@ -4558,7 +4558,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_schedule_dispatch(bfqd); } -static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) +static void bfq_finish_request_body(struct bfq_queue *bfqq) { bfqq->allocated--; @@ -4588,7 +4588,7 @@ static void bfq_finish_request(struct request *rq) spin_lock_irqsave(&bfqd->lock, flags); bfq_completed_request(bfqq, bfqd); - bfq_put_rq_priv_body(bfqq); + bfq_finish_request_body(bfqq); spin_unlock_irqrestore(&bfqd->lock, flags); } else { @@ -4609,7 +4609,7 @@ static void bfq_finish_request(struct request *rq) bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); } - bfq_put_rq_priv_body(bfqq); + bfq_finish_request_body(bfqq); } rq->elv.priv[0] = NULL; -- cgit v1.2.3-58-ga151 From 5f15684bd5e5ef39d4337988864fec8012471dda Mon Sep 17 00:00:00 2001 From: Richard Narron Date: Wed, 10 Jan 2018 09:12:16 -0700 Subject: partitions/msdos: Unable to mount UFS 44bsd partitions UFS partitions from newer versions of FreeBSD 10 and 11 use relative addressing for their subpartitions. But older versions of FreeBSD still use absolute addressing just like OpenBSD and NetBSD. Instead of simply testing for a FreeBSD partition, the code needs to also test if the starting offset of the C subpartition is zero. https://bugzilla.kernel.org/show_bug.cgi?id=197733 Signed-off-by: Richard Narron Signed-off-by: Jens Axboe --- block/partitions/msdos.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 0af3a3db6fb0..82c44f7df911 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -301,7 +301,9 @@ static void parse_bsd(struct parsed_partitions *state, continue; bsd_start = le32_to_cpu(p->p_offset); bsd_size = le32_to_cpu(p->p_size); - if (memcmp(flavour, "bsd\0", 4) == 0) + /* FreeBSD has relative offset if C partition offset is zero */ + if (memcmp(flavour, "bsd\0", 4) == 0 && + le32_to_cpu(l->d_partitions[2].p_offset) == 0) bsd_start += offset; if (offset == bsd_start && size == bsd_size) /* full parent partition, we have it already */ -- cgit v1.2.3-58-ga151 From fcd36c36f381f534adad5a6c6485db4405d5ea42 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 10 Jan 2018 08:33:33 -0800 Subject: blk-mq: Explain when 'active_queues' is decremented It is nontrivial to derive from the blk-mq source code when blk_mq_tags.active_queues is decremented. Hence add a comment that explains this. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 9aa24c9508f9..266fc4f6b046 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -954,6 +954,12 @@ static void blk_mq_timeout_work(struct work_struct *work) data.next = blk_rq_timeout(round_jiffies_up(data.next)); mod_timer(&q->timeout, data.next); } else { + /* + * Request timeouts are handled as a forward rolling timer. If + * we end up here it means that no requests are pending and + * also that no request has been pending for a while. Mark + * each hctx as idle. + */ queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) -- cgit v1.2.3-58-ga151 From 5d75d3f2e736d6c8be79b677e10edb6af1bf7ed6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2018 11:30:08 -0700 Subject: blk-mq: add a few missing debugfs RQF_ flags We are missing ZONE_WRITE_LOCKED and MQ_TIMEOUT_EXPIRED, add them so the debugfs bits can decode them. Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 8adc83786256..25d41151073d 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -288,6 +288,8 @@ static const char *const rqf_name[] = { RQF_NAME(HASHED), RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), + RQF_NAME(ZONE_WRITE_LOCKED), + RQF_NAME(MQ_TIMEOUT_EXPIRED), }; #undef RQF_NAME -- cgit v1.2.3-58-ga151 From 76a86f9d027b342b8759a4b2f9f7fe046e284220 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2018 11:30:56 -0700 Subject: block: remove REQ_ATOM_POLL_SLEPT We don't need this to be an atomic flag, it can be a regular flag. We either end up on the same CPU for the polling, in which case the state is sane, or we did the sleep which would imply the needed barrier to ensure we see the right state. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 +- block/blk-mq.c | 5 ++--- block/blk.h | 2 -- include/linux/blkdev.h | 2 ++ 4 files changed, 5 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 25d41151073d..dd890d5e0fbd 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -290,13 +290,13 @@ static const char *const rqf_name[] = { RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_LOCKED), RQF_NAME(MQ_TIMEOUT_EXPIRED), + RQF_NAME(MQ_POLL_SLEPT), }; #undef RQF_NAME #define RQAF_NAME(name) [REQ_ATOM_##name] = #name static const char *const rqaf_name[] = { RQAF_NAME(COMPLETE), - RQAF_NAME(POLL_SLEPT), }; #undef RQAF_NAME diff --git a/block/blk-mq.c b/block/blk-mq.c index 266fc4f6b046..3239ca9e199f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -483,7 +483,6 @@ void blk_mq_free_request(struct request *rq) blk_put_rl(blk_rq_rl(rq)); blk_mq_rq_update_state(rq, MQ_RQ_IDLE); - clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); if (rq->tag != -1) blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) @@ -2976,7 +2975,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, unsigned int nsecs; ktime_t kt; - if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) + if (rq->rq_flags & RQF_MQ_POLL_SLEPT) return false; /* @@ -2996,7 +2995,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, if (!nsecs) return false; - set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); + rq->rq_flags |= RQF_MQ_POLL_SLEPT; /* * This will be replaced with the stats tracking code, using diff --git a/block/blk.h b/block/blk.h index a68dbe312ea3..eb306c52121e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -124,8 +124,6 @@ void blk_account_io_done(struct request *req); */ enum rq_atomic_flags { REQ_ATOM_COMPLETE = 0, - - REQ_ATOM_POLL_SLEPT, }; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 007a7cf1f262..ba31674d8581 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -127,6 +127,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) /* timeout is expired */ #define RQF_MQ_TIMEOUT_EXPIRED ((__force req_flags_t)(1 << 20)) +/* already slept for hybrid poll */ +#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 21)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ -- cgit v1.2.3-58-ga151 From 0a72e7f44964b9ada3e5c15820372e9cb119bf80 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Jan 2018 14:23:42 -0700 Subject: block: add accessors for setting/querying request deadline We reduce the resolution of request expiry, but since we're already using jiffies for this where resolution depends on the kernel configuration and since the timeout resolution is coarse anyway, that should be fine. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- block/blk-timeout.c | 14 ++++++++------ block/blk.h | 15 +++++++++++++++ include/linux/blkdev.h | 4 +++- 4 files changed, 27 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 3239ca9e199f..7035c305be45 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -858,7 +858,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, while (true) { start = read_seqcount_begin(&rq->gstate_seq); gstate = READ_ONCE(rq->gstate); - deadline = rq->deadline; + deadline = blk_rq_deadline(rq); if (!read_seqcount_retry(&rq->gstate_seq, start)) break; cond_resched(); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index ebe99963386c..a05e3676d24a 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req) static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, unsigned int *next_set) { - if (time_after_eq(jiffies, rq->deadline)) { + const unsigned long deadline = blk_rq_deadline(rq); + + if (time_after_eq(jiffies, deadline)) { list_del_init(&rq->timeout_list); /* @@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout */ if (!blk_mark_rq_complete(rq)) blk_rq_timed_out(rq); - } else if (!*next_set || time_after(*next_timeout, rq->deadline)) { - *next_timeout = rq->deadline; + } else if (!*next_set || time_after(*next_timeout, deadline)) { + *next_timeout = deadline; *next_set = 1; } } @@ -162,7 +164,7 @@ void blk_abort_request(struct request *req) * immediately and that scan sees the new timeout value. * No need for fancy synchronizations. */ - req->deadline = jiffies; + blk_rq_set_deadline(req, jiffies); mod_timer(&req->q->timeout, 0); } else { if (blk_mark_rq_complete(req)) @@ -213,7 +215,7 @@ void blk_add_timer(struct request *req) if (!req->timeout) req->timeout = q->rq_timeout; - req->deadline = jiffies + req->timeout; + blk_rq_set_deadline(req, jiffies + req->timeout); req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED; /* @@ -228,7 +230,7 @@ void blk_add_timer(struct request *req) * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); + expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req))); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { diff --git a/block/blk.h b/block/blk.h index eb306c52121e..bcd9cf7db0d4 100644 --- a/block/blk.h +++ b/block/blk.h @@ -236,6 +236,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req) q->last_merge = NULL; } +/* + * Steal a bit from this field for legacy IO path atomic IO marking. Note that + * setting the deadline clears the bottom bit, potentially clearing the + * completed bit. The user has to be OK with this (current ones are fine). + */ +static inline void blk_rq_set_deadline(struct request *rq, unsigned long time) +{ + rq->__deadline = time & ~0x1UL; +} + +static inline unsigned long blk_rq_deadline(struct request *rq) +{ + return rq->__deadline & ~0x1UL; +} + /* * Internal io_context interface */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ba31674d8581..aa6698cf483c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -257,7 +257,9 @@ struct request { struct u64_stats_sync aborted_gstate_sync; u64 aborted_gstate; - unsigned long deadline; + /* access through blk_rq_set_deadline, blk_rq_deadline */ + unsigned long __deadline; + struct list_head timeout_list; /* -- cgit v1.2.3-58-ga151 From e14575b3d457f5806d79b85886ef94d9c29e3b2a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2018 11:34:25 -0700 Subject: block: convert REQ_ATOM_COMPLETE to stealing rq->__deadline bit We only have one atomic flag left. Instead of using an entire unsigned long for that, steal the bottom bit of the deadline field that we already reserved. Remove ->atomic_flags, since it's now unused. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-mq-debugfs.c | 9 +-------- block/blk-mq.c | 2 +- block/blk.h | 19 +++++++++---------- include/linux/blkdev.h | 2 -- 5 files changed, 12 insertions(+), 22 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f843ae4f858d..7ba607527487 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2853,7 +2853,7 @@ void blk_start_request(struct request *req) wbt_issue(req->q->rq_wb, &req->issue_stat); } - BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); + BUG_ON(blk_rq_is_complete(req)); blk_add_timer(req); } EXPORT_SYMBOL(blk_start_request); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index dd890d5e0fbd..19db3f583bf1 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -294,12 +294,6 @@ static const char *const rqf_name[] = { }; #undef RQF_NAME -#define RQAF_NAME(name) [REQ_ATOM_##name] = #name -static const char *const rqaf_name[] = { - RQAF_NAME(COMPLETE), -}; -#undef RQAF_NAME - int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; @@ -316,8 +310,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) seq_puts(m, ", .rq_flags="); blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, ARRAY_SIZE(rqf_name)); - seq_puts(m, ", .atomic_flags="); - blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name)); + seq_printf(m, ", complete=%d", blk_rq_is_complete(rq)); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); if (mq_ops->show_rq) diff --git a/block/blk-mq.c b/block/blk-mq.c index 7035c305be45..87e6b10c8ecb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -294,7 +294,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->rq_flags |= RQF_PREEMPT; if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; - /* do not touch atomic flags, it needs atomic ops against the timer */ rq->cpu = -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); @@ -313,6 +312,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->special = NULL; /* tag was already set */ rq->extra_len = 0; + rq->__deadline = 0; INIT_LIST_HEAD(&rq->timeout_list); rq->timeout = 0; diff --git a/block/blk.h b/block/blk.h index bcd9cf7db0d4..c84ae0e21ebd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -119,25 +119,24 @@ void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); void blk_account_io_done(struct request *req); -/* - * Internal atomic flags for request handling - */ -enum rq_atomic_flags { - REQ_ATOM_COMPLETE = 0, -}; - /* * EH timer and IO completion will both attempt to 'grab' the request, make - * sure that only one of them succeeds + * sure that only one of them succeeds. Steal the bottom bit of the + * __deadline field for this. */ static inline int blk_mark_rq_complete(struct request *rq) { - return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); + return test_and_set_bit(0, &rq->__deadline); } static inline void blk_clear_rq_complete(struct request *rq) { - clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); + clear_bit(0, &rq->__deadline); +} + +static inline bool blk_rq_is_complete(struct request *rq) +{ + return test_bit(0, &rq->__deadline); } /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aa6698cf483c..d4b2f7bb18d6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -156,8 +156,6 @@ struct request { int internal_tag; - unsigned long atomic_flags; - /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ int tag; -- cgit v1.2.3-58-ga151 From 7c3fb70f0341f9d924818e648906774921f4bcb3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2018 11:46:39 -0700 Subject: block: rearrange a few request fields for better cache layout Move completion related items (like the call single data) near the end of the struct, instead of mixing them in with the initial queueing related fields. Move queuelist below the bio structures. Then we have all queueing related bits in the first cache line. This yields a 1.5-2% increase in IOPS for a null_blk test, both for sync and for high thread count access. Sync test goes form 975K to 992K, 32-thread case from 20.8M to 21.2M IOPS. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq.c | 18 +++++++++--------- include/linux/blkdev.h | 28 +++++++++++++++------------- 2 files changed, 24 insertions(+), 22 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 87e6b10c8ecb..435a5a0d441f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -270,8 +270,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; - rq->rq_flags = 0; - if (data->flags & BLK_MQ_REQ_INTERNAL) { rq->tag = -1; rq->internal_tag = tag; @@ -285,26 +283,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, data->hctx->tags->rqs[rq->tag] = rq; } - INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; + rq->rq_flags = 0; + rq->cpu = -1; rq->cmd_flags = op; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; - rq->cpu = -1; + INIT_LIST_HEAD(&rq->queuelist); INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; rq->start_time = jiffies; -#ifdef CONFIG_BLK_CGROUP - rq->rl = NULL; - set_start_time_ns(rq); - rq->io_start_time_ns = 0; -#endif rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -321,6 +315,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->end_io_data = NULL; rq->next_rq = NULL; +#ifdef CONFIG_BLK_CGROUP + rq->rl = NULL; + set_start_time_ns(rq); + rq->io_start_time_ns = 0; +#endif + data->ctx->rq_dispatched[op_is_sync(op)]++; return rq; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d4b2f7bb18d6..71a9371c8182 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -141,12 +141,6 @@ typedef __u32 __bitwise req_flags_t; * especially blk_mq_rq_ctx_init() to take care of the added fields. */ struct request { - struct list_head queuelist; - union { - call_single_data_t csd; - u64 fifo_time; - }; - struct request_queue *q; struct blk_mq_ctx *mq_ctx; @@ -164,6 +158,8 @@ struct request { struct bio *bio; struct bio *biotail; + struct list_head queuelist; + /* * The hash is used inside the scheduler, and killed once the * request reaches the dispatch list. The ipi_list is only used @@ -211,19 +207,16 @@ struct request { struct hd_struct *part; unsigned long start_time; struct blk_issue_stat issue_stat; -#ifdef CONFIG_BLK_CGROUP - struct request_list *rl; /* rl this rq is alloced from */ - unsigned long long start_time_ns; - unsigned long long io_start_time_ns; /* when passed to hardware */ -#endif /* Number of scatter-gather DMA addr+len pairs after * physical address coalescing is performed. */ unsigned short nr_phys_segments; + #if defined(CONFIG_BLK_DEV_INTEGRITY) unsigned short nr_integrity_segments; #endif + unsigned short write_hint; unsigned short ioprio; unsigned int timeout; @@ -232,8 +225,6 @@ struct request { unsigned int extra_len; /* length of alignment and padding */ - unsigned short write_hint; - /* * On blk-mq, the lower bits of ->gstate (generation number and * state) carry the MQ_RQ_* state value and the upper bits the @@ -260,6 +251,11 @@ struct request { struct list_head timeout_list; + union { + call_single_data_t csd; + u64 fifo_time; + }; + /* * completion callback. */ @@ -268,6 +264,12 @@ struct request { /* for bidi */ struct request *next_rq; + +#ifdef CONFIG_BLK_CGROUP + struct request_list *rl; /* rl this rq is alloced from */ + unsigned long long start_time_ns; + unsigned long long io_start_time_ns; /* when passed to hardware */ +#endif }; static inline bool blk_rq_is_scsi(struct request *rq) -- cgit v1.2.3-58-ga151 From 0478fe68685a428c71decc19abecd265a6d658dd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 10 Jan 2018 16:54:52 +0100 Subject: block: silently forbid sending any ioctl to a partition After the first few months, the message has not led to many bug reports. It's been almost five years now, and in practice the main source of it seems to be MTIOCGET that someone is using to detect tape devices. While we could whitelist it just like CDROM_GET_CAPABILITY, this patch just removes the message altogether. The patch also removes the "safe but not very useful" ioctl whitelist, as suggested by Christoph. I doubt anything is using most of those ioctls _in general_, let alone on a partition. Reviewed-by: Christoph Hellwig Signed-off-by: Paolo Bonzini Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 29 ----------------------------- 1 file changed, 29 deletions(-) (limited to 'block') diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 5cddff44a2f8..60b471f8621b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -693,38 +693,9 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) if (bd && bd == bd->bd_contains) return 0; - /* Actually none of these is particularly useful on a partition, - * but they are safe. - */ - switch (cmd) { - case SCSI_IOCTL_GET_IDLUN: - case SCSI_IOCTL_GET_BUS_NUMBER: - case SCSI_IOCTL_GET_PCI: - case SCSI_IOCTL_PROBE_HOST: - case SG_GET_VERSION_NUM: - case SG_SET_TIMEOUT: - case SG_GET_TIMEOUT: - case SG_GET_RESERVED_SIZE: - case SG_SET_RESERVED_SIZE: - case SG_EMULATED_HOST: - return 0; - case CDROM_GET_CAPABILITY: - /* Keep this until we remove the printk below. udev sends it - * and we do not want to spam dmesg about it. CD-ROMs do - * not have partitions, so we get here only for disks. - */ - return -ENOIOCTLCMD; - default: - break; - } - if (capable(CAP_SYS_RAWIO)) return 0; - /* In particular, rule out all resets and host-specific ioctls. */ - printk_ratelimited(KERN_WARNING - "%s: sending ioctl %x to a partition!\n", current->comm, cmd); - return -ENOIOCTLCMD; } EXPORT_SYMBOL(scsi_verify_blk_ioctl); -- cgit v1.2.3-58-ga151 From b7435db8b8d11df94453708295c2ea5b09caff5f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 10 Jan 2018 11:34:27 -0800 Subject: blk-mq: Add locking annotations to hctx_lock() and hctx_unlock() This patch avoids that sparse reports the following: block/blk-mq.c:637:33: warning: context imbalance in 'hctx_unlock' - unexpected unlock block/blk-mq.c:642:9: warning: context imbalance in 'hctx_lock' - wrong count at exit Signed-off-by: Bart Van Assche Cc: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 435a5a0d441f..8000ba6db07d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -559,6 +559,7 @@ static void __blk_mq_complete_request(struct request *rq) } static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) + __releases(hctx->srcu) { if (!(hctx->flags & BLK_MQ_F_BLOCKING)) rcu_read_unlock(); @@ -567,6 +568,7 @@ static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) } static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) + __acquires(hctx->srcu) { if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { /* shut up gcc false positive */ -- cgit v1.2.3-58-ga151 From c27d53fb445f2d93a1918c3dd7344770b0cd865b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 10 Jan 2018 13:41:21 -0800 Subject: blk-mq: Reduce the number of if-statements in blk_mq_mark_tag_wait() This patch does not change any functionality but makes the blk_mq_mark_tag_wait() code slightly easier to read. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Omar Sandoval Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-mq.c | 69 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 34 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8000ba6db07d..afccd0848d6f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1104,58 +1104,59 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, struct request *rq) { struct blk_mq_hw_ctx *this_hctx = *hctx; - bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0; struct sbq_wait_state *ws; wait_queue_entry_t *wait; bool ret; - if (!shared_tags) { + if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) { if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); - } else { - wait = &this_hctx->dispatch_wait; - if (!list_empty_careful(&wait->entry)) - return false; - spin_lock(&this_hctx->lock); - if (!list_empty(&wait->entry)) { - spin_unlock(&this_hctx->lock); - return false; - } + /* + * It's possible that a tag was freed in the window between the + * allocation failure and adding the hardware queue to the wait + * queue. + * + * Don't clear RESTART here, someone else could have set it. + * At most this will cost an extra queue run. + */ + return blk_mq_get_driver_tag(rq, hctx, false); + } + + wait = &this_hctx->dispatch_wait; + if (!list_empty_careful(&wait->entry)) + return false; - ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); - add_wait_queue(&ws->wait, wait); + spin_lock(&this_hctx->lock); + if (!list_empty(&wait->entry)) { + spin_unlock(&this_hctx->lock); + return false; } + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); + add_wait_queue(&ws->wait, wait); + /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. */ ret = blk_mq_get_driver_tag(rq, hctx, false); - - if (!shared_tags) { - /* - * Don't clear RESTART here, someone else could have set it. - * At most this will cost an extra queue run. - */ - return ret; - } else { - if (!ret) { - spin_unlock(&this_hctx->lock); - return false; - } - - /* - * We got a tag, remove ourselves from the wait queue to ensure - * someone else gets the wakeup. - */ - spin_lock_irq(&ws->wait.lock); - list_del_init(&wait->entry); - spin_unlock_irq(&ws->wait.lock); + if (!ret) { spin_unlock(&this_hctx->lock); - return true; + return false; } + + /* + * We got a tag, remove ourselves from the wait queue to ensure + * someone else gets the wakeup. + */ + spin_lock_irq(&ws->wait.lock); + list_del_init(&wait->entry); + spin_unlock_irq(&ws->wait.lock); + spin_unlock(&this_hctx->lock); + + return true; } bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, -- cgit v1.2.3-58-ga151 From 20e4d813931961fe26d26a1e98b3aba6ec00b130 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 12 Jan 2018 10:53:06 +0800 Subject: blk-mq: simplify queue mapping & schedule with each possisble CPU The previous patch assigns interrupt vectors to all possible CPUs, so now hctx can be mapped to possible CPUs, this patch applies this fact to simplify queue mapping & schedule so that we don't need to handle CPU hotplug for dealing with physical CPU plug & unplug. With this simplication, we can work well on physical CPU plug & unplug, which is a normal use case for VM at least. Make sure we allocate blk_mq_ctx structures for all possible CPUs, and set hctx->numa_node for possible CPUs which are mapped to this hctx. And only choose the online CPUs for schedule. Reported-by: Christian Borntraeger Tested-by: Christian Borntraeger Tested-by: Stefan Haberland Cc: Thomas Gleixner Signed-off-by: Christoph Hellwig Fixes: 4b855ad37194 ("blk-mq: Create hctx for each present CPU") (merged the three into one because any single one may not work, and fix selecting online CPUs for scheduler) Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index afccd0848d6f..b3b2003b7429 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -440,7 +440,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, blk_queue_exit(q); return ERR_PTR(-EXDEV); } - cpu = cpumask_first(alloc_data.hctx->cpumask); + cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); alloc_data.ctx = __blk_mq_get_ctx(q, cpu); rq = blk_mq_get_request(q, NULL, op, &alloc_data); @@ -1324,9 +1324,10 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) if (--hctx->next_cpu_batch <= 0) { int next_cpu; - next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); + next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask, + cpu_online_mask); if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(hctx->cpumask); + next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask); hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; @@ -2220,16 +2221,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; - /* If the cpu isn't present, the cpu is mapped to first hctx */ - if (!cpu_present(i)) - continue; - - hctx = blk_mq_map_queue(q, i); - /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ + hctx = blk_mq_map_queue(q, i); if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) hctx->numa_node = local_memory_node(cpu_to_node(i)); } @@ -2286,7 +2282,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) * * If the cpu isn't present, the cpu is mapped to first hctx. */ - for_each_present_cpu(i) { + for_each_possible_cpu(i) { hctx_idx = q->mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && @@ -2340,7 +2336,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) /* * Initialize batch roundrobin counts */ - hctx->next_cpu = cpumask_first(hctx->cpumask); + hctx->next_cpu = cpumask_first_and(hctx->cpumask, + cpu_online_mask); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } } -- cgit v1.2.3-58-ga151 From 85ba3effc5a0836b9195f2010684062ad230cc23 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Jan 2018 14:47:57 -0700 Subject: blk-mq: add missing RQF_STARTED to debugfs Looking at debug output, we see: ./000000009ddfa913/requeue_list:000000009646711c {.op=READ, .state=idle, gen=0x1 18, abort_gen=0x0, .cmd_flags=, .rq_flags=SORTED|1|SOFTBARRIER|IO_STAT, complete =0, .tag=-1, .internal_tag=217} Note the '1' between SORTED and SOFTBARRIER - that's because no name as defined for RQF_STARTED. Fixed that. Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 19db3f583bf1..fa31ceaa8de6 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -271,6 +271,7 @@ static const char *const cmd_flag_name[] = { #define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name static const char *const rqf_name[] = { RQF_NAME(SORTED), + RQF_NAME(STARTED), RQF_NAME(QUEUED), RQF_NAME(SOFTBARRIER), RQF_NAME(FLUSH_SEQ), -- cgit v1.2.3-58-ga151 From bf9ae8c5325c0070d0ec81a849bba8d156f65993 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 14 Jan 2018 10:40:45 -0700 Subject: blk-mq: fix bad clear of RQF_MQ_INFLIGHT in blk_mq_ct_ctx_init() A previous commit moved the clearing of rq->rq_flags later, but we may have already set RQF_MQ_INFLIGHT when that happens. Ensure that we correctly initialize rq->rq_flags to the right value. This is based on an original fix by Ming, just rewritten to not require a conditional. Fixes: 7c3fb70f0341 ("block: rearrange a few request fields for better cache layout") Reviewed-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index b3b2003b7429..c8f62e6be6b6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -269,13 +269,14 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; + req_flags_t rq_flags = 0; if (data->flags & BLK_MQ_REQ_INTERNAL) { rq->tag = -1; rq->internal_tag = tag; } else { if (blk_mq_tag_busy(data->hctx)) { - rq->rq_flags = RQF_MQ_INFLIGHT; + rq_flags = RQF_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); } rq->tag = tag; @@ -286,7 +287,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; - rq->rq_flags = 0; + rq->rq_flags = rq_flags; rq->cpu = -1; rq->cmd_flags = op; if (data->flags & BLK_MQ_REQ_PREEMPT) -- cgit v1.2.3-58-ga151 From bc8d062c36e3525e81ea8237ff0ab3264c2317b6 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 9 Jan 2018 20:46:49 -0500 Subject: block: only bdi_unregister() in del_gendisk() if !GENHD_FL_HIDDEN device_add_disk() will only call bdi_register_owner() if !GENHD_FL_HIDDEN, so it follows that del_gendisk() should only call bdi_unregister() if !GENHD_FL_HIDDEN. Found with code inspection. bdi_unregister() won't do any harm if bdi_register_owner() wasn't used but best to avoid the unnecessary call to bdi_unregister(). Fixes: 8ddcd65325 ("block: introduce GENHD_FL_HIDDEN") Signed-off-by: Mike Snitzer Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 96a66f671720..00620e01e043 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -725,7 +725,8 @@ void del_gendisk(struct gendisk *disk) * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ - bdi_unregister(disk->queue->backing_dev_info); + if (!(disk->flags & GENHD_FL_HIDDEN)) + bdi_unregister(disk->queue->backing_dev_info); blk_unregister_queue(disk); } else { WARN_ON(1); -- cgit v1.2.3-58-ga151 From 667257e8b2988c0183ba23e2bcd6900e87961606 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 11 Jan 2018 14:11:01 -0500 Subject: block: properly protect the 'queue' kobj in blk_unregister_queue The original commit e9a823fb34a8b (block: fix warning when I/O elevator is changed as request_queue is being removed) is pretty conflated. "conflated" because the resource being protected by q->sysfs_lock isn't the queue_flags (it is the 'queue' kobj). q->sysfs_lock serializes __elevator_change() (via elv_iosched_store) from racing with blk_unregister_queue(): 1) By holding q->sysfs_lock first, __elevator_change() can complete before a racing blk_unregister_queue(). 2) Conversely, __elevator_change() is testing for QUEUE_FLAG_REGISTERED in case elv_iosched_store() loses the race with blk_unregister_queue(), it needs a way to know the 'queue' kobj isn't there. Expand the scope of blk_unregister_queue()'s q->sysfs_lock use so it is held until after the 'queue' kobj is removed. To do so blk_mq_unregister_dev() must not also take q->sysfs_lock. So rename __blk_mq_unregister_dev() to blk_mq_unregister_dev(). Also, blk_unregister_queue() should use q->queue_lock to protect against any concurrent writes to q->queue_flags -- even though chances are the queue is being cleaned up so no concurrent writes are likely. Fixes: e9a823fb34a8b ("block: fix warning when I/O elevator is changed as request_queue is being removed") Signed-off-by: Mike Snitzer Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 9 +-------- block/blk-sysfs.c | 13 ++++++++++--- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 79969c3c234f..a54b4b070f1c 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) return ret; } -static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) +void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; @@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) q->mq_sysfs_init_done = false; } -void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) -{ - mutex_lock(&q->sysfs_lock); - __blk_mq_unregister_dev(dev, q); - mutex_unlock(&q->sysfs_lock); -} - void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) { kobject_init(&hctx->kobj, &blk_mq_hw_ktype); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 870484eaed1f..9272452ff456 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -929,12 +929,17 @@ void blk_unregister_queue(struct gendisk *disk) if (WARN_ON(!q)) return; + /* + * Protect against the 'queue' kobj being accessed + * while/after it is removed. + */ mutex_lock(&q->sysfs_lock); - queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q); - mutex_unlock(&q->sysfs_lock); - wbt_exit(q); + spin_lock_irq(q->queue_lock); + queue_flag_clear(QUEUE_FLAG_REGISTERED, q); + spin_unlock_irq(q->queue_lock); + wbt_exit(q); if (q->mq_ops) blk_mq_unregister_dev(disk_to_dev(disk), q); @@ -946,4 +951,6 @@ void blk_unregister_queue(struct gendisk *disk) kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); kobject_put(&disk_to_dev(disk)->kobj); + + mutex_unlock(&q->sysfs_lock); } -- cgit v1.2.3-58-ga151 From fa70d2e2c4a0a54ced98260c6a176cc94c876d27 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 8 Jan 2018 22:01:13 -0500 Subject: block: allow gendisk's request_queue registration to be deferred Since I can remember DM has forced the block layer to allow the allocation and initialization of the request_queue to be distinct operations. Reason for this is block/genhd.c:add_disk() has requires that the request_queue (and associated bdi) be tied to the gendisk before add_disk() is called -- because add_disk() also deals with exposing the request_queue via blk_register_queue(). DM's dynamic creation of arbitrary device types (and associated request_queue types) requires the DM device's gendisk be available so that DM table loads can establish a master/slave relationship with subordinate devices that are referenced by loaded DM tables -- using bd_link_disk_holder(). But until these DM tables, and their associated subordinate devices, are known DM cannot know what type of request_queue it needs -- nor what its queue_limits should be. This chicken and egg scenario has created all manner of problems for DM and, at times, the block layer. Summary of changes: - Add device_add_disk_no_queue_reg() and add_disk_no_queue_reg() variant that drivers may use to add a disk without also calling blk_register_queue(). Driver must call blk_register_queue() once its request_queue is fully initialized. - Return early from blk_unregister_queue() if QUEUE_FLAG_REGISTERED is not set. It won't be set if driver used add_disk_no_queue_reg() but driver encounters an error and must del_gendisk() before calling blk_register_queue(). - Export blk_register_queue(). These changes allow DM to use add_disk_no_queue_reg() to anchor its gendisk as the "master" for master/slave relationships DM must establish with subordinate devices referenced in DM tables that get loaded. Once all "slave" devices for a DM device are known its request_queue can be properly initialized and then advertised via sysfs -- important improvement being that no request_queue resource initialization performed by blk_register_queue() is missed for DM devices anymore. Signed-off-by: Mike Snitzer Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 5 +++++ block/genhd.c | 20 +++++++++++++++++--- include/linux/genhd.h | 5 +++++ 3 files changed, 27 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9272452ff456..4a6a40ffd78e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -921,6 +921,7 @@ unlock: mutex_unlock(&q->sysfs_lock); return ret; } +EXPORT_SYMBOL_GPL(blk_register_queue); void blk_unregister_queue(struct gendisk *disk) { @@ -929,6 +930,10 @@ void blk_unregister_queue(struct gendisk *disk) if (WARN_ON(!q)) return; + /* Return early if disk->queue was never registered. */ + if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + return; + /* * Protect against the 'queue' kobj being accessed * while/after it is removed. diff --git a/block/genhd.c b/block/genhd.c index 00620e01e043..88a53c188cb7 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -629,16 +629,18 @@ exit: } /** - * device_add_disk - add partitioning information to kernel list + * __device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information + * @register_queue: register the queue if set to true * * This function registers the partitioning information in @disk * with the kernel. * * FIXME: error handling */ -void device_add_disk(struct device *parent, struct gendisk *disk) +static void __device_add_disk(struct device *parent, struct gendisk *disk, + bool register_queue) { dev_t devt; int retval; @@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk) exact_match, exact_lock, disk); } register_disk(parent, disk); - blk_register_queue(disk); + if (register_queue) + blk_register_queue(disk); /* * Take an extra ref on queue which will be put on disk_release() @@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk) disk_add_events(disk); blk_integrity_add(disk); } + +void device_add_disk(struct device *parent, struct gendisk *disk) +{ + __device_add_disk(parent, disk, true); +} EXPORT_SYMBOL(device_add_disk); +void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) +{ + __device_add_disk(parent, disk, false); +} +EXPORT_SYMBOL(device_add_disk_no_queue_reg); + void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5144ebe046c9..5e3531027b51 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -395,6 +395,11 @@ static inline void add_disk(struct gendisk *disk) { device_add_disk(NULL, disk); } +extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk); +static inline void add_disk_no_queue_reg(struct gendisk *disk) +{ + device_add_disk_no_queue_reg(NULL, disk); +} extern void del_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); -- cgit v1.2.3-58-ga151 From 69e0927b3774563c19b5fb32e91d75edc147fb62 Mon Sep 17 00:00:00 2001 From: Douglas Gilbert Date: Sun, 14 Jan 2018 17:00:48 -0500 Subject: blk_rq_map_user_iov: fix error override During stress tests by syzkaller on the sg driver the block layer infrequently returns EINVAL. Closer inspection shows the block layer was trying to return ENOMEM (which is much more understandable) but for some reason overroad that useful error. Patch below does not show this (unchanged) line: ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy); That 'ret' was being overridden when that function failed. Signed-off-by: Douglas Gilbert Signed-off-by: Jens Axboe --- block/blk-map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index b21f8e86f120..209eb3b45c54 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -114,7 +114,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); struct bio *bio = NULL; struct iov_iter i; - int ret; + int ret = -EINVAL; if (!iter_is_iovec(iter)) goto fail; @@ -143,7 +143,7 @@ unmap_rq: __blk_rq_unmap_user(bio); fail: rq->bio = NULL; - return -EINVAL; + return ret; } EXPORT_SYMBOL(blk_rq_map_user_iov); -- cgit v1.2.3-58-ga151 From 7bed45954b95601230ebf387d3e4e20e4a3cc025 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 18 Jan 2018 00:41:51 +0800 Subject: blk-mq: make sure hctx->next_cpu is set correctly When hctx->next_cpu is set from possible online CPUs, there is one race in which hctx->next_cpu may be set as >= nr_cpu_ids, and finally break workqueue. The race can be triggered in the following two sitations: 1) when one CPU is becoming DEAD, blk_mq_hctx_notify_dead() is called to dispatch requests from the DEAD cpu context, but at that time, this DEAD CPU has been cleared from 'cpu_online_mask', so all CPUs in hctx->cpumask may become offline, and cause hctx->next_cpu set a bad value. 2) blk_mq_delay_run_hw_queue() is called from CPU B, and found the queue should be run on the other CPU A, then CPU A may become offline at the same time and all CPUs in hctx->cpumask become offline. This patch deals with this issue by re-selecting next CPU, and making sure it is set correctly. Cc: Christian Borntraeger Cc: Stefan Haberland Cc: Christoph Hellwig Cc: Thomas Gleixner Reported-by: "jianchao.wang" Tested-by: "jianchao.wang" Fixes: 20e4d81393 ("blk-mq: simplify queue mapping & schedule with each possisble CPU") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index c8f62e6be6b6..3bd41f1066ee 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1319,21 +1319,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { + bool tried = false; + if (hctx->queue->nr_hw_queues == 1) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { int next_cpu; - +select_cpu: next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask, cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask); - hctx->next_cpu = next_cpu; + /* + * No online CPU is found, so have to make sure hctx->next_cpu + * is set correctly for not breaking workqueue. + */ + if (next_cpu >= nr_cpu_ids) + hctx->next_cpu = cpumask_first(hctx->cpumask); + else + hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } + /* + * Do unbound schedule if we can't find a online CPU for this hctx, + * and it should only happen in the path of handling CPU DEAD. + */ + if (!cpu_online(hctx->next_cpu)) { + if (!tried) { + tried = true; + goto select_cpu; + } + + /* + * Make sure to re-select CPU next time once after CPUs + * in hctx->cpumask become online again. + */ + hctx->next_cpu_batch = 1; + return WORK_CPU_UNBOUND; + } return hctx->next_cpu; } -- cgit v1.2.3-58-ga151 From 7df938fbc4ee641e70e05002ac67c24b19e86e74 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 18 Jan 2018 00:41:52 +0800 Subject: blk-mq: turn WARN_ON in __blk_mq_run_hw_queue into printk We know this WARN_ON is harmless and in reality it may be trigged, so convert it to printk() and dump_stack() to avoid to confusing people. Also add comment about two releated races here. Cc: Christian Borntraeger Cc: Stefan Haberland Cc: Christoph Hellwig Cc: Thomas Gleixner Cc: "jianchao.wang" Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 3bd41f1066ee..ec429be05729 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1294,9 +1294,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) /* * We should be running this queue from one of the CPUs that * are mapped to it. + * + * There are at least two related races now between setting + * hctx->next_cpu from blk_mq_hctx_next_cpu() and running + * __blk_mq_run_hw_queue(): + * + * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), + * but later it becomes online, then this warning is harmless + * at all + * + * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), + * but later it becomes offline, then the warning can't be + * triggered, and we depend on blk-mq timeout handler to + * handle dispatched requests to this hctx */ - WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && - cpu_online(hctx->next_cpu)); + if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && + cpu_online(hctx->next_cpu)) { + printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", + raw_smp_processor_id(), + cpumask_empty(hctx->cpumask) ? "inactive": "active"); + dump_stack(); + } /* * We can't run the queue inline with ints disabled. Ensure that -- cgit v1.2.3-58-ga151 From 0f95549c0ea1e8075ae049202088b2c6a0cb40ad Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 17 Jan 2018 11:25:56 -0500 Subject: blk-mq: factor out a few helpers from __blk_mq_try_issue_directly No functional change. Just makes code flow more logically. In following commit, __blk_mq_try_issue_directly() will be used to return the dispatch result (blk_status_t) to DM. DM needs this information to improve IO merging. Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-mq.c | 79 ++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 27 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ec429be05729..ddc46f215bfa 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1738,9 +1738,9 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } -static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie) +static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { @@ -1749,6 +1749,43 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, }; blk_qc_t new_cookie; blk_status_t ret; + + new_cookie = request_to_qc_t(hctx, rq); + + /* + * For OK queue, we are done. For error, caller may kill it. + * Any other error (busy), just add it to our list as we + * previously would have done. + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + switch (ret) { + case BLK_STS_OK: + *cookie = new_cookie; + break; + case BLK_STS_RESOURCE: + __blk_mq_requeue_request(rq); + break; + default: + *cookie = BLK_QC_T_NONE; + break; + } + + return ret; +} + +static void __blk_mq_fallback_to_insert(struct blk_mq_hw_ctx *hctx, + struct request *rq, + bool run_queue) +{ + blk_mq_sched_insert_request(rq, false, run_queue, false, + hctx->flags & BLK_MQ_F_BLOCKING); +} + +static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie) +{ + struct request_queue *q = rq->q; bool run_queue = true; /* RCU or SRCU read lock is needed before checking quiesced flag */ @@ -1768,41 +1805,29 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - new_cookie = request_to_qc_t(hctx, rq); - - /* - * For OK queue, we are done. For error, kill it. Any other - * error (busy), just add it to our list as we previously - * would have done - */ - ret = q->mq_ops->queue_rq(hctx, &bd); - switch (ret) { - case BLK_STS_OK: - *cookie = new_cookie; - return; - case BLK_STS_RESOURCE: - __blk_mq_requeue_request(rq); - goto insert; - default: - *cookie = BLK_QC_T_NONE; - blk_mq_end_request(rq, ret); - return; - } - + return __blk_mq_issue_directly(hctx, rq, cookie); insert: - blk_mq_sched_insert_request(rq, false, run_queue, false, - hctx->flags & BLK_MQ_F_BLOCKING); + __blk_mq_fallback_to_insert(hctx, rq, run_queue); + + return BLK_STS_OK; } static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie) { + blk_status_t ret; int srcu_idx; might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); hctx_lock(hctx, &srcu_idx); - __blk_mq_try_issue_directly(hctx, rq, cookie); + + ret = __blk_mq_try_issue_directly(hctx, rq, cookie); + if (ret == BLK_STS_RESOURCE) + __blk_mq_fallback_to_insert(hctx, rq, true); + else if (ret != BLK_STS_OK) + blk_mq_end_request(rq, ret); + hctx_unlock(hctx, srcu_idx); } -- cgit v1.2.3-58-ga151 From 396eaf21ee17c476e8f66249fb1f4a39003d0ab4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 17 Jan 2018 11:25:57 -0500 Subject: blk-mq: improve DM's blk-mq IO merging via blk_insert_cloned_request feedback blk_insert_cloned_request() is called in the fast path of a dm-rq driver (e.g. blk-mq request-based DM mpath). blk_insert_cloned_request() uses blk_mq_request_bypass_insert() to directly append the request to the blk-mq hctx->dispatch_list of the underlying queue. 1) This way isn't efficient enough because the hctx spinlock is always used. 2) With blk_insert_cloned_request(), we completely bypass underlying queue's elevator and depend on the upper-level dm-rq driver's elevator to schedule IO. But dm-rq currently can't get the underlying queue's dispatch feedback at all. Without knowing whether a request was issued or not (e.g. due to underlying queue being busy) the dm-rq elevator will not be able to provide effective IO merging (as a side-effect of dm-rq currently blindly destaging a request from its elevator only to requeue it after a delay, which kills any opportunity for merging). This obviously causes very bad sequential IO performance. Fix this by updating blk_insert_cloned_request() to use blk_mq_request_direct_issue(). blk_mq_request_direct_issue() allows a request to be issued directly to the underlying queue and returns the dispatch feedback (blk_status_t). If blk_mq_request_direct_issue() returns BLK_SYS_RESOURCE the dm-rq driver will now use DM_MAPIO_REQUEUE to _not_ destage the request. Whereby preserving the opportunity to merge IO. With this, request-based DM's blk-mq sequential IO performance is vastly improved (as much as 3X in mpath/virtio-scsi testing). Signed-off-by: Ming Lei [blk-mq.c changes heavily influenced by Ming Lei's initial solution, but they were refactored to make them less fragile and easier to read/review] Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +-- block/blk-mq.c | 37 +++++++++++++++++++++++++++++-------- block/blk-mq.h | 3 +++ drivers/md/dm-rq.c | 19 ++++++++++++++++--- 4 files changed, 49 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 7ba607527487..55f338020254 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2500,8 +2500,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - blk_mq_request_bypass_insert(rq, true); - return BLK_STS_OK; + return blk_mq_request_direct_issue(rq); } spin_lock_irqsave(q->queue_lock, flags); diff --git a/block/blk-mq.c b/block/blk-mq.c index ddc46f215bfa..e383a20809f4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1775,15 +1775,19 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, static void __blk_mq_fallback_to_insert(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool run_queue) + bool run_queue, bool bypass_insert) { - blk_mq_sched_insert_request(rq, false, run_queue, false, - hctx->flags & BLK_MQ_F_BLOCKING); + if (!bypass_insert) + blk_mq_sched_insert_request(rq, false, run_queue, false, + hctx->flags & BLK_MQ_F_BLOCKING); + else + blk_mq_request_bypass_insert(rq, run_queue); } static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, - blk_qc_t *cookie) + blk_qc_t *cookie, + bool bypass_insert) { struct request_queue *q = rq->q; bool run_queue = true; @@ -1794,7 +1798,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - if (q->elevator) + if (q->elevator && !bypass_insert) goto insert; if (!blk_mq_get_driver_tag(rq, NULL, false)) @@ -1807,7 +1811,9 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, return __blk_mq_issue_directly(hctx, rq, cookie); insert: - __blk_mq_fallback_to_insert(hctx, rq, run_queue); + __blk_mq_fallback_to_insert(hctx, rq, run_queue, bypass_insert); + if (bypass_insert) + return BLK_STS_RESOURCE; return BLK_STS_OK; } @@ -1822,15 +1828,30 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, cookie); + ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); if (ret == BLK_STS_RESOURCE) - __blk_mq_fallback_to_insert(hctx, rq, true); + __blk_mq_fallback_to_insert(hctx, rq, true, false); else if (ret != BLK_STS_OK) blk_mq_end_request(rq, ret); hctx_unlock(hctx, srcu_idx); } +blk_status_t blk_mq_request_direct_issue(struct request *rq) +{ + blk_status_t ret; + int srcu_idx; + blk_qc_t unused_cookie; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + + hctx_lock(hctx, &srcu_idx); + ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true); + hctx_unlock(hctx, srcu_idx); + + return ret; +} + static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); diff --git a/block/blk-mq.h b/block/blk-mq.h index 8591a54d989b..e3ebc93646ca 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -74,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); +/* Used by blk_insert_cloned_request() to issue request directly */ +blk_status_t blk_mq_request_direct_issue(struct request *rq); + /* * CPU -> queue mappings */ diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index c28357f5cb0e..b7d175e94a02 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -395,7 +395,7 @@ static void end_clone_request(struct request *clone, blk_status_t error) dm_complete_request(tio->orig, error); } -static void dm_dispatch_clone_request(struct request *clone, struct request *rq) +static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq) { blk_status_t r; @@ -404,9 +404,10 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq) clone->start_time = jiffies; r = blk_insert_cloned_request(clone->q, clone); - if (r) + if (r != BLK_STS_OK && r != BLK_STS_RESOURCE) /* must complete clone in terms of original request */ dm_complete_request(rq, r); + return r; } static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, @@ -476,8 +477,10 @@ static int map_request(struct dm_rq_target_io *tio) struct mapped_device *md = tio->md; struct request *rq = tio->orig; struct request *clone = NULL; + blk_status_t ret; r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); +check_again: switch (r) { case DM_MAPIO_SUBMITTED: /* The target has taken the I/O to submit by itself later */ @@ -492,7 +495,17 @@ static int map_request(struct dm_rq_target_io *tio) /* The target has remapped the I/O so dispatch it */ trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), blk_rq_pos(rq)); - dm_dispatch_clone_request(clone, rq); + ret = dm_dispatch_clone_request(clone, rq); + if (ret == BLK_STS_RESOURCE) { + blk_rq_unprep_clone(clone); + tio->ti->type->release_clone_rq(clone); + tio->clone = NULL; + if (!rq->q->mq_ops) + r = DM_MAPIO_DELAY_REQUEUE; + else + r = DM_MAPIO_REQUEUE; + goto check_again; + } break; case DM_MAPIO_REQUEUE: /* The target wants to requeue the I/O */ -- cgit v1.2.3-58-ga151 From 9e97d2951a7e6ee6e204f87f6bda4ff754a8cede Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 17 Jan 2018 11:25:58 -0500 Subject: blk-mq-sched: remove unused 'can_block' arg from blk_mq_sched_insert_request After commit: 923218f6166a ("blk-mq: don't allocate driver tag upfront for flush rq") we no longer use the 'can_block' argument in blk_mq_sched_insert_request(). Kill it. Signed-off-by: Mike Snitzer Added actual commit message as to why it's being removed. Signed-off-by: Jens Axboe --- block/blk-exec.c | 2 +- block/blk-mq-sched.c | 2 +- block/blk-mq-sched.h | 2 +- block/blk-mq.c | 16 +++++++--------- 4 files changed, 10 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-exec.c b/block/blk-exec.c index 5c0f3dc446dc..f7b292f12449 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be reused after dying flag is set */ if (q->mq_ops) { - blk_mq_sched_insert_request(rq, at_head, true, false, false); + blk_mq_sched_insert_request(rq, at_head, true, false); return; } diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 2ff7cf0cbf73..55c0a745b427 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -427,7 +427,7 @@ done: } void blk_mq_sched_insert_request(struct request *rq, bool at_head, - bool run_queue, bool async, bool can_block) + bool run_queue, bool async) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index ba1d1418a96d..1e9c9018ace1 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_insert_request(struct request *rq, bool at_head, - bool run_queue, bool async, bool can_block); + bool run_queue, bool async); void blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async); diff --git a/block/blk-mq.c b/block/blk-mq.c index e383a20809f4..c418858a60ef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -745,13 +745,13 @@ static void blk_mq_requeue_work(struct work_struct *work) rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, true, false, false, true); + blk_mq_sched_insert_request(rq, true, false, false); } while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, false, false, false, true); + blk_mq_sched_insert_request(rq, false, false, false); } blk_mq_run_hw_queues(q, false); @@ -1773,13 +1773,11 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, return ret; } -static void __blk_mq_fallback_to_insert(struct blk_mq_hw_ctx *hctx, - struct request *rq, +static void __blk_mq_fallback_to_insert(struct request *rq, bool run_queue, bool bypass_insert) { if (!bypass_insert) - blk_mq_sched_insert_request(rq, false, run_queue, false, - hctx->flags & BLK_MQ_F_BLOCKING); + blk_mq_sched_insert_request(rq, false, run_queue, false); else blk_mq_request_bypass_insert(rq, run_queue); } @@ -1811,7 +1809,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, return __blk_mq_issue_directly(hctx, rq, cookie); insert: - __blk_mq_fallback_to_insert(hctx, rq, run_queue, bypass_insert); + __blk_mq_fallback_to_insert(rq, run_queue, bypass_insert); if (bypass_insert) return BLK_STS_RESOURCE; @@ -1830,7 +1828,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); if (ret == BLK_STS_RESOURCE) - __blk_mq_fallback_to_insert(hctx, rq, true, false); + __blk_mq_fallback_to_insert(rq, true, false); else if (ret != BLK_STS_OK) blk_mq_end_request(rq, ret); @@ -1960,7 +1958,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else if (q->elevator) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_sched_insert_request(rq, false, true, true, true); + blk_mq_sched_insert_request(rq, false, true, true); } else { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); -- cgit v1.2.3-58-ga151 From de99a346884f019387230bc549de74456daca248 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 10:31:39 -0800 Subject: block: Fix __bio_integrity_endio() documentation Fixes: 4246a0b63bd8 ("block: add a bi_error field to struct bio") Reviewed-by: Martin K. Petersen Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/bio-integrity.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 23b42e8aa03e..9cfdd6c83b5b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work) /** * __bio_integrity_endio - Integrity I/O completion function * @bio: Protected bio - * @error: Pointer to errno * * Description: Completion for integrity I/O * -- cgit v1.2.3-58-ga151 From 23d4ee19e789ae3dce3e04bd24e3d1537965475f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 18 Jan 2018 12:06:59 +0800 Subject: blk-mq: don't dispatch request in blk_mq_request_direct_issue if queue is busy If we run into blk_mq_request_direct_issue(), when queue is busy, we don't want to dispatch this request into hctx->dispatch_list, and what we need to do is to return the queue busy info to caller, so that caller can deal with it well. Fixes: 396eaf21ee ("blk-mq: improve DM's blk-mq IO merging via blk_insert_cloned_request feedback") Reported-by: Laurence Oberman Reviewed-by: Mike Snitzer Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index c418858a60ef..74a4f237ba91 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1773,15 +1773,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, return ret; } -static void __blk_mq_fallback_to_insert(struct request *rq, - bool run_queue, bool bypass_insert) -{ - if (!bypass_insert) - blk_mq_sched_insert_request(rq, false, run_queue, false); - else - blk_mq_request_bypass_insert(rq, run_queue); -} - static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie, @@ -1790,9 +1781,16 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request_queue *q = rq->q; bool run_queue = true; - /* RCU or SRCU read lock is needed before checking quiesced flag */ + /* + * RCU or SRCU read lock is needed before checking quiesced flag. + * + * When queue is stopped or quiesced, ignore 'bypass_insert' from + * blk_mq_request_direct_issue(), and return BLK_STS_OK to caller, + * and avoid driver to try to dispatch again. + */ if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { run_queue = false; + bypass_insert = false; goto insert; } @@ -1809,10 +1807,10 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, return __blk_mq_issue_directly(hctx, rq, cookie); insert: - __blk_mq_fallback_to_insert(rq, run_queue, bypass_insert); if (bypass_insert) return BLK_STS_RESOURCE; + blk_mq_sched_insert_request(rq, false, run_queue, false); return BLK_STS_OK; } @@ -1828,7 +1826,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); if (ret == BLK_STS_RESOURCE) - __blk_mq_fallback_to_insert(rq, true, false); + blk_mq_sched_insert_request(rq, false, true, false); else if (ret != BLK_STS_OK) blk_mq_end_request(rq, ret); -- cgit v1.2.3-58-ga151 From a52a69ea89dc12e6f4572f554940789c1ab23c7a Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 13 Jan 2018 12:05:17 +0100 Subject: block, bfq: limit tags for writes and async I/O MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Asynchronous I/O can easily starve synchronous I/O (both sync reads and sync writes), by consuming all request tags. Similarly, storms of synchronous writes, such as those that sync(2) may trigger, can starve synchronous reads. In their turn, these two problems may also cause BFQ to loose control on latency for interactive and soft real-time applications. For example, on a PLEXTOR PX-256M5S SSD, LibreOffice Writer takes 0.6 seconds to start if the device is idle, but it takes more than 45 seconds (!) if there are sequential writes in the background. This commit addresses this issue by limiting the maximum percentage of tags that asynchronous I/O requests and synchronous write requests can consume. In particular, this commit grants a higher threshold to synchronous writes, to prevent the latter from being starved by asynchronous I/O. According to the above test, LibreOffice Writer now starts in about 1.2 seconds on average, regardless of the background workload, and apart from some rare outlier. To check this improvement, run, e.g., sudo ./comm_startup_lat.sh bfq 5 5 seq 10 "lowriter --terminate_after_init" for the comm_startup_lat benchmark in the S suite [1]. [1] https://github.com/Algodev-github/S Tested-by: Oleksandr Natalenko Tested-by: Holger Hoffstätte Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++ block/bfq-iosched.h | 12 +++++++++ 2 files changed, 89 insertions(+) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index f352b1677143..a7ab0cb50733 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -417,6 +417,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, } } +/* + * See the comments on bfq_limit_depth for the purpose of + * the depths set in the function. + */ +static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) +{ + bfqd->sb_shift = bt->sb.shift; + + /* + * In-word depths if no bfq_queue is being weight-raised: + * leaving 25% of tags only for sync reads. + * + * In next formulas, right-shift the value + * (1U<sb_shift), instead of computing directly + * (1U<<(bfqd->sb_shift - something)), to be robust against + * any possible value of bfqd->sb_shift, without having to + * limit 'something'. + */ + /* no more than 50% of tags for async I/O */ + bfqd->word_depths[0][0] = max((1U<sb_shift)>>1, 1U); + /* + * no more than 75% of tags for sync writes (25% extra tags + * w.r.t. async I/O, to prevent async I/O from starving sync + * writes) + */ + bfqd->word_depths[0][1] = max(((1U<sb_shift) * 3)>>2, 1U); + + /* + * In-word depths in case some bfq_queue is being weight- + * raised: leaving ~63% of tags for sync reads. This is the + * highest percentage for which, in our tests, application + * start-up times didn't suffer from any regression due to tag + * shortage. + */ + /* no more than ~18% of tags for async I/O */ + bfqd->word_depths[1][0] = max(((1U<sb_shift) * 3)>>4, 1U); + /* no more than ~37% of tags for sync writes (~20% extra tags) */ + bfqd->word_depths[1][1] = max(((1U<sb_shift) * 6)>>4, 1U); +} + +/* + * Async I/O can easily starve sync I/O (both sync reads and sync + * writes), by consuming all tags. Similarly, storms of sync writes, + * such as those that sync(2) may trigger, can starve sync reads. + * Limit depths of async I/O and sync writes so as to counter both + * problems. + */ +static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +{ + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct bfq_data *bfqd = data->q->elevator->elevator_data; + struct sbitmap_queue *bt; + + if (op_is_sync(op) && !op_is_write(op)) + return; + + if (data->flags & BLK_MQ_REQ_RESERVED) { + if (unlikely(!tags->nr_reserved_tags)) { + WARN_ON_ONCE(1); + return; + } + bt = &tags->breserved_tags; + } else + bt = &tags->bitmap_tags; + + if (unlikely(bfqd->sb_shift != bt->sb.shift)) + bfq_update_depths(bfqd, bt); + + data->shallow_depth = + bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + + bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", + __func__, bfqd->wr_busy_queues, op_is_sync(op), + data->shallow_depth); +} + static struct bfq_queue * bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, sector_t sector, struct rb_node **ret_parent, @@ -5285,6 +5361,7 @@ static struct elv_fs_entry bfq_attrs[] = { static struct elevator_type iosched_bfq_mq = { .ops.mq = { + .limit_depth = bfq_limit_depth, .prepare_request = bfq_prepare_request, .finish_request = bfq_finish_request, .exit_icq = bfq_exit_icq, diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 5d47b58d5fc8..fcd941008127 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -629,6 +629,18 @@ struct bfq_data { struct bfq_io_cq *bio_bic; /* bfqq associated with the task issuing current bio for merging */ struct bfq_queue *bio_bfqq; + + /* + * Cached sbitmap shift, used to compute depth limits in + * bfq_update_depths. + */ + unsigned int sb_shift; + + /* + * Depth limits used in bfq_limit_depth (see comments on the + * function) + */ + unsigned int word_depths[2][2]; }; enum bfqq_state_flags { -- cgit v1.2.3-58-ga151 From 8a8747dc01cee6f92a52c03ba686e9f60cb46c87 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 13 Jan 2018 12:05:18 +0100 Subject: block, bfq: limit sectors served with interactive weight raising MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To maximise responsiveness, BFQ raises the weight, and performs device idling, for bfq_queues associated with processes deemed as interactive. In particular, weight raising has a maximum duration, equal to the time needed to start a large application. If a weight-raised process goes on doing I/O beyond this maximum duration, it loses weight-raising. This mechanism is evidently vulnerable to the following false positives: I/O-bound applications that will go on doing I/O for much longer than the duration of weight-raising. These applications have basically no benefit from being weight-raised at the beginning of their I/O. On the opposite end, while being weight-raised, these applications a) unjustly steal throughput to applications that may truly need low latency; b) make BFQ uselessly perform device idling; device idling results in loss of device throughput with most flash-based storage, and may increase latencies when used purposelessly. This commit adds a countermeasure to reduce both the above problems. To introduce this countermeasure, we provide the following extra piece of information (full details in the comments added by this commit). During the start-up of the large application used as a reference to set the duration of weight-raising, involved processes transfer at most ~110K sectors each. Accordingly, a process initially deemed as interactive has no right to be weight-raised any longer, once transferred 110K sectors or more. Basing on this consideration, this commit early-ends weight-raising for a bfq_queue if the latter happens to have received an amount of service at least equal to 110K sectors (actually, a little bit more, to keep a safety margin). I/O-bound applications that reach a high throughput, such as file copy, get to this threshold much before the allowed weight-raising period finishes. Thus this early ending of weight-raising reduces the amount of time during which these applications cause the problems described above. Tested-by: Oleksandr Natalenko Tested-by: Holger Hoffstätte Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++------ block/bfq-iosched.h | 5 ++++ block/bfq-wf2q.c | 3 ++ 3 files changed, 80 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a7ab0cb50733..47e6ec7427c4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -209,15 +209,17 @@ static struct kmem_cache *bfq_pool; * interactive applications automatically, using the following formula: * duration = (R / r) * T, where r is the peak rate of the device, and * R and T are two reference parameters. - * In particular, R is the peak rate of the reference device (see below), - * and T is a reference time: given the systems that are likely to be - * installed on the reference device according to its speed class, T is - * about the maximum time needed, under BFQ and while reading two files in - * parallel, to load typical large applications on these systems. - * In practice, the slower/faster the device at hand is, the more/less it - * takes to load applications with respect to the reference device. - * Accordingly, the longer/shorter BFQ grants weight raising to interactive - * applications. + * In particular, R is the peak rate of the reference device (see + * below), and T is a reference time: given the systems that are + * likely to be installed on the reference device according to its + * speed class, T is about the maximum time needed, under BFQ and + * while reading two files in parallel, to load typical large + * applications on these systems (see the comments on + * max_service_from_wr below, for more details on how T is obtained). + * In practice, the slower/faster the device at hand is, the more/less + * it takes to load applications with respect to the reference device. + * Accordingly, the longer/shorter BFQ grants weight raising to + * interactive applications. * * BFQ uses four different reference pairs (R, T), depending on: * . whether the device is rotational or non-rotational; @@ -254,6 +256,60 @@ static int T_slow[2]; static int T_fast[2]; static int device_speed_thresh[2]; +/* + * BFQ uses the above-detailed, time-based weight-raising mechanism to + * privilege interactive tasks. This mechanism is vulnerable to the + * following false positives: I/O-bound applications that will go on + * doing I/O for much longer than the duration of weight + * raising. These applications have basically no benefit from being + * weight-raised at the beginning of their I/O. On the opposite end, + * while being weight-raised, these applications + * a) unjustly steal throughput to applications that may actually need + * low latency; + * b) make BFQ uselessly perform device idling; device idling results + * in loss of device throughput with most flash-based storage, and may + * increase latencies when used purposelessly. + * + * BFQ tries to reduce these problems, by adopting the following + * countermeasure. To introduce this countermeasure, we need first to + * finish explaining how the duration of weight-raising for + * interactive tasks is computed. + * + * For a bfq_queue deemed as interactive, the duration of weight + * raising is dynamically adjusted, as a function of the estimated + * peak rate of the device, so as to be equal to the time needed to + * execute the 'largest' interactive task we benchmarked so far. By + * largest task, we mean the task for which each involved process has + * to do more I/O than for any of the other tasks we benchmarked. This + * reference interactive task is the start-up of LibreOffice Writer, + * and in this task each process/bfq_queue needs to have at most ~110K + * sectors transferred. + * + * This last piece of information enables BFQ to reduce the actual + * duration of weight-raising for at least one class of I/O-bound + * applications: those doing sequential or quasi-sequential I/O. An + * example is file copy. In fact, once started, the main I/O-bound + * processes of these applications usually consume the above 110K + * sectors in much less time than the processes of an application that + * is starting, because these I/O-bound processes will greedily devote + * almost all their CPU cycles only to their target, + * throughput-friendly I/O operations. This is even more true if BFQ + * happens to be underestimating the device peak rate, and thus + * overestimating the duration of weight raising. But, according to + * our measurements, once transferred 110K sectors, these processes + * have no right to be weight-raised any longer. + * + * Basing on the last consideration, BFQ ends weight-raising for a + * bfq_queue if the latter happens to have received an amount of + * service at least equal to the following constant. The constant is + * set to slightly more than 110K, to have a minimum safety margin. + * + * This early ending of weight-raising reduces the amount of time + * during which interactive false positives cause the two problems + * described at the beginning of these comments. + */ +static const unsigned long max_service_from_wr = 120000; + #define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) @@ -1352,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, if (old_wr_coeff == 1 && wr_or_deserves_wr) { /* start a weight-raising period */ if (interactive) { + bfqq->service_from_wr = 0; bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); } else { @@ -3665,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->entity.prio_changed = 1; } } + if (bfqq->wr_coeff > 1 && + bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && + bfqq->service_from_wr > max_service_from_wr) { + /* see comments on max_service_from_wr */ + bfq_bfqq_end_wr(bfqq); + } } /* * To improve latency (for this or other queues), immediately diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index fcd941008127..350c39ae2896 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -337,6 +337,11 @@ struct bfq_queue { * last transition from idle to backlogged. */ unsigned long service_from_backlogged; + /* + * Cumulative service received from the @bfq_queue since its + * last transition to weight-raised state. + */ + unsigned long service_from_wr; /* * Value of wr start time when switching to soft rt diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 4456eda34e48..4498c43245e2 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -838,6 +838,9 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served) if (!bfqq->service_from_backlogged) bfqq->first_IO_time = jiffies; + if (bfqq->wr_coeff > 1) + bfqq->service_from_wr += served; + bfqq->service_from_backlogged += served; for_each_entity(entity) { st = bfq_entity_service_tree(entity); -- cgit v1.2.3-58-ga151 From 83d016ac86428dbca8a62d3e4fdc29e3ea39e535 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 Jan 2018 11:48:08 -0800 Subject: block: Unexport elv_register_queue() and elv_unregister_queue() These two functions are only called from inside the block layer so unexport them. Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk.h | 3 +++ block/elevator.c | 2 -- include/linux/elevator.h | 2 -- 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index c84ae0e21ebd..b1771851ed92 100644 --- a/block/blk.h +++ b/block/blk.h @@ -162,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq e->type->ops.sq.elevator_deactivate_req_fn(q, rq); } +int elv_register_queue(struct request_queue *q); +void elv_unregister_queue(struct request_queue *q); + struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); #ifdef CONFIG_FAIL_IO_TIMEOUT diff --git a/block/elevator.c b/block/elevator.c index 138faeb08a7c..4f00b53cd5fd 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -886,7 +886,6 @@ int elv_register_queue(struct request_queue *q) } return error; } -EXPORT_SYMBOL(elv_register_queue); void elv_unregister_queue(struct request_queue *q) { @@ -900,7 +899,6 @@ void elv_unregister_queue(struct request_queue *q) wbt_enable_default(q); } } -EXPORT_SYMBOL(elv_unregister_queue); int elv_register(struct elevator_type *e) { diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 3d794b3dc532..6d9e230dffd2 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -198,8 +198,6 @@ extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); extern void elv_requeue_request(struct request_queue *, struct request *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); -extern int elv_register_queue(struct request_queue *q); -extern void elv_unregister_queue(struct request_queue *q); extern int elv_may_queue(struct request_queue *, unsigned int); extern void elv_completed_request(struct request_queue *, struct request *); extern int elv_set_request(struct request_queue *q, struct request *rq, -- cgit v1.2.3-58-ga151 From 14a23498ba97683c6790b1bcd8b2cdfe9ad99797 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 Jan 2018 11:48:09 -0800 Subject: block: Document scheduler modification locking requirements This patch does not change any functionality. Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/elevator.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 4f00b53cd5fd..e87e9b43aba0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q) struct elevator_queue *e = q->elevator; int error; + lockdep_assert_held(&q->sysfs_lock); + error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); if (!error) { struct elv_fs_entry *attr = e->type->elevator_attrs; @@ -889,6 +891,8 @@ int elv_register_queue(struct request_queue *q) void elv_unregister_queue(struct request_queue *q) { + lockdep_assert_held(&q->sysfs_lock); + if (q) { struct elevator_queue *e = q->elevator; @@ -965,6 +969,8 @@ static int elevator_switch_mq(struct request_queue *q, { int ret; + lockdep_assert_held(&q->sysfs_lock); + blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); @@ -1010,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) bool old_registered = false; int err; + lockdep_assert_held(&q->sysfs_lock); + if (q->mq_ops) return elevator_switch_mq(q, new_e); -- cgit v1.2.3-58-ga151 From 2c2086afc2b8b974fac32cb028e73dc27bfae442 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 Jan 2018 11:48:10 -0800 Subject: block: Protect less code with sysfs_lock in blk_{un,}register_queue() The __blk_mq_register_dev(), blk_mq_unregister_dev(), elv_register_queue() and elv_unregister_queue() calls need to be protected with sysfs_lock but other code in these functions not. Hence protect only this code with sysfs_lock. This patch fixes a locking inversion issue in blk_unregister_queue() and also in an error path of blk_register_queue(): it is not allowed to hold sysfs_lock around the kobject_del(&q->kobj) call. Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4a6a40ffd78e..cbea895a5547 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +/** + * blk_register_queue - register a block layer queue with sysfs + * @disk: Disk of which the request queue should be registered with sysfs. + */ int blk_register_queue(struct gendisk *disk) { int ret; @@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk) if (q->request_fn || (q->mq_ops && q->elevator)) { ret = elv_register_queue(q); if (ret) { + mutex_unlock(&q->sysfs_lock); kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); - goto unlock; + return ret; } } ret = 0; @@ -923,6 +928,13 @@ unlock: } EXPORT_SYMBOL_GPL(blk_register_queue); +/** + * blk_unregister_queue - counterpart of blk_register_queue() + * @disk: Disk of which the request queue should be unregistered from sysfs. + * + * Note: the caller is responsible for guaranteeing that this function is called + * after blk_register_queue() has finished. + */ void blk_unregister_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; @@ -935,8 +947,9 @@ void blk_unregister_queue(struct gendisk *disk) return; /* - * Protect against the 'queue' kobj being accessed - * while/after it is removed. + * Since sysfs_remove_dir() prevents adding new directory entries + * before removal of existing entries starts, protect against + * concurrent elv_iosched_store() calls. */ mutex_lock(&q->sysfs_lock); @@ -944,18 +957,24 @@ void blk_unregister_queue(struct gendisk *disk) queue_flag_clear(QUEUE_FLAG_REGISTERED, q); spin_unlock_irq(q->queue_lock); - wbt_exit(q); - + /* + * Remove the sysfs attributes before unregistering the queue data + * structures that can be modified through sysfs. + */ if (q->mq_ops) blk_mq_unregister_dev(disk_to_dev(disk), q); - - if (q->request_fn || (q->mq_ops && q->elevator)) - elv_unregister_queue(q); + mutex_unlock(&q->sysfs_lock); kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); - kobject_put(&disk_to_dev(disk)->kobj); + wbt_exit(q); + + mutex_lock(&q->sysfs_lock); + if (q->request_fn || (q->mq_ops && q->elevator)) + elv_unregister_queue(q); mutex_unlock(&q->sysfs_lock); + + kobject_put(&disk_to_dev(disk)->kobj); } -- cgit v1.2.3-58-ga151 From 17534c6f2c065ad8e34ff6f013e5afaa90428512 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Mon, 11 Dec 2017 22:56:25 +0800 Subject: blk-throttle: export io_serviced_recursive, io_service_bytes_recursive export these two interface for cgroup-v1. Acked-by: Tejun Heo Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/blk-throttle.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 825bc29767e6..e8428417ac0a 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1510,11 +1510,21 @@ static struct cftype throtl_legacy_files[] = { .private = (unsigned long)&blkcg_policy_throtl, .seq_show = blkg_print_stat_bytes, }, + { + .name = "throttle.io_service_bytes_recursive", + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_bytes_recursive, + }, { .name = "throttle.io_serviced", .private = (unsigned long)&blkcg_policy_throtl, .seq_show = blkg_print_stat_ios, }, + { + .name = "throttle.io_serviced_recursive", + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_ios_recursive, + }, { } /* terminate */ }; -- cgit v1.2.3-58-ga151 From 721c7fc701c71f693307d274d2b346a1ecd4a534 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 11 Jan 2018 14:09:11 +0100 Subject: block: fail op_is_write() requests to read-only partitions Regular block device writes go through blkdev_write_iter(), which does bdev_read_only(), while zeroout/discard/etc requests are never checked, both userspace- and kernel-triggered. Add a generic catch-all check to generic_make_request_checks() to actually enforce ioctl(BLKROSET) and set_disk_ro(), which is used by quite a few drivers for things like snapshots, read-only backing files/images, etc. Reviewed-by: Sagi Grimberg Signed-off-by: Ilya Dryomov Signed-off-by: Jens Axboe --- block/blk-core.c | 56 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 18 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 55f338020254..c21a16e9fdf9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2062,6 +2062,21 @@ static inline bool should_fail_request(struct hd_struct *part, #endif /* CONFIG_FAIL_MAKE_REQUEST */ +static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) +{ + if (part->policy && op_is_write(bio_op(bio))) { + char b[BDEVNAME_SIZE]; + + printk(KERN_ERR + "generic_make_request: Trying to write " + "to read-only block-device %s (partno %d)\n", + bio_devname(bio, b), part->partno); + return true; + } + + return false; +} + /* * Remap block n of partition p to block n+start(p) of the disk. */ @@ -2070,27 +2085,28 @@ static inline int blk_partition_remap(struct bio *bio) struct hd_struct *p; int ret = 0; + rcu_read_lock(); + p = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) || + bio_check_ro(bio, p))) { + ret = -EIO; + goto out; + } + /* * Zone reset does not include bi_size so bio_sectors() is always 0. * Include a test for the reset op code and perform the remap if needed. */ - if (!bio->bi_partno || - (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)) - return 0; + if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET) + goto out; - rcu_read_lock(); - p = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) { - bio->bi_iter.bi_sector += p->start_sect; - bio->bi_partno = 0; - trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), - bio->bi_iter.bi_sector - p->start_sect); - } else { - printk("%s: fail for partition %d\n", __func__, bio->bi_partno); - ret = -EIO; - } - rcu_read_unlock(); + bio->bi_iter.bi_sector += p->start_sect; + bio->bi_partno = 0; + trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), + bio->bi_iter.bi_sector - p->start_sect); +out: + rcu_read_unlock(); return ret; } @@ -2149,15 +2165,19 @@ generic_make_request_checks(struct bio *bio) * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue is not a request based queue. */ - if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) goto not_supported; if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) goto end_io; - if (blk_partition_remap(bio)) - goto end_io; + if (!bio->bi_partno) { + if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) + goto end_io; + } else { + if (blk_partition_remap(bio)) + goto end_io; + } if (bio_check_eod(bio, nr_sectors)) goto end_io; -- cgit v1.2.3-58-ga151 From a13553c777375009584741e7d9982e775c4b0744 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 11 Jan 2018 14:09:12 +0100 Subject: block: add bdev_read_only() checks to common helpers Similar to blkdev_write_iter(), return -EPERM if the partition is read-only. This covers ioctl(), fallocate() and most in-kernel users but isn't meant to be exhaustive -- everything else will be caught in generic_make_request_checks(), fail with -EIO and can be fixed later. Reviewed-by: Sagi Grimberg Signed-off-by: Ilya Dryomov Signed-off-by: Jens Axboe --- block/blk-lib.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 2bc544ce3d2e..a676084d4740 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + if (flags & BLKDEV_DISCARD_SECURE) { if (!blk_queue_secure_erase(q)) return -EOPNOTSUPP; @@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; if ((sector | nr_sects) & bs_mask) return -EINVAL; @@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); @@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + while (nr_sects != 0) { bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), gfp_mask); -- cgit v1.2.3-58-ga151 From b889bf66d001a46a95deef18ddbe6db84645ed24 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 21 Nov 2017 09:38:30 +0800 Subject: blk-throttle: track read and write request individually In mixed read/write workload on SSD, write latency is much lower than read. But now we only track and record read latency and then use it as threshold base for both read and write io latency accounting. As a result, write io latency will always be considered as good and bad_bio_cnt is much smaller than 20% of bio_cnt. That is to mean, the tg to be checked will be treated as idle most of the time and still let others dispatch more ios, even it is truly running under low limit and wants its low limit to be guaranteed, which is not we expected in fact. So track read and write request individually, which can bring more precise latency control for low limit idle detection. Signed-off-by: Joseph Qi Reviewed-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-throttle.c | 134 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 79 insertions(+), 55 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index e8428417ac0a..e136f5ef9577 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -216,9 +216,9 @@ struct throtl_data unsigned int scale; - struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE]; - struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; - struct latency_bucket __percpu *latency_buckets; + struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE]; + struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE]; + struct latency_bucket __percpu *latency_buckets[2]; unsigned long last_calculate_time; unsigned long filtered_latency; @@ -2050,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) #ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void throtl_update_latency_buckets(struct throtl_data *td) { - struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE]; - int i, cpu; - unsigned long last_latency = 0; - unsigned long latency; + struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; + int i, cpu, rw; + unsigned long last_latency[2] = { 0 }; + unsigned long latency[2]; if (!blk_queue_nonrot(td->queue)) return; @@ -2062,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td) td->last_calculate_time = jiffies; memset(avg_latency, 0, sizeof(avg_latency)); - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - struct latency_bucket *tmp = &td->tmp_buckets[i]; - - for_each_possible_cpu(cpu) { - struct latency_bucket *bucket; - - /* this isn't race free, but ok in practice */ - bucket = per_cpu_ptr(td->latency_buckets, cpu); - tmp->total_latency += bucket[i].total_latency; - tmp->samples += bucket[i].samples; - bucket[i].total_latency = 0; - bucket[i].samples = 0; - } + for (rw = READ; rw <= WRITE; rw++) { + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + struct latency_bucket *tmp = &td->tmp_buckets[rw][i]; + + for_each_possible_cpu(cpu) { + struct latency_bucket *bucket; + + /* this isn't race free, but ok in practice */ + bucket = per_cpu_ptr(td->latency_buckets[rw], + cpu); + tmp->total_latency += bucket[i].total_latency; + tmp->samples += bucket[i].samples; + bucket[i].total_latency = 0; + bucket[i].samples = 0; + } - if (tmp->samples >= 32) { - int samples = tmp->samples; + if (tmp->samples >= 32) { + int samples = tmp->samples; - latency = tmp->total_latency; + latency[rw] = tmp->total_latency; - tmp->total_latency = 0; - tmp->samples = 0; - latency /= samples; - if (latency == 0) - continue; - avg_latency[i].latency = latency; + tmp->total_latency = 0; + tmp->samples = 0; + latency[rw] /= samples; + if (latency[rw] == 0) + continue; + avg_latency[rw][i].latency = latency[rw]; + } } } - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - if (!avg_latency[i].latency) { - if (td->avg_buckets[i].latency < last_latency) - td->avg_buckets[i].latency = last_latency; - continue; - } + for (rw = READ; rw <= WRITE; rw++) { + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + if (!avg_latency[rw][i].latency) { + if (td->avg_buckets[rw][i].latency < last_latency[rw]) + td->avg_buckets[rw][i].latency = + last_latency[rw]; + continue; + } - if (!td->avg_buckets[i].valid) - latency = avg_latency[i].latency; - else - latency = (td->avg_buckets[i].latency * 7 + - avg_latency[i].latency) >> 3; + if (!td->avg_buckets[rw][i].valid) + latency[rw] = avg_latency[rw][i].latency; + else + latency[rw] = (td->avg_buckets[rw][i].latency * 7 + + avg_latency[rw][i].latency) >> 3; - td->avg_buckets[i].latency = max(latency, last_latency); - td->avg_buckets[i].valid = true; - last_latency = td->avg_buckets[i].latency; + td->avg_buckets[rw][i].latency = max(latency[rw], + last_latency[rw]); + td->avg_buckets[rw][i].valid = true; + last_latency[rw] = td->avg_buckets[rw][i].latency; + } } for (i = 0; i < LATENCY_BUCKET_SIZE; i++) throtl_log(&td->service_queue, - "Latency bucket %d: latency=%ld, valid=%d", i, - td->avg_buckets[i].latency, td->avg_buckets[i].valid); + "Latency bucket %d: read latency=%ld, read valid=%d, " + "write latency=%ld, write valid=%d", i, + td->avg_buckets[READ][i].latency, + td->avg_buckets[READ][i].valid, + td->avg_buckets[WRITE][i].latency, + td->avg_buckets[WRITE][i].valid); } #else static inline void throtl_update_latency_buckets(struct throtl_data *td) @@ -2258,16 +2269,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size, struct latency_bucket *latency; int index; - if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ || + if (!td || td->limit_index != LIMIT_LOW || + !(op == REQ_OP_READ || op == REQ_OP_WRITE) || !blk_queue_nonrot(td->queue)) return; index = request_bucket_index(size); - latency = get_cpu_ptr(td->latency_buckets); + latency = get_cpu_ptr(td->latency_buckets[op]); latency[index].total_latency += time; latency[index].samples++; - put_cpu_ptr(td->latency_buckets); + put_cpu_ptr(td->latency_buckets[op]); } void blk_throtl_stat_add(struct request *rq, u64 time_ns) @@ -2286,6 +2298,7 @@ void blk_throtl_bio_endio(struct bio *bio) unsigned long finish_time; unsigned long start_time; unsigned long lat; + int rw = bio_data_dir(bio); tg = bio->bi_cg_private; if (!tg) @@ -2314,7 +2327,7 @@ void blk_throtl_bio_endio(struct bio *bio) bucket = request_bucket_index( blk_stat_size(&bio->bi_issue_stat)); - threshold = tg->td->avg_buckets[bucket].latency + + threshold = tg->td->avg_buckets[rw][bucket].latency + tg->latency_target; if (lat > threshold) tg->bad_bio_cnt++; @@ -2407,9 +2420,16 @@ int blk_throtl_init(struct request_queue *q) td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; - td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) * + td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) * + LATENCY_BUCKET_SIZE, __alignof__(u64)); + if (!td->latency_buckets[READ]) { + kfree(td); + return -ENOMEM; + } + td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) * LATENCY_BUCKET_SIZE, __alignof__(u64)); - if (!td->latency_buckets) { + if (!td->latency_buckets[WRITE]) { + free_percpu(td->latency_buckets[READ]); kfree(td); return -ENOMEM; } @@ -2428,7 +2448,8 @@ int blk_throtl_init(struct request_queue *q) /* activate policy */ ret = blkcg_activate_policy(q, &blkcg_policy_throtl); if (ret) { - free_percpu(td->latency_buckets); + free_percpu(td->latency_buckets[READ]); + free_percpu(td->latency_buckets[WRITE]); kfree(td); } return ret; @@ -2439,7 +2460,8 @@ void blk_throtl_exit(struct request_queue *q) BUG_ON(!q->td); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); - free_percpu(q->td->latency_buckets); + free_percpu(q->td->latency_buckets[READ]); + free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); } @@ -2457,8 +2479,10 @@ void blk_throtl_register_queue(struct request_queue *q) } else { td->throtl_slice = DFL_THROTL_SLICE_HD; td->filtered_latency = LATENCY_FILTERED_HD; - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) - td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY; + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY; + td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY; + } } #ifndef CONFIG_BLK_DEV_THROTTLING_LOW /* if no low limit, use previous default */ -- cgit v1.2.3-58-ga151 From c77ff7fd03ddca8face268c4cf093c0edf4bcf1f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 19 Jan 2018 08:58:54 -0800 Subject: blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly() Most blk-mq functions have a name that follows the pattern blk_mq_${action}. However, the function name blk_mq_request_direct_issue is an exception. Hence rename this function. This patch does not change any functionality. Reviewed-by: Mike Snitzer Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-mq.c | 4 ++-- block/blk-mq.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index c21a16e9fdf9..1645a1e54a37 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2520,7 +2520,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - return blk_mq_request_direct_issue(rq); + return blk_mq_request_issue_directly(rq); } spin_lock_irqsave(q->queue_lock, flags); diff --git a/block/blk-mq.c b/block/blk-mq.c index 74a4f237ba91..0fc6c95e5a29 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1785,7 +1785,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, * RCU or SRCU read lock is needed before checking quiesced flag. * * When queue is stopped or quiesced, ignore 'bypass_insert' from - * blk_mq_request_direct_issue(), and return BLK_STS_OK to caller, + * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, * and avoid driver to try to dispatch again. */ if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { @@ -1833,7 +1833,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, hctx_unlock(hctx, srcu_idx); } -blk_status_t blk_mq_request_direct_issue(struct request *rq) +blk_status_t blk_mq_request_issue_directly(struct request *rq) { blk_status_t ret; int srcu_idx; diff --git a/block/blk-mq.h b/block/blk-mq.h index e3ebc93646ca..88c558f71819 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -75,7 +75,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); /* Used by blk_insert_cloned_request() to issue request directly */ -blk_status_t blk_mq_request_direct_issue(struct request *rq); +blk_status_t blk_mq_request_issue_directly(struct request *rq); /* * CPU -> queue mappings -- cgit v1.2.3-58-ga151 From ae943d20624de0a6aac7dd0597616dce2c498029 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 19 Jan 2018 08:58:55 -0800 Subject: blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays Make sure that calling blk_mq_run_hw_queue() or blk_mq_kick_requeue_list() triggers a queue run without delay even if blk_mq_delay_run_hw_queue() has been called recently and if its delay has not yet expired. Reviewed-by: Mike Snitzer Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 0fc6c95e5a29..43e7449723e0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -785,7 +785,7 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q) { - kblockd_schedule_delayed_work(&q->requeue_work, 0); + kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); @@ -1401,9 +1401,8 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, put_cpu(); } - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), - &hctx->run_work, - msecs_to_jiffies(msecs)); + kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, + msecs_to_jiffies(msecs)); } void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) -- cgit v1.2.3-58-ga151 From f5ced52aaa5494c1feb9f80252cb2a2cde0dace8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 19 Jan 2018 08:58:56 -0800 Subject: block: Remove kblockd_schedule_delayed_work{,_on}() The previous patch removed all users of these two functions. Hence also remove the functions themselves. Reviewed-by: Mike Snitzer Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 14 -------------- include/linux/blkdev.h | 2 -- 2 files changed, 16 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 1645a1e54a37..cdae69be68e9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3441,20 +3441,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, } EXPORT_SYMBOL(kblockd_mod_delayed_work_on); -int kblockd_schedule_delayed_work(struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work(kblockd_workqueue, dwork, delay); -} -EXPORT_SYMBOL(kblockd_schedule_delayed_work); - -int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); -} -EXPORT_SYMBOL(kblockd_schedule_delayed_work_on); - /** * blk_start_plug - initialize blk_plug and track it inside the task_struct * @plug: The &struct blk_plug that needs to be initialized diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 71a9371c8182..afc43fb63c16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1800,8 +1800,6 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio) int kblockd_schedule_work(struct work_struct *work); int kblockd_schedule_work_on(int cpu, struct work_struct *work); -int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); -int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); #ifdef CONFIG_BLK_CGROUP -- cgit v1.2.3-58-ga151 From 475a055e62a1eb92f4358ad8a9059df973c190ac Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Sat, 20 Jan 2018 07:34:25 +0800 Subject: blk-throttle: use queue_is_rq_based use queue_is_rq_based instead of open code. Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/blk-throttle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index e136f5ef9577..c475f0fe3530 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2489,7 +2489,7 @@ void blk_throtl_register_queue(struct request_queue *q) td->throtl_slice = DFL_THROTL_SLICE_HD; #endif - td->track_bio_latency = !q->mq_ops && !q->request_fn; + td->track_bio_latency = !queue_is_rq_based(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); } -- cgit v1.2.3-58-ga151 From 20d59023c5ec4426284af492808bcea1f39787ef Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 23 Jan 2018 09:10:19 -0700 Subject: block: Set BIO_TRACE_COMPLETION on new bio during split We inadvertently set it again on the source bio, but we need to set it on the new split bio instead. Fixes: fbbaf700e7b1 ("block: trace completion of all bios.") Signed-off-by: Goldwyn Rodrigues Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index fe1efbeaf4aa..77993fb4bac6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1808,7 +1808,7 @@ struct bio *bio_split(struct bio *bio, int sectors, bio_advance(bio, split->bi_iter.bi_size); if (bio_flagged(bio, BIO_TRACE_COMPLETION)) - bio_set_flag(bio, BIO_TRACE_COMPLETION); + bio_set_flag(split, BIO_TRACE_COMPLETION); return split; } -- cgit v1.2.3-58-ga151 From 6b136a24b05c81a24e0b648a4bd938bcd0c4f69e Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Wed, 24 Jan 2018 01:20:00 +0800 Subject: blk-mq-debugfs: don't allow write on attributes with seq_operations set Attributes that only implement .seq_ops are read-only, any write to them should be rejected. But currently kernel would crash when writing to such debugfs entries, e.g. chmod +w /sys/kernel/debug/block//requeue_list echo 0 > /sys/kernel/debug/block//requeue_list chmod -w /sys/kernel/debug/block//requeue_list Fix it by returning -EPERM in blk_mq_debugfs_write() when writing to such attributes. Cc: Ming Lei Signed-off-by: Eryu Guan Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index fa31ceaa8de6..21cbc1f071c6 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -697,7 +697,11 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf, const struct blk_mq_debugfs_attr *attr = m->private; void *data = d_inode(file->f_path.dentry->d_parent)->i_private; - if (!attr->write) + /* + * Attributes that only implement .seq_ops are read-only and 'attr' is + * the same with 'data' in this case. + */ + if (attr == data || !attr->write) return -EPERM; return attr->write(data, buf, count, ppos); -- cgit v1.2.3-58-ga151 From 3124b65dad946c20feaaf08959ee38ec27361da9 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 24 Jan 2018 09:50:06 -0700 Subject: bsg: use pr_debug instead of hand crafted macros Use pr_debug instead of hand crafted macros. This way it is not needed to re-compile the kernel to enable bsg debug outputs and it's possible to selectively enable specific prints. Cc: Joe Perches Reviewed-by: Bart Van Assche Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/bsg.c | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/bsg.c b/block/bsg.c index 452f94f1c5d4..a1bcbb6ba50b 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -32,6 +32,9 @@ #define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver" #define BSG_VERSION "0.4" +#define bsg_dbg(bd, fmt, ...) \ + pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__) + struct bsg_device { struct request_queue *queue; spinlock_t lock; @@ -55,14 +58,6 @@ enum { #define BSG_DEFAULT_CMDS 64 #define BSG_MAX_DEVS 32768 -#undef BSG_DEBUG - -#ifdef BSG_DEBUG -#define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ##args) -#else -#define dprintk(fmt, args...) -#endif - static DEFINE_MUTEX(bsg_mutex); static DEFINE_IDR(bsg_minor_idr); @@ -123,7 +118,7 @@ static struct bsg_command *bsg_alloc_command(struct bsg_device *bd) bc->bd = bd; INIT_LIST_HEAD(&bc->list); - dprintk("%s: returning free cmd %p\n", bd->name, bc); + bsg_dbg(bd, "returning free cmd %p\n", bc); return bc; out: spin_unlock_irq(&bd->lock); @@ -222,7 +217,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode) if (!bcd->class_dev) return ERR_PTR(-ENXIO); - dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp, + bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n", + (unsigned long long) hdr->dout_xferp, hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, hdr->din_xfer_len); @@ -299,8 +295,8 @@ static void bsg_rq_end_io(struct request *rq, blk_status_t status) struct bsg_device *bd = bc->bd; unsigned long flags; - dprintk("%s: finished rq %p bc %p, bio %p\n", - bd->name, rq, bc, bc->bio); + bsg_dbg(bd, "finished rq %p bc %p, bio %p\n", + rq, bc, bc->bio); bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); @@ -333,7 +329,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q, list_add_tail(&bc->list, &bd->busy_list); spin_unlock_irq(&bd->lock); - dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc); + bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc); rq->end_io_data = bc; blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io); @@ -379,7 +375,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd) } } while (1); - dprintk("%s: returning done %p\n", bd->name, bc); + bsg_dbg(bd, "returning done %p\n", bc); return bc; } @@ -390,7 +386,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, struct scsi_request *req = scsi_req(rq); int ret = 0; - dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result); + pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result); /* * fill in all the output members */ @@ -469,7 +465,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd) struct bsg_command *bc; int ret, tret; - dprintk("%s: entered\n", bd->name); + bsg_dbg(bd, "entered\n"); /* * wait for all commands to complete @@ -572,7 +568,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) int ret; ssize_t bytes_read; - dprintk("%s: read %zd bytes\n", bd->name, count); + bsg_dbg(bd, "read %zd bytes\n", count); bsg_set_block(bd, file); @@ -646,7 +642,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) ssize_t bytes_written; int ret; - dprintk("%s: write %zd bytes\n", bd->name, count); + bsg_dbg(bd, "write %zd bytes\n", count); if (unlikely(uaccess_kernel())) return -EINVAL; @@ -664,7 +660,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) if (!bytes_written || err_block_err(ret)) bytes_written = ret; - dprintk("%s: returning %zd\n", bd->name, bytes_written); + bsg_dbg(bd, "returning %zd\n", bytes_written); return bytes_written; } @@ -717,7 +713,7 @@ static int bsg_put_device(struct bsg_device *bd) hlist_del(&bd->dev_list); mutex_unlock(&bsg_mutex); - dprintk("%s: tearing down\n", bd->name); + bsg_dbg(bd, "tearing down\n"); /* * close can always block @@ -744,9 +740,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode, struct file *file) { struct bsg_device *bd; -#ifdef BSG_DEBUG unsigned char buf[32]; -#endif if (!blk_queue_scsi_passthrough(rq)) { WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); @@ -771,7 +765,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode, hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); - dprintk("bound to <%s>, max queue %d\n", + bsg_dbg(bd, "bound to <%s>, max queue %d\n", format_dev_t(buf, inode->i_rdev), bd->max_queue); mutex_unlock(&bsg_mutex); -- cgit v1.2.3-58-ga151