summaryrefslogtreecommitdiff
path: root/block/bfq-iosched.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/bfq-iosched.c')
-rw-r--r--block/bfq-iosched.c304
1 files changed, 201 insertions, 103 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index fec18118dc30..0c612a911696 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -433,26 +433,21 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
/**
* bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
- * @bfqd: the lookup key.
- * @ioc: the io_context of the process doing I/O.
* @q: the request queue.
*/
-static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
- struct io_context *ioc,
- struct request_queue *q)
+static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
{
- if (ioc) {
- unsigned long flags;
- struct bfq_io_cq *icq;
+ struct bfq_io_cq *icq;
+ unsigned long flags;
- spin_lock_irqsave(&q->queue_lock, flags);
- icq = icq_to_bic(ioc_lookup_icq(ioc, q));
- spin_unlock_irqrestore(&q->queue_lock, flags);
+ if (!current->io_context)
+ return NULL;
- return icq;
- }
+ spin_lock_irqsave(&q->queue_lock, flags);
+ icq = icq_to_bic(ioc_lookup_icq(q));
+ spin_unlock_irqrestore(&q->queue_lock, flags);
- return NULL;
+ return icq;
}
/*
@@ -565,26 +560,134 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
}
}
+#define BFQ_LIMIT_INLINE_DEPTH 16
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
+{
+ struct bfq_data *bfqd = bfqq->bfqd;
+ struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH];
+ struct bfq_entity **entities = inline_entities;
+ int depth, level;
+ int class_idx = bfqq->ioprio_class - 1;
+ struct bfq_sched_data *sched_data;
+ unsigned long wsum;
+ bool ret = false;
+
+ if (!entity->on_st_or_in_serv)
+ return false;
+
+ /* +1 for bfqq entity, root cgroup not included */
+ depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1;
+ if (depth > BFQ_LIMIT_INLINE_DEPTH) {
+ entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO);
+ if (!entities)
+ return false;
+ }
+
+ spin_lock_irq(&bfqd->lock);
+ sched_data = entity->sched_data;
+ /* Gather our ancestors as we need to traverse them in reverse order */
+ level = 0;
+ for_each_entity(entity) {
+ /*
+ * If at some level entity is not even active, allow request
+ * queueing so that BFQ knows there's work to do and activate
+ * entities.
+ */
+ if (!entity->on_st_or_in_serv)
+ goto out;
+ /* Uh, more parents than cgroup subsystem thinks? */
+ if (WARN_ON_ONCE(level >= depth))
+ break;
+ entities[level++] = entity;
+ }
+ WARN_ON_ONCE(level != depth);
+ for (level--; level >= 0; level--) {
+ entity = entities[level];
+ if (level > 0) {
+ wsum = bfq_entity_service_tree(entity)->wsum;
+ } else {
+ int i;
+ /*
+ * For bfqq itself we take into account service trees
+ * of all higher priority classes and multiply their
+ * weights so that low prio queue from higher class
+ * gets more requests than high prio queue from lower
+ * class.
+ */
+ wsum = 0;
+ for (i = 0; i <= class_idx; i++) {
+ wsum = wsum * IOPRIO_BE_NR +
+ sched_data->service_tree[i].wsum;
+ }
+ }
+ limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum);
+ if (entity->allocated >= limit) {
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "too many requests: allocated %d limit %d level %d",
+ entity->allocated, limit, level);
+ ret = true;
+ break;
+ }
+ }
+out:
+ spin_unlock_irq(&bfqd->lock);
+ if (entities != inline_entities)
+ kfree(entities);
+ return ret;
+}
+#else
+static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
+{
+ return false;
+}
+#endif
+
/*
* Async I/O can easily starve sync I/O (both sync reads and sync
* writes), by consuming all tags. Similarly, storms of sync writes,
* such as those that sync(2) may trigger, can starve sync reads.
* Limit depths of async I/O and sync writes so as to counter both
* problems.
+ *
+ * Also if a bfq queue or its parent cgroup consume more tags than would be
+ * appropriate for their weight, we trim the available tag depth to 1. This
+ * avoids a situation where one cgroup can starve another cgroup from tags and
+ * thus block service differentiation among cgroups. Note that because the
+ * queue / cgroup already has many requests allocated and queued, this does not
+ * significantly affect service guarantees coming from the BFQ scheduling
+ * algorithm.
*/
static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
{
struct bfq_data *bfqd = data->q->elevator->elevator_data;
+ struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
+ struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL;
+ int depth;
+ unsigned limit = data->q->nr_requests;
+
+ /* Sync reads have full depth available */
+ if (op_is_sync(op) && !op_is_write(op)) {
+ depth = 0;
+ } else {
+ depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+ limit = (limit * depth) >> bfqd->full_depth_shift;
+ }
- if (op_is_sync(op) && !op_is_write(op))
- return;
-
- data->shallow_depth =
- bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+ /*
+ * Does queue (or any parent entity) exceed number of requests that
+ * should be available to it? Heavily limit depth so that it cannot
+ * consume more available requests and thus starve other entities.
+ */
+ if (bfqq && bfqq_request_over_limit(bfqq, limit))
+ depth = 1;
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
- __func__, bfqd->wr_busy_queues, op_is_sync(op),
- data->shallow_depth);
+ __func__, bfqd->wr_busy_queues, op_is_sync(op), depth);
+ if (depth)
+ data->shallow_depth = depth;
}
static struct bfq_queue *
@@ -1113,7 +1216,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
static int bfqq_process_refs(struct bfq_queue *bfqq)
{
- return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
+ return bfqq->ref - bfqq->entity.allocated -
+ bfqq->entity.on_st_or_in_serv -
(bfqq->weight_counter != NULL) - bfqq->stable_ref;
}
@@ -1982,20 +2086,19 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
* aspect, see the comments on the choice of the queue for injection
* in bfq_select_queue().
*
- * Turning back to the detection of a waker queue, a queue Q is deemed
- * as a waker queue for bfqq if, for three consecutive times, bfqq
- * happens to become non empty right after a request of Q has been
- * completed. In this respect, even if bfqq is empty, we do not check
- * for a waker if it still has some in-flight I/O. In fact, in this
- * case bfqq is actually still being served by the drive, and may
- * receive new I/O on the completion of some of the in-flight
- * requests. In particular, on the first time, Q is tentatively set as
- * a candidate waker queue, while on the third consecutive time that Q
- * is detected, the field waker_bfqq is set to Q, to confirm that Q is
- * a waker queue for bfqq. These detection steps are performed only if
- * bfqq has a long think time, so as to make it more likely that
- * bfqq's I/O is actually being blocked by a synchronization. This
- * last filter, plus the above three-times requirement, make false
+ * Turning back to the detection of a waker queue, a queue Q is deemed as a
+ * waker queue for bfqq if, for three consecutive times, bfqq happens to become
+ * non empty right after a request of Q has been completed within given
+ * timeout. In this respect, even if bfqq is empty, we do not check for a waker
+ * if it still has some in-flight I/O. In fact, in this case bfqq is actually
+ * still being served by the drive, and may receive new I/O on the completion
+ * of some of the in-flight requests. In particular, on the first time, Q is
+ * tentatively set as a candidate waker queue, while on the third consecutive
+ * time that Q is detected, the field waker_bfqq is set to Q, to confirm that Q
+ * is a waker queue for bfqq. These detection steps are performed only if bfqq
+ * has a long think time, so as to make it more likely that bfqq's I/O is
+ * actually being blocked by a synchronization. This last filter, plus the
+ * above three-times requirement and time limit for detection, make false
* positives less likely.
*
* NOTE
@@ -2019,6 +2122,8 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
u64 now_ns)
{
+ char waker_name[MAX_BFQQ_NAME_LENGTH];
+
if (!bfqd->last_completed_rq_bfqq ||
bfqd->last_completed_rq_bfqq == bfqq ||
bfq_bfqq_has_short_ttime(bfqq) ||
@@ -2027,8 +2132,16 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq)
return;
+ /*
+ * We reset waker detection logic also if too much time has passed
+ * since the first detection. If wakeups are rare, pointless idling
+ * doesn't hurt throughput that much. The condition below makes sure
+ * we do not uselessly idle blocking waker in more than 1/64 cases.
+ */
if (bfqd->last_completed_rq_bfqq !=
- bfqq->tentative_waker_bfqq) {
+ bfqq->tentative_waker_bfqq ||
+ now_ns > bfqq->waker_detection_started +
+ 128 * (u64)bfqd->bfq_slice_idle) {
/*
* First synchronization detected with a
* candidate waker queue, or with a different
@@ -2037,12 +2150,19 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq->tentative_waker_bfqq =
bfqd->last_completed_rq_bfqq;
bfqq->num_waker_detections = 1;
+ bfqq->waker_detection_started = now_ns;
+ bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name,
+ MAX_BFQQ_NAME_LENGTH);
+ bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name);
} else /* Same tentative waker queue detected again */
bfqq->num_waker_detections++;
if (bfqq->num_waker_detections == 3) {
bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
bfqq->tentative_waker_bfqq = NULL;
+ bfq_bfqq_name(bfqq->waker_bfqq, waker_name,
+ MAX_BFQQ_NAME_LENGTH);
+ bfq_log_bfqq(bfqd, bfqq, "set waker %s", waker_name);
/*
* If the waker queue disappears, then
@@ -2332,7 +2452,7 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
* returned by bfq_bic_lookup does not go away before
* bfqd->lock is taken.
*/
- struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
+ struct bfq_io_cq *bic = bfq_bic_lookup(q);
bool ret;
spin_lock_irq(&bfqd->lock);
@@ -5878,6 +5998,22 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
}
}
+static void bfqq_request_allocated(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ for_each_entity(entity)
+ entity->allocated++;
+}
+
+static void bfqq_request_freed(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ for_each_entity(entity)
+ entity->allocated--;
+}
+
/* returns true if it causes the idle timer to be disabled */
static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
{
@@ -5891,8 +6027,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
* Release the request's reference to the old bfqq
* and make sure one is taken to the shared queue.
*/
- new_bfqq->allocated++;
- bfqq->allocated--;
+ bfqq_request_allocated(new_bfqq);
+ bfqq_request_freed(bfqq);
new_bfqq->ref++;
/*
* If the bic associated with the process
@@ -5991,48 +6127,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
spin_lock_irq(&bfqd->lock);
bfqq = bfq_init_rq(rq);
-
- /*
- * Reqs with at_head or passthrough flags set are to be put
- * directly into dispatch list. Additional case for putting rq
- * directly into the dispatch queue: the only active
- * bfq_queues are bfqq and either its waker bfq_queue or one
- * of its woken bfq_queues. The rationale behind this
- * additional condition is as follows:
- * - consider a bfq_queue, say Q1, detected as a waker of
- * another bfq_queue, say Q2
- * - by definition of a waker, Q1 blocks the I/O of Q2, i.e.,
- * some I/O of Q1 needs to be completed for new I/O of Q2
- * to arrive. A notable example of waker is journald
- * - so, Q1 and Q2 are in any respect the queues of two
- * cooperating processes (or of two cooperating sets of
- * processes): the goal of Q1's I/O is doing what needs to
- * be done so that new Q2's I/O can finally be
- * issued. Therefore, if the service of Q1's I/O is delayed,
- * then Q2's I/O is delayed too. Conversely, if Q2's I/O is
- * delayed, the goal of Q1's I/O is hindered.
- * - as a consequence, if some I/O of Q1/Q2 arrives while
- * Q2/Q1 is the only queue in service, there is absolutely
- * no point in delaying the service of such an I/O. The
- * only possible result is a throughput loss
- * - so, when the above condition holds, the best option is to
- * have the new I/O dispatched as soon as possible
- * - the most effective and efficient way to attain the above
- * goal is to put the new I/O directly in the dispatch
- * list
- * - as an additional restriction, Q1 and Q2 must be the only
- * busy queues for this commit to put the I/O of Q2/Q1 in
- * the dispatch list. This is necessary, because, if also
- * other queues are waiting for service, then putting new
- * I/O directly in the dispatch list may evidently cause a
- * violation of service guarantees for the other queues
- */
- if (!bfqq ||
- (bfqq != bfqd->in_service_queue &&
- bfqd->in_service_queue != NULL &&
- bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) &&
- (bfqq->waker_bfqq == bfqd->in_service_queue ||
- bfqd->in_service_queue->waker_bfqq == bfqq)) || at_head) {
+ if (!bfqq || at_head) {
if (at_head)
list_add(&rq->queuelist, &bfqd->dispatch);
else
@@ -6059,7 +6154,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
* merge).
*/
cmd_flags = rq->cmd_flags;
-
spin_unlock_irq(&bfqd->lock);
bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
@@ -6251,8 +6345,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
{
- bfqq->allocated--;
-
+ bfqq_request_freed(bfqq);
bfq_put_queue(bfqq);
}
@@ -6476,6 +6569,16 @@ static void bfq_finish_requeue_request(struct request *rq)
rq->elv.priv[1] = NULL;
}
+static void bfq_finish_request(struct request *rq)
+{
+ bfq_finish_requeue_request(rq);
+
+ if (rq->elv.icq) {
+ put_io_context(rq->elv.icq->ioc);
+ rq->elv.icq = NULL;
+ }
+}
+
/*
* Removes the association between the current task and bfqq, assuming
* that bic points to the bfq iocontext of the task.
@@ -6573,6 +6676,8 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
*/
static void bfq_prepare_request(struct request *rq)
{
+ rq->elv.icq = ioc_find_get_icq(rq->q);
+
/*
* Regardless of whether we have an icq attached, we have to
* clear the scheduler pointers, as they might point to
@@ -6672,7 +6777,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
}
}
- bfqq->allocated++;
+ bfqq_request_allocated(bfqq);
bfqq->ref++;
bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
rq, bfqq, bfqq->ref);
@@ -6835,11 +6940,11 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
* See the comments on bfq_limit_depth for the purpose of
* the depths set in the function. Return minimum shallow depth we'll use.
*/
-static unsigned int bfq_update_depths(struct bfq_data *bfqd,
- struct sbitmap_queue *bt)
+static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
{
- unsigned int i, j, min_shallow = UINT_MAX;
+ unsigned int depth = 1U << bt->sb.shift;
+ bfqd->full_depth_shift = bt->sb.shift;
/*
* In-word depths if no bfq_queue is being weight-raised:
* leaving 25% of tags only for sync reads.
@@ -6851,13 +6956,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
* limit 'something'.
*/
/* no more than 50% of tags for async I/O */
- bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U);
+ bfqd->word_depths[0][0] = max(depth >> 1, 1U);
/*
* no more than 75% of tags for sync writes (25% extra tags
* w.r.t. async I/O, to prevent async I/O from starving sync
* writes)
*/
- bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U);
+ bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
/*
* In-word depths in case some bfq_queue is being weight-
@@ -6867,25 +6972,18 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
* shortage.
*/
/* no more than ~18% of tags for async I/O */
- bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U);
+ bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
/* no more than ~37% of tags for sync writes (~20% extra tags) */
- bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U);
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < 2; j++)
- min_shallow = min(min_shallow, bfqd->word_depths[i][j]);
-
- return min_shallow;
+ bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
}
static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
{
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags;
- unsigned int min_shallow;
- min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
+ bfq_update_depths(bfqd, &tags->bitmap_tags);
+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
}
static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
@@ -7300,7 +7398,7 @@ static struct elevator_type iosched_bfq_mq = {
.limit_depth = bfq_limit_depth,
.prepare_request = bfq_prepare_request,
.requeue_request = bfq_finish_requeue_request,
- .finish_request = bfq_finish_requeue_request,
+ .finish_request = bfq_finish_request,
.exit_icq = bfq_exit_icq,
.insert_requests = bfq_insert_requests,
.dispatch_request = bfq_dispatch_request,