diff --git a/block/blk-core.c b/block/blk-core.c index 3c26c7f48703..8fbdac7010bb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -640,13 +640,18 @@ EXPORT_SYMBOL(blk_get_queue); static inline void blk_free_request(struct request_queue *q, struct request *rq) { - if (rq->cmd_flags & REQ_ELVPRIV) + if (rq->cmd_flags & REQ_ELVPRIV) { elv_put_request(q, rq); + if (rq->elv.icq) + put_io_context(rq->elv.icq->ioc, q); + } + mempool_free(rq, q->rq.rq_pool); } static struct request * -blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask) +blk_alloc_request(struct request_queue *q, struct io_cq *icq, + unsigned int flags, gfp_t gfp_mask) { struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); @@ -657,10 +662,15 @@ blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask) rq->cmd_flags = flags | REQ_ALLOCED; - if ((flags & REQ_ELVPRIV) && - unlikely(elv_set_request(q, rq, gfp_mask))) { - mempool_free(rq, q->rq.rq_pool); - return NULL; + if (flags & REQ_ELVPRIV) { + rq->elv.icq = icq; + if (unlikely(elv_set_request(q, rq, gfp_mask))) { + mempool_free(rq, q->rq.rq_pool); + return NULL; + } + /* @rq->elv.icq holds on to io_context until @rq is freed */ + if (icq) + get_io_context(icq->ioc); } return rq; @@ -772,11 +782,14 @@ static struct request *get_request(struct request_queue *q, int rw_flags, { struct request *rq = NULL; struct request_list *rl = &q->rq; + struct elevator_type *et; struct io_context *ioc; + struct io_cq *icq = NULL; const bool is_sync = rw_is_sync(rw_flags) != 0; bool retried = false; int may_queue; retry: + et = q->elevator->type; ioc = current->io_context; if (unlikely(blk_queue_dead(q))) @@ -837,17 +850,36 @@ retry: rl->count[is_sync]++; rl->starved[is_sync] = 0; + /* + * Decide whether the new request will be managed by elevator. If + * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will + * prevent the current elevator from being destroyed until the new + * request is freed. This guarantees icq's won't be destroyed and + * makes creating new ones safe. + * + * Also, lookup icq while holding queue_lock. If it doesn't exist, + * it will be created after releasing queue_lock. + */ if (blk_rq_should_init_elevator(bio) && !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) { rw_flags |= REQ_ELVPRIV; rl->elvpriv++; + if (et->icq_cache && ioc) + icq = ioc_lookup_icq(ioc, q); } if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; spin_unlock_irq(q->queue_lock); - rq = blk_alloc_request(q, rw_flags, gfp_mask); + /* create icq if missing */ + if (unlikely(et->icq_cache && !icq)) + icq = ioc_create_icq(q, gfp_mask); + + /* rqs are guaranteed to have icq on elv_set_request() if requested */ + if (likely(!et->icq_cache || icq)) + rq = blk_alloc_request(q, icq, rw_flags, gfp_mask); + if (unlikely(!rq)) { /* * Allocation failed presumably due to memory. Undo anything diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 0910a5584d38..c04d16b02225 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -289,7 +289,6 @@ void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags, kmem_cache_free(iocontext_cachep, ioc); task_unlock(task); } -EXPORT_SYMBOL(create_io_context_slowpath); /** * get_task_io_context - get io_context of a task @@ -362,6 +361,65 @@ out: } EXPORT_SYMBOL(ioc_lookup_icq); +/** + * ioc_create_icq - create and link io_cq + * @q: request_queue of interest + * @gfp_mask: allocation mask + * + * Make sure io_cq linking %current->io_context and @q exists. If either + * io_context and/or icq don't exist, they will be created using @gfp_mask. + * + * The caller is responsible for ensuring @ioc won't go away and @q is + * alive and will stay alive until this function returns. + */ +struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) +{ + struct elevator_type *et = q->elevator->type; + struct io_context *ioc; + struct io_cq *icq; + + /* allocate stuff */ + ioc = create_io_context(current, gfp_mask, q->node); + if (!ioc) + return NULL; + + icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, + q->node); + if (!icq) + return NULL; + + if (radix_tree_preload(gfp_mask) < 0) { + kmem_cache_free(et->icq_cache, icq); + return NULL; + } + + icq->ioc = ioc; + icq->q = q; + INIT_LIST_HEAD(&icq->q_node); + INIT_HLIST_NODE(&icq->ioc_node); + + /* lock both q and ioc and try to link @icq */ + spin_lock_irq(q->queue_lock); + spin_lock(&ioc->lock); + + if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { + hlist_add_head(&icq->ioc_node, &ioc->icq_list); + list_add(&icq->q_node, &q->icq_list); + if (et->ops.elevator_init_icq_fn) + et->ops.elevator_init_icq_fn(icq); + } else { + kmem_cache_free(et->icq_cache, icq); + icq = ioc_lookup_icq(ioc, q); + if (!icq) + printk(KERN_ERR "cfq: icq link failed!\n"); + } + + spin_unlock(&ioc->lock); + spin_unlock_irq(q->queue_lock); + radix_tree_preload_end(); + return icq; +} + void ioc_set_changed(struct io_context *ioc, int which) { struct io_cq *icq; diff --git a/block/blk.h b/block/blk.h index ed4d9bf2ab16..7efd772336de 100644 --- a/block/blk.h +++ b/block/blk.h @@ -200,6 +200,7 @@ static inline int blk_do_io_stat(struct request *rq) */ void get_io_context(struct io_context *ioc); struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); +struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask); void ioc_clear_queue(struct request_queue *q); void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask, diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 11f49d036845..f3b44c394e6d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2935,117 +2935,6 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, return cfqq; } -/** - * ioc_create_icq - create and link io_cq - * @q: request_queue of interest - * @gfp_mask: allocation mask - * - * Make sure io_cq linking %current->io_context and @q exists. If either - * io_context and/or icq don't exist, they will be created using @gfp_mask. - * - * The caller is responsible for ensuring @ioc won't go away and @q is - * alive and will stay alive until this function returns. - */ -static struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) -{ - struct elevator_type *et = q->elevator->type; - struct io_context *ioc; - struct io_cq *icq; - - /* allocate stuff */ - ioc = create_io_context(current, gfp_mask, q->node); - if (!ioc) - return NULL; - - icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, - q->node); - if (!icq) - return NULL; - - if (radix_tree_preload(gfp_mask) < 0) { - kmem_cache_free(et->icq_cache, icq); - return NULL; - } - - icq->ioc = ioc; - icq->q = q; - INIT_LIST_HEAD(&icq->q_node); - INIT_HLIST_NODE(&icq->ioc_node); - - /* lock both q and ioc and try to link @icq */ - spin_lock_irq(q->queue_lock); - spin_lock(&ioc->lock); - - if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { - hlist_add_head(&icq->ioc_node, &ioc->icq_list); - list_add(&icq->q_node, &q->icq_list); - if (et->ops.elevator_init_icq_fn) - et->ops.elevator_init_icq_fn(icq); - } else { - kmem_cache_free(et->icq_cache, icq); - icq = ioc_lookup_icq(ioc, q); - if (!icq) - printk(KERN_ERR "cfq: icq link failed!\n"); - } - - spin_unlock(&ioc->lock); - spin_unlock_irq(q->queue_lock); - radix_tree_preload_end(); - return icq; -} - -/** - * cfq_get_cic - acquire cfq_io_cq and bump refcnt on io_context - * @cfqd: cfqd to setup cic for - * @gfp_mask: allocation mask - * - * Return cfq_io_cq associating @cfqd and %current->io_context and - * bump refcnt on io_context. If ioc or cic doesn't exist, they're created - * using @gfp_mask. - * - * Must be called under queue_lock which may be released and re-acquired. - * This function also may sleep depending on @gfp_mask. - */ -static struct cfq_io_cq *cfq_get_cic(struct cfq_data *cfqd, gfp_t gfp_mask) -{ - struct request_queue *q = cfqd->queue; - struct cfq_io_cq *cic = NULL; - struct io_context *ioc; - - lockdep_assert_held(q->queue_lock); - - while (true) { - /* fast path */ - ioc = current->io_context; - if (likely(ioc)) { - cic = cfq_cic_lookup(cfqd, ioc); - if (likely(cic)) - break; - } - - /* slow path - unlock, create missing ones and retry */ - spin_unlock_irq(q->queue_lock); - cic = icq_to_cic(ioc_create_icq(q, gfp_mask)); - spin_lock_irq(q->queue_lock); - if (!cic) - return NULL; - } - - /* bump @ioc's refcnt and handle changed notifications */ - get_io_context(ioc); - - if (unlikely(cic->icq.changed)) { - if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed)) - changed_ioprio(cic); -#ifdef CONFIG_CFQ_GROUP_IOSCHED - if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed)) - changed_cgroup(cic); -#endif - } - - return cic; -} - static void __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle) { @@ -3524,8 +3413,6 @@ static void cfq_put_request(struct request *rq) BUG_ON(!cfqq->allocated[rw]); cfqq->allocated[rw]--; - put_io_context(RQ_CIC(rq)->icq.ioc, cfqq->cfqd->queue); - /* Put down rq reference on cfqg */ cfq_put_cfqg(RQ_CFQG(rq)); rq->elv.priv[0] = NULL; @@ -3574,7 +3461,7 @@ static int cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) { struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_io_cq *cic; + struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); const int rw = rq_data_dir(rq); const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; @@ -3582,9 +3469,16 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) might_sleep_if(gfp_mask & __GFP_WAIT); spin_lock_irq(q->queue_lock); - cic = cfq_get_cic(cfqd, gfp_mask); - if (!cic) - goto queue_fail; + + /* handle changed notifications */ + if (unlikely(cic->icq.changed)) { + if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed)) + changed_ioprio(cic); +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed)) + changed_cgroup(cic); +#endif + } new_queue: cfqq = cic_to_cfqq(cic, is_sync); @@ -3615,17 +3509,10 @@ new_queue: cfqq->allocated[rw]++; cfqq->ref++; - rq->elv.icq = &cic->icq; rq->elv.priv[0] = cfqq; rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg); spin_unlock_irq(q->queue_lock); return 0; - -queue_fail: - cfq_schedule_dispatch(cfqd); - spin_unlock_irq(q->queue_lock); - cfq_log(cfqd, "set_request fail"); - return 1; } static void cfq_kick_queue(struct work_struct *work) diff --git a/include/linux/elevator.h b/include/linux/elevator.h index c8f1e67a8ebe..c24f3d7fbf1e 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -60,8 +60,8 @@ struct elevator_ops elevator_request_list_fn *elevator_former_req_fn; elevator_request_list_fn *elevator_latter_req_fn; - elevator_init_icq_fn *elevator_init_icq_fn; - elevator_exit_icq_fn *elevator_exit_icq_fn; + elevator_init_icq_fn *elevator_init_icq_fn; /* see iocontext.h */ + elevator_exit_icq_fn *elevator_exit_icq_fn; /* ditto */ elevator_set_req_fn *elevator_set_req_fn; elevator_put_req_fn *elevator_put_req_fn; @@ -90,8 +90,8 @@ struct elevator_type /* fields provided by elevator implementation */ struct elevator_ops ops; - size_t icq_size; - size_t icq_align; + size_t icq_size; /* see iocontext.h */ + size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; char elevator_name[ELV_NAME_MAX]; struct module *elevator_owner; diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index ac390a34c0e7..7e1371c4bccf 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -10,6 +10,65 @@ enum { ICQ_CGROUP_CHANGED, }; +/* + * An io_cq (icq) is association between an io_context (ioc) and a + * request_queue (q). This is used by elevators which need to track + * information per ioc - q pair. + * + * Elevator can request use of icq by setting elevator_type->icq_size and + * ->icq_align. Both size and align must be larger than that of struct + * io_cq and elevator can use the tail area for private information. The + * recommended way to do this is defining a struct which contains io_cq as + * the first member followed by private members and using its size and + * align. For example, + * + * struct snail_io_cq { + * struct io_cq icq; + * int poke_snail; + * int feed_snail; + * }; + * + * struct elevator_type snail_elv_type { + * .ops = { ... }, + * .icq_size = sizeof(struct snail_io_cq), + * .icq_align = __alignof__(struct snail_io_cq), + * ... + * }; + * + * If icq_size is set, block core will manage icq's. All requests will + * have its ->elv.icq field set before elevator_ops->elevator_set_req_fn() + * is called and be holding a reference to the associated io_context. + * + * Whenever a new icq is created, elevator_ops->elevator_init_icq_fn() is + * called and, on destruction, ->elevator_exit_icq_fn(). Both functions + * are called with both the associated io_context and queue locks held. + * + * Elevator is allowed to lookup icq using ioc_lookup_icq() while holding + * queue lock but the returned icq is valid only until the queue lock is + * released. Elevators can not and should not try to create or destroy + * icq's. + * + * As icq's are linked from both ioc and q, the locking rules are a bit + * complex. + * + * - ioc lock nests inside q lock. + * + * - ioc->icq_list and icq->ioc_node are protected by ioc lock. + * q->icq_list and icq->q_node by q lock. + * + * - ioc->icq_tree and ioc->icq_hint are protected by ioc lock, while icq + * itself is protected by q lock. However, both the indexes and icq + * itself are also RCU managed and lookup can be performed holding only + * the q lock. + * + * - icq's are not reference counted. They are destroyed when either the + * ioc or q goes away. Each request with icq set holds an extra + * reference to ioc to ensure it stays until the request is completed. + * + * - Linking and unlinking icq's are performed while holding both ioc and q + * locks. Due to the lock ordering, q exit is simple but ioc exit + * requires reverse-order double lock dance. + */ struct io_cq { struct request_queue *q; struct io_context *ioc;