drm/scheduler: rework entity flush, kill and fini

This was buggy because when we had to wait for entities which were
killed as well we would just deadlock.

Instead move all the dependency handling into the callbacks so that
will all happen asynchronously.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Luben Tuikov <luben.tuikov@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20221014084641.128280-13-christian.koenig@amd.com
This commit is contained in:
Christian König 2022-09-29 14:50:56 +02:00
parent 2cf9886e28
commit 2fdb8a8f07

View File

@ -140,6 +140,73 @@ bool drm_sched_entity_is_ready(struct drm_sched_entity *entity)
return true; return true;
} }
static void drm_sched_entity_kill_jobs_work(struct work_struct *wrk)
{
struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
drm_sched_fence_finished(job->s_fence);
WARN_ON(job->s_fence->parent);
job->sched->ops->free_job(job);
}
/* Signal the scheduler finished fence when the entity in question is killed. */
static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
struct dma_fence_cb *cb)
{
struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
finish_cb);
int r;
dma_fence_put(f);
/* Wait for all dependencies to avoid data corruptions */
while (!xa_empty(&job->dependencies)) {
f = xa_erase(&job->dependencies, job->last_dependency++);
r = dma_fence_add_callback(f, &job->finish_cb,
drm_sched_entity_kill_jobs_cb);
if (!r)
return;
dma_fence_put(f);
}
INIT_WORK(&job->work, drm_sched_entity_kill_jobs_work);
schedule_work(&job->work);
}
/* Remove the entity from the scheduler and kill all pending jobs */
static void drm_sched_entity_kill(struct drm_sched_entity *entity)
{
struct drm_sched_job *job;
struct dma_fence *prev;
if (!entity->rq)
return;
spin_lock(&entity->rq_lock);
entity->stopped = true;
drm_sched_rq_remove_entity(entity->rq, entity);
spin_unlock(&entity->rq_lock);
/* Make sure this entity is not used by the scheduler at the moment */
wait_for_completion(&entity->entity_idle);
prev = dma_fence_get(entity->last_scheduled);
while ((job = to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) {
struct drm_sched_fence *s_fence = job->s_fence;
dma_fence_set_error(&s_fence->finished, -ESRCH);
dma_fence_get(&s_fence->finished);
if (!prev || dma_fence_add_callback(prev, &job->finish_cb,
drm_sched_entity_kill_jobs_cb))
drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
prev = &s_fence->finished;
}
dma_fence_put(prev);
}
/** /**
* drm_sched_entity_flush - Flush a context entity * drm_sched_entity_flush - Flush a context entity
* *
@ -180,91 +247,13 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
/* For killed process disable any more IBs enqueue right now */ /* For killed process disable any more IBs enqueue right now */
last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); last_user = cmpxchg(&entity->last_user, current->group_leader, NULL);
if ((!last_user || last_user == current->group_leader) && if ((!last_user || last_user == current->group_leader) &&
(current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) { (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
spin_lock(&entity->rq_lock); drm_sched_entity_kill(entity);
entity->stopped = true;
drm_sched_rq_remove_entity(entity->rq, entity);
spin_unlock(&entity->rq_lock);
}
return ret; return ret;
} }
EXPORT_SYMBOL(drm_sched_entity_flush); EXPORT_SYMBOL(drm_sched_entity_flush);
static void drm_sched_entity_kill_jobs_work(struct work_struct *wrk)
{
struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
drm_sched_fence_finished(job->s_fence);
WARN_ON(job->s_fence->parent);
job->sched->ops->free_job(job);
}
/* Signal the scheduler finished fence when the entity in question is killed. */
static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
struct dma_fence_cb *cb)
{
struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
finish_cb);
dma_fence_put(f);
INIT_WORK(&job->work, drm_sched_entity_kill_jobs_work);
schedule_work(&job->work);
}
static struct dma_fence *
drm_sched_job_dependency(struct drm_sched_job *job,
struct drm_sched_entity *entity)
{
if (!xa_empty(&job->dependencies))
return xa_erase(&job->dependencies, job->last_dependency++);
if (job->sched->ops->dependency)
return job->sched->ops->dependency(job, entity);
return NULL;
}
static void drm_sched_entity_kill_jobs(struct drm_sched_entity *entity)
{
struct drm_sched_job *job;
struct dma_fence *f;
int r;
while ((job = to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) {
struct drm_sched_fence *s_fence = job->s_fence;
/* Wait for all dependencies to avoid data corruptions */
while ((f = drm_sched_job_dependency(job, entity))) {
dma_fence_wait(f, false);
dma_fence_put(f);
}
drm_sched_fence_scheduled(s_fence);
dma_fence_set_error(&s_fence->finished, -ESRCH);
/*
* When pipe is hanged by older entity, new entity might
* not even have chance to submit it's first job to HW
* and so entity->last_scheduled will remain NULL
*/
if (!entity->last_scheduled) {
drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
continue;
}
dma_fence_get(entity->last_scheduled);
r = dma_fence_add_callback(entity->last_scheduled,
&job->finish_cb,
drm_sched_entity_kill_jobs_cb);
if (r == -ENOENT)
drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
else if (r)
DRM_ERROR("fence add callback failed (%d)\n", r);
}
}
/** /**
* drm_sched_entity_fini - Destroy a context entity * drm_sched_entity_fini - Destroy a context entity
* *
@ -278,35 +267,19 @@ static void drm_sched_entity_kill_jobs(struct drm_sched_entity *entity)
*/ */
void drm_sched_entity_fini(struct drm_sched_entity *entity) void drm_sched_entity_fini(struct drm_sched_entity *entity)
{ {
struct drm_gpu_scheduler *sched = NULL;
if (entity->rq) {
sched = entity->rq->sched;
drm_sched_rq_remove_entity(entity->rq, entity);
}
/* Consumption of existing IBs wasn't completed. Forcefully
* remove them here.
*/
if (spsc_queue_count(&entity->job_queue)) {
if (sched) {
/* /*
* Wait for thread to idle to make sure it isn't processing * If consumption of existing IBs wasn't completed. Forcefully remove
* this entity. * them here. Also makes sure that the scheduler won't touch this entity
* any more.
*/ */
wait_for_completion(&entity->entity_idle); drm_sched_entity_kill(entity);
}
if (entity->dependency) { if (entity->dependency) {
dma_fence_remove_callback(entity->dependency, dma_fence_remove_callback(entity->dependency, &entity->cb);
&entity->cb);
dma_fence_put(entity->dependency); dma_fence_put(entity->dependency);
entity->dependency = NULL; entity->dependency = NULL;
} }
drm_sched_entity_kill_jobs(entity);
}
dma_fence_put(entity->last_scheduled); dma_fence_put(entity->last_scheduled);
entity->last_scheduled = NULL; entity->last_scheduled = NULL;
} }
@ -416,6 +389,19 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
return false; return false;
} }
static struct dma_fence *
drm_sched_job_dependency(struct drm_sched_job *job,
struct drm_sched_entity *entity)
{
if (!xa_empty(&job->dependencies))
return xa_erase(&job->dependencies, job->last_dependency++);
if (job->sched->ops->dependency)
return job->sched->ops->dependency(job, entity);
return NULL;
}
struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity) struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
{ {
struct drm_sched_job *sched_job; struct drm_sched_job *sched_job;