LXR linux/drivers/gpu/drm/scheduler/sched

   1/*
   2 * Copyright 2015 Advanced Micro Devices, Inc.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20 * OTHER DEALINGS IN THE SOFTWARE.
  21 *
  22 */
  23
  24/**
  25 * DOC: Overview
  26 *
  27 * The GPU scheduler provides entities which allow userspace to push jobs
  28 * into software queues which are then scheduled on a hardware run queue.
  29 * The software queues have a priority among them. The scheduler selects the entities
  30 * from the run queue using a FIFO. The scheduler provides dependency handling
  31 * features among jobs. The driver is supposed to provide callback functions for
  32 * backend operations to the scheduler like submitting a job to hardware run queue,
  33 * returning the dependencies of a job etc.
  34 *
  35 * The organisation of the scheduler is the following:
  36 *
  37 * 1. Each hw run queue has one scheduler
  38 * 2. Each scheduler has multiple run queues with different priorities
  39 *    (e.g., HIGH_HW,HIGH_SW, KERNEL, NORMAL)
  40 * 3. Each scheduler run queue has a queue of entities to schedule
  41 * 4. Entities themselves maintain a queue of jobs that will be scheduled on
  42 *    the hardware.
  43 *
  44 * The jobs in a entity are always scheduled in the order that they were pushed.
  45 */
  46
  47#include <linux/kthread.h>
  48#include <linux/wait.h>
  49#include <linux/sched.h>
  50#include <linux/completion.h>
  51#include <linux/dma-resv.h>
  52#include <uapi/linux/sched/types.h>
  53
  54#include <drm/drm_print.h>
  55#include <drm/drm_gem.h>
  56#include <drm/gpu_scheduler.h>
  57#include <drm/spsc_queue.h>
  58
  59#define CREATE_TRACE_POINTS
  60#include "gpu_scheduler_trace.h"
  61
  62#define to_drm_sched_job(sched_job)             \
  63                container_of((sched_job), struct drm_sched_job, queue_node)
  64
  65/**
  66 * drm_sched_rq_init - initialize a given run queue struct
  67 *
  68 * @sched: scheduler instance to associate with this run queue
  69 * @rq: scheduler run queue
  70 *
  71 * Initializes a scheduler runqueue.
  72 */
  73static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
  74                              struct drm_sched_rq *rq)
  75{
  76        spin_lock_init(&rq->lock);
  77        INIT_LIST_HEAD(&rq->entities);
  78        rq->current_entity = NULL;
  79        rq->sched = sched;
  80}
  81
  82/**
  83 * drm_sched_rq_add_entity - add an entity
  84 *
  85 * @rq: scheduler run queue
  86 * @entity: scheduler entity
  87 *
  88 * Adds a scheduler entity to the run queue.
  89 */
  90void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
  91                             struct drm_sched_entity *entity)
  92{
  93        if (!list_empty(&entity->list))
  94                return;
  95        spin_lock(&rq->lock);
  96        atomic_inc(rq->sched->score);
  97        list_add_tail(&entity->list, &rq->entities);
  98        spin_unlock(&rq->lock);
  99}
 100
 101/**
 102 * drm_sched_rq_remove_entity - remove an entity
 103 *
 104 * @rq: scheduler run queue
 105 * @entity: scheduler entity
 106 *
 107 * Removes a scheduler entity from the run queue.
 108 */
 109void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
 110                                struct drm_sched_entity *entity)
 111{
 112        if (list_empty(&entity->list))
 113                return;
 114        spin_lock(&rq->lock);
 115        atomic_dec(rq->sched->score);
 116        list_del_init(&entity->list);
 117        if (rq->current_entity == entity)
 118                rq->current_entity = NULL;
 119        spin_unlock(&rq->lock);
 120}
 121
 122/**
 123 * drm_sched_rq_select_entity - Select an entity which could provide a job to run
 124 *
 125 * @rq: scheduler run queue to check.
 126 *
 127 * Try to find a ready entity, returns NULL if none found.
 128 */
 129static struct drm_sched_entity *
 130drm_sched_rq_select_entity(struct drm_sched_rq *rq)
 131{
 132        struct drm_sched_entity *entity;
 133
 134        spin_lock(&rq->lock);
 135
 136        entity = rq->current_entity;
 137        if (entity) {
 138                list_for_each_entry_continue(entity, &rq->entities, list) {
 139                        if (drm_sched_entity_is_ready(entity)) {
 140                                rq->current_entity = entity;
 141                                reinit_completion(&entity->entity_idle);
 142                                spin_unlock(&rq->lock);
 143                                return entity;
 144                        }
 145                }
 146        }
 147
 148        list_for_each_entry(entity, &rq->entities, list) {
 149
 150                if (drm_sched_entity_is_ready(entity)) {
 151                        rq->current_entity = entity;
 152                        reinit_completion(&entity->entity_idle);
 153                        spin_unlock(&rq->lock);
 154                        return entity;
 155                }
 156
 157                if (entity == rq->current_entity)
 158                        break;
 159        }
 160
 161        spin_unlock(&rq->lock);
 162
 163        return NULL;
 164}
 165
 166/**
 167 * drm_sched_job_done - complete a job
 168 * @s_job: pointer to the job which is done
 169 *
 170 * Finish the job's fence and wake up the worker thread.
 171 */
 172static void drm_sched_job_done(struct drm_sched_job *s_job)
 173{
 174        struct drm_sched_fence *s_fence = s_job->s_fence;
 175        struct drm_gpu_scheduler *sched = s_fence->sched;
 176
 177        atomic_dec(&sched->hw_rq_count);
 178        atomic_dec(sched->score);
 179
 180        trace_drm_sched_process_job(s_fence);
 181
 182        dma_fence_get(&s_fence->finished);
 183        drm_sched_fence_finished(s_fence);
 184        dma_fence_put(&s_fence->finished);
 185        wake_up_interruptible(&sched->wake_up_worker);
 186}
 187
 188/**
 189 * drm_sched_job_done_cb - the callback for a done job
 190 * @f: fence
 191 * @cb: fence callbacks
 192 */
 193static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb)
 194{
 195        struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb);
 196
 197        drm_sched_job_done(s_job);
 198}
 199
 200/**
 201 * drm_sched_dependency_optimized
 202 *
 203 * @fence: the dependency fence
 204 * @entity: the entity which depends on the above fence
 205 *
 206 * Returns true if the dependency can be optimized and false otherwise
 207 */
 208bool drm_sched_dependency_optimized(struct dma_fence* fence,
 209                                    struct drm_sched_entity *entity)
 210{
 211        struct drm_gpu_scheduler *sched = entity->rq->sched;
 212        struct drm_sched_fence *s_fence;
 213
 214        if (!fence || dma_fence_is_signaled(fence))
 215                return false;
 216        if (fence->context == entity->fence_context)
 217                return true;
 218        s_fence = to_drm_sched_fence(fence);
 219        if (s_fence && s_fence->sched == sched)
 220                return true;
 221
 222        return false;
 223}
 224EXPORT_SYMBOL(drm_sched_dependency_optimized);
 225
 226/**
 227 * drm_sched_start_timeout - start timeout for reset worker
 228 *
 229 * @sched: scheduler instance to start the worker for
 230 *
 231 * Start the timeout for the given scheduler.
 232 */
 233static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
 234{
 235        if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
 236            !list_empty(&sched->pending_list))
 237                queue_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout);
 238}
 239
 240/**
 241 * drm_sched_fault - immediately start timeout handler
 242 *
 243 * @sched: scheduler where the timeout handling should be started.
 244 *
 245 * Start timeout handling immediately when the driver detects a hardware fault.
 246 */
 247void drm_sched_fault(struct drm_gpu_scheduler *sched)
 248{
 249        mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0);
 250}
 251EXPORT_SYMBOL(drm_sched_fault);
 252
 253/**
 254 * drm_sched_suspend_timeout - Suspend scheduler job timeout
 255 *
 256 * @sched: scheduler instance for which to suspend the timeout
 257 *
 258 * Suspend the delayed work timeout for the scheduler. This is done by
 259 * modifying the delayed work timeout to an arbitrary large value,
 260 * MAX_SCHEDULE_TIMEOUT in this case.
 261 *
 262 * Returns the timeout remaining
 263 *
 264 */
 265unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
 266{
 267        unsigned long sched_timeout, now = jiffies;
 268
 269        sched_timeout = sched->work_tdr.timer.expires;
 270
 271        /*
 272         * Modify the timeout to an arbitrarily large value. This also prevents
 273         * the timeout to be restarted when new submissions arrive
 274         */
 275        if (mod_delayed_work(sched->timeout_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
 276                        && time_after(sched_timeout, now))
 277                return sched_timeout - now;
 278        else
 279                return sched->timeout;
 280}
 281EXPORT_SYMBOL(drm_sched_suspend_timeout);
 282
 283/**
 284 * drm_sched_resume_timeout - Resume scheduler job timeout
 285 *
 286 * @sched: scheduler instance for which to resume the timeout
 287 * @remaining: remaining timeout
 288 *
 289 * Resume the delayed work timeout for the scheduler.
 290 */
 291void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
 292                unsigned long remaining)
 293{
 294        spin_lock(&sched->job_list_lock);
 295
 296        if (list_empty(&sched->pending_list))
 297                cancel_delayed_work(&sched->work_tdr);
 298        else
 299                mod_delayed_work(sched->timeout_wq, &sched->work_tdr, remaining);
 300
 301        spin_unlock(&sched->job_list_lock);
 302}
 303EXPORT_SYMBOL(drm_sched_resume_timeout);
 304
 305static void drm_sched_job_begin(struct drm_sched_job *s_job)
 306{
 307        struct drm_gpu_scheduler *sched = s_job->sched;
 308
 309        spin_lock(&sched->job_list_lock);
 310        list_add_tail(&s_job->list, &sched->pending_list);
 311        drm_sched_start_timeout(sched);
 312        spin_unlock(&sched->job_list_lock);
 313}
 314
 315static void drm_sched_job_timedout(struct work_struct *work)
 316{
 317        struct drm_gpu_scheduler *sched;
 318        struct drm_sched_job *job;
 319        enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL;
 320
 321        sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
 322
 323        /* Protects against concurrent deletion in drm_sched_get_cleanup_job */
 324        spin_lock(&sched->job_list_lock);
 325        job = list_first_entry_or_null(&sched->pending_list,
 326                                       struct drm_sched_job, list);
 327
 328        if (job) {
 329                /*
 330                 * Remove the bad job so it cannot be freed by concurrent
 331                 * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
 332                 * is parked at which point it's safe.
 333                 */
 334                list_del_init(&job->list);
 335                spin_unlock(&sched->job_list_lock);
 336
 337                status = job->sched->ops->timedout_job(job);
 338
 339                /*
 340                 * Guilty job did complete and hence needs to be manually removed
 341                 * See drm_sched_stop doc.
 342                 */
 343                if (sched->free_guilty) {
 344                        job->sched->ops->free_job(job);
 345                        sched->free_guilty = false;
 346                }
 347        } else {
 348                spin_unlock(&sched->job_list_lock);
 349        }
 350
 351        if (status != DRM_GPU_SCHED_STAT_ENODEV) {
 352                spin_lock(&sched->job_list_lock);
 353                drm_sched_start_timeout(sched);
 354                spin_unlock(&sched->job_list_lock);
 355        }
 356}
 357
 358 /**
 359  * drm_sched_increase_karma - Update sched_entity guilty flag
 360  *
 361  * @bad: The job guilty of time out
 362  *
 363  * Increment on every hang caused by the 'bad' job. If this exceeds the hang
 364  * limit of the scheduler then the respective sched entity is marked guilty and
 365  * jobs from it will not be scheduled further
 366  */
 367void drm_sched_increase_karma(struct drm_sched_job *bad)
 368{
 369        drm_sched_increase_karma_ext(bad, 1);
 370}
 371EXPORT_SYMBOL(drm_sched_increase_karma);
 372
 373void drm_sched_reset_karma(struct drm_sched_job *bad)
 374{
 375        drm_sched_increase_karma_ext(bad, 0);
 376}
 377EXPORT_SYMBOL(drm_sched_reset_karma);
 378
 379/**
 380 * drm_sched_stop - stop the scheduler
 381 *
 382 * @sched: scheduler instance
 383 * @bad: job which caused the time out
 384 *
 385 * Stop the scheduler and also removes and frees all completed jobs.
 386 * Note: bad job will not be freed as it might be used later and so it's
 387 * callers responsibility to release it manually if it's not part of the
 388 * pending list any more.
 389 *
 390 */
 391void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 392{
 393        struct drm_sched_job *s_job, *tmp;
 394
 395        kthread_park(sched->thread);
 396
 397        /*
 398         * Reinsert back the bad job here - now it's safe as
 399         * drm_sched_get_cleanup_job cannot race against us and release the
 400         * bad job at this point - we parked (waited for) any in progress
 401         * (earlier) cleanups and drm_sched_get_cleanup_job will not be called
 402         * now until the scheduler thread is unparked.
 403         */
 404        if (bad && bad->sched == sched)
 405                /*
 406                 * Add at the head of the queue to reflect it was the earliest
 407                 * job extracted.
 408                 */
 409                list_add(&bad->list, &sched->pending_list);
 410
 411        /*
 412         * Iterate the job list from later to  earlier one and either deactive
 413         * their HW callbacks or remove them from pending list if they already
 414         * signaled.
 415         * This iteration is thread safe as sched thread is stopped.
 416         */
 417        list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list,
 418                                         list) {
 419                if (s_job->s_fence->parent &&
 420                    dma_fence_remove_callback(s_job->s_fence->parent,
 421                                              &s_job->cb)) {
 422                        atomic_dec(&sched->hw_rq_count);
 423                } else {
 424                        /*
 425                         * remove job from pending_list.
 426                         * Locking here is for concurrent resume timeout
 427                         */
 428                        spin_lock(&sched->job_list_lock);
 429                        list_del_init(&s_job->list);
 430                        spin_unlock(&sched->job_list_lock);
 431
 432                        /*
 433                         * Wait for job's HW fence callback to finish using s_job
 434                         * before releasing it.
 435                         *
 436                         * Job is still alive so fence refcount at least 1
 437                         */
 438                        dma_fence_wait(&s_job->s_fence->finished, false);
 439
 440                        /*
 441                         * We must keep bad job alive for later use during
 442                         * recovery by some of the drivers but leave a hint
 443                         * that the guilty job must be released.
 444                         */
 445                        if (bad != s_job)
 446                                sched->ops->free_job(s_job);
 447                        else
 448                                sched->free_guilty = true;
 449                }
 450        }
 451
 452        /*
 453         * Stop pending timer in flight as we rearm it in  drm_sched_start. This
 454         * avoids the pending timeout work in progress to fire right away after
 455         * this TDR finished and before the newly restarted jobs had a
 456         * chance to complete.
 457         */
 458        cancel_delayed_work(&sched->work_tdr);
 459}
 460
 461EXPORT_SYMBOL(drm_sched_stop);
 462
 463/**
 464 * drm_sched_start - recover jobs after a reset
 465 *
 466 * @sched: scheduler instance
 467 * @full_recovery: proceed with complete sched restart
 468 *
 469 */
 470void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 471{
 472        struct drm_sched_job *s_job, *tmp;
 473        int r;
 474
 475        /*
 476         * Locking the list is not required here as the sched thread is parked
 477         * so no new jobs are being inserted or removed. Also concurrent
 478         * GPU recovers can't run in parallel.
 479         */
 480        list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
 481                struct dma_fence *fence = s_job->s_fence->parent;
 482
 483                atomic_inc(&sched->hw_rq_count);
 484
 485                if (!full_recovery)
 486                        continue;
 487
 488                if (fence) {
 489                        r = dma_fence_add_callback(fence, &s_job->cb,
 490                                                   drm_sched_job_done_cb);
 491                        if (r == -ENOENT)
 492                                drm_sched_job_done(s_job);
 493                        else if (r)
 494                                DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
 495                                          r);
 496                } else
 497                        drm_sched_job_done(s_job);
 498        }
 499
 500        if (full_recovery) {
 501                spin_lock(&sched->job_list_lock);
 502                drm_sched_start_timeout(sched);
 503                spin_unlock(&sched->job_list_lock);
 504        }
 505
 506        kthread_unpark(sched->thread);
 507}
 508EXPORT_SYMBOL(drm_sched_start);
 509
 510/**
 511 * drm_sched_resubmit_jobs - helper to relaunch jobs from the pending list
 512 *
 513 * @sched: scheduler instance
 514 *
 515 */
 516void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
 517{
 518        drm_sched_resubmit_jobs_ext(sched, INT_MAX);
 519}
 520EXPORT_SYMBOL(drm_sched_resubmit_jobs);
 521
 522/**
 523 * drm_sched_resubmit_jobs_ext - helper to relunch certain number of jobs from mirror ring list
 524 *
 525 * @sched: scheduler instance
 526 * @max: job numbers to relaunch
 527 *
 528 */
 529void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 530{
 531        struct drm_sched_job *s_job, *tmp;
 532        uint64_t guilty_context;
 533        bool found_guilty = false;
 534        struct dma_fence *fence;
 535        int i = 0;
 536
 537        list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
 538                struct drm_sched_fence *s_fence = s_job->s_fence;
 539
 540                if (i >= max)
 541                        break;
 542
 543                if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
 544                        found_guilty = true;
 545                        guilty_context = s_job->s_fence->scheduled.context;
 546                }
 547
 548                if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
 549                        dma_fence_set_error(&s_fence->finished, -ECANCELED);
 550
 551                dma_fence_put(s_job->s_fence->parent);
 552                fence = sched->ops->run_job(s_job);
 553                i++;
 554
 555                if (IS_ERR_OR_NULL(fence)) {
 556                        if (IS_ERR(fence))
 557                                dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
 558
 559                        s_job->s_fence->parent = NULL;
 560                } else {
 561                        s_job->s_fence->parent = fence;
 562                }
 563        }
 564}
 565EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext);
 566
 567/**
 568 * drm_sched_job_init - init a scheduler job
 569 * @job: scheduler job to init
 570 * @entity: scheduler entity to use
 571 * @owner: job owner for debugging
 572 *
 573 * Refer to drm_sched_entity_push_job() documentation
 574 * for locking considerations.
 575 *
 576 * Drivers must make sure drm_sched_job_cleanup() if this function returns
 577 * successfully, even when @job is aborted before drm_sched_job_arm() is called.
 578 *
 579 * WARNING: amdgpu abuses &drm_sched.ready to signal when the hardware
 580 * has died, which can mean that there's no valid runqueue for a @entity.
 581 * This function returns -ENOENT in this case (which probably should be -EIO as
 582 * a more meanigful return value).
 583 *
 584 * Returns 0 for success, negative error code otherwise.
 585 */
 586int drm_sched_job_init(struct drm_sched_job *job,
 587                       struct drm_sched_entity *entity,
 588                       void *owner)
 589{
 590        drm_sched_entity_select_rq(entity);
 591        if (!entity->rq)
 592                return -ENOENT;
 593
 594        job->entity = entity;
 595        job->s_fence = drm_sched_fence_alloc(entity, owner);
 596        if (!job->s_fence)
 597                return -ENOMEM;
 598
 599        INIT_LIST_HEAD(&job->list);
 600
 601        xa_init_flags(&job->dependencies, XA_FLAGS_ALLOC);
 602
 603        return 0;
 604}
 605EXPORT_SYMBOL(drm_sched_job_init);
 606
 607/**
 608 * drm_sched_job_arm - arm a scheduler job for execution
 609 * @job: scheduler job to arm
 610 *
 611 * This arms a scheduler job for execution. Specifically it initializes the
 612 * &drm_sched_job.s_fence of @job, so that it can be attached to struct dma_resv
 613 * or other places that need to track the completion of this job.
 614 *
 615 * Refer to drm_sched_entity_push_job() documentation for locking
 616 * considerations.
 617 *
 618 * This can only be called if drm_sched_job_init() succeeded.
 619 */
 620void drm_sched_job_arm(struct drm_sched_job *job)
 621{
 622        struct drm_gpu_scheduler *sched;
 623        struct drm_sched_entity *entity = job->entity;
 624
 625        BUG_ON(!entity);
 626
 627        sched = entity->rq->sched;
 628
 629        job->sched = sched;
 630        job->s_priority = entity->rq - sched->sched_rq;
 631        job->id = atomic64_inc_return(&sched->job_id_count);
 632
 633        drm_sched_fence_init(job->s_fence, job->entity);
 634}
 635EXPORT_SYMBOL(drm_sched_job_arm);
 636
 637/**
 638 * drm_sched_job_add_dependency - adds the fence as a job dependency
 639 * @job: scheduler job to add the dependencies to
 640 * @fence: the dma_fence to add to the list of dependencies.
 641 *
 642 * Note that @fence is consumed in both the success and error cases.
 643 *
 644 * Returns:
 645 * 0 on success, or an error on failing to expand the array.
 646 */
 647int drm_sched_job_add_dependency(struct drm_sched_job *job,
 648                                 struct dma_fence *fence)
 649{
 650        struct dma_fence *entry;
 651        unsigned long index;
 652        u32 id = 0;
 653        int ret;
 654
 655        if (!fence)
 656                return 0;
 657
 658        /* Deduplicate if we already depend on a fence from the same context.
 659         * This lets the size of the array of deps scale with the number of
 660         * engines involved, rather than the number of BOs.
 661         */
 662        xa_for_each(&job->dependencies, index, entry) {
 663                if (entry->context != fence->context)
 664                        continue;
 665
 666                if (dma_fence_is_later(fence, entry)) {
 667                        dma_fence_put(entry);
 668                        xa_store(&job->dependencies, index, fence, GFP_KERNEL);
 669                } else {
 670                        dma_fence_put(fence);
 671                }
 672                return 0;
 673        }
 674
 675        ret = xa_alloc(&job->dependencies, &id, fence, xa_limit_32b, GFP_KERNEL);
 676        if (ret != 0)
 677                dma_fence_put(fence);
 678
 679        return ret;
 680}
 681EXPORT_SYMBOL(drm_sched_job_add_dependency);
 682
 683/**
 684 * drm_sched_job_add_implicit_dependencies - adds implicit dependencies as job
 685 *   dependencies
 686 * @job: scheduler job to add the dependencies to
 687 * @obj: the gem object to add new dependencies from.
 688 * @write: whether the job might write the object (so we need to depend on
 689 * shared fences in the reservation object).
 690 *
 691 * This should be called after drm_gem_lock_reservations() on your array of
 692 * GEM objects used in the job but before updating the reservations with your
 693 * own fences.
 694 *
 695 * Returns:
 696 * 0 on success, or an error on failing to expand the array.
 697 */
 698int drm_sched_job_add_implicit_dependencies(struct drm_sched_job *job,
 699                                            struct drm_gem_object *obj,
 700                                            bool write)
 701{
 702        struct dma_resv_iter cursor;
 703        struct dma_fence *fence;
 704        int ret;
 705
 706        dma_resv_for_each_fence(&cursor, obj->resv, write, fence) {
 707                /* Make sure to grab an additional ref on the added fence */
 708                dma_fence_get(fence);
 709                ret = drm_sched_job_add_dependency(job, fence);
 710                if (ret) {
 711                        dma_fence_put(fence);
 712                        return ret;
 713                }
 714        }
 715        return 0;
 716}
 717EXPORT_SYMBOL(drm_sched_job_add_implicit_dependencies);
 718
 719
 720/**
 721 * drm_sched_job_cleanup - clean up scheduler job resources
 722 * @job: scheduler job to clean up
 723 *
 724 * Cleans up the resources allocated with drm_sched_job_init().
 725 *
 726 * Drivers should call this from their error unwind code if @job is aborted
 727 * before drm_sched_job_arm() is called.
 728 *
 729 * After that point of no return @job is committed to be executed by the
 730 * scheduler, and this function should be called from the
 731 * &drm_sched_backend_ops.free_job callback.
 732 */
 733void drm_sched_job_cleanup(struct drm_sched_job *job)
 734{
 735        struct dma_fence *fence;
 736        unsigned long index;
 737
 738        if (kref_read(&job->s_fence->finished.refcount)) {
 739                /* drm_sched_job_arm() has been called */
 740                dma_fence_put(&job->s_fence->finished);
 741        } else {
 742                /* aborted job before committing to run it */
 743                drm_sched_fence_free(job->s_fence);
 744        }
 745
 746        job->s_fence = NULL;
 747
 748        xa_for_each(&job->dependencies, index, fence) {
 749                dma_fence_put(fence);
 750        }
 751        xa_destroy(&job->dependencies);
 752
 753}
 754EXPORT_SYMBOL(drm_sched_job_cleanup);
 755
 756/**
 757 * drm_sched_ready - is the scheduler ready
 758 *
 759 * @sched: scheduler instance
 760 *
 761 * Return true if we can push more jobs to the hw, otherwise false.
 762 */
 763static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
 764{
 765        return atomic_read(&sched->hw_rq_count) <
 766                sched->hw_submission_limit;
 767}
 768
 769/**
 770 * drm_sched_wakeup - Wake up the scheduler when it is ready
 771 *
 772 * @sched: scheduler instance
 773 *
 774 */
 775void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
 776{
 777        if (drm_sched_ready(sched))
 778                wake_up_interruptible(&sched->wake_up_worker);
 779}
 780
 781/**
 782 * drm_sched_select_entity - Select next entity to process
 783 *
 784 * @sched: scheduler instance
 785 *
 786 * Returns the entity to process or NULL if none are found.
 787 */
 788static struct drm_sched_entity *
 789drm_sched_select_entity(struct drm_gpu_scheduler *sched)
 790{
 791        struct drm_sched_entity *entity;
 792        int i;
 793
 794        if (!drm_sched_ready(sched))
 795                return NULL;
 796
 797        /* Kernel run queue has higher priority than normal run queue*/
 798        for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
 799                entity = drm_sched_rq_select_entity(&sched->sched_rq[i]);
 800                if (entity)
 801                        break;
 802        }
 803
 804        return entity;
 805}
 806
 807/**
 808 * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed
 809 *
 810 * @sched: scheduler instance
 811 *
 812 * Returns the next finished job from the pending list (if there is one)
 813 * ready for it to be destroyed.
 814 */
 815static struct drm_sched_job *
 816drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 817{
 818        struct drm_sched_job *job, *next;
 819
 820        spin_lock(&sched->job_list_lock);
 821
 822        job = list_first_entry_or_null(&sched->pending_list,
 823                                       struct drm_sched_job, list);
 824
 825        if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
 826                /* remove job from pending_list */
 827                list_del_init(&job->list);
 828
 829                /* cancel this job's TO timer */
 830                cancel_delayed_work(&sched->work_tdr);
 831                /* make the scheduled timestamp more accurate */
 832                next = list_first_entry_or_null(&sched->pending_list,
 833                                                typeof(*next), list);
 834
 835                if (next) {
 836                        next->s_fence->scheduled.timestamp =
 837                                job->s_fence->finished.timestamp;
 838                        /* start TO timer for next job */
 839                        drm_sched_start_timeout(sched);
 840                }
 841        } else {
 842                job = NULL;
 843        }
 844
 845        spin_unlock(&sched->job_list_lock);
 846
 847        return job;
 848}
 849
 850/**
 851 * drm_sched_pick_best - Get a drm sched from a sched_list with the least load
 852 * @sched_list: list of drm_gpu_schedulers
 853 * @num_sched_list: number of drm_gpu_schedulers in the sched_list
 854 *
 855 * Returns pointer of the sched with the least load or NULL if none of the
 856 * drm_gpu_schedulers are ready
 857 */
 858struct drm_gpu_scheduler *
 859drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
 860                     unsigned int num_sched_list)
 861{
 862        struct drm_gpu_scheduler *sched, *picked_sched = NULL;
 863        int i;
 864        unsigned int min_score = UINT_MAX, num_score;
 865
 866        for (i = 0; i < num_sched_list; ++i) {
 867                sched = sched_list[i];
 868
 869                if (!sched->ready) {
 870                        DRM_WARN("scheduler %s is not ready, skipping",
 871                                 sched->name);
 872                        continue;
 873                }
 874
 875                num_score = atomic_read(sched->score);
 876                if (num_score < min_score) {
 877                        min_score = num_score;
 878                        picked_sched = sched;
 879                }
 880        }
 881
 882        return picked_sched;
 883}
 884EXPORT_SYMBOL(drm_sched_pick_best);
 885
 886/**
 887 * drm_sched_blocked - check if the scheduler is blocked
 888 *
 889 * @sched: scheduler instance
 890 *
 891 * Returns true if blocked, otherwise false.
 892 */
 893static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
 894{
 895        if (kthread_should_park()) {
 896                kthread_parkme();
 897                return true;
 898        }
 899
 900        return false;
 901}
 902
 903/**
 904 * drm_sched_main - main scheduler thread
 905 *
 906 * @param: scheduler instance
 907 *
 908 * Returns 0.
 909 */
 910static int drm_sched_main(void *param)
 911{
 912        struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
 913        int r;
 914
 915        sched_set_fifo_low(current);
 916
 917        while (!kthread_should_stop()) {
 918                struct drm_sched_entity *entity = NULL;
 919                struct drm_sched_fence *s_fence;
 920                struct drm_sched_job *sched_job;
 921                struct dma_fence *fence;
 922                struct drm_sched_job *cleanup_job = NULL;
 923
 924                wait_event_interruptible(sched->wake_up_worker,
 925                                         (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
 926                                         (!drm_sched_blocked(sched) &&
 927                                          (entity = drm_sched_select_entity(sched))) ||
 928                                         kthread_should_stop());
 929
 930                if (cleanup_job)
 931                        sched->ops->free_job(cleanup_job);
 932
 933                if (!entity)
 934                        continue;
 935
 936                sched_job = drm_sched_entity_pop_job(entity);
 937
 938                if (!sched_job) {
 939                        complete(&entity->entity_idle);
 940                        continue;
 941                }
 942
 943                s_fence = sched_job->s_fence;
 944
 945                atomic_inc(&sched->hw_rq_count);
 946                drm_sched_job_begin(sched_job);
 947
 948                trace_drm_run_job(sched_job, entity);
 949                fence = sched->ops->run_job(sched_job);
 950                complete(&entity->entity_idle);
 951                drm_sched_fence_scheduled(s_fence);
 952
 953                if (!IS_ERR_OR_NULL(fence)) {
 954                        s_fence->parent = dma_fence_get(fence);
 955                        r = dma_fence_add_callback(fence, &sched_job->cb,
 956                                                   drm_sched_job_done_cb);
 957                        if (r == -ENOENT)
 958                                drm_sched_job_done(sched_job);
 959                        else if (r)
 960                                DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
 961                                          r);
 962                        dma_fence_put(fence);
 963                } else {
 964                        if (IS_ERR(fence))
 965                                dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
 966
 967                        drm_sched_job_done(sched_job);
 968                }
 969
 970                wake_up(&sched->job_scheduled);
 971        }
 972        return 0;
 973}
 974
 975/**
 976 * drm_sched_init - Init a gpu scheduler instance
 977 *
 978 * @sched: scheduler instance
 979 * @ops: backend operations for this scheduler
 980 * @hw_submission: number of hw submissions that can be in flight
 981 * @hang_limit: number of times to allow a job to hang before dropping it
 982 * @timeout: timeout value in jiffies for the scheduler
 983 * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
 984 *              used
 985 * @score: optional score atomic shared with other schedulers
 986 * @name: name used for debugging
 987 *
 988 * Return 0 on success, otherwise error code.
 989 */
 990int drm_sched_init(struct drm_gpu_scheduler *sched,
 991                   const struct drm_sched_backend_ops *ops,
 992                   unsigned hw_submission, unsigned hang_limit,
 993                   long timeout, struct workqueue_struct *timeout_wq,
 994                   atomic_t *score, const char *name, struct device *dev)
 995{
 996        int i, ret;
 997        sched->ops = ops;
 998        sched->hw_submission_limit = hw_submission;
 999        sched->name = name;
1000        sched->timeout = timeout;

1001        sched->timeout_wq = timeout_wq ? : system_wq;
1002        sched->hang_limit = hang_limit;
1003        sched->score = score ? score : &sched->_score;
1004        sched->dev = dev;
1005        for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
1006                drm_sched_rq_init(sched, &sched->sched_rq[i]);
1007
1008        init_waitqueue_head(&sched->wake_up_worker);
1009        init_waitqueue_head(&sched->job_scheduled);
1010        INIT_LIST_HEAD(&sched->pending_list);
1011        spin_lock_init(&sched->job_list_lock);
1012        atomic_set(&sched->hw_rq_count, 0);
1013        INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
1014        atomic_set(&sched->_score, 0);
1015        atomic64_set(&sched->job_id_count, 0);
1016
1017        /* Each scheduler will run on a seperate kernel thread */
1018        sched->thread = kthread_run(drm_sched_main, sched, sched->name);
1019        if (IS_ERR(sched->thread)) {
1020                ret = PTR_ERR(sched->thread);
1021                sched->thread = NULL;
1022                DRM_DEV_ERROR(sched->dev, "Failed to create scheduler for %s.\n", name);
1023                return ret;
1024        }
1025
1026        sched->ready = true;
1027        return 0;
1028}
1029EXPORT_SYMBOL(drm_sched_init);
1030
1031/**
1032 * drm_sched_fini - Destroy a gpu scheduler
1033 *
1034 * @sched: scheduler instance
1035 *
1036 * Tears down and cleans up the scheduler.
1037 */
1038void drm_sched_fini(struct drm_gpu_scheduler *sched)
1039{
1040        struct drm_sched_entity *s_entity;
1041        int i;
1042
1043        if (sched->thread)
1044                kthread_stop(sched->thread);
1045
1046        for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
1047                struct drm_sched_rq *rq = &sched->sched_rq[i];
1048
1049                if (!rq)
1050                        continue;
1051
1052                spin_lock(&rq->lock);
1053                list_for_each_entry(s_entity, &rq->entities, list)
1054                        /*
1055                         * Prevents reinsertion and marks job_queue as idle,
1056                         * it will removed from rq in drm_sched_entity_fini
1057                         * eventually
1058                         */
1059                        s_entity->stopped = true;
1060                spin_unlock(&rq->lock);
1061
1062        }
1063
1064        /* Wakeup everyone stuck in drm_sched_entity_flush for this scheduler */
1065        wake_up_all(&sched->job_scheduled);
1066
1067        /* Confirm no work left behind accessing device structures */
1068        cancel_delayed_work_sync(&sched->work_tdr);
1069
1070        sched->ready = false;
1071}
1072EXPORT_SYMBOL(drm_sched_fini);
1073
1074/**
1075 * drm_sched_increase_karma_ext - Update sched_entity guilty flag
1076 *
1077 * @bad: The job guilty of time out
1078 * @type: type for increase/reset karma
1079 *
1080 */
1081void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
1082{
1083        int i;
1084        struct drm_sched_entity *tmp;
1085        struct drm_sched_entity *entity;
1086        struct drm_gpu_scheduler *sched = bad->sched;
1087
1088        /* don't change @bad's karma if it's from KERNEL RQ,
1089         * because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
1090         * corrupt but keep in mind that kernel jobs always considered good.
1091         */
1092        if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
1093                if (type == 0)
1094                        atomic_set(&bad->karma, 0);
1095                else if (type == 1)
1096                        atomic_inc(&bad->karma);
1097
1098                for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
1099                     i++) {
1100                        struct drm_sched_rq *rq = &sched->sched_rq[i];
1101
1102                        spin_lock(&rq->lock);
1103                        list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
1104                                if (bad->s_fence->scheduled.context ==
1105                                    entity->fence_context) {
1106                                        if (entity->guilty)
1107                                                atomic_set(entity->guilty, type);
1108                                        break;
1109                                }
1110                        }
1111                        spin_unlock(&rq->lock);
1112                        if (&entity->list != &rq->entities)
1113                                break;
1114                }
1115        }
1116}
1117EXPORT_SYMBOL(drm_sched_increase_karma_ext);
1118