LXR linux/drivers/gpu/drm/vc4/vc4

   1/*
   2 * Copyright © 2014 Broadcom
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 */
  23
  24#include <linux/module.h>
  25#include <linux/platform_device.h>
  26#include <linux/pm_runtime.h>
  27#include <linux/device.h>
  28#include <linux/io.h>
  29#include <linux/sched/signal.h>
  30
  31#include "uapi/drm/vc4_drm.h"
  32#include "vc4_drv.h"
  33#include "vc4_regs.h"
  34#include "vc4_trace.h"
  35
  36static void
  37vc4_queue_hangcheck(struct drm_device *dev)
  38{
  39        struct vc4_dev *vc4 = to_vc4_dev(dev);
  40
  41        mod_timer(&vc4->hangcheck.timer,
  42                  round_jiffies_up(jiffies + msecs_to_jiffies(100)));
  43}
  44
  45struct vc4_hang_state {
  46        struct drm_vc4_get_hang_state user_state;
  47
  48        u32 bo_count;
  49        struct drm_gem_object **bo;
  50};
  51
  52static void
  53vc4_free_hang_state(struct drm_device *dev, struct vc4_hang_state *state)
  54{
  55        unsigned int i;
  56
  57        for (i = 0; i < state->user_state.bo_count; i++)
  58                drm_gem_object_unreference_unlocked(state->bo[i]);
  59
  60        kfree(state);
  61}
  62
  63int
  64vc4_get_hang_state_ioctl(struct drm_device *dev, void *data,
  65                         struct drm_file *file_priv)
  66{
  67        struct drm_vc4_get_hang_state *get_state = data;
  68        struct drm_vc4_get_hang_state_bo *bo_state;
  69        struct vc4_hang_state *kernel_state;
  70        struct drm_vc4_get_hang_state *state;
  71        struct vc4_dev *vc4 = to_vc4_dev(dev);
  72        unsigned long irqflags;
  73        u32 i;
  74        int ret = 0;
  75
  76        spin_lock_irqsave(&vc4->job_lock, irqflags);
  77        kernel_state = vc4->hang_state;
  78        if (!kernel_state) {
  79                spin_unlock_irqrestore(&vc4->job_lock, irqflags);
  80                return -ENOENT;
  81        }
  82        state = &kernel_state->user_state;
  83
  84        /* If the user's array isn't big enough, just return the
  85         * required array size.
  86         */
  87        if (get_state->bo_count < state->bo_count) {
  88                get_state->bo_count = state->bo_count;
  89                spin_unlock_irqrestore(&vc4->job_lock, irqflags);
  90                return 0;
  91        }
  92
  93        vc4->hang_state = NULL;
  94        spin_unlock_irqrestore(&vc4->job_lock, irqflags);
  95
  96        /* Save the user's BO pointer, so we don't stomp it with the memcpy. */
  97        state->bo = get_state->bo;
  98        memcpy(get_state, state, sizeof(*state));
  99
 100        bo_state = kcalloc(state->bo_count, sizeof(*bo_state), GFP_KERNEL);
 101        if (!bo_state) {
 102                ret = -ENOMEM;
 103                goto err_free;
 104        }
 105
 106        for (i = 0; i < state->bo_count; i++) {
 107                struct vc4_bo *vc4_bo = to_vc4_bo(kernel_state->bo[i]);
 108                u32 handle;
 109
 110                ret = drm_gem_handle_create(file_priv, kernel_state->bo[i],
 111                                            &handle);
 112
 113                if (ret) {
 114                        state->bo_count = i;
 115                        goto err_delete_handle;
 116                }
 117                bo_state[i].handle = handle;
 118                bo_state[i].paddr = vc4_bo->base.paddr;
 119                bo_state[i].size = vc4_bo->base.base.size;
 120        }
 121
 122        if (copy_to_user((void __user *)(uintptr_t)get_state->bo,
 123                         bo_state,
 124                         state->bo_count * sizeof(*bo_state)))
 125                ret = -EFAULT;
 126
 127err_delete_handle:
 128        if (ret) {
 129                for (i = 0; i < state->bo_count; i++)
 130                        drm_gem_handle_delete(file_priv, bo_state[i].handle);
 131        }
 132
 133err_free:
 134        vc4_free_hang_state(dev, kernel_state);
 135        kfree(bo_state);
 136
 137        return ret;
 138}
 139
 140static void
 141vc4_save_hang_state(struct drm_device *dev)
 142{
 143        struct vc4_dev *vc4 = to_vc4_dev(dev);
 144        struct drm_vc4_get_hang_state *state;
 145        struct vc4_hang_state *kernel_state;
 146        struct vc4_exec_info *exec[2];
 147        struct vc4_bo *bo;
 148        unsigned long irqflags;
 149        unsigned int i, j, unref_list_count, prev_idx;
 150
 151        kernel_state = kcalloc(1, sizeof(*kernel_state), GFP_KERNEL);
 152        if (!kernel_state)
 153                return;
 154
 155        state = &kernel_state->user_state;
 156
 157        spin_lock_irqsave(&vc4->job_lock, irqflags);
 158        exec[0] = vc4_first_bin_job(vc4);
 159        exec[1] = vc4_first_render_job(vc4);
 160        if (!exec[0] && !exec[1]) {
 161                spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 162                return;
 163        }
 164
 165        /* Get the bos from both binner and renderer into hang state. */
 166        state->bo_count = 0;
 167        for (i = 0; i < 2; i++) {
 168                if (!exec[i])
 169                        continue;
 170
 171                unref_list_count = 0;
 172                list_for_each_entry(bo, &exec[i]->unref_list, unref_head)
 173                        unref_list_count++;
 174                state->bo_count += exec[i]->bo_count + unref_list_count;
 175        }
 176
 177        kernel_state->bo = kcalloc(state->bo_count,
 178                                   sizeof(*kernel_state->bo), GFP_ATOMIC);
 179
 180        if (!kernel_state->bo) {
 181                spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 182                return;
 183        }
 184
 185        prev_idx = 0;
 186        for (i = 0; i < 2; i++) {
 187                if (!exec[i])
 188                        continue;
 189
 190                for (j = 0; j < exec[i]->bo_count; j++) {
 191                        drm_gem_object_reference(&exec[i]->bo[j]->base);
 192                        kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base;
 193                }
 194
 195                list_for_each_entry(bo, &exec[i]->unref_list, unref_head) {
 196                        drm_gem_object_reference(&bo->base.base);
 197                        kernel_state->bo[j + prev_idx] = &bo->base.base;
 198                        j++;
 199                }
 200                prev_idx = j + 1;
 201        }
 202
 203        if (exec[0])
 204                state->start_bin = exec[0]->ct0ca;
 205        if (exec[1])
 206                state->start_render = exec[1]->ct1ca;
 207
 208        spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 209
 210        state->ct0ca = V3D_READ(V3D_CTNCA(0));
 211        state->ct0ea = V3D_READ(V3D_CTNEA(0));
 212
 213        state->ct1ca = V3D_READ(V3D_CTNCA(1));
 214        state->ct1ea = V3D_READ(V3D_CTNEA(1));
 215
 216        state->ct0cs = V3D_READ(V3D_CTNCS(0));
 217        state->ct1cs = V3D_READ(V3D_CTNCS(1));
 218
 219        state->ct0ra0 = V3D_READ(V3D_CT00RA0);
 220        state->ct1ra0 = V3D_READ(V3D_CT01RA0);
 221
 222        state->bpca = V3D_READ(V3D_BPCA);
 223        state->bpcs = V3D_READ(V3D_BPCS);
 224        state->bpoa = V3D_READ(V3D_BPOA);
 225        state->bpos = V3D_READ(V3D_BPOS);
 226
 227        state->vpmbase = V3D_READ(V3D_VPMBASE);
 228
 229        state->dbge = V3D_READ(V3D_DBGE);
 230        state->fdbgo = V3D_READ(V3D_FDBGO);
 231        state->fdbgb = V3D_READ(V3D_FDBGB);
 232        state->fdbgr = V3D_READ(V3D_FDBGR);
 233        state->fdbgs = V3D_READ(V3D_FDBGS);
 234        state->errstat = V3D_READ(V3D_ERRSTAT);
 235
 236        spin_lock_irqsave(&vc4->job_lock, irqflags);
 237        if (vc4->hang_state) {
 238                spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 239                vc4_free_hang_state(dev, kernel_state);
 240        } else {
 241                vc4->hang_state = kernel_state;
 242                spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 243        }
 244}
 245
 246static void
 247vc4_reset(struct drm_device *dev)
 248{
 249        struct vc4_dev *vc4 = to_vc4_dev(dev);
 250
 251        DRM_INFO("Resetting GPU.\n");
 252
 253        mutex_lock(&vc4->power_lock);
 254        if (vc4->power_refcount) {
 255                /* Power the device off and back on the by dropping the
 256                 * reference on runtime PM.
 257                 */
 258                pm_runtime_put_sync_suspend(&vc4->v3d->pdev->dev);
 259                pm_runtime_get_sync(&vc4->v3d->pdev->dev);
 260        }
 261        mutex_unlock(&vc4->power_lock);
 262
 263        vc4_irq_reset(dev);
 264
 265        /* Rearm the hangcheck -- another job might have been waiting
 266         * for our hung one to get kicked off, and vc4_irq_reset()
 267         * would have started it.
 268         */
 269        vc4_queue_hangcheck(dev);
 270}
 271
 272static void
 273vc4_reset_work(struct work_struct *work)
 274{
 275        struct vc4_dev *vc4 =
 276                container_of(work, struct vc4_dev, hangcheck.reset_work);
 277
 278        vc4_save_hang_state(vc4->dev);
 279
 280        vc4_reset(vc4->dev);
 281}
 282
 283static void
 284vc4_hangcheck_elapsed(unsigned long data)
 285{
 286        struct drm_device *dev = (struct drm_device *)data;
 287        struct vc4_dev *vc4 = to_vc4_dev(dev);
 288        uint32_t ct0ca, ct1ca;
 289        unsigned long irqflags;
 290        struct vc4_exec_info *bin_exec, *render_exec;
 291
 292        spin_lock_irqsave(&vc4->job_lock, irqflags);
 293
 294        bin_exec = vc4_first_bin_job(vc4);
 295        render_exec = vc4_first_render_job(vc4);
 296
 297        /* If idle, we can stop watching for hangs. */
 298        if (!bin_exec && !render_exec) {
 299                spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 300                return;
 301        }
 302
 303        ct0ca = V3D_READ(V3D_CTNCA(0));
 304        ct1ca = V3D_READ(V3D_CTNCA(1));
 305
 306        /* If we've made any progress in execution, rearm the timer
 307         * and wait.
 308         */
 309        if ((bin_exec && ct0ca != bin_exec->last_ct0ca) ||
 310            (render_exec && ct1ca != render_exec->last_ct1ca)) {
 311                if (bin_exec)
 312                        bin_exec->last_ct0ca = ct0ca;
 313                if (render_exec)
 314                        render_exec->last_ct1ca = ct1ca;
 315                spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 316                vc4_queue_hangcheck(dev);
 317                return;
 318        }
 319
 320        spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 321
 322        /* We've gone too long with no progress, reset.  This has to
 323         * be done from a work struct, since resetting can sleep and
 324         * this timer hook isn't allowed to.
 325         */
 326        schedule_work(&vc4->hangcheck.reset_work);
 327}
 328
 329static void
 330submit_cl(struct drm_device *dev, uint32_t thread, uint32_t start, uint32_t end)
 331{
 332        struct vc4_dev *vc4 = to_vc4_dev(dev);
 333
 334        /* Set the current and end address of the control list.
 335         * Writing the end register is what starts the job.
 336         */
 337        V3D_WRITE(V3D_CTNCA(thread), start);
 338        V3D_WRITE(V3D_CTNEA(thread), end);
 339}
 340
 341int
 342vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno, uint64_t timeout_ns,
 343                   bool interruptible)
 344{
 345        struct vc4_dev *vc4 = to_vc4_dev(dev);
 346        int ret = 0;
 347        unsigned long timeout_expire;
 348        DEFINE_WAIT(wait);
 349
 350        if (vc4->finished_seqno >= seqno)
 351                return 0;
 352
 353        if (timeout_ns == 0)
 354                return -ETIME;
 355
 356        timeout_expire = jiffies + nsecs_to_jiffies(timeout_ns);
 357
 358        trace_vc4_wait_for_seqno_begin(dev, seqno, timeout_ns);
 359        for (;;) {
 360                prepare_to_wait(&vc4->job_wait_queue, &wait,
 361                                interruptible ? TASK_INTERRUPTIBLE :
 362                                TASK_UNINTERRUPTIBLE);
 363
 364                if (interruptible && signal_pending(current)) {
 365                        ret = -ERESTARTSYS;
 366                        break;
 367                }
 368
 369                if (vc4->finished_seqno >= seqno)
 370                        break;
 371
 372                if (timeout_ns != ~0ull) {
 373                        if (time_after_eq(jiffies, timeout_expire)) {
 374                                ret = -ETIME;
 375                                break;
 376                        }
 377                        schedule_timeout(timeout_expire - jiffies);
 378                } else {
 379                        schedule();
 380                }
 381        }
 382
 383        finish_wait(&vc4->job_wait_queue, &wait);
 384        trace_vc4_wait_for_seqno_end(dev, seqno);
 385
 386        return ret;
 387}
 388
 389static void
 390vc4_flush_caches(struct drm_device *dev)
 391{
 392        struct vc4_dev *vc4 = to_vc4_dev(dev);
 393
 394        /* Flush the GPU L2 caches.  These caches sit on top of system
 395         * L3 (the 128kb or so shared with the CPU), and are
 396         * non-allocating in the L3.
 397         */
 398        V3D_WRITE(V3D_L2CACTL,
 399                  V3D_L2CACTL_L2CCLR);
 400
 401        V3D_WRITE(V3D_SLCACTL,
 402                  VC4_SET_FIELD(0xf, V3D_SLCACTL_T1CC) |
 403                  VC4_SET_FIELD(0xf, V3D_SLCACTL_T0CC) |
 404                  VC4_SET_FIELD(0xf, V3D_SLCACTL_UCC) |
 405                  VC4_SET_FIELD(0xf, V3D_SLCACTL_ICC));
 406}
 407
 408/* Sets the registers for the next job to be actually be executed in
 409 * the hardware.
 410 *
 411 * The job_lock should be held during this.
 412 */
 413void
 414vc4_submit_next_bin_job(struct drm_device *dev)
 415{
 416        struct vc4_dev *vc4 = to_vc4_dev(dev);
 417        struct vc4_exec_info *exec;
 418
 419again:
 420        exec = vc4_first_bin_job(vc4);
 421        if (!exec)
 422                return;
 423
 424        vc4_flush_caches(dev);
 425
 426        /* Either put the job in the binner if it uses the binner, or
 427         * immediately move it to the to-be-rendered queue.
 428         */
 429        if (exec->ct0ca != exec->ct0ea) {
 430                submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
 431        } else {
 432                vc4_move_job_to_render(dev, exec);
 433                goto again;
 434        }
 435}
 436
 437void
 438vc4_submit_next_render_job(struct drm_device *dev)
 439{
 440        struct vc4_dev *vc4 = to_vc4_dev(dev);
 441        struct vc4_exec_info *exec = vc4_first_render_job(vc4);
 442
 443        if (!exec)
 444                return;
 445
 446        submit_cl(dev, 1, exec->ct1ca, exec->ct1ea);
 447}
 448
 449void
 450vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec)
 451{
 452        struct vc4_dev *vc4 = to_vc4_dev(dev);
 453        bool was_empty = list_empty(&vc4->render_job_list);
 454
 455        list_move_tail(&exec->head, &vc4->render_job_list);
 456        if (was_empty)
 457                vc4_submit_next_render_job(dev);
 458}
 459
 460static void
 461vc4_update_bo_seqnos(struct vc4_exec_info *exec, uint64_t seqno)
 462{
 463        struct vc4_bo *bo;
 464        unsigned i;
 465
 466        for (i = 0; i < exec->bo_count; i++) {
 467                bo = to_vc4_bo(&exec->bo[i]->base);
 468                bo->seqno = seqno;
 469
 470                reservation_object_add_shared_fence(bo->resv, exec->fence);
 471        }
 472
 473        list_for_each_entry(bo, &exec->unref_list, unref_head) {
 474                bo->seqno = seqno;
 475        }
 476
 477        for (i = 0; i < exec->rcl_write_bo_count; i++) {
 478                bo = to_vc4_bo(&exec->rcl_write_bo[i]->base);
 479                bo->write_seqno = seqno;
 480
 481                reservation_object_add_excl_fence(bo->resv, exec->fence);
 482        }
 483}
 484
 485static void
 486vc4_unlock_bo_reservations(struct drm_device *dev,
 487                           struct vc4_exec_info *exec,
 488                           struct ww_acquire_ctx *acquire_ctx)
 489{
 490        int i;
 491
 492        for (i = 0; i < exec->bo_count; i++) {
 493                struct vc4_bo *bo = to_vc4_bo(&exec->bo[i]->base);
 494
 495                ww_mutex_unlock(&bo->resv->lock);
 496        }
 497
 498        ww_acquire_fini(acquire_ctx);
 499}
 500
 501/* Takes the reservation lock on all the BOs being referenced, so that
 502 * at queue submit time we can update the reservations.
 503 *
 504 * We don't lock the RCL the tile alloc/state BOs, or overflow memory
 505 * (all of which are on exec->unref_list).  They're entirely private
 506 * to vc4, so we don't attach dma-buf fences to them.
 507 */
 508static int
 509vc4_lock_bo_reservations(struct drm_device *dev,
 510                         struct vc4_exec_info *exec,
 511                         struct ww_acquire_ctx *acquire_ctx)
 512{
 513        int contended_lock = -1;
 514        int i, ret;
 515        struct vc4_bo *bo;
 516
 517        ww_acquire_init(acquire_ctx, &reservation_ww_class);
 518
 519retry:
 520        if (contended_lock != -1) {
 521                bo = to_vc4_bo(&exec->bo[contended_lock]->base);
 522                ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock,
 523                                                       acquire_ctx);
 524                if (ret) {
 525                        ww_acquire_done(acquire_ctx);
 526                        return ret;
 527                }
 528        }
 529
 530        for (i = 0; i < exec->bo_count; i++) {
 531                if (i == contended_lock)
 532                        continue;
 533
 534                bo = to_vc4_bo(&exec->bo[i]->base);
 535
 536                ret = ww_mutex_lock_interruptible(&bo->resv->lock, acquire_ctx);
 537                if (ret) {
 538                        int j;
 539
 540                        for (j = 0; j < i; j++) {
 541                                bo = to_vc4_bo(&exec->bo[j]->base);
 542                                ww_mutex_unlock(&bo->resv->lock);
 543                        }
 544
 545                        if (contended_lock != -1 && contended_lock >= i) {
 546                                bo = to_vc4_bo(&exec->bo[contended_lock]->base);
 547
 548                                ww_mutex_unlock(&bo->resv->lock);
 549                        }
 550
 551                        if (ret == -EDEADLK) {
 552                                contended_lock = i;
 553                                goto retry;
 554                        }
 555
 556                        ww_acquire_done(acquire_ctx);
 557                        return ret;
 558                }
 559        }
 560
 561        ww_acquire_done(acquire_ctx);
 562
 563        /* Reserve space for our shared (read-only) fence references,
 564         * before we commit the CL to the hardware.
 565         */
 566        for (i = 0; i < exec->bo_count; i++) {
 567                bo = to_vc4_bo(&exec->bo[i]->base);
 568
 569                ret = reservation_object_reserve_shared(bo->resv);
 570                if (ret) {
 571                        vc4_unlock_bo_reservations(dev, exec, acquire_ctx);
 572                        return ret;
 573                }
 574        }
 575
 576        return 0;
 577}
 578
 579/* Queues a struct vc4_exec_info for execution.  If no job is
 580 * currently executing, then submits it.
 581 *
 582 * Unlike most GPUs, our hardware only handles one command list at a
 583 * time.  To queue multiple jobs at once, we'd need to edit the
 584 * previous command list to have a jump to the new one at the end, and
 585 * then bump the end address.  That's a change for a later date,
 586 * though.
 587 */
 588static int
 589vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
 590                 struct ww_acquire_ctx *acquire_ctx)
 591{
 592        struct vc4_dev *vc4 = to_vc4_dev(dev);
 593        uint64_t seqno;
 594        unsigned long irqflags;
 595        struct vc4_fence *fence;
 596
 597        fence = kzalloc(sizeof(*fence), GFP_KERNEL);
 598        if (!fence)
 599                return -ENOMEM;
 600        fence->dev = dev;
 601
 602        spin_lock_irqsave(&vc4->job_lock, irqflags);
 603
 604        seqno = ++vc4->emit_seqno;
 605        exec->seqno = seqno;
 606
 607        dma_fence_init(&fence->base, &vc4_fence_ops, &vc4->job_lock,
 608                       vc4->dma_fence_context, exec->seqno);
 609        fence->seqno = exec->seqno;
 610        exec->fence = &fence->base;
 611
 612        vc4_update_bo_seqnos(exec, seqno);
 613
 614        vc4_unlock_bo_reservations(dev, exec, acquire_ctx);
 615
 616        list_add_tail(&exec->head, &vc4->bin_job_list);
 617
 618        /* If no job was executing, kick ours off.  Otherwise, it'll
 619         * get started when the previous job's flush done interrupt
 620         * occurs.
 621         */
 622        if (vc4_first_bin_job(vc4) == exec) {
 623                vc4_submit_next_bin_job(dev);
 624                vc4_queue_hangcheck(dev);
 625        }
 626
 627        spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 628
 629        return 0;
 630}
 631
 632/**
 633 * vc4_cl_lookup_bos() - Sets up exec->bo[] with the GEM objects
 634 * referenced by the job.
 635 * @dev: DRM device
 636 * @file_priv: DRM file for this fd
 637 * @exec: V3D job being set up
 638 *
 639 * The command validator needs to reference BOs by their index within
 640 * the submitted job's BO list.  This does the validation of the job's
 641 * BO list and reference counting for the lifetime of the job.
 642 *
 643 * Note that this function doesn't need to unreference the BOs on
 644 * failure, because that will happen at vc4_complete_exec() time.
 645 */
 646static int
 647vc4_cl_lookup_bos(struct drm_device *dev,
 648                  struct drm_file *file_priv,
 649                  struct vc4_exec_info *exec)
 650{
 651        struct drm_vc4_submit_cl *args = exec->args;
 652        uint32_t *handles;
 653        int ret = 0;
 654        int i;
 655
 656        exec->bo_count = args->bo_handle_count;
 657
 658        if (!exec->bo_count) {
 659                /* See comment on bo_index for why we have to check
 660                 * this.
 661                 */
 662                DRM_ERROR("Rendering requires BOs to validate\n");
 663                return -EINVAL;
 664        }
 665
 666        exec->bo = kvmalloc_array(exec->bo_count,
 667                                    sizeof(struct drm_gem_cma_object *),
 668                                    GFP_KERNEL | __GFP_ZERO);
 669        if (!exec->bo) {
 670                DRM_ERROR("Failed to allocate validated BO pointers\n");
 671                return -ENOMEM;
 672        }
 673
 674        handles = kvmalloc_array(exec->bo_count, sizeof(uint32_t), GFP_KERNEL);
 675        if (!handles) {
 676                ret = -ENOMEM;
 677                DRM_ERROR("Failed to allocate incoming GEM handles\n");
 678                goto fail;
 679        }
 680
 681        if (copy_from_user(handles,
 682                           (void __user *)(uintptr_t)args->bo_handles,
 683                           exec->bo_count * sizeof(uint32_t))) {
 684                ret = -EFAULT;
 685                DRM_ERROR("Failed to copy in GEM handles\n");
 686                goto fail;
 687        }
 688
 689        spin_lock(&file_priv->table_lock);
 690        for (i = 0; i < exec->bo_count; i++) {
 691                struct drm_gem_object *bo = idr_find(&file_priv->object_idr,
 692                                                     handles[i]);
 693                if (!bo) {
 694                        DRM_ERROR("Failed to look up GEM BO %d: %d\n",
 695                                  i, handles[i]);
 696                        ret = -EINVAL;
 697                        spin_unlock(&file_priv->table_lock);
 698                        goto fail;
 699                }
 700                drm_gem_object_reference(bo);
 701                exec->bo[i] = (struct drm_gem_cma_object *)bo;
 702        }
 703        spin_unlock(&file_priv->table_lock);
 704
 705fail:
 706        kvfree(handles);
 707        return ret;
 708}
 709
 710static int
 711vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
 712{
 713        struct drm_vc4_submit_cl *args = exec->args;
 714        void *temp = NULL;
 715        void *bin;
 716        int ret = 0;
 717        uint32_t bin_offset = 0;
 718        uint32_t shader_rec_offset = roundup(bin_offset + args->bin_cl_size,
 719                                             16);
 720        uint32_t uniforms_offset = shader_rec_offset + args->shader_rec_size;
 721        uint32_t exec_size = uniforms_offset + args->uniforms_size;
 722        uint32_t temp_size = exec_size + (sizeof(struct vc4_shader_state) *
 723                                          args->shader_rec_count);
 724        struct vc4_bo *bo;
 725
 726        if (shader_rec_offset < args->bin_cl_size ||
 727            uniforms_offset < shader_rec_offset ||
 728            exec_size < uniforms_offset ||
 729            args->shader_rec_count >= (UINT_MAX /
 730                                          sizeof(struct vc4_shader_state)) ||
 731            temp_size < exec_size) {
 732                DRM_ERROR("overflow in exec arguments\n");
 733                ret = -EINVAL;
 734                goto fail;
 735        }
 736
 737        /* Allocate space where we'll store the copied in user command lists
 738         * and shader records.
 739         *
 740         * We don't just copy directly into the BOs because we need to
 741         * read the contents back for validation, and I think the
 742         * bo->vaddr is uncached access.
 743         */
 744        temp = kvmalloc_array(temp_size, 1, GFP_KERNEL);
 745        if (!temp) {
 746                DRM_ERROR("Failed to allocate storage for copying "
 747                          "in bin/render CLs.\n");
 748                ret = -ENOMEM;
 749                goto fail;
 750        }
 751        bin = temp + bin_offset;
 752        exec->shader_rec_u = temp + shader_rec_offset;
 753        exec->uniforms_u = temp + uniforms_offset;
 754        exec->shader_state = temp + exec_size;
 755        exec->shader_state_size = args->shader_rec_count;
 756
 757        if (copy_from_user(bin,
 758                           (void __user *)(uintptr_t)args->bin_cl,
 759                           args->bin_cl_size)) {
 760                ret = -EFAULT;
 761                goto fail;
 762        }
 763
 764        if (copy_from_user(exec->shader_rec_u,
 765                           (void __user *)(uintptr_t)args->shader_rec,
 766                           args->shader_rec_size)) {
 767                ret = -EFAULT;
 768                goto fail;
 769        }
 770
 771        if (copy_from_user(exec->uniforms_u,
 772                           (void __user *)(uintptr_t)args->uniforms,
 773                           args->uniforms_size)) {
 774                ret = -EFAULT;
 775                goto fail;
 776        }
 777
 778        bo = vc4_bo_create(dev, exec_size, true);
 779        if (IS_ERR(bo)) {
 780                DRM_ERROR("Couldn't allocate BO for binning\n");
 781                ret = PTR_ERR(bo);
 782                goto fail;
 783        }
 784        exec->exec_bo = &bo->base;
 785
 786        list_add_tail(&to_vc4_bo(&exec->exec_bo->base)->unref_head,
 787                      &exec->unref_list);
 788
 789        exec->ct0ca = exec->exec_bo->paddr + bin_offset;
 790
 791        exec->bin_u = bin;
 792
 793        exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset;
 794        exec->shader_rec_p = exec->exec_bo->paddr + shader_rec_offset;
 795        exec->shader_rec_size = args->shader_rec_size;
 796
 797        exec->uniforms_v = exec->exec_bo->vaddr + uniforms_offset;
 798        exec->uniforms_p = exec->exec_bo->paddr + uniforms_offset;
 799        exec->uniforms_size = args->uniforms_size;
 800
 801        ret = vc4_validate_bin_cl(dev,
 802                                  exec->exec_bo->vaddr + bin_offset,
 803                                  bin,
 804                                  exec);
 805        if (ret)
 806                goto fail;
 807
 808        ret = vc4_validate_shader_recs(dev, exec);
 809        if (ret)
 810                goto fail;
 811
 812        /* Block waiting on any previous rendering into the CS's VBO,
 813         * IB, or textures, so that pixels are actually written by the
 814         * time we try to read them.
 815         */
 816        ret = vc4_wait_for_seqno(dev, exec->bin_dep_seqno, ~0ull, true);
 817
 818fail:
 819        kvfree(temp);
 820        return ret;
 821}
 822
 823static void
 824vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 825{
 826        struct vc4_dev *vc4 = to_vc4_dev(dev);
 827        unsigned long irqflags;
 828        unsigned i;
 829
 830        /* If we got force-completed because of GPU reset rather than
 831         * through our IRQ handler, signal the fence now.
 832         */
 833        if (exec->fence)
 834                dma_fence_signal(exec->fence);
 835
 836        if (exec->bo) {
 837                for (i = 0; i < exec->bo_count; i++)
 838                        drm_gem_object_unreference_unlocked(&exec->bo[i]->base);
 839                kvfree(exec->bo);
 840        }
 841
 842        while (!list_empty(&exec->unref_list)) {
 843                struct vc4_bo *bo = list_first_entry(&exec->unref_list,
 844                                                     struct vc4_bo, unref_head);
 845                list_del(&bo->unref_head);
 846                drm_gem_object_unreference_unlocked(&bo->base.base);
 847        }
 848
 849        /* Free up the allocation of any bin slots we used. */
 850        spin_lock_irqsave(&vc4->job_lock, irqflags);
 851        vc4->bin_alloc_used &= ~exec->bin_slots;
 852        spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 853
 854        mutex_lock(&vc4->power_lock);
 855        if (--vc4->power_refcount == 0) {
 856                pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
 857                pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev);
 858        }
 859        mutex_unlock(&vc4->power_lock);
 860
 861        kfree(exec);
 862}
 863
 864void
 865vc4_job_handle_completed(struct vc4_dev *vc4)
 866{
 867        unsigned long irqflags;
 868        struct vc4_seqno_cb *cb, *cb_temp;
 869
 870        spin_lock_irqsave(&vc4->job_lock, irqflags);
 871        while (!list_empty(&vc4->job_done_list)) {
 872                struct vc4_exec_info *exec =
 873                        list_first_entry(&vc4->job_done_list,
 874                                         struct vc4_exec_info, head);
 875                list_del(&exec->head);
 876
 877                spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 878                vc4_complete_exec(vc4->dev, exec);
 879                spin_lock_irqsave(&vc4->job_lock, irqflags);
 880        }
 881
 882        list_for_each_entry_safe(cb, cb_temp, &vc4->seqno_cb_list, work.entry) {
 883                if (cb->seqno <= vc4->finished_seqno) {
 884                        list_del_init(&cb->work.entry);
 885                        schedule_work(&cb->work);
 886                }
 887        }
 888
 889        spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 890}
 891
 892static void vc4_seqno_cb_work(struct work_struct *work)
 893{
 894        struct vc4_seqno_cb *cb = container_of(work, struct vc4_seqno_cb, work);
 895
 896        cb->func(cb);
 897}
 898
 899int vc4_queue_seqno_cb(struct drm_device *dev,
 900                       struct vc4_seqno_cb *cb, uint64_t seqno,
 901                       void (*func)(struct vc4_seqno_cb *cb))
 902{
 903        struct vc4_dev *vc4 = to_vc4_dev(dev);
 904        int ret = 0;
 905        unsigned long irqflags;
 906
 907        cb->func = func;
 908        INIT_WORK(&cb->work, vc4_seqno_cb_work);
 909
 910        spin_lock_irqsave(&vc4->job_lock, irqflags);
 911        if (seqno > vc4->finished_seqno) {
 912                cb->seqno = seqno;
 913                list_add_tail(&cb->work.entry, &vc4->seqno_cb_list);
 914        } else {
 915                schedule_work(&cb->work);
 916        }
 917        spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 918
 919        return ret;
 920}
 921
 922/* Scheduled when any job has been completed, this walks the list of
 923 * jobs that had completed and unrefs their BOs and frees their exec
 924 * structs.
 925 */
 926static void
 927vc4_job_done_work(struct work_struct *work)
 928{
 929        struct vc4_dev *vc4 =
 930                container_of(work, struct vc4_dev, job_done_work);
 931
 932        vc4_job_handle_completed(vc4);
 933}
 934
 935static int
 936vc4_wait_for_seqno_ioctl_helper(struct drm_device *dev,
 937                                uint64_t seqno,
 938                                uint64_t *timeout_ns)
 939{
 940        unsigned long start = jiffies;
 941        int ret = vc4_wait_for_seqno(dev, seqno, *timeout_ns, true);
 942
 943        if ((ret == -EINTR || ret == -ERESTARTSYS) && *timeout_ns != ~0ull) {
 944                uint64_t delta = jiffies_to_nsecs(jiffies - start);
 945
 946                if (*timeout_ns >= delta)
 947                        *timeout_ns -= delta;
 948        }
 949
 950        return ret;
 951}
 952
 953int
 954vc4_wait_seqno_ioctl(struct drm_device *dev, void *data,
 955                     struct drm_file *file_priv)
 956{
 957        struct drm_vc4_wait_seqno *args = data;
 958
 959        return vc4_wait_for_seqno_ioctl_helper(dev, args->seqno,
 960                                               &args->timeout_ns);
 961}
 962
 963int
 964vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
 965                  struct drm_file *file_priv)
 966{
 967        int ret;
 968        struct drm_vc4_wait_bo *args = data;
 969        struct drm_gem_object *gem_obj;
 970        struct vc4_bo *bo;
 971
 972        if (args->pad != 0)
 973                return -EINVAL;
 974
 975        gem_obj = drm_gem_object_lookup(file_priv, args->handle);
 976        if (!gem_obj) {
 977                DRM_ERROR("Failed to look up GEM BO %d\n", args->handle);
 978                return -EINVAL;
 979        }
 980        bo = to_vc4_bo(gem_obj);
 981
 982        ret = vc4_wait_for_seqno_ioctl_helper(dev, bo->seqno,
 983                                              &args->timeout_ns);
 984
 985        drm_gem_object_unreference_unlocked(gem_obj);
 986        return ret;
 987}
 988
 989/**
 990 * vc4_submit_cl_ioctl() - Submits a job (frame) to the VC4.
 991 * @dev: DRM device
 992 * @data: ioctl argument
 993 * @file_priv: DRM file for this fd
 994 *
 995 * This is the main entrypoint for userspace to submit a 3D frame to
 996 * the GPU.  Userspace provides the binner command list (if
 997 * applicable), and the kernel sets up the render command list to draw
 998 * to the framebuffer described in the ioctl, using the command lists
 999 * that the 3D engine's binner will produce.
1000 */

1001int
1002vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
1003                    struct drm_file *file_priv)
1004{
1005        struct vc4_dev *vc4 = to_vc4_dev(dev);
1006        struct drm_vc4_submit_cl *args = data;
1007        struct vc4_exec_info *exec;
1008        struct ww_acquire_ctx acquire_ctx;
1009        int ret = 0;
1010
1011        if ((args->flags & ~VC4_SUBMIT_CL_USE_CLEAR_COLOR) != 0) {
1012                DRM_ERROR("Unknown flags: 0x%02x\n", args->flags);
1013                return -EINVAL;
1014        }
1015
1016        exec = kcalloc(1, sizeof(*exec), GFP_KERNEL);
1017        if (!exec) {
1018                DRM_ERROR("malloc failure on exec struct\n");
1019                return -ENOMEM;
1020        }
1021
1022        mutex_lock(&vc4->power_lock);
1023        if (vc4->power_refcount++ == 0) {
1024                ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
1025                if (ret < 0) {
1026                        mutex_unlock(&vc4->power_lock);
1027                        vc4->power_refcount--;
1028                        kfree(exec);
1029                        return ret;
1030                }
1031        }
1032        mutex_unlock(&vc4->power_lock);
1033
1034        exec->args = args;
1035        INIT_LIST_HEAD(&exec->unref_list);
1036
1037        ret = vc4_cl_lookup_bos(dev, file_priv, exec);
1038        if (ret)
1039                goto fail;
1040
1041        if (exec->args->bin_cl_size != 0) {
1042                ret = vc4_get_bcl(dev, exec);
1043                if (ret)
1044                        goto fail;
1045        } else {
1046                exec->ct0ca = 0;
1047                exec->ct0ea = 0;
1048        }
1049
1050        ret = vc4_get_rcl(dev, exec);
1051        if (ret)
1052                goto fail;
1053
1054        ret = vc4_lock_bo_reservations(dev, exec, &acquire_ctx);
1055        if (ret)
1056                goto fail;
1057
1058        /* Clear this out of the struct we'll be putting in the queue,
1059         * since it's part of our stack.
1060         */
1061        exec->args = NULL;
1062
1063        ret = vc4_queue_submit(dev, exec, &acquire_ctx);
1064        if (ret)
1065                goto fail;
1066
1067        /* Return the seqno for our job. */
1068        args->seqno = vc4->emit_seqno;
1069
1070        return 0;
1071
1072fail:
1073        vc4_complete_exec(vc4->dev, exec);
1074
1075        return ret;
1076}
1077
1078void
1079vc4_gem_init(struct drm_device *dev)
1080{
1081        struct vc4_dev *vc4 = to_vc4_dev(dev);
1082
1083        vc4->dma_fence_context = dma_fence_context_alloc(1);
1084
1085        INIT_LIST_HEAD(&vc4->bin_job_list);
1086        INIT_LIST_HEAD(&vc4->render_job_list);
1087        INIT_LIST_HEAD(&vc4->job_done_list);
1088        INIT_LIST_HEAD(&vc4->seqno_cb_list);
1089        spin_lock_init(&vc4->job_lock);
1090
1091        INIT_WORK(&vc4->hangcheck.reset_work, vc4_reset_work);
1092        setup_timer(&vc4->hangcheck.timer,
1093                    vc4_hangcheck_elapsed,
1094                    (unsigned long)dev);
1095
1096        INIT_WORK(&vc4->job_done_work, vc4_job_done_work);
1097
1098        mutex_init(&vc4->power_lock);
1099}
1100
1101void
1102vc4_gem_destroy(struct drm_device *dev)
1103{
1104        struct vc4_dev *vc4 = to_vc4_dev(dev);
1105
1106        /* Waiting for exec to finish would need to be done before
1107         * unregistering V3D.
1108         */
1109        WARN_ON(vc4->emit_seqno != vc4->finished_seqno);
1110
1111        /* V3D should already have disabled its interrupt and cleared
1112         * the overflow allocation registers.  Now free the object.
1113         */
1114        if (vc4->bin_bo) {
1115                drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
1116                vc4->bin_bo = NULL;
1117        }
1118
1119        if (vc4->hang_state)
1120                vc4_free_hang_state(dev, vc4->hang_state);
1121
1122        vc4_bo_cache_destroy(dev);
1123}
1124