LXR linux/drivers/gpu/drm/i915/i915

   1/*
   2 * Copyright © 2008-2015 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * Authors:
  24 *    Eric Anholt <eric@anholt.net>
  25 *
  26 */
  27
  28#include <drm/drmP.h>
  29#include <drm/drm_vma_manager.h>
  30#include <drm/i915_drm.h>
  31#include "i915_drv.h"
  32#include "i915_gem_clflush.h"
  33#include "i915_vgpu.h"
  34#include "i915_trace.h"
  35#include "intel_drv.h"
  36#include "intel_frontbuffer.h"
  37#include "intel_mocs.h"
  38#include "intel_workarounds.h"
  39#include "i915_gemfs.h"
  40#include <linux/dma-fence-array.h>
  41#include <linux/kthread.h>
  42#include <linux/reservation.h>
  43#include <linux/shmem_fs.h>
  44#include <linux/slab.h>
  45#include <linux/stop_machine.h>
  46#include <linux/swap.h>
  47#include <linux/pci.h>
  48#include <linux/dma-buf.h>
  49
  50static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  51
  52static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  53{
  54        if (obj->cache_dirty)
  55                return false;
  56
  57        if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  58                return true;
  59
  60        return obj->pin_global; /* currently in use by HW, keep flushed */
  61}
  62
  63static int
  64insert_mappable_node(struct i915_ggtt *ggtt,
  65                     struct drm_mm_node *node, u32 size)
  66{
  67        memset(node, 0, sizeof(*node));
  68        return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
  69                                           size, 0, I915_COLOR_UNEVICTABLE,
  70                                           0, ggtt->mappable_end,
  71                                           DRM_MM_INSERT_LOW);
  72}
  73
  74static void
  75remove_mappable_node(struct drm_mm_node *node)
  76{
  77        drm_mm_remove_node(node);
  78}
  79
  80/* some bookkeeping */
  81static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  82                                  u64 size)
  83{
  84        spin_lock(&dev_priv->mm.object_stat_lock);
  85        dev_priv->mm.object_count++;
  86        dev_priv->mm.object_memory += size;
  87        spin_unlock(&dev_priv->mm.object_stat_lock);
  88}
  89
  90static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  91                                     u64 size)
  92{
  93        spin_lock(&dev_priv->mm.object_stat_lock);
  94        dev_priv->mm.object_count--;
  95        dev_priv->mm.object_memory -= size;
  96        spin_unlock(&dev_priv->mm.object_stat_lock);
  97}
  98
  99static int
 100i915_gem_wait_for_error(struct i915_gpu_error *error)
 101{
 102        int ret;
 103
 104        might_sleep();
 105
 106        /*
 107         * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 108         * userspace. If it takes that long something really bad is going on and
 109         * we should simply try to bail out and fail as gracefully as possible.
 110         */
 111        ret = wait_event_interruptible_timeout(error->reset_queue,
 112                                               !i915_reset_backoff(error),
 113                                               I915_RESET_TIMEOUT);
 114        if (ret == 0) {
 115                DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 116                return -EIO;
 117        } else if (ret < 0) {
 118                return ret;
 119        } else {
 120                return 0;
 121        }
 122}
 123
 124int i915_mutex_lock_interruptible(struct drm_device *dev)
 125{
 126        struct drm_i915_private *dev_priv = to_i915(dev);
 127        int ret;
 128
 129        ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 130        if (ret)
 131                return ret;
 132
 133        ret = mutex_lock_interruptible(&dev->struct_mutex);
 134        if (ret)
 135                return ret;
 136
 137        return 0;
 138}
 139
 140static u32 __i915_gem_park(struct drm_i915_private *i915)
 141{
 142        GEM_TRACE("\n");
 143
 144        lockdep_assert_held(&i915->drm.struct_mutex);
 145        GEM_BUG_ON(i915->gt.active_requests);
 146        GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
 147
 148        if (!i915->gt.awake)
 149                return I915_EPOCH_INVALID;
 150
 151        GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
 152
 153        /*
 154         * Be paranoid and flush a concurrent interrupt to make sure
 155         * we don't reactivate any irq tasklets after parking.
 156         *
 157         * FIXME: Note that even though we have waited for execlists to be idle,
 158         * there may still be an in-flight interrupt even though the CSB
 159         * is now empty. synchronize_irq() makes sure that a residual interrupt
 160         * is completed before we continue, but it doesn't prevent the HW from
 161         * raising a spurious interrupt later. To complete the shield we should
 162         * coordinate disabling the CS irq with flushing the interrupts.
 163         */
 164        synchronize_irq(i915->drm.irq);
 165
 166        intel_engines_park(i915);
 167        i915_timelines_park(i915);
 168
 169        i915_pmu_gt_parked(i915);
 170        i915_vma_parked(i915);
 171
 172        i915->gt.awake = false;
 173
 174        if (INTEL_GEN(i915) >= 6)
 175                gen6_rps_idle(i915);
 176
 177        intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ);
 178
 179        intel_runtime_pm_put(i915);
 180
 181        return i915->gt.epoch;
 182}
 183
 184void i915_gem_park(struct drm_i915_private *i915)
 185{
 186        GEM_TRACE("\n");
 187
 188        lockdep_assert_held(&i915->drm.struct_mutex);
 189        GEM_BUG_ON(i915->gt.active_requests);
 190
 191        if (!i915->gt.awake)
 192                return;
 193
 194        /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
 195        mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
 196}
 197
 198void i915_gem_unpark(struct drm_i915_private *i915)
 199{
 200        GEM_TRACE("\n");
 201
 202        lockdep_assert_held(&i915->drm.struct_mutex);
 203        GEM_BUG_ON(!i915->gt.active_requests);
 204
 205        if (i915->gt.awake)
 206                return;
 207
 208        intel_runtime_pm_get_noresume(i915);
 209
 210        /*
 211         * It seems that the DMC likes to transition between the DC states a lot
 212         * when there are no connected displays (no active power domains) during
 213         * command submission.
 214         *
 215         * This activity has negative impact on the performance of the chip with
 216         * huge latencies observed in the interrupt handler and elsewhere.
 217         *
 218         * Work around it by grabbing a GT IRQ power domain whilst there is any
 219         * GT activity, preventing any DC state transitions.
 220         */
 221        intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 222
 223        i915->gt.awake = true;
 224        if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
 225                i915->gt.epoch = 1;
 226
 227        intel_enable_gt_powersave(i915);
 228        i915_update_gfx_val(i915);
 229        if (INTEL_GEN(i915) >= 6)
 230                gen6_rps_busy(i915);
 231        i915_pmu_gt_unparked(i915);
 232
 233        intel_engines_unpark(i915);
 234
 235        i915_queue_hangcheck(i915);
 236
 237        queue_delayed_work(i915->wq,
 238                           &i915->gt.retire_work,
 239                           round_jiffies_up_relative(HZ));
 240}
 241
 242int
 243i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 244                            struct drm_file *file)
 245{
 246        struct drm_i915_private *dev_priv = to_i915(dev);
 247        struct i915_ggtt *ggtt = &dev_priv->ggtt;
 248        struct drm_i915_gem_get_aperture *args = data;
 249        struct i915_vma *vma;
 250        u64 pinned;
 251
 252        pinned = ggtt->vm.reserved;
 253        mutex_lock(&dev->struct_mutex);
 254        list_for_each_entry(vma, &ggtt->vm.active_list, vm_link)
 255                if (i915_vma_is_pinned(vma))
 256                        pinned += vma->node.size;
 257        list_for_each_entry(vma, &ggtt->vm.inactive_list, vm_link)
 258                if (i915_vma_is_pinned(vma))
 259                        pinned += vma->node.size;
 260        mutex_unlock(&dev->struct_mutex);
 261
 262        args->aper_size = ggtt->vm.total;
 263        args->aper_available_size = args->aper_size - pinned;
 264
 265        return 0;
 266}
 267
 268static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 269{
 270        struct address_space *mapping = obj->base.filp->f_mapping;
 271        drm_dma_handle_t *phys;
 272        struct sg_table *st;
 273        struct scatterlist *sg;
 274        char *vaddr;
 275        int i;
 276        int err;
 277
 278        if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 279                return -EINVAL;
 280
 281        /* Always aligning to the object size, allows a single allocation
 282         * to handle all possible callers, and given typical object sizes,
 283         * the alignment of the buddy allocation will naturally match.
 284         */
 285        phys = drm_pci_alloc(obj->base.dev,
 286                             roundup_pow_of_two(obj->base.size),
 287                             roundup_pow_of_two(obj->base.size));
 288        if (!phys)
 289                return -ENOMEM;
 290
 291        vaddr = phys->vaddr;
 292        for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 293                struct page *page;
 294                char *src;
 295
 296                page = shmem_read_mapping_page(mapping, i);
 297                if (IS_ERR(page)) {
 298                        err = PTR_ERR(page);
 299                        goto err_phys;
 300                }
 301
 302                src = kmap_atomic(page);
 303                memcpy(vaddr, src, PAGE_SIZE);
 304                drm_clflush_virt_range(vaddr, PAGE_SIZE);
 305                kunmap_atomic(src);
 306
 307                put_page(page);
 308                vaddr += PAGE_SIZE;
 309        }
 310
 311        i915_gem_chipset_flush(to_i915(obj->base.dev));
 312
 313        st = kmalloc(sizeof(*st), GFP_KERNEL);
 314        if (!st) {
 315                err = -ENOMEM;
 316                goto err_phys;
 317        }
 318
 319        if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 320                kfree(st);
 321                err = -ENOMEM;
 322                goto err_phys;
 323        }
 324
 325        sg = st->sgl;
 326        sg->offset = 0;
 327        sg->length = obj->base.size;
 328
 329        sg_dma_address(sg) = phys->busaddr;
 330        sg_dma_len(sg) = obj->base.size;
 331
 332        obj->phys_handle = phys;
 333
 334        __i915_gem_object_set_pages(obj, st, sg->length);
 335
 336        return 0;
 337
 338err_phys:
 339        drm_pci_free(obj->base.dev, phys);
 340
 341        return err;
 342}
 343
 344static void __start_cpu_write(struct drm_i915_gem_object *obj)
 345{
 346        obj->read_domains = I915_GEM_DOMAIN_CPU;
 347        obj->write_domain = I915_GEM_DOMAIN_CPU;
 348        if (cpu_write_needs_clflush(obj))
 349                obj->cache_dirty = true;
 350}
 351
 352static void
 353__i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 354                                struct sg_table *pages,
 355                                bool needs_clflush)
 356{
 357        GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 358
 359        if (obj->mm.madv == I915_MADV_DONTNEED)
 360                obj->mm.dirty = false;
 361
 362        if (needs_clflush &&
 363            (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 364            !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 365                drm_clflush_sg(pages);
 366
 367        __start_cpu_write(obj);
 368}
 369
 370static void
 371i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 372                               struct sg_table *pages)
 373{
 374        __i915_gem_object_release_shmem(obj, pages, false);
 375
 376        if (obj->mm.dirty) {
 377                struct address_space *mapping = obj->base.filp->f_mapping;
 378                char *vaddr = obj->phys_handle->vaddr;
 379                int i;
 380
 381                for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 382                        struct page *page;
 383                        char *dst;
 384
 385                        page = shmem_read_mapping_page(mapping, i);
 386                        if (IS_ERR(page))
 387                                continue;
 388
 389                        dst = kmap_atomic(page);
 390                        drm_clflush_virt_range(vaddr, PAGE_SIZE);
 391                        memcpy(dst, vaddr, PAGE_SIZE);
 392                        kunmap_atomic(dst);
 393
 394                        set_page_dirty(page);
 395                        if (obj->mm.madv == I915_MADV_WILLNEED)
 396                                mark_page_accessed(page);
 397                        put_page(page);
 398                        vaddr += PAGE_SIZE;
 399                }
 400                obj->mm.dirty = false;
 401        }
 402
 403        sg_free_table(pages);
 404        kfree(pages);
 405
 406        drm_pci_free(obj->base.dev, obj->phys_handle);
 407}
 408
 409static void
 410i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 411{
 412        i915_gem_object_unpin_pages(obj);
 413}
 414
 415static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 416        .get_pages = i915_gem_object_get_pages_phys,
 417        .put_pages = i915_gem_object_put_pages_phys,
 418        .release = i915_gem_object_release_phys,
 419};
 420
 421static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 422
 423int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 424{
 425        struct i915_vma *vma;
 426        LIST_HEAD(still_in_list);
 427        int ret;
 428
 429        lockdep_assert_held(&obj->base.dev->struct_mutex);
 430
 431        /* Closed vma are removed from the obj->vma_list - but they may
 432         * still have an active binding on the object. To remove those we
 433         * must wait for all rendering to complete to the object (as unbinding
 434         * must anyway), and retire the requests.
 435         */
 436        ret = i915_gem_object_set_to_cpu_domain(obj, false);
 437        if (ret)
 438                return ret;
 439
 440        while ((vma = list_first_entry_or_null(&obj->vma_list,
 441                                               struct i915_vma,
 442                                               obj_link))) {
 443                list_move_tail(&vma->obj_link, &still_in_list);
 444                ret = i915_vma_unbind(vma);
 445                if (ret)
 446                        break;
 447        }
 448        list_splice(&still_in_list, &obj->vma_list);
 449
 450        return ret;
 451}
 452
 453static long
 454i915_gem_object_wait_fence(struct dma_fence *fence,
 455                           unsigned int flags,
 456                           long timeout,
 457                           struct intel_rps_client *rps_client)
 458{
 459        struct i915_request *rq;
 460
 461        BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 462
 463        if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 464                return timeout;
 465
 466        if (!dma_fence_is_i915(fence))
 467                return dma_fence_wait_timeout(fence,
 468                                              flags & I915_WAIT_INTERRUPTIBLE,
 469                                              timeout);
 470
 471        rq = to_request(fence);
 472        if (i915_request_completed(rq))
 473                goto out;
 474
 475        /*
 476         * This client is about to stall waiting for the GPU. In many cases
 477         * this is undesirable and limits the throughput of the system, as
 478         * many clients cannot continue processing user input/output whilst
 479         * blocked. RPS autotuning may take tens of milliseconds to respond
 480         * to the GPU load and thus incurs additional latency for the client.
 481         * We can circumvent that by promoting the GPU frequency to maximum
 482         * before we wait. This makes the GPU throttle up much more quickly
 483         * (good for benchmarks and user experience, e.g. window animations),
 484         * but at a cost of spending more power processing the workload
 485         * (bad for battery). Not all clients even want their results
 486         * immediately and for them we should just let the GPU select its own
 487         * frequency to maximise efficiency. To prevent a single client from
 488         * forcing the clocks too high for the whole system, we only allow
 489         * each client to waitboost once in a busy period.
 490         */
 491        if (rps_client && !i915_request_started(rq)) {
 492                if (INTEL_GEN(rq->i915) >= 6)
 493                        gen6_rps_boost(rq, rps_client);
 494        }
 495
 496        timeout = i915_request_wait(rq, flags, timeout);
 497
 498out:
 499        if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
 500                i915_request_retire_upto(rq);
 501
 502        return timeout;
 503}
 504
 505static long
 506i915_gem_object_wait_reservation(struct reservation_object *resv,
 507                                 unsigned int flags,
 508                                 long timeout,
 509                                 struct intel_rps_client *rps_client)
 510{
 511        unsigned int seq = __read_seqcount_begin(&resv->seq);
 512        struct dma_fence *excl;
 513        bool prune_fences = false;
 514
 515        if (flags & I915_WAIT_ALL) {
 516                struct dma_fence **shared;
 517                unsigned int count, i;
 518                int ret;
 519
 520                ret = reservation_object_get_fences_rcu(resv,
 521                                                        &excl, &count, &shared);
 522                if (ret)
 523                        return ret;
 524
 525                for (i = 0; i < count; i++) {
 526                        timeout = i915_gem_object_wait_fence(shared[i],
 527                                                             flags, timeout,
 528                                                             rps_client);
 529                        if (timeout < 0)
 530                                break;
 531
 532                        dma_fence_put(shared[i]);
 533                }
 534
 535                for (; i < count; i++)
 536                        dma_fence_put(shared[i]);
 537                kfree(shared);
 538
 539                /*
 540                 * If both shared fences and an exclusive fence exist,
 541                 * then by construction the shared fences must be later
 542                 * than the exclusive fence. If we successfully wait for
 543                 * all the shared fences, we know that the exclusive fence
 544                 * must all be signaled. If all the shared fences are
 545                 * signaled, we can prune the array and recover the
 546                 * floating references on the fences/requests.
 547                 */
 548                prune_fences = count && timeout >= 0;
 549        } else {
 550                excl = reservation_object_get_excl_rcu(resv);
 551        }
 552
 553        if (excl && timeout >= 0)
 554                timeout = i915_gem_object_wait_fence(excl, flags, timeout,
 555                                                     rps_client);
 556
 557        dma_fence_put(excl);
 558
 559        /*
 560         * Opportunistically prune the fences iff we know they have *all* been
 561         * signaled and that the reservation object has not been changed (i.e.
 562         * no new fences have been added).
 563         */
 564        if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 565                if (reservation_object_trylock(resv)) {
 566                        if (!__read_seqcount_retry(&resv->seq, seq))
 567                                reservation_object_add_excl_fence(resv, NULL);
 568                        reservation_object_unlock(resv);
 569                }
 570        }
 571
 572        return timeout;
 573}
 574
 575static void __fence_set_priority(struct dma_fence *fence,
 576                                 const struct i915_sched_attr *attr)
 577{
 578        struct i915_request *rq;
 579        struct intel_engine_cs *engine;
 580
 581        if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 582                return;
 583
 584        rq = to_request(fence);
 585        engine = rq->engine;
 586
 587        local_bh_disable();
 588        rcu_read_lock(); /* RCU serialisation for set-wedged protection */
 589        if (engine->schedule)
 590                engine->schedule(rq, attr);
 591        rcu_read_unlock();
 592        local_bh_enable(); /* kick the tasklets if queues were reprioritised */
 593}
 594
 595static void fence_set_priority(struct dma_fence *fence,
 596                               const struct i915_sched_attr *attr)
 597{
 598        /* Recurse once into a fence-array */
 599        if (dma_fence_is_array(fence)) {
 600                struct dma_fence_array *array = to_dma_fence_array(fence);
 601                int i;
 602
 603                for (i = 0; i < array->num_fences; i++)
 604                        __fence_set_priority(array->fences[i], attr);
 605        } else {
 606                __fence_set_priority(fence, attr);
 607        }
 608}
 609
 610int
 611i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 612                              unsigned int flags,
 613                              const struct i915_sched_attr *attr)
 614{
 615        struct dma_fence *excl;
 616
 617        if (flags & I915_WAIT_ALL) {
 618                struct dma_fence **shared;
 619                unsigned int count, i;
 620                int ret;
 621
 622                ret = reservation_object_get_fences_rcu(obj->resv,
 623                                                        &excl, &count, &shared);
 624                if (ret)
 625                        return ret;
 626
 627                for (i = 0; i < count; i++) {
 628                        fence_set_priority(shared[i], attr);
 629                        dma_fence_put(shared[i]);
 630                }
 631
 632                kfree(shared);
 633        } else {
 634                excl = reservation_object_get_excl_rcu(obj->resv);
 635        }
 636
 637        if (excl) {
 638                fence_set_priority(excl, attr);
 639                dma_fence_put(excl);
 640        }
 641        return 0;
 642}
 643
 644/**
 645 * Waits for rendering to the object to be completed
 646 * @obj: i915 gem object
 647 * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 648 * @timeout: how long to wait
 649 * @rps_client: client (user process) to charge for any waitboosting
 650 */
 651int
 652i915_gem_object_wait(struct drm_i915_gem_object *obj,
 653                     unsigned int flags,
 654                     long timeout,
 655                     struct intel_rps_client *rps_client)
 656{
 657        might_sleep();
 658#if IS_ENABLED(CONFIG_LOCKDEP)
 659        GEM_BUG_ON(debug_locks &&
 660                   !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
 661                   !!(flags & I915_WAIT_LOCKED));
 662#endif
 663        GEM_BUG_ON(timeout < 0);
 664
 665        timeout = i915_gem_object_wait_reservation(obj->resv,
 666                                                   flags, timeout,
 667                                                   rps_client);
 668        return timeout < 0 ? timeout : 0;
 669}
 670
 671static struct intel_rps_client *to_rps_client(struct drm_file *file)
 672{
 673        struct drm_i915_file_private *fpriv = file->driver_priv;
 674
 675        return &fpriv->rps_client;
 676}
 677
 678static int
 679i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 680                     struct drm_i915_gem_pwrite *args,
 681                     struct drm_file *file)
 682{
 683        void *vaddr = obj->phys_handle->vaddr + args->offset;
 684        char __user *user_data = u64_to_user_ptr(args->data_ptr);
 685
 686        /* We manually control the domain here and pretend that it
 687         * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 688         */
 689        intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 690        if (copy_from_user(vaddr, user_data, args->size))
 691                return -EFAULT;
 692
 693        drm_clflush_virt_range(vaddr, args->size);
 694        i915_gem_chipset_flush(to_i915(obj->base.dev));
 695
 696        intel_fb_obj_flush(obj, ORIGIN_CPU);
 697        return 0;
 698}
 699
 700void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 701{
 702        return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 703}
 704
 705void i915_gem_object_free(struct drm_i915_gem_object *obj)
 706{
 707        struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 708        kmem_cache_free(dev_priv->objects, obj);
 709}
 710
 711static int
 712i915_gem_create(struct drm_file *file,
 713                struct drm_i915_private *dev_priv,
 714                uint64_t size,
 715                uint32_t *handle_p)
 716{
 717        struct drm_i915_gem_object *obj;
 718        int ret;
 719        u32 handle;
 720
 721        size = roundup(size, PAGE_SIZE);
 722        if (size == 0)
 723                return -EINVAL;
 724
 725        /* Allocate the new object */
 726        obj = i915_gem_object_create(dev_priv, size);
 727        if (IS_ERR(obj))
 728                return PTR_ERR(obj);
 729
 730        ret = drm_gem_handle_create(file, &obj->base, &handle);
 731        /* drop reference from allocate - handle holds it now */
 732        i915_gem_object_put(obj);
 733        if (ret)
 734                return ret;
 735
 736        *handle_p = handle;
 737        return 0;
 738}
 739
 740int
 741i915_gem_dumb_create(struct drm_file *file,
 742                     struct drm_device *dev,
 743                     struct drm_mode_create_dumb *args)
 744{
 745        /* have to work out size/pitch and return them */
 746        args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 747        args->size = args->pitch * args->height;
 748        return i915_gem_create(file, to_i915(dev),
 749                               args->size, &args->handle);
 750}
 751
 752static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 753{
 754        return !(obj->cache_level == I915_CACHE_NONE ||
 755                 obj->cache_level == I915_CACHE_WT);
 756}
 757
 758/**
 759 * Creates a new mm object and returns a handle to it.
 760 * @dev: drm device pointer
 761 * @data: ioctl data blob
 762 * @file: drm file pointer
 763 */
 764int
 765i915_gem_create_ioctl(struct drm_device *dev, void *data,
 766                      struct drm_file *file)
 767{
 768        struct drm_i915_private *dev_priv = to_i915(dev);
 769        struct drm_i915_gem_create *args = data;
 770
 771        i915_gem_flush_free_objects(dev_priv);
 772
 773        return i915_gem_create(file, dev_priv,
 774                               args->size, &args->handle);
 775}
 776
 777static inline enum fb_op_origin
 778fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 779{
 780        return (domain == I915_GEM_DOMAIN_GTT ?
 781                obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 782}
 783
 784void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
 785{
 786        /*
 787         * No actual flushing is required for the GTT write domain for reads
 788         * from the GTT domain. Writes to it "immediately" go to main memory
 789         * as far as we know, so there's no chipset flush. It also doesn't
 790         * land in the GPU render cache.
 791         *
 792         * However, we do have to enforce the order so that all writes through
 793         * the GTT land before any writes to the device, such as updates to
 794         * the GATT itself.
 795         *
 796         * We also have to wait a bit for the writes to land from the GTT.
 797         * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 798         * timing. This issue has only been observed when switching quickly
 799         * between GTT writes and CPU reads from inside the kernel on recent hw,
 800         * and it appears to only affect discrete GTT blocks (i.e. on LLC
 801         * system agents we cannot reproduce this behaviour, until Cannonlake
 802         * that was!).
 803         */
 804
 805        wmb();
 806
 807        if (INTEL_INFO(dev_priv)->has_coherent_ggtt)
 808                return;
 809
 810        i915_gem_chipset_flush(dev_priv);
 811
 812        intel_runtime_pm_get(dev_priv);
 813        spin_lock_irq(&dev_priv->uncore.lock);
 814
 815        POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
 816
 817        spin_unlock_irq(&dev_priv->uncore.lock);
 818        intel_runtime_pm_put(dev_priv);
 819}
 820
 821static void
 822flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 823{
 824        struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 825        struct i915_vma *vma;
 826
 827        if (!(obj->write_domain & flush_domains))
 828                return;
 829
 830        switch (obj->write_domain) {
 831        case I915_GEM_DOMAIN_GTT:
 832                i915_gem_flush_ggtt_writes(dev_priv);
 833
 834                intel_fb_obj_flush(obj,
 835                                   fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 836
 837                for_each_ggtt_vma(vma, obj) {
 838                        if (vma->iomap)
 839                                continue;
 840
 841                        i915_vma_unset_ggtt_write(vma);
 842                }
 843                break;
 844
 845        case I915_GEM_DOMAIN_WC:
 846                wmb();
 847                break;
 848
 849        case I915_GEM_DOMAIN_CPU:
 850                i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 851                break;
 852
 853        case I915_GEM_DOMAIN_RENDER:
 854                if (gpu_write_needs_clflush(obj))
 855                        obj->cache_dirty = true;
 856                break;
 857        }
 858
 859        obj->write_domain = 0;
 860}
 861
 862static inline int
 863__copy_to_user_swizzled(char __user *cpu_vaddr,
 864                        const char *gpu_vaddr, int gpu_offset,
 865                        int length)
 866{
 867        int ret, cpu_offset = 0;
 868
 869        while (length > 0) {
 870                int cacheline_end = ALIGN(gpu_offset + 1, 64);
 871                int this_length = min(cacheline_end - gpu_offset, length);
 872                int swizzled_gpu_offset = gpu_offset ^ 64;
 873
 874                ret = __copy_to_user(cpu_vaddr + cpu_offset,
 875                                     gpu_vaddr + swizzled_gpu_offset,
 876                                     this_length);
 877                if (ret)
 878                        return ret + length;
 879
 880                cpu_offset += this_length;
 881                gpu_offset += this_length;
 882                length -= this_length;
 883        }
 884
 885        return 0;
 886}
 887
 888static inline int
 889__copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
 890                          const char __user *cpu_vaddr,
 891                          int length)
 892{
 893        int ret, cpu_offset = 0;
 894
 895        while (length > 0) {
 896                int cacheline_end = ALIGN(gpu_offset + 1, 64);
 897                int this_length = min(cacheline_end - gpu_offset, length);
 898                int swizzled_gpu_offset = gpu_offset ^ 64;
 899
 900                ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
 901                                       cpu_vaddr + cpu_offset,
 902                                       this_length);
 903                if (ret)
 904                        return ret + length;
 905
 906                cpu_offset += this_length;
 907                gpu_offset += this_length;
 908                length -= this_length;
 909        }
 910
 911        return 0;
 912}
 913
 914/*
 915 * Pins the specified object's pages and synchronizes the object with
 916 * GPU accesses. Sets needs_clflush to non-zero if the caller should
 917 * flush the object from the CPU cache.
 918 */
 919int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 920                                    unsigned int *needs_clflush)
 921{
 922        int ret;
 923
 924        lockdep_assert_held(&obj->base.dev->struct_mutex);
 925
 926        *needs_clflush = 0;
 927        if (!i915_gem_object_has_struct_page(obj))
 928                return -ENODEV;
 929
 930        ret = i915_gem_object_wait(obj,
 931                                   I915_WAIT_INTERRUPTIBLE |
 932                                   I915_WAIT_LOCKED,
 933                                   MAX_SCHEDULE_TIMEOUT,
 934                                   NULL);
 935        if (ret)
 936                return ret;
 937
 938        ret = i915_gem_object_pin_pages(obj);
 939        if (ret)
 940                return ret;
 941
 942        if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 943            !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 944                ret = i915_gem_object_set_to_cpu_domain(obj, false);
 945                if (ret)
 946                        goto err_unpin;
 947                else
 948                        goto out;
 949        }
 950
 951        flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 952
 953        /* If we're not in the cpu read domain, set ourself into the gtt
 954         * read domain and manually flush cachelines (if required). This
 955         * optimizes for the case when the gpu will dirty the data
 956         * anyway again before the next pread happens.
 957         */
 958        if (!obj->cache_dirty &&
 959            !(obj->read_domains & I915_GEM_DOMAIN_CPU))
 960                *needs_clflush = CLFLUSH_BEFORE;
 961
 962out:
 963        /* return with the pages pinned */
 964        return 0;
 965
 966err_unpin:
 967        i915_gem_object_unpin_pages(obj);
 968        return ret;
 969}
 970
 971int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 972                                     unsigned int *needs_clflush)
 973{
 974        int ret;
 975
 976        lockdep_assert_held(&obj->base.dev->struct_mutex);
 977
 978        *needs_clflush = 0;
 979        if (!i915_gem_object_has_struct_page(obj))
 980                return -ENODEV;
 981
 982        ret = i915_gem_object_wait(obj,
 983                                   I915_WAIT_INTERRUPTIBLE |
 984                                   I915_WAIT_LOCKED |
 985                                   I915_WAIT_ALL,
 986                                   MAX_SCHEDULE_TIMEOUT,
 987                                   NULL);
 988        if (ret)
 989                return ret;
 990
 991        ret = i915_gem_object_pin_pages(obj);
 992        if (ret)
 993                return ret;
 994
 995        if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 996            !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 997                ret = i915_gem_object_set_to_cpu_domain(obj, true);
 998                if (ret)
 999                        goto err_unpin;
1000                else

1001                        goto out;
1002        }
1003
1004        flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
1005
1006        /* If we're not in the cpu write domain, set ourself into the
1007         * gtt write domain and manually flush cachelines (as required).
1008         * This optimizes for the case when the gpu will use the data
1009         * right away and we therefore have to clflush anyway.
1010         */
1011        if (!obj->cache_dirty) {
1012                *needs_clflush |= CLFLUSH_AFTER;
1013
1014                /*
1015                 * Same trick applies to invalidate partially written
1016                 * cachelines read before writing.
1017                 */
1018                if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
1019                        *needs_clflush |= CLFLUSH_BEFORE;
1020        }
1021
1022out:
1023        intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1024        obj->mm.dirty = true;
1025        /* return with the pages pinned */
1026        return 0;
1027
1028err_unpin:
1029        i915_gem_object_unpin_pages(obj);
1030        return ret;
1031}
1032
1033static void
1034shmem_clflush_swizzled_range(char *addr, unsigned long length,
1035                             bool swizzled)
1036{
1037        if (unlikely(swizzled)) {
1038                unsigned long start = (unsigned long) addr;
1039                unsigned long end = (unsigned long) addr + length;
1040
1041                /* For swizzling simply ensure that we always flush both
1042                 * channels. Lame, but simple and it works. Swizzled
1043                 * pwrite/pread is far from a hotpath - current userspace
1044                 * doesn't use it at all. */
1045                start = round_down(start, 128);
1046                end = round_up(end, 128);
1047
1048                drm_clflush_virt_range((void *)start, end - start);
1049        } else {
1050                drm_clflush_virt_range(addr, length);
1051        }
1052
1053}
1054
1055/* Only difference to the fast-path function is that this can handle bit17
1056 * and uses non-atomic copy and kmap functions. */
1057static int
1058shmem_pread_slow(struct page *page, int offset, int length,
1059                 char __user *user_data,
1060                 bool page_do_bit17_swizzling, bool needs_clflush)
1061{
1062        char *vaddr;
1063        int ret;
1064
1065        vaddr = kmap(page);
1066        if (needs_clflush)
1067                shmem_clflush_swizzled_range(vaddr + offset, length,
1068                                             page_do_bit17_swizzling);
1069
1070        if (page_do_bit17_swizzling)
1071                ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
1072        else
1073                ret = __copy_to_user(user_data, vaddr + offset, length);
1074        kunmap(page);
1075
1076        return ret ? - EFAULT : 0;
1077}
1078
1079static int
1080shmem_pread(struct page *page, int offset, int length, char __user *user_data,
1081            bool page_do_bit17_swizzling, bool needs_clflush)
1082{
1083        int ret;
1084
1085        ret = -ENODEV;
1086        if (!page_do_bit17_swizzling) {
1087                char *vaddr = kmap_atomic(page);
1088
1089                if (needs_clflush)
1090                        drm_clflush_virt_range(vaddr + offset, length);
1091                ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
1092                kunmap_atomic(vaddr);
1093        }
1094        if (ret == 0)
1095                return 0;
1096
1097        return shmem_pread_slow(page, offset, length, user_data,
1098                                page_do_bit17_swizzling, needs_clflush);
1099}
1100
1101static int
1102i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1103                     struct drm_i915_gem_pread *args)
1104{
1105        char __user *user_data;
1106        u64 remain;
1107        unsigned int obj_do_bit17_swizzling;
1108        unsigned int needs_clflush;
1109        unsigned int idx, offset;
1110        int ret;
1111
1112        obj_do_bit17_swizzling = 0;
1113        if (i915_gem_object_needs_bit17_swizzle(obj))
1114                obj_do_bit17_swizzling = BIT(17);
1115
1116        ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1117        if (ret)
1118                return ret;
1119
1120        ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1121        mutex_unlock(&obj->base.dev->struct_mutex);
1122        if (ret)
1123                return ret;
1124
1125        remain = args->size;
1126        user_data = u64_to_user_ptr(args->data_ptr);
1127        offset = offset_in_page(args->offset);
1128        for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1129                struct page *page = i915_gem_object_get_page(obj, idx);
1130                unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1131
1132                ret = shmem_pread(page, offset, length, user_data,
1133                                  page_to_phys(page) & obj_do_bit17_swizzling,
1134                                  needs_clflush);
1135                if (ret)
1136                        break;
1137
1138                remain -= length;
1139                user_data += length;
1140                offset = 0;
1141        }
1142
1143        i915_gem_obj_finish_shmem_access(obj);
1144        return ret;
1145}
1146
1147static inline bool
1148gtt_user_read(struct io_mapping *mapping,
1149              loff_t base, int offset,
1150              char __user *user_data, int length)
1151{
1152        void __iomem *vaddr;
1153        unsigned long unwritten;
1154
1155        /* We can use the cpu mem copy function because this is X86. */
1156        vaddr = io_mapping_map_atomic_wc(mapping, base);
1157        unwritten = __copy_to_user_inatomic(user_data,
1158                                            (void __force *)vaddr + offset,
1159                                            length);
1160        io_mapping_unmap_atomic(vaddr);
1161        if (unwritten) {
1162                vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1163                unwritten = copy_to_user(user_data,
1164                                         (void __force *)vaddr + offset,
1165                                         length);
1166                io_mapping_unmap(vaddr);
1167        }
1168        return unwritten;
1169}
1170
1171static int
1172i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1173                   const struct drm_i915_gem_pread *args)
1174{
1175        struct drm_i915_private *i915 = to_i915(obj->base.dev);
1176        struct i915_ggtt *ggtt = &i915->ggtt;
1177        struct drm_mm_node node;
1178        struct i915_vma *vma;
1179        void __user *user_data;
1180        u64 remain, offset;
1181        int ret;
1182
1183        ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1184        if (ret)
1185                return ret;
1186
1187        intel_runtime_pm_get(i915);
1188        vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1189                                       PIN_MAPPABLE |
1190                                       PIN_NONFAULT |
1191                                       PIN_NONBLOCK);
1192        if (!IS_ERR(vma)) {
1193                node.start = i915_ggtt_offset(vma);
1194                node.allocated = false;
1195                ret = i915_vma_put_fence(vma);
1196                if (ret) {
1197                        i915_vma_unpin(vma);
1198                        vma = ERR_PTR(ret);
1199                }
1200        }
1201        if (IS_ERR(vma)) {
1202                ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1203                if (ret)
1204                        goto out_unlock;
1205                GEM_BUG_ON(!node.allocated);
1206        }
1207
1208        ret = i915_gem_object_set_to_gtt_domain(obj, false);
1209        if (ret)
1210                goto out_unpin;
1211
1212        mutex_unlock(&i915->drm.struct_mutex);
1213
1214        user_data = u64_to_user_ptr(args->data_ptr);
1215        remain = args->size;
1216        offset = args->offset;
1217
1218        while (remain > 0) {
1219                /* Operation in this page
1220                 *
1221                 * page_base = page offset within aperture
1222                 * page_offset = offset within page
1223                 * page_length = bytes to copy for this page
1224                 */
1225                u32 page_base = node.start;
1226                unsigned page_offset = offset_in_page(offset);
1227                unsigned page_length = PAGE_SIZE - page_offset;
1228                page_length = remain < page_length ? remain : page_length;
1229                if (node.allocated) {
1230                        wmb();
1231                        ggtt->vm.insert_page(&ggtt->vm,
1232                                             i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1233                                             node.start, I915_CACHE_NONE, 0);
1234                        wmb();
1235                } else {
1236                        page_base += offset & PAGE_MASK;
1237                }
1238
1239                if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1240                                  user_data, page_length)) {
1241                        ret = -EFAULT;
1242                        break;
1243                }
1244
1245                remain -= page_length;
1246                user_data += page_length;
1247                offset += page_length;
1248        }
1249
1250        mutex_lock(&i915->drm.struct_mutex);
1251out_unpin:
1252        if (node.allocated) {
1253                wmb();
1254                ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1255                remove_mappable_node(&node);
1256        } else {
1257                i915_vma_unpin(vma);
1258        }
1259out_unlock:
1260        intel_runtime_pm_put(i915);
1261        mutex_unlock(&i915->drm.struct_mutex);
1262
1263        return ret;
1264}
1265
1266/**
1267 * Reads data from the object referenced by handle.
1268 * @dev: drm device pointer
1269 * @data: ioctl data blob
1270 * @file: drm file pointer
1271 *
1272 * On error, the contents of *data are undefined.
1273 */
1274int
1275i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1276                     struct drm_file *file)
1277{
1278        struct drm_i915_gem_pread *args = data;
1279        struct drm_i915_gem_object *obj;
1280        int ret;
1281
1282        if (args->size == 0)
1283                return 0;
1284
1285        if (!access_ok(VERIFY_WRITE,
1286                       u64_to_user_ptr(args->data_ptr),
1287                       args->size))
1288                return -EFAULT;
1289
1290        obj = i915_gem_object_lookup(file, args->handle);
1291        if (!obj)
1292                return -ENOENT;
1293
1294        /* Bounds check source.  */
1295        if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1296                ret = -EINVAL;
1297                goto out;
1298        }
1299
1300        trace_i915_gem_object_pread(obj, args->offset, args->size);
1301
1302        ret = i915_gem_object_wait(obj,
1303                                   I915_WAIT_INTERRUPTIBLE,
1304                                   MAX_SCHEDULE_TIMEOUT,
1305                                   to_rps_client(file));
1306        if (ret)
1307                goto out;
1308
1309        ret = i915_gem_object_pin_pages(obj);
1310        if (ret)
1311                goto out;
1312
1313        ret = i915_gem_shmem_pread(obj, args);
1314        if (ret == -EFAULT || ret == -ENODEV)
1315                ret = i915_gem_gtt_pread(obj, args);
1316
1317        i915_gem_object_unpin_pages(obj);
1318out:
1319        i915_gem_object_put(obj);
1320        return ret;
1321}
1322
1323/* This is the fast write path which cannot handle
1324 * page faults in the source data
1325 */
1326
1327static inline bool
1328ggtt_write(struct io_mapping *mapping,
1329           loff_t base, int offset,
1330           char __user *user_data, int length)
1331{
1332        void __iomem *vaddr;
1333        unsigned long unwritten;
1334
1335        /* We can use the cpu mem copy function because this is X86. */
1336        vaddr = io_mapping_map_atomic_wc(mapping, base);
1337        unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1338                                                      user_data, length);
1339        io_mapping_unmap_atomic(vaddr);
1340        if (unwritten) {
1341                vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1342                unwritten = copy_from_user((void __force *)vaddr + offset,
1343                                           user_data, length);
1344                io_mapping_unmap(vaddr);
1345        }
1346
1347        return unwritten;
1348}
1349
1350/**
1351 * This is the fast pwrite path, where we copy the data directly from the
1352 * user into the GTT, uncached.
1353 * @obj: i915 GEM object
1354 * @args: pwrite arguments structure
1355 */
1356static int
1357i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1358                         const struct drm_i915_gem_pwrite *args)
1359{
1360        struct drm_i915_private *i915 = to_i915(obj->base.dev);
1361        struct i915_ggtt *ggtt = &i915->ggtt;
1362        struct drm_mm_node node;
1363        struct i915_vma *vma;
1364        u64 remain, offset;
1365        void __user *user_data;
1366        int ret;
1367
1368        ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1369        if (ret)
1370                return ret;
1371
1372        if (i915_gem_object_has_struct_page(obj)) {
1373                /*
1374                 * Avoid waking the device up if we can fallback, as
1375                 * waking/resuming is very slow (worst-case 10-100 ms
1376                 * depending on PCI sleeps and our own resume time).
1377                 * This easily dwarfs any performance advantage from
1378                 * using the cache bypass of indirect GGTT access.
1379                 */
1380                if (!intel_runtime_pm_get_if_in_use(i915)) {
1381                        ret = -EFAULT;
1382                        goto out_unlock;
1383                }
1384        } else {
1385                /* No backing pages, no fallback, we must force GGTT access */
1386                intel_runtime_pm_get(i915);
1387        }
1388
1389        vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1390                                       PIN_MAPPABLE |
1391                                       PIN_NONFAULT |
1392                                       PIN_NONBLOCK);
1393        if (!IS_ERR(vma)) {
1394                node.start = i915_ggtt_offset(vma);
1395                node.allocated = false;
1396                ret = i915_vma_put_fence(vma);
1397                if (ret) {
1398                        i915_vma_unpin(vma);
1399                        vma = ERR_PTR(ret);
1400                }
1401        }
1402        if (IS_ERR(vma)) {
1403                ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1404                if (ret)
1405                        goto out_rpm;
1406                GEM_BUG_ON(!node.allocated);
1407        }
1408
1409        ret = i915_gem_object_set_to_gtt_domain(obj, true);
1410        if (ret)
1411                goto out_unpin;
1412
1413        mutex_unlock(&i915->drm.struct_mutex);
1414
1415        intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1416
1417        user_data = u64_to_user_ptr(args->data_ptr);
1418        offset = args->offset;
1419        remain = args->size;
1420        while (remain) {
1421                /* Operation in this page
1422                 *
1423                 * page_base = page offset within aperture
1424                 * page_offset = offset within page
1425                 * page_length = bytes to copy for this page
1426                 */
1427                u32 page_base = node.start;
1428                unsigned int page_offset = offset_in_page(offset);
1429                unsigned int page_length = PAGE_SIZE - page_offset;
1430                page_length = remain < page_length ? remain : page_length;
1431                if (node.allocated) {
1432                        wmb(); /* flush the write before we modify the GGTT */
1433                        ggtt->vm.insert_page(&ggtt->vm,
1434                                             i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1435                                             node.start, I915_CACHE_NONE, 0);
1436                        wmb(); /* flush modifications to the GGTT (insert_page) */
1437                } else {
1438                        page_base += offset & PAGE_MASK;
1439                }
1440                /* If we get a fault while copying data, then (presumably) our
1441                 * source page isn't available.  Return the error and we'll
1442                 * retry in the slow path.
1443                 * If the object is non-shmem backed, we retry again with the
1444                 * path that handles page fault.
1445                 */
1446                if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1447                               user_data, page_length)) {
1448                        ret = -EFAULT;
1449                        break;
1450                }
1451
1452                remain -= page_length;
1453                user_data += page_length;
1454                offset += page_length;
1455        }
1456        intel_fb_obj_flush(obj, ORIGIN_CPU);
1457
1458        mutex_lock(&i915->drm.struct_mutex);
1459out_unpin:
1460        if (node.allocated) {
1461                wmb();
1462                ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1463                remove_mappable_node(&node);
1464        } else {
1465                i915_vma_unpin(vma);
1466        }
1467out_rpm:
1468        intel_runtime_pm_put(i915);
1469out_unlock:
1470        mutex_unlock(&i915->drm.struct_mutex);
1471        return ret;
1472}
1473
1474static int
1475shmem_pwrite_slow(struct page *page, int offset, int length,
1476                  char __user *user_data,
1477                  bool page_do_bit17_swizzling,
1478                  bool needs_clflush_before,
1479                  bool needs_clflush_after)
1480{
1481        char *vaddr;
1482        int ret;
1483
1484        vaddr = kmap(page);
1485        if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
1486                shmem_clflush_swizzled_range(vaddr + offset, length,
1487                                             page_do_bit17_swizzling);
1488        if (page_do_bit17_swizzling)
1489                ret = __copy_from_user_swizzled(vaddr, offset, user_data,
1490                                                length);
1491        else
1492                ret = __copy_from_user(vaddr + offset, user_data, length);
1493        if (needs_clflush_after)
1494                shmem_clflush_swizzled_range(vaddr + offset, length,
1495                                             page_do_bit17_swizzling);
1496        kunmap(page);
1497
1498        return ret ? -EFAULT : 0;
1499}
1500
1501/* Per-page copy function for the shmem pwrite fastpath.
1502 * Flushes invalid cachelines before writing to the target if
1503 * needs_clflush_before is set and flushes out any written cachelines after
1504 * writing if needs_clflush is set.
1505 */
1506static int
1507shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1508             bool page_do_bit17_swizzling,
1509             bool needs_clflush_before,
1510             bool needs_clflush_after)
1511{
1512        int ret;
1513
1514        ret = -ENODEV;
1515        if (!page_do_bit17_swizzling) {
1516                char *vaddr = kmap_atomic(page);
1517
1518                if (needs_clflush_before)
1519                        drm_clflush_virt_range(vaddr + offset, len);
1520                ret = __copy_from_user_inatomic(vaddr + offset, user_data, len);
1521                if (needs_clflush_after)
1522                        drm_clflush_virt_range(vaddr + offset, len);
1523
1524                kunmap_atomic(vaddr);
1525        }
1526        if (ret == 0)
1527                return ret;
1528
1529        return shmem_pwrite_slow(page, offset, len, user_data,
1530                                 page_do_bit17_swizzling,
1531                                 needs_clflush_before,
1532                                 needs_clflush_after);
1533}
1534
1535static int
1536i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1537                      const struct drm_i915_gem_pwrite *args)
1538{
1539        struct drm_i915_private *i915 = to_i915(obj->base.dev);
1540        void __user *user_data;
1541        u64 remain;
1542        unsigned int obj_do_bit17_swizzling;
1543        unsigned int partial_cacheline_write;
1544        unsigned int needs_clflush;
1545        unsigned int offset, idx;
1546        int ret;
1547
1548        ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1549        if (ret)
1550                return ret;
1551
1552        ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1553        mutex_unlock(&i915->drm.struct_mutex);
1554        if (ret)
1555                return ret;
1556
1557        obj_do_bit17_swizzling = 0;
1558        if (i915_gem_object_needs_bit17_swizzle(obj))
1559                obj_do_bit17_swizzling = BIT(17);
1560
1561        /* If we don't overwrite a cacheline completely we need to be
1562         * careful to have up-to-date data by first clflushing. Don't
1563         * overcomplicate things and flush the entire patch.
1564         */
1565        partial_cacheline_write = 0;
1566        if (needs_clflush & CLFLUSH_BEFORE)
1567                partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1568
1569        user_data = u64_to_user_ptr(args->data_ptr);
1570        remain = args->size;
1571        offset = offset_in_page(args->offset);
1572        for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1573                struct page *page = i915_gem_object_get_page(obj, idx);
1574                unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1575
1576                ret = shmem_pwrite(page, offset, length, user_data,
1577                                   page_to_phys(page) & obj_do_bit17_swizzling,
1578                                   (offset | length) & partial_cacheline_write,
1579                                   needs_clflush & CLFLUSH_AFTER);
1580                if (ret)
1581                        break;
1582
1583                remain -= length;
1584                user_data += length;
1585                offset = 0;
1586        }
1587
1588        intel_fb_obj_flush(obj, ORIGIN_CPU);
1589        i915_gem_obj_finish_shmem_access(obj);
1590        return ret;
1591}
1592
1593/**
1594 * Writes data to the object referenced by handle.
1595 * @dev: drm device
1596 * @data: ioctl data blob
1597 * @file: drm file
1598 *
1599 * On error, the contents of the buffer that were to be modified are undefined.
1600 */
1601int
1602i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1603                      struct drm_file *file)
1604{
1605        struct drm_i915_gem_pwrite *args = data;
1606        struct drm_i915_gem_object *obj;
1607        int ret;
1608
1609        if (args->size == 0)
1610                return 0;
1611
1612        if (!access_ok(VERIFY_READ,
1613                       u64_to_user_ptr(args->data_ptr),
1614                       args->size))
1615                return -EFAULT;
1616
1617        obj = i915_gem_object_lookup(file, args->handle);
1618        if (!obj)
1619                return -ENOENT;
1620
1621        /* Bounds check destination. */
1622        if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1623                ret = -EINVAL;
1624                goto err;
1625        }
1626
1627        /* Writes not allowed into this read-only object */
1628        if (i915_gem_object_is_readonly(obj)) {
1629                ret = -EINVAL;
1630                goto err;
1631        }
1632
1633        trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1634
1635        ret = -ENODEV;
1636        if (obj->ops->pwrite)
1637                ret = obj->ops->pwrite(obj, args);
1638        if (ret != -ENODEV)
1639                goto err;
1640
1641        ret = i915_gem_object_wait(obj,
1642                                   I915_WAIT_INTERRUPTIBLE |
1643                                   I915_WAIT_ALL,
1644                                   MAX_SCHEDULE_TIMEOUT,
1645                                   to_rps_client(file));
1646        if (ret)
1647                goto err;
1648
1649        ret = i915_gem_object_pin_pages(obj);
1650        if (ret)
1651                goto err;
1652
1653        ret = -EFAULT;
1654        /* We can only do the GTT pwrite on untiled buffers, as otherwise
1655         * it would end up going through the fenced access, and we'll get
1656         * different detiling behavior between reading and writing.
1657         * pread/pwrite currently are reading and writing from the CPU
1658         * perspective, requiring manual detiling by the client.
1659         */
1660        if (!i915_gem_object_has_struct_page(obj) ||
1661            cpu_write_needs_clflush(obj))
1662                /* Note that the gtt paths might fail with non-page-backed user
1663                 * pointers (e.g. gtt mappings when moving data between
1664                 * textures). Fallback to the shmem path in that case.
1665                 */
1666                ret = i915_gem_gtt_pwrite_fast(obj, args);
1667
1668        if (ret == -EFAULT || ret == -ENOSPC) {
1669                if (obj->phys_handle)
1670                        ret = i915_gem_phys_pwrite(obj, args, file);
1671                else
1672                        ret = i915_gem_shmem_pwrite(obj, args);
1673        }
1674
1675        i915_gem_object_unpin_pages(obj);
1676err:
1677        i915_gem_object_put(obj);
1678        return ret;
1679}
1680
1681static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1682{
1683        struct drm_i915_private *i915;
1684        struct list_head *list;
1685        struct i915_vma *vma;
1686
1687        GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1688
1689        for_each_ggtt_vma(vma, obj) {
1690                if (i915_vma_is_active(vma))
1691                        continue;
1692
1693                if (!drm_mm_node_allocated(&vma->node))
1694                        continue;
1695
1696                list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
1697        }
1698
1699        i915 = to_i915(obj->base.dev);
1700        spin_lock(&i915->mm.obj_lock);
1701        list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1702        list_move_tail(&obj->mm.link, list);
1703        spin_unlock(&i915->mm.obj_lock);
1704}
1705
1706/**
1707 * Called when user space prepares to use an object with the CPU, either
1708 * through the mmap ioctl's mapping or a GTT mapping.
1709 * @dev: drm device
1710 * @data: ioctl data blob
1711 * @file: drm file
1712 */
1713int
1714i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1715                          struct drm_file *file)
1716{
1717        struct drm_i915_gem_set_domain *args = data;
1718        struct drm_i915_gem_object *obj;
1719        uint32_t read_domains = args->read_domains;
1720        uint32_t write_domain = args->write_domain;
1721        int err;
1722
1723        /* Only handle setting domains to types used by the CPU. */
1724        if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1725                return -EINVAL;
1726
1727        /* Having something in the write domain implies it's in the read
1728         * domain, and only that read domain.  Enforce that in the request.
1729         */
1730        if (write_domain != 0 && read_domains != write_domain)
1731                return -EINVAL;
1732
1733        obj = i915_gem_object_lookup(file, args->handle);
1734        if (!obj)
1735                return -ENOENT;
1736
1737        /* Try to flush the object off the GPU without holding the lock.
1738         * We will repeat the flush holding the lock in the normal manner
1739         * to catch cases where we are gazumped.
1740         */
1741        err = i915_gem_object_wait(obj,
1742                                   I915_WAIT_INTERRUPTIBLE |
1743                                   (write_domain ? I915_WAIT_ALL : 0),
1744                                   MAX_SCHEDULE_TIMEOUT,
1745                                   to_rps_client(file));
1746        if (err)
1747                goto out;
1748
1749        /*
1750         * Proxy objects do not control access to the backing storage, ergo
1751         * they cannot be used as a means to manipulate the cache domain
1752         * tracking for that backing storage. The proxy object is always
1753         * considered to be outside of any cache domain.
1754         */
1755        if (i915_gem_object_is_proxy(obj)) {
1756                err = -ENXIO;
1757                goto out;
1758        }
1759
1760        /*
1761         * Flush and acquire obj->pages so that we are coherent through
1762         * direct access in memory with previous cached writes through
1763         * shmemfs and that our cache domain tracking remains valid.
1764         * For example, if the obj->filp was moved to swap without us
1765         * being notified and releasing the pages, we would mistakenly
1766         * continue to assume that the obj remained out of the CPU cached
1767         * domain.
1768         */
1769        err = i915_gem_object_pin_pages(obj);
1770        if (err)
1771                goto out;
1772
1773        err = i915_mutex_lock_interruptible(dev);
1774        if (err)
1775                goto out_unpin;
1776
1777        if (read_domains & I915_GEM_DOMAIN_WC)
1778                err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1779        else if (read_domains & I915_GEM_DOMAIN_GTT)
1780                err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1781        else
1782                err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1783
1784        /* And bump the LRU for this access */
1785        i915_gem_object_bump_inactive_ggtt(obj);
1786
1787        mutex_unlock(&dev->struct_mutex);
1788
1789        if (write_domain != 0)
1790                intel_fb_obj_invalidate(obj,
1791                                        fb_write_origin(obj, write_domain));
1792
1793out_unpin:
1794        i915_gem_object_unpin_pages(obj);
1795out:
1796        i915_gem_object_put(obj);
1797        return err;
1798}
1799
1800/**
1801 * Called when user space has done writes to this buffer
1802 * @dev: drm device
1803 * @data: ioctl data blob
1804 * @file: drm file
1805 */
1806int
1807i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1808                         struct drm_file *file)
1809{
1810        struct drm_i915_gem_sw_finish *args = data;
1811        struct drm_i915_gem_object *obj;
1812
1813        obj = i915_gem_object_lookup(file, args->handle);
1814        if (!obj)
1815                return -ENOENT;
1816
1817        /*
1818         * Proxy objects are barred from CPU access, so there is no
1819         * need to ban sw_finish as it is a nop.
1820         */
1821
1822        /* Pinned buffers may be scanout, so flush the cache */
1823        i915_gem_object_flush_if_display(obj);
1824        i915_gem_object_put(obj);
1825
1826        return 0;
1827}
1828
1829/**
1830 * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1831 *                       it is mapped to.
1832 * @dev: drm device
1833 * @data: ioctl data blob
1834 * @file: drm file
1835 *
1836 * While the mapping holds a reference on the contents of the object, it doesn't
1837 * imply a ref on the object itself.
1838 *
1839 * IMPORTANT:
1840 *
1841 * DRM driver writers who look a this function as an example for how to do GEM
1842 * mmap support, please don't implement mmap support like here. The modern way
1843 * to implement DRM mmap support is with an mmap offset ioctl (like
1844 * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1845 * That way debug tooling like valgrind will understand what's going on, hiding
1846 * the mmap call in a driver private ioctl will break that. The i915 driver only
1847 * does cpu mmaps this way because we didn't know better.
1848 */
1849int
1850i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1851                    struct drm_file *file)
1852{
1853        struct drm_i915_gem_mmap *args = data;
1854        struct drm_i915_gem_object *obj;
1855        unsigned long addr;
1856
1857        if (args->flags & ~(I915_MMAP_WC))
1858                return -EINVAL;
1859
1860        if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1861                return -ENODEV;
1862
1863        obj = i915_gem_object_lookup(file, args->handle);
1864        if (!obj)
1865                return -ENOENT;
1866
1867        /* prime objects have no backing filp to GEM mmap
1868         * pages from.
1869         */
1870        if (!obj->base.filp) {
1871                i915_gem_object_put(obj);
1872                return -ENXIO;
1873        }
1874
1875        addr = vm_mmap(obj->base.filp, 0, args->size,
1876                       PROT_READ | PROT_WRITE, MAP_SHARED,
1877                       args->offset);
1878        if (args->flags & I915_MMAP_WC) {
1879                struct mm_struct *mm = current->mm;
1880                struct vm_area_struct *vma;
1881
1882                if (down_write_killable(&mm->mmap_sem)) {
1883                        i915_gem_object_put(obj);
1884                        return -EINTR;
1885                }
1886                vma = find_vma(mm, addr);
1887                if (vma)
1888                        vma->vm_page_prot =
1889                                pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1890                else
1891                        addr = -ENOMEM;
1892                up_write(&mm->mmap_sem);
1893
1894                /* This may race, but that's ok, it only gets set */
1895                WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1896        }
1897        i915_gem_object_put(obj);
1898        if (IS_ERR((void *)addr))
1899                return addr;
1900
1901        args->addr_ptr = (uint64_t) addr;
1902
1903        return 0;
1904}
1905
1906static unsigned int tile_row_pages(const struct drm_i915_gem_object *obj)
1907{
1908        return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1909}
1910
1911/**
1912 * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1913 *
1914 * A history of the GTT mmap interface:
1915 *
1916 * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1917 *     aligned and suitable for fencing, and still fit into the available
1918 *     mappable space left by the pinned display objects. A classic problem
1919 *     we called the page-fault-of-doom where we would ping-pong between
1920 *     two objects that could not fit inside the GTT and so the memcpy
1921 *     would page one object in at the expense of the other between every
1922 *     single byte.
1923 *
1924 * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1925 *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1926 *     object is too large for the available space (or simply too large
1927 *     for the mappable aperture!), a view is created instead and faulted
1928 *     into userspace. (This view is aligned and sized appropriately for
1929 *     fenced access.)
1930 *
1931 * 2 - Recognise WC as a separate cache domain so that we can flush the
1932 *     delayed writes via GTT before performing direct access via WC.
1933 *
1934 * Restrictions:
1935 *
1936 *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1937 *    hangs on some architectures, corruption on others. An attempt to service
1938 *    a GTT page fault from a snoopable object will generate a SIGBUS.
1939 *
1940 *  * the object must be able to fit into RAM (physical memory, though no
1941 *    limited to the mappable aperture).
1942 *
1943 *
1944 * Caveats:
1945 *
1946 *  * a new GTT page fault will synchronize rendering from the GPU and flush
1947 *    all data to system memory. Subsequent access will not be synchronized.
1948 *
1949 *  * all mappings are revoked on runtime device suspend.
1950 *
1951 *  * there are only 8, 16 or 32 fence registers to share between all users
1952 *    (older machines require fence register for display and blitter access
1953 *    as well). Contention of the fence registers will cause the previous users
1954 *    to be unmapped and any new access will generate new page faults.
1955 *
1956 *  * running out of memory while servicing a fault may generate a SIGBUS,
1957 *    rather than the expected SIGSEGV.
1958 */
1959int i915_gem_mmap_gtt_version(void)
1960{
1961        return 2;
1962}
1963
1964static inline struct i915_ggtt_view
1965compute_partial_view(const struct drm_i915_gem_object *obj,
1966                     pgoff_t page_offset,
1967                     unsigned int chunk)
1968{
1969        struct i915_ggtt_view view;
1970
1971        if (i915_gem_object_is_tiled(obj))
1972                chunk = roundup(chunk, tile_row_pages(obj));
1973
1974        view.type = I915_GGTT_VIEW_PARTIAL;
1975        view.partial.offset = rounddown(page_offset, chunk);
1976        view.partial.size =
1977                min_t(unsigned int, chunk,
1978                      (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1979
1980        /* If the partial covers the entire object, just create a normal VMA. */
1981        if (chunk >= obj->base.size >> PAGE_SHIFT)
1982                view.type = I915_GGTT_VIEW_NORMAL;
1983
1984        return view;
1985}
1986
1987/**
1988 * i915_gem_fault - fault a page into the GTT
1989 * @vmf: fault info
1990 *
1991 * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1992 * from userspace.  The fault handler takes care of binding the object to
1993 * the GTT (if needed), allocating and programming a fence register (again,
1994 * only if needed based on whether the old reg is still valid or the object
1995 * is tiled) and inserting a new PTE into the faulting process.
1996 *
1997 * Note that the faulting process may involve evicting existing objects
1998 * from the GTT and/or fence registers to make room.  So performance may
1999 * suffer if the GTT working set is large or there are few fence registers
2000 * left.

2001 *
2002 * The current feature set supported by i915_gem_fault() and thus GTT mmaps
2003 * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
2004 */
2005vm_fault_t i915_gem_fault(struct vm_fault *vmf)
2006{
2007#define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
2008        struct vm_area_struct *area = vmf->vma;
2009        struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
2010        struct drm_device *dev = obj->base.dev;
2011        struct drm_i915_private *dev_priv = to_i915(dev);
2012        struct i915_ggtt *ggtt = &dev_priv->ggtt;
2013        bool write = area->vm_flags & VM_WRITE;
2014        struct i915_vma *vma;
2015        pgoff_t page_offset;
2016        int ret;
2017
2018        /* Sanity check that we allow writing into this object */
2019        if (i915_gem_object_is_readonly(obj) && write)
2020                return VM_FAULT_SIGBUS;
2021
2022        /* We don't use vmf->pgoff since that has the fake offset */
2023        page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
2024
2025        trace_i915_gem_object_fault(obj, page_offset, true, write);
2026
2027        /* Try to flush the object off the GPU first without holding the lock.
2028         * Upon acquiring the lock, we will perform our sanity checks and then
2029         * repeat the flush holding the lock in the normal manner to catch cases
2030         * where we are gazumped.
2031         */
2032        ret = i915_gem_object_wait(obj,
2033                                   I915_WAIT_INTERRUPTIBLE,
2034                                   MAX_SCHEDULE_TIMEOUT,
2035                                   NULL);
2036        if (ret)
2037                goto err;
2038
2039        ret = i915_gem_object_pin_pages(obj);
2040        if (ret)
2041                goto err;
2042
2043        intel_runtime_pm_get(dev_priv);
2044
2045        ret = i915_mutex_lock_interruptible(dev);
2046        if (ret)
2047                goto err_rpm;
2048
2049        /* Access to snoopable pages through the GTT is incoherent. */
2050        if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
2051                ret = -EFAULT;
2052                goto err_unlock;
2053        }
2054
2055
2056        /* Now pin it into the GTT as needed */
2057        vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
2058                                       PIN_MAPPABLE |
2059                                       PIN_NONBLOCK |
2060                                       PIN_NONFAULT);
2061        if (IS_ERR(vma)) {
2062                /* Use a partial view if it is bigger than available space */
2063                struct i915_ggtt_view view =
2064                        compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
2065                unsigned int flags;
2066
2067                flags = PIN_MAPPABLE;
2068                if (view.type == I915_GGTT_VIEW_NORMAL)
2069                        flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
2070
2071                /*
2072                 * Userspace is now writing through an untracked VMA, abandon
2073                 * all hope that the hardware is able to track future writes.
2074                 */
2075                obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
2076
2077                vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2078                if (IS_ERR(vma) && !view.type) {
2079                        flags = PIN_MAPPABLE;
2080                        view.type = I915_GGTT_VIEW_PARTIAL;
2081                        vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
2082                }
2083        }
2084        if (IS_ERR(vma)) {
2085                ret = PTR_ERR(vma);
2086                goto err_unlock;
2087        }
2088
2089        ret = i915_gem_object_set_to_gtt_domain(obj, write);
2090        if (ret)
2091                goto err_unpin;
2092
2093        ret = i915_vma_pin_fence(vma);
2094        if (ret)
2095                goto err_unpin;
2096
2097        /* Finally, remap it using the new GTT offset */
2098        ret = remap_io_mapping(area,
2099                               area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
2100                               (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
2101                               min_t(u64, vma->size, area->vm_end - area->vm_start),
2102                               &ggtt->iomap);
2103        if (ret)
2104                goto err_fence;
2105
2106        /* Mark as being mmapped into userspace for later revocation */
2107        assert_rpm_wakelock_held(dev_priv);
2108        if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
2109                list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
2110        GEM_BUG_ON(!obj->userfault_count);
2111
2112        i915_vma_set_ggtt_write(vma);
2113
2114err_fence:
2115        i915_vma_unpin_fence(vma);
2116err_unpin:
2117        __i915_vma_unpin(vma);
2118err_unlock:
2119        mutex_unlock(&dev->struct_mutex);
2120err_rpm:
2121        intel_runtime_pm_put(dev_priv);
2122        i915_gem_object_unpin_pages(obj);
2123err:
2124        switch (ret) {
2125        case -EIO:
2126                /*
2127                 * We eat errors when the gpu is terminally wedged to avoid
2128                 * userspace unduly crashing (gl has no provisions for mmaps to
2129                 * fail). But any other -EIO isn't ours (e.g. swap in failure)
2130                 * and so needs to be reported.
2131                 */
2132                if (!i915_terminally_wedged(&dev_priv->gpu_error))
2133                        return VM_FAULT_SIGBUS;
2134                /* else: fall through */
2135        case -EAGAIN:
2136                /*
2137                 * EAGAIN means the gpu is hung and we'll wait for the error
2138                 * handler to reset everything when re-faulting in
2139                 * i915_mutex_lock_interruptible.
2140                 */
2141        case 0:
2142        case -ERESTARTSYS:
2143        case -EINTR:
2144        case -EBUSY:
2145                /*
2146                 * EBUSY is ok: this just means that another thread
2147                 * already did the job.
2148                 */
2149                return VM_FAULT_NOPAGE;
2150        case -ENOMEM:
2151                return VM_FAULT_OOM;
2152        case -ENOSPC:
2153        case -EFAULT:
2154                return VM_FAULT_SIGBUS;
2155        default:
2156                WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2157                return VM_FAULT_SIGBUS;
2158        }
2159}
2160
2161static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2162{
2163        struct i915_vma *vma;
2164
2165        GEM_BUG_ON(!obj->userfault_count);
2166
2167        obj->userfault_count = 0;
2168        list_del(&obj->userfault_link);
2169        drm_vma_node_unmap(&obj->base.vma_node,
2170                           obj->base.dev->anon_inode->i_mapping);
2171
2172        for_each_ggtt_vma(vma, obj)
2173                i915_vma_unset_userfault(vma);
2174}
2175
2176/**
2177 * i915_gem_release_mmap - remove physical page mappings
2178 * @obj: obj in question
2179 *
2180 * Preserve the reservation of the mmapping with the DRM core code, but
2181 * relinquish ownership of the pages back to the system.
2182 *
2183 * It is vital that we remove the page mapping if we have mapped a tiled
2184 * object through the GTT and then lose the fence register due to
2185 * resource pressure. Similarly if the object has been moved out of the
2186 * aperture, than pages mapped into userspace must be revoked. Removing the
2187 * mapping will then trigger a page fault on the next user access, allowing
2188 * fixup by i915_gem_fault().
2189 */
2190void
2191i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2192{
2193        struct drm_i915_private *i915 = to_i915(obj->base.dev);
2194
2195        /* Serialisation between user GTT access and our code depends upon
2196         * revoking the CPU's PTE whilst the mutex is held. The next user
2197         * pagefault then has to wait until we release the mutex.
2198         *
2199         * Note that RPM complicates somewhat by adding an additional
2200         * requirement that operations to the GGTT be made holding the RPM
2201         * wakeref.
2202         */
2203        lockdep_assert_held(&i915->drm.struct_mutex);
2204        intel_runtime_pm_get(i915);
2205
2206        if (!obj->userfault_count)
2207                goto out;
2208
2209        __i915_gem_object_release_mmap(obj);
2210
2211        /* Ensure that the CPU's PTE are revoked and there are not outstanding
2212         * memory transactions from userspace before we return. The TLB
2213         * flushing implied above by changing the PTE above *should* be
2214         * sufficient, an extra barrier here just provides us with a bit
2215         * of paranoid documentation about our requirement to serialise
2216         * memory writes before touching registers / GSM.
2217         */
2218        wmb();
2219
2220out:
2221        intel_runtime_pm_put(i915);
2222}
2223
2224void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2225{
2226        struct drm_i915_gem_object *obj, *on;
2227        int i;
2228
2229        /*
2230         * Only called during RPM suspend. All users of the userfault_list
2231         * must be holding an RPM wakeref to ensure that this can not
2232         * run concurrently with themselves (and use the struct_mutex for
2233         * protection between themselves).
2234         */
2235
2236        list_for_each_entry_safe(obj, on,
2237                                 &dev_priv->mm.userfault_list, userfault_link)
2238                __i915_gem_object_release_mmap(obj);
2239
2240        /* The fence will be lost when the device powers down. If any were
2241         * in use by hardware (i.e. they are pinned), we should not be powering
2242         * down! All other fences will be reacquired by the user upon waking.
2243         */
2244        for (i = 0; i < dev_priv->num_fence_regs; i++) {
2245                struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2246
2247                /* Ideally we want to assert that the fence register is not
2248                 * live at this point (i.e. that no piece of code will be
2249                 * trying to write through fence + GTT, as that both violates
2250                 * our tracking of activity and associated locking/barriers,
2251                 * but also is illegal given that the hw is powered down).
2252                 *
2253                 * Previously we used reg->pin_count as a "liveness" indicator.
2254                 * That is not sufficient, and we need a more fine-grained
2255                 * tool if we want to have a sanity check here.
2256                 */
2257
2258                if (!reg->vma)
2259                        continue;
2260
2261                GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2262                reg->dirty = true;
2263        }
2264}
2265
2266static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2267{
2268        struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2269        int err;
2270
2271        err = drm_gem_create_mmap_offset(&obj->base);
2272        if (likely(!err))
2273                return 0;
2274
2275        /* Attempt to reap some mmap space from dead objects */
2276        do {
2277                err = i915_gem_wait_for_idle(dev_priv,
2278                                             I915_WAIT_INTERRUPTIBLE,
2279                                             MAX_SCHEDULE_TIMEOUT);
2280                if (err)
2281                        break;
2282
2283                i915_gem_drain_freed_objects(dev_priv);
2284                err = drm_gem_create_mmap_offset(&obj->base);
2285                if (!err)
2286                        break;
2287
2288        } while (flush_delayed_work(&dev_priv->gt.retire_work));
2289
2290        return err;
2291}
2292
2293static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2294{
2295        drm_gem_free_mmap_offset(&obj->base);
2296}
2297
2298int
2299i915_gem_mmap_gtt(struct drm_file *file,
2300                  struct drm_device *dev,
2301                  uint32_t handle,
2302                  uint64_t *offset)
2303{
2304        struct drm_i915_gem_object *obj;
2305        int ret;
2306
2307        obj = i915_gem_object_lookup(file, handle);
2308        if (!obj)
2309                return -ENOENT;
2310
2311        ret = i915_gem_object_create_mmap_offset(obj);
2312        if (ret == 0)
2313                *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2314
2315        i915_gem_object_put(obj);
2316        return ret;
2317}
2318
2319/**
2320 * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2321 * @dev: DRM device
2322 * @data: GTT mapping ioctl data
2323 * @file: GEM object info
2324 *
2325 * Simply returns the fake offset to userspace so it can mmap it.
2326 * The mmap call will end up in drm_gem_mmap(), which will set things
2327 * up so we can get faults in the handler above.
2328 *
2329 * The fault handler will take care of binding the object into the GTT
2330 * (since it may have been evicted to make room for something), allocating
2331 * a fence register, and mapping the appropriate aperture address into
2332 * userspace.
2333 */
2334int
2335i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2336                        struct drm_file *file)
2337{
2338        struct drm_i915_gem_mmap_gtt *args = data;
2339
2340        return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2341}
2342
2343/* Immediately discard the backing storage */
2344static void
2345i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2346{
2347        i915_gem_object_free_mmap_offset(obj);
2348
2349        if (obj->base.filp == NULL)
2350                return;
2351
2352        /* Our goal here is to return as much of the memory as
2353         * is possible back to the system as we are called from OOM.
2354         * To do this we must instruct the shmfs to drop all of its
2355         * backing pages, *now*.
2356         */
2357        shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2358        obj->mm.madv = __I915_MADV_PURGED;
2359        obj->mm.pages = ERR_PTR(-EFAULT);
2360}
2361
2362/* Try to discard unwanted pages */
2363void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2364{
2365        struct address_space *mapping;
2366
2367        lockdep_assert_held(&obj->mm.lock);
2368        GEM_BUG_ON(i915_gem_object_has_pages(obj));
2369
2370        switch (obj->mm.madv) {
2371        case I915_MADV_DONTNEED:
2372                i915_gem_object_truncate(obj);
2373        case __I915_MADV_PURGED:
2374                return;
2375        }
2376
2377        if (obj->base.filp == NULL)
2378                return;
2379
2380        mapping = obj->base.filp->f_mapping,
2381        invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2382}
2383
2384static void
2385i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2386                              struct sg_table *pages)
2387{
2388        struct sgt_iter sgt_iter;
2389        struct page *page;
2390
2391        __i915_gem_object_release_shmem(obj, pages, true);
2392
2393        i915_gem_gtt_finish_pages(obj, pages);
2394
2395        if (i915_gem_object_needs_bit17_swizzle(obj))
2396                i915_gem_object_save_bit_17_swizzle(obj, pages);
2397
2398        for_each_sgt_page(page, sgt_iter, pages) {
2399                if (obj->mm.dirty)
2400                        set_page_dirty(page);
2401
2402                if (obj->mm.madv == I915_MADV_WILLNEED)
2403                        mark_page_accessed(page);
2404
2405                put_page(page);
2406        }
2407        obj->mm.dirty = false;
2408
2409        sg_free_table(pages);
2410        kfree(pages);
2411}
2412
2413static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2414{
2415        struct radix_tree_iter iter;
2416        void __rcu **slot;
2417
2418        rcu_read_lock();
2419        radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2420                radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2421        rcu_read_unlock();
2422}
2423
2424static struct sg_table *
2425__i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2426{
2427        struct drm_i915_private *i915 = to_i915(obj->base.dev);
2428        struct sg_table *pages;
2429
2430        pages = fetch_and_zero(&obj->mm.pages);
2431        if (!pages)
2432                return NULL;
2433
2434        spin_lock(&i915->mm.obj_lock);
2435        list_del(&obj->mm.link);
2436        spin_unlock(&i915->mm.obj_lock);
2437
2438        if (obj->mm.mapping) {
2439                void *ptr;
2440
2441                ptr = page_mask_bits(obj->mm.mapping);
2442                if (is_vmalloc_addr(ptr))
2443                        vunmap(ptr);
2444                else
2445                        kunmap(kmap_to_page(ptr));
2446
2447                obj->mm.mapping = NULL;
2448        }
2449
2450        __i915_gem_object_reset_page_iter(obj);
2451        obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2452
2453        return pages;
2454}
2455
2456void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2457                                 enum i915_mm_subclass subclass)
2458{
2459        struct sg_table *pages;
2460
2461        if (i915_gem_object_has_pinned_pages(obj))
2462                return;
2463
2464        GEM_BUG_ON(obj->bind_count);
2465        if (!i915_gem_object_has_pages(obj))
2466                return;
2467
2468        /* May be called by shrinker from within get_pages() (on another bo) */
2469        mutex_lock_nested(&obj->mm.lock, subclass);
2470        if (unlikely(atomic_read(&obj->mm.pages_pin_count)))
2471                goto unlock;
2472
2473        /*
2474         * ->put_pages might need to allocate memory for the bit17 swizzle
2475         * array, hence protect them from being reaped by removing them from gtt
2476         * lists early.
2477         */
2478        pages = __i915_gem_object_unset_pages(obj);
2479        if (!IS_ERR(pages))
2480                obj->ops->put_pages(obj, pages);
2481
2482unlock:
2483        mutex_unlock(&obj->mm.lock);
2484}
2485
2486static bool i915_sg_trim(struct sg_table *orig_st)
2487{
2488        struct sg_table new_st;
2489        struct scatterlist *sg, *new_sg;
2490        unsigned int i;
2491
2492        if (orig_st->nents == orig_st->orig_nents)
2493                return false;
2494
2495        if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2496                return false;
2497
2498        new_sg = new_st.sgl;
2499        for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2500                sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2501                sg_dma_address(new_sg) = sg_dma_address(sg);
2502                sg_dma_len(new_sg) = sg_dma_len(sg);
2503
2504                new_sg = sg_next(new_sg);
2505        }
2506        GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2507
2508        sg_free_table(orig_st);
2509
2510        *orig_st = new_st;
2511        return true;
2512}
2513
2514static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2515{
2516        struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2517        const unsigned long page_count = obj->base.size / PAGE_SIZE;
2518        unsigned long i;
2519        struct address_space *mapping;
2520        struct sg_table *st;
2521        struct scatterlist *sg;
2522        struct sgt_iter sgt_iter;
2523        struct page *page;
2524        unsigned long last_pfn = 0;     /* suppress gcc warning */
2525        unsigned int max_segment = i915_sg_segment_size();
2526        unsigned int sg_page_sizes;
2527        gfp_t noreclaim;
2528        int ret;
2529
2530        /*
2531         * Assert that the object is not currently in any GPU domain. As it
2532         * wasn't in the GTT, there shouldn't be any way it could have been in
2533         * a GPU cache
2534         */
2535        GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2536        GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2537
2538        /*
2539         * If there's no chance of allocating enough pages for the whole
2540         * object, bail early.
2541         */
2542        if (page_count > totalram_pages)
2543                return -ENOMEM;
2544
2545        st = kmalloc(sizeof(*st), GFP_KERNEL);
2546        if (st == NULL)
2547                return -ENOMEM;
2548
2549rebuild_st:
2550        if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2551                kfree(st);
2552                return -ENOMEM;
2553        }
2554
2555        /*
2556         * Get the list of pages out of our struct file.  They'll be pinned
2557         * at this point until we release them.
2558         *
2559         * Fail silently without starting the shrinker
2560         */
2561        mapping = obj->base.filp->f_mapping;
2562        noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2563        noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2564
2565        sg = st->sgl;
2566        st->nents = 0;
2567        sg_page_sizes = 0;
2568        for (i = 0; i < page_count; i++) {
2569                const unsigned int shrink[] = {
2570                        I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2571                        0,
2572                }, *s = shrink;
2573                gfp_t gfp = noreclaim;
2574
2575                do {
2576                        page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2577                        if (likely(!IS_ERR(page)))
2578                                break;
2579
2580                        if (!*s) {
2581                                ret = PTR_ERR(page);
2582                                goto err_sg;
2583                        }
2584
2585                        i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2586                        cond_resched();
2587
2588                        /*
2589                         * We've tried hard to allocate the memory by reaping
2590                         * our own buffer, now let the real VM do its job and
2591                         * go down in flames if truly OOM.
2592                         *
2593                         * However, since graphics tend to be disposable,
2594                         * defer the oom here by reporting the ENOMEM back
2595                         * to userspace.
2596                         */
2597                        if (!*s) {
2598                                /* reclaim and warn, but no oom */
2599                                gfp = mapping_gfp_mask(mapping);
2600
2601                                /*
2602                                 * Our bo are always dirty and so we require
2603                                 * kswapd to reclaim our pages (direct reclaim
2604                                 * does not effectively begin pageout of our
2605                                 * buffers on its own). However, direct reclaim
2606                                 * only waits for kswapd when under allocation
2607                                 * congestion. So as a result __GFP_RECLAIM is
2608                                 * unreliable and fails to actually reclaim our
2609                                 * dirty pages -- unless you try over and over
2610                                 * again with !__GFP_NORETRY. However, we still
2611                                 * want to fail this allocation rather than
2612                                 * trigger the out-of-memory killer and for
2613                                 * this we want __GFP_RETRY_MAYFAIL.
2614                                 */
2615                                gfp |= __GFP_RETRY_MAYFAIL;
2616                        }
2617                } while (1);
2618
2619                if (!i ||
2620                    sg->length >= max_segment ||
2621                    page_to_pfn(page) != last_pfn + 1) {
2622                        if (i) {
2623                                sg_page_sizes |= sg->length;
2624                                sg = sg_next(sg);
2625                        }
2626                        st->nents++;
2627                        sg_set_page(sg, page, PAGE_SIZE, 0);
2628                } else {
2629                        sg->length += PAGE_SIZE;
2630                }
2631                last_pfn = page_to_pfn(page);
2632
2633                /* Check that the i965g/gm workaround works. */
2634                WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2635        }
2636        if (sg) { /* loop terminated early; short sg table */
2637                sg_page_sizes |= sg->length;
2638                sg_mark_end(sg);
2639        }
2640
2641        /* Trim unused sg entries to avoid wasting memory. */
2642        i915_sg_trim(st);
2643
2644        ret = i915_gem_gtt_prepare_pages(obj, st);
2645        if (ret) {
2646                /*
2647                 * DMA remapping failed? One possible cause is that
2648                 * it could not reserve enough large entries, asking
2649                 * for PAGE_SIZE chunks instead may be helpful.
2650                 */
2651                if (max_segment > PAGE_SIZE) {
2652                        for_each_sgt_page(page, sgt_iter, st)
2653                                put_page(page);
2654                        sg_free_table(st);
2655
2656                        max_segment = PAGE_SIZE;
2657                        goto rebuild_st;
2658                } else {
2659                        dev_warn(&dev_priv->drm.pdev->dev,
2660                                 "Failed to DMA remap %lu pages\n",
2661                                 page_count);
2662                        goto err_pages;
2663                }
2664        }
2665
2666        if (i915_gem_object_needs_bit17_swizzle(obj))
2667                i915_gem_object_do_bit_17_swizzle(obj, st);
2668
2669        __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2670
2671        return 0;
2672
2673err_sg:
2674        sg_mark_end(sg);
2675err_pages:
2676        for_each_sgt_page(page, sgt_iter, st)
2677                put_page(page);
2678        sg_free_table(st);
2679        kfree(st);
2680
2681        /*
2682         * shmemfs first checks if there is enough memory to allocate the page
2683         * and reports ENOSPC should there be insufficient, along with the usual
2684         * ENOMEM for a genuine allocation failure.
2685         *
2686         * We use ENOSPC in our driver to mean that we have run out of aperture
2687         * space and so want to translate the error from shmemfs back to our
2688         * usual understanding of ENOMEM.
2689         */
2690        if (ret == -ENOSPC)
2691                ret = -ENOMEM;
2692
2693        return ret;
2694}
2695
2696void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2697                                 struct sg_table *pages,
2698                                 unsigned int sg_page_sizes)
2699{
2700        struct drm_i915_private *i915 = to_i915(obj->base.dev);
2701        unsigned long supported = INTEL_INFO(i915)->page_sizes;
2702        int i;
2703
2704        lockdep_assert_held(&obj->mm.lock);
2705
2706        obj->mm.get_page.sg_pos = pages->sgl;
2707        obj->mm.get_page.sg_idx = 0;
2708
2709        obj->mm.pages = pages;
2710
2711        if (i915_gem_object_is_tiled(obj) &&
2712            i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2713                GEM_BUG_ON(obj->mm.quirked);
2714                __i915_gem_object_pin_pages(obj);
2715                obj->mm.quirked = true;
2716        }
2717
2718        GEM_BUG_ON(!sg_page_sizes);
2719        obj->mm.page_sizes.phys = sg_page_sizes;
2720
2721        /*
2722         * Calculate the supported page-sizes which fit into the given
2723         * sg_page_sizes. This will give us the page-sizes which we may be able
2724         * to use opportunistically when later inserting into the GTT. For
2725         * example if phys=2G, then in theory we should be able to use 1G, 2M,
2726         * 64K or 4K pages, although in practice this will depend on a number of
2727         * other factors.
2728         */
2729        obj->mm.page_sizes.sg = 0;
2730        for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2731                if (obj->mm.page_sizes.phys & ~0u << i)
2732                        obj->mm.page_sizes.sg |= BIT(i);
2733        }
2734        GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2735
2736        spin_lock(&i915->mm.obj_lock);
2737        list_add(&obj->mm.link, &i915->mm.unbound_list);
2738        spin_unlock(&i915->mm.obj_lock);
2739}
2740
2741static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2742{
2743        int err;
2744
2745        if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2746                DRM_DEBUG("Attempting to obtain a purgeable object\n");
2747                return -EFAULT;
2748        }
2749
2750        err = obj->ops->get_pages(obj);
2751        GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2752
2753        return err;
2754}
2755
2756/* Ensure that the associated pages are gathered from the backing storage
2757 * and pinned into our object. i915_gem_object_pin_pages() may be called
2758 * multiple times before they are released by a single call to
2759 * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2760 * either as a result of memory pressure (reaping pages under the shrinker)
2761 * or as the object is itself released.
2762 */
2763int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2764{
2765        int err;
2766
2767        err = mutex_lock_interruptible(&obj->mm.lock);
2768        if (err)
2769                return err;
2770
2771        if (unlikely(!i915_gem_object_has_pages(obj))) {
2772                GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2773
2774                err = ____i915_gem_object_get_pages(obj);
2775                if (err)
2776                        goto unlock;
2777
2778                smp_mb__before_atomic();
2779        }
2780        atomic_inc(&obj->mm.pages_pin_count);
2781
2782unlock:
2783        mutex_unlock(&obj->mm.lock);
2784        return err;
2785}
2786
2787/* The 'mapping' part of i915_gem_object_pin_map() below */
2788static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2789                                 enum i915_map_type type)
2790{
2791        unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2792        struct sg_table *sgt = obj->mm.pages;
2793        struct sgt_iter sgt_iter;
2794        struct page *page;
2795        struct page *stack_pages[32];
2796        struct page **pages = stack_pages;
2797        unsigned long i = 0;
2798        pgprot_t pgprot;
2799        void *addr;
2800
2801        /* A single page can always be kmapped */
2802        if (n_pages == 1 && type == I915_MAP_WB)
2803                return kmap(sg_page(sgt->sgl));
2804
2805        if (n_pages > ARRAY_SIZE(stack_pages)) {
2806                /* Too big for stack -- allocate temporary array instead */
2807                pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2808                if (!pages)
2809                        return NULL;
2810        }
2811
2812        for_each_sgt_page(page, sgt_iter, sgt)
2813                pages[i++] = page;
2814
2815        /* Check that we have the expected number of pages */
2816        GEM_BUG_ON(i != n_pages);
2817
2818        switch (type) {
2819        default:
2820                MISSING_CASE(type);
2821                /* fallthrough to use PAGE_KERNEL anyway */
2822        case I915_MAP_WB:
2823                pgprot = PAGE_KERNEL;
2824                break;
2825        case I915_MAP_WC:
2826                pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2827                break;
2828        }
2829        addr = vmap(pages, n_pages, 0, pgprot);
2830
2831        if (pages != stack_pages)
2832                kvfree(pages);
2833
2834        return addr;
2835}
2836
2837/* get, pin, and map the pages of the object into kernel space */
2838void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2839                              enum i915_map_type type)
2840{
2841        enum i915_map_type has_type;
2842        bool pinned;
2843        void *ptr;
2844        int ret;
2845
2846        if (unlikely(!i915_gem_object_has_struct_page(obj)))
2847                return ERR_PTR(-ENXIO);
2848
2849        ret = mutex_lock_interruptible(&obj->mm.lock);
2850        if (ret)
2851                return ERR_PTR(ret);
2852
2853        pinned = !(type & I915_MAP_OVERRIDE);
2854        type &= ~I915_MAP_OVERRIDE;
2855
2856        if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2857                if (unlikely(!i915_gem_object_has_pages(obj))) {
2858                        GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2859
2860                        ret = ____i915_gem_object_get_pages(obj);
2861                        if (ret)
2862                                goto err_unlock;
2863
2864                        smp_mb__before_atomic();
2865                }
2866                atomic_inc(&obj->mm.pages_pin_count);
2867                pinned = false;
2868        }
2869        GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2870
2871        ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2872        if (ptr && has_type != type) {
2873                if (pinned) {
2874                        ret = -EBUSY;
2875                        goto err_unpin;
2876                }
2877
2878                if (is_vmalloc_addr(ptr))
2879                        vunmap(ptr);
2880                else
2881                        kunmap(kmap_to_page(ptr));
2882
2883                ptr = obj->mm.mapping = NULL;
2884        }
2885
2886        if (!ptr) {
2887                ptr = i915_gem_object_map(obj, type);
2888                if (!ptr) {
2889                        ret = -ENOMEM;
2890                        goto err_unpin;
2891                }
2892
2893                obj->mm.mapping = page_pack_bits(ptr, type);
2894        }
2895
2896out_unlock:
2897        mutex_unlock(&obj->mm.lock);
2898        return ptr;
2899
2900err_unpin:
2901        atomic_dec(&obj->mm.pages_pin_count);
2902err_unlock:
2903        ptr = ERR_PTR(ret);
2904        goto out_unlock;
2905}
2906
2907static int
2908i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2909                           const struct drm_i915_gem_pwrite *arg)
2910{
2911        struct address_space *mapping = obj->base.filp->f_mapping;
2912        char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2913        u64 remain, offset;
2914        unsigned int pg;
2915
2916        /* Before we instantiate/pin the backing store for our use, we
2917         * can prepopulate the shmemfs filp efficiently using a write into
2918         * the pagecache. We avoid the penalty of instantiating all the
2919         * pages, important if the user is just writing to a few and never
2920         * uses the object on the GPU, and using a direct write into shmemfs
2921         * allows it to avoid the cost of retrieving a page (either swapin
2922         * or clearing-before-use) before it is overwritten.
2923         */
2924        if (i915_gem_object_has_pages(obj))
2925                return -ENODEV;
2926
2927        if (obj->mm.madv != I915_MADV_WILLNEED)
2928                return -EFAULT;
2929
2930        /* Before the pages are instantiated the object is treated as being
2931         * in the CPU domain. The pages will be clflushed as required before
2932         * use, and we can freely write into the pages directly. If userspace
2933         * races pwrite with any other operation; corruption will ensue -
2934         * that is userspace's prerogative!
2935         */
2936
2937        remain = arg->size;
2938        offset = arg->offset;
2939        pg = offset_in_page(offset);
2940
2941        do {
2942                unsigned int len, unwritten;
2943                struct page *page;
2944                void *data, *vaddr;
2945                int err;
2946
2947                len = PAGE_SIZE - pg;
2948                if (len > remain)
2949                        len = remain;
2950
2951                err = pagecache_write_begin(obj->base.filp, mapping,
2952                                            offset, len, 0,
2953                                            &page, &data);
2954                if (err < 0)
2955                        return err;
2956
2957                vaddr = kmap(page);
2958                unwritten = copy_from_user(vaddr + pg, user_data, len);
2959                kunmap(page);
2960
2961                err = pagecache_write_end(obj->base.filp, mapping,
2962                                          offset, len, len - unwritten,
2963                                          page, data);
2964                if (err < 0)
2965                        return err;
2966
2967                if (unwritten)
2968                        return -EFAULT;
2969
2970                remain -= len;
2971                user_data += len;
2972                offset += len;
2973                pg = 0;
2974        } while (remain);
2975
2976        return 0;
2977}
2978
2979static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
2980                                        const struct i915_gem_context *ctx)
2981{
2982        unsigned int score;
2983        unsigned long prev_hang;
2984
2985        if (i915_gem_context_is_banned(ctx))
2986                score = I915_CLIENT_SCORE_CONTEXT_BAN;
2987        else
2988                score = 0;
2989
2990        prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
2991        if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
2992                score += I915_CLIENT_SCORE_HANG_FAST;
2993
2994        if (score) {
2995                atomic_add(score, &file_priv->ban_score);
2996
2997                DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
2998                                 ctx->name, score,
2999                                 atomic_read(&file_priv->ban_score));
3000        }

3001}
3002
3003static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
3004{
3005        unsigned int score;
3006        bool banned, bannable;
3007
3008        atomic_inc(&ctx->guilty_count);
3009
3010        bannable = i915_gem_context_is_bannable(ctx);
3011        score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
3012        banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
3013
3014        /* Cool contexts don't accumulate client ban score */
3015        if (!bannable)
3016                return;
3017
3018        if (banned) {
3019                DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
3020                                 ctx->name, atomic_read(&ctx->guilty_count),
3021                                 score);
3022                i915_gem_context_set_banned(ctx);
3023        }
3024
3025        if (!IS_ERR_OR_NULL(ctx->file_priv))
3026                i915_gem_client_mark_guilty(ctx->file_priv, ctx);
3027}
3028
3029static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
3030{
3031        atomic_inc(&ctx->active_count);
3032}
3033
3034struct i915_request *
3035i915_gem_find_active_request(struct intel_engine_cs *engine)
3036{
3037        struct i915_request *request, *active = NULL;
3038        unsigned long flags;
3039
3040        /*
3041         * We are called by the error capture, reset and to dump engine
3042         * state at random points in time. In particular, note that neither is
3043         * crucially ordered with an interrupt. After a hang, the GPU is dead
3044         * and we assume that no more writes can happen (we waited long enough
3045         * for all writes that were in transaction to be flushed) - adding an
3046         * extra delay for a recent interrupt is pointless. Hence, we do
3047         * not need an engine->irq_seqno_barrier() before the seqno reads.
3048         * At all other times, we must assume the GPU is still running, but
3049         * we only care about the snapshot of this moment.
3050         */
3051        spin_lock_irqsave(&engine->timeline.lock, flags);
3052        list_for_each_entry(request, &engine->timeline.requests, link) {
3053                if (__i915_request_completed(request, request->global_seqno))
3054                        continue;
3055
3056                active = request;
3057                break;
3058        }
3059        spin_unlock_irqrestore(&engine->timeline.lock, flags);
3060
3061        return active;
3062}
3063
3064/*
3065 * Ensure irq handler finishes, and not run again.
3066 * Also return the active request so that we only search for it once.
3067 */
3068struct i915_request *
3069i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
3070{
3071        struct i915_request *request;
3072
3073        /*
3074         * During the reset sequence, we must prevent the engine from
3075         * entering RC6. As the context state is undefined until we restart
3076         * the engine, if it does enter RC6 during the reset, the state
3077         * written to the powercontext is undefined and so we may lose
3078         * GPU state upon resume, i.e. fail to restart after a reset.
3079         */
3080        intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
3081
3082        request = engine->reset.prepare(engine);
3083        if (request && request->fence.error == -EIO)
3084                request = ERR_PTR(-EIO); /* Previous reset failed! */
3085
3086        return request;
3087}
3088
3089int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
3090{
3091        struct intel_engine_cs *engine;
3092        struct i915_request *request;
3093        enum intel_engine_id id;
3094        int err = 0;
3095
3096        for_each_engine(engine, dev_priv, id) {
3097                request = i915_gem_reset_prepare_engine(engine);
3098                if (IS_ERR(request)) {
3099                        err = PTR_ERR(request);
3100                        continue;
3101                }
3102
3103                engine->hangcheck.active_request = request;
3104        }
3105
3106        i915_gem_revoke_fences(dev_priv);
3107        intel_uc_sanitize(dev_priv);
3108
3109        return err;
3110}
3111
3112static void engine_skip_context(struct i915_request *request)
3113{
3114        struct intel_engine_cs *engine = request->engine;
3115        struct i915_gem_context *hung_ctx = request->gem_context;
3116        struct i915_timeline *timeline = request->timeline;
3117        unsigned long flags;
3118
3119        GEM_BUG_ON(timeline == &engine->timeline);
3120
3121        spin_lock_irqsave(&engine->timeline.lock, flags);
3122        spin_lock(&timeline->lock);
3123
3124        list_for_each_entry_continue(request, &engine->timeline.requests, link)
3125                if (request->gem_context == hung_ctx)
3126                        i915_request_skip(request, -EIO);
3127
3128        list_for_each_entry(request, &timeline->requests, link)
3129                i915_request_skip(request, -EIO);
3130
3131        spin_unlock(&timeline->lock);
3132        spin_unlock_irqrestore(&engine->timeline.lock, flags);
3133}
3134
3135/* Returns the request if it was guilty of the hang */
3136static struct i915_request *
3137i915_gem_reset_request(struct intel_engine_cs *engine,
3138                       struct i915_request *request,
3139                       bool stalled)
3140{
3141        /* The guilty request will get skipped on a hung engine.
3142         *
3143         * Users of client default contexts do not rely on logical
3144         * state preserved between batches so it is safe to execute
3145         * queued requests following the hang. Non default contexts
3146         * rely on preserved state, so skipping a batch loses the
3147         * evolution of the state and it needs to be considered corrupted.
3148         * Executing more queued batches on top of corrupted state is
3149         * risky. But we take the risk by trying to advance through
3150         * the queued requests in order to make the client behaviour
3151         * more predictable around resets, by not throwing away random
3152         * amount of batches it has prepared for execution. Sophisticated
3153         * clients can use gem_reset_stats_ioctl and dma fence status
3154         * (exported via sync_file info ioctl on explicit fences) to observe
3155         * when it loses the context state and should rebuild accordingly.
3156         *
3157         * The context ban, and ultimately the client ban, mechanism are safety
3158         * valves if client submission ends up resulting in nothing more than
3159         * subsequent hangs.
3160         */
3161
3162        if (i915_request_completed(request)) {
3163                GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
3164                          engine->name, request->global_seqno,
3165                          request->fence.context, request->fence.seqno,
3166                          intel_engine_get_seqno(engine));
3167                stalled = false;
3168        }
3169
3170        if (stalled) {
3171                i915_gem_context_mark_guilty(request->gem_context);
3172                i915_request_skip(request, -EIO);
3173
3174                /* If this context is now banned, skip all pending requests. */
3175                if (i915_gem_context_is_banned(request->gem_context))
3176                        engine_skip_context(request);
3177        } else {
3178                /*
3179                 * Since this is not the hung engine, it may have advanced
3180                 * since the hang declaration. Double check by refinding
3181                 * the active request at the time of the reset.
3182                 */
3183                request = i915_gem_find_active_request(engine);
3184                if (request) {
3185                        unsigned long flags;
3186
3187                        i915_gem_context_mark_innocent(request->gem_context);
3188                        dma_fence_set_error(&request->fence, -EAGAIN);
3189
3190                        /* Rewind the engine to replay the incomplete rq */
3191                        spin_lock_irqsave(&engine->timeline.lock, flags);
3192                        request = list_prev_entry(request, link);
3193                        if (&request->link == &engine->timeline.requests)
3194                                request = NULL;
3195                        spin_unlock_irqrestore(&engine->timeline.lock, flags);
3196                }
3197        }
3198
3199        return request;
3200}
3201
3202void i915_gem_reset_engine(struct intel_engine_cs *engine,
3203                           struct i915_request *request,
3204                           bool stalled)
3205{
3206        /*
3207         * Make sure this write is visible before we re-enable the interrupt
3208         * handlers on another CPU, as tasklet_enable() resolves to just
3209         * a compiler barrier which is insufficient for our purpose here.
3210         */
3211        smp_store_mb(engine->irq_posted, 0);
3212
3213        if (request)
3214                request = i915_gem_reset_request(engine, request, stalled);
3215
3216        /* Setup the CS to resume from the breadcrumb of the hung request */
3217        engine->reset.reset(engine, request);
3218}
3219
3220void i915_gem_reset(struct drm_i915_private *dev_priv,
3221                    unsigned int stalled_mask)
3222{
3223        struct intel_engine_cs *engine;
3224        enum intel_engine_id id;
3225
3226        lockdep_assert_held(&dev_priv->drm.struct_mutex);
3227
3228        i915_retire_requests(dev_priv);
3229
3230        for_each_engine(engine, dev_priv, id) {
3231                struct intel_context *ce;
3232
3233                i915_gem_reset_engine(engine,
3234                                      engine->hangcheck.active_request,
3235                                      stalled_mask & ENGINE_MASK(id));
3236                ce = fetch_and_zero(&engine->last_retired_context);
3237                if (ce)
3238                        intel_context_unpin(ce);
3239
3240                /*
3241                 * Ostensibily, we always want a context loaded for powersaving,
3242                 * so if the engine is idle after the reset, send a request
3243                 * to load our scratch kernel_context.
3244                 *
3245                 * More mysteriously, if we leave the engine idle after a reset,
3246                 * the next userspace batch may hang, with what appears to be
3247                 * an incoherent read by the CS (presumably stale TLB). An
3248                 * empty request appears sufficient to paper over the glitch.
3249                 */
3250                if (intel_engine_is_idle(engine)) {
3251                        struct i915_request *rq;
3252
3253                        rq = i915_request_alloc(engine,
3254                                                dev_priv->kernel_context);
3255                        if (!IS_ERR(rq))
3256                                i915_request_add(rq);
3257                }
3258        }
3259
3260        i915_gem_restore_fences(dev_priv);
3261}
3262
3263void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
3264{
3265        engine->reset.finish(engine);
3266
3267        intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
3268}
3269
3270void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
3271{
3272        struct intel_engine_cs *engine;
3273        enum intel_engine_id id;
3274
3275        lockdep_assert_held(&dev_priv->drm.struct_mutex);
3276
3277        for_each_engine(engine, dev_priv, id) {
3278                engine->hangcheck.active_request = NULL;
3279                i915_gem_reset_finish_engine(engine);
3280        }
3281}
3282
3283static void nop_submit_request(struct i915_request *request)
3284{
3285        GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3286                  request->engine->name,
3287                  request->fence.context, request->fence.seqno);
3288        dma_fence_set_error(&request->fence, -EIO);
3289
3290        i915_request_submit(request);
3291}
3292
3293static void nop_complete_submit_request(struct i915_request *request)
3294{
3295        unsigned long flags;
3296
3297        GEM_TRACE("%s fence %llx:%d -> -EIO\n",
3298                  request->engine->name,
3299                  request->fence.context, request->fence.seqno);
3300        dma_fence_set_error(&request->fence, -EIO);
3301
3302        spin_lock_irqsave(&request->engine->timeline.lock, flags);
3303        __i915_request_submit(request);
3304        intel_engine_init_global_seqno(request->engine, request->global_seqno);
3305        spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
3306}
3307
3308void i915_gem_set_wedged(struct drm_i915_private *i915)
3309{
3310        struct intel_engine_cs *engine;
3311        enum intel_engine_id id;
3312
3313        GEM_TRACE("start\n");
3314
3315        if (GEM_SHOW_DEBUG()) {
3316                struct drm_printer p = drm_debug_printer(__func__);
3317
3318                for_each_engine(engine, i915, id)
3319                        intel_engine_dump(engine, &p, "%s\n", engine->name);
3320        }
3321
3322        if (test_and_set_bit(I915_WEDGED, &i915->gpu_error.flags))
3323                goto out;
3324
3325        /*
3326         * First, stop submission to hw, but do not yet complete requests by
3327         * rolling the global seqno forward (since this would complete requests
3328         * for which we haven't set the fence error to EIO yet).
3329         */
3330        for_each_engine(engine, i915, id) {
3331                i915_gem_reset_prepare_engine(engine);
3332
3333                engine->submit_request = nop_submit_request;
3334                engine->schedule = NULL;
3335        }
3336        i915->caps.scheduler = 0;
3337
3338        /* Even if the GPU reset fails, it should still stop the engines */
3339        if (INTEL_GEN(i915) >= 5)
3340                intel_gpu_reset(i915, ALL_ENGINES);
3341
3342        /*
3343         * Make sure no one is running the old callback before we proceed with
3344         * cancelling requests and resetting the completion tracking. Otherwise
3345         * we might submit a request to the hardware which never completes.
3346         */
3347        synchronize_rcu();
3348
3349        for_each_engine(engine, i915, id) {
3350                /* Mark all executing requests as skipped */
3351                engine->cancel_requests(engine);
3352
3353                /*
3354                 * Only once we've force-cancelled all in-flight requests can we
3355                 * start to complete all requests.
3356                 */
3357                engine->submit_request = nop_complete_submit_request;
3358        }
3359
3360        /*
3361         * Make sure no request can slip through without getting completed by
3362         * either this call here to intel_engine_init_global_seqno, or the one
3363         * in nop_complete_submit_request.
3364         */
3365        synchronize_rcu();
3366
3367        for_each_engine(engine, i915, id) {
3368                unsigned long flags;
3369
3370                /*
3371                 * Mark all pending requests as complete so that any concurrent
3372                 * (lockless) lookup doesn't try and wait upon the request as we
3373                 * reset it.
3374                 */
3375                spin_lock_irqsave(&engine->timeline.lock, flags);
3376                intel_engine_init_global_seqno(engine,
3377                                               intel_engine_last_submit(engine));
3378                spin_unlock_irqrestore(&engine->timeline.lock, flags);
3379
3380                i915_gem_reset_finish_engine(engine);
3381        }
3382
3383out:
3384        GEM_TRACE("end\n");
3385
3386        wake_up_all(&i915->gpu_error.reset_queue);
3387}
3388
3389bool i915_gem_unset_wedged(struct drm_i915_private *i915)
3390{
3391        struct i915_timeline *tl;
3392
3393        lockdep_assert_held(&i915->drm.struct_mutex);
3394        if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
3395                return true;
3396
3397        GEM_TRACE("start\n");
3398
3399        /*
3400         * Before unwedging, make sure that all pending operations
3401         * are flushed and errored out - we may have requests waiting upon
3402         * third party fences. We marked all inflight requests as EIO, and
3403         * every execbuf since returned EIO, for consistency we want all
3404         * the currently pending requests to also be marked as EIO, which
3405         * is done inside our nop_submit_request - and so we must wait.
3406         *
3407         * No more can be submitted until we reset the wedged bit.
3408         */
3409        list_for_each_entry(tl, &i915->gt.timelines, link) {
3410                struct i915_request *rq;
3411
3412                rq = i915_gem_active_peek(&tl->last_request,
3413                                          &i915->drm.struct_mutex);
3414                if (!rq)
3415                        continue;
3416
3417                /*
3418                 * We can't use our normal waiter as we want to
3419                 * avoid recursively trying to handle the current
3420                 * reset. The basic dma_fence_default_wait() installs
3421                 * a callback for dma_fence_signal(), which is
3422                 * triggered by our nop handler (indirectly, the
3423                 * callback enables the signaler thread which is
3424                 * woken by the nop_submit_request() advancing the seqno
3425                 * and when the seqno passes the fence, the signaler
3426                 * then signals the fence waking us up).
3427                 */
3428                if (dma_fence_default_wait(&rq->fence, true,
3429                                           MAX_SCHEDULE_TIMEOUT) < 0)
3430                        return false;
3431        }
3432        i915_retire_requests(i915);
3433        GEM_BUG_ON(i915->gt.active_requests);
3434
3435        if (!intel_gpu_reset(i915, ALL_ENGINES))
3436                intel_engines_sanitize(i915);
3437
3438        /*
3439         * Undo nop_submit_request. We prevent all new i915 requests from
3440         * being queued (by disallowing execbuf whilst wedged) so having
3441         * waited for all active requests above, we know the system is idle
3442         * and do not have to worry about a thread being inside
3443         * engine->submit_request() as we swap over. So unlike installing
3444         * the nop_submit_request on reset, we can do this from normal
3445         * context and do not require stop_machine().
3446         */
3447        intel_engines_reset_default_submission(i915);
3448        i915_gem_contexts_lost(i915);
3449
3450        GEM_TRACE("end\n");
3451
3452        smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
3453        clear_bit(I915_WEDGED, &i915->gpu_error.flags);
3454
3455        return true;
3456}
3457
3458static void
3459i915_gem_retire_work_handler(struct work_struct *work)
3460{
3461        struct drm_i915_private *dev_priv =
3462                container_of(work, typeof(*dev_priv), gt.retire_work.work);
3463        struct drm_device *dev = &dev_priv->drm;
3464
3465        /* Come back later if the device is busy... */
3466        if (mutex_trylock(&dev->struct_mutex)) {
3467                i915_retire_requests(dev_priv);
3468                mutex_unlock(&dev->struct_mutex);
3469        }
3470
3471        /*
3472         * Keep the retire handler running until we are finally idle.
3473         * We do not need to do this test under locking as in the worst-case
3474         * we queue the retire worker once too often.
3475         */
3476        if (READ_ONCE(dev_priv->gt.awake))
3477                queue_delayed_work(dev_priv->wq,
3478                                   &dev_priv->gt.retire_work,
3479                                   round_jiffies_up_relative(HZ));
3480}
3481
3482static void shrink_caches(struct drm_i915_private *i915)
3483{
3484        /*
3485         * kmem_cache_shrink() discards empty slabs and reorders partially
3486         * filled slabs to prioritise allocating from the mostly full slabs,
3487         * with the aim of reducing fragmentation.
3488         */
3489        kmem_cache_shrink(i915->priorities);
3490        kmem_cache_shrink(i915->dependencies);
3491        kmem_cache_shrink(i915->requests);
3492        kmem_cache_shrink(i915->luts);
3493        kmem_cache_shrink(i915->vmas);
3494        kmem_cache_shrink(i915->objects);
3495}
3496
3497struct sleep_rcu_work {
3498        union {
3499                struct rcu_head rcu;
3500                struct work_struct work;
3501        };
3502        struct drm_i915_private *i915;
3503        unsigned int epoch;
3504};
3505
3506static inline bool
3507same_epoch(struct drm_i915_private *i915, unsigned int epoch)
3508{
3509        /*
3510         * There is a small chance that the epoch wrapped since we started
3511         * sleeping. If we assume that epoch is at least a u32, then it will
3512         * take at least 2^32 * 100ms for it to wrap, or about 326 years.
3513         */
3514        return epoch == READ_ONCE(i915->gt.epoch);
3515}
3516
3517static void __sleep_work(struct work_struct *work)
3518{
3519        struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3520        struct drm_i915_private *i915 = s->i915;
3521        unsigned int epoch = s->epoch;
3522
3523        kfree(s);
3524        if (same_epoch(i915, epoch))
3525                shrink_caches(i915);
3526}
3527
3528static void __sleep_rcu(struct rcu_head *rcu)
3529{
3530        struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3531        struct drm_i915_private *i915 = s->i915;
3532
3533        if (same_epoch(i915, s->epoch)) {
3534                INIT_WORK(&s->work, __sleep_work);
3535                queue_work(i915->wq, &s->work);
3536        } else {
3537                kfree(s);
3538        }
3539}
3540
3541static inline bool
3542new_requests_since_last_retire(const struct drm_i915_private *i915)
3543{
3544        return (READ_ONCE(i915->gt.active_requests) ||
3545                work_pending(&i915->gt.idle_work.work));
3546}
3547
3548static void assert_kernel_context_is_current(struct drm_i915_private *i915)
3549{
3550        struct intel_engine_cs *engine;
3551        enum intel_engine_id id;
3552
3553        if (i915_terminally_wedged(&i915->gpu_error))
3554                return;
3555
3556        GEM_BUG_ON(i915->gt.active_requests);
3557        for_each_engine(engine, i915, id) {
3558                GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
3559                GEM_BUG_ON(engine->last_retired_context !=
3560                           to_intel_context(i915->kernel_context, engine));
3561        }
3562}
3563
3564static void
3565i915_gem_idle_work_handler(struct work_struct *work)
3566{
3567        struct drm_i915_private *dev_priv =
3568                container_of(work, typeof(*dev_priv), gt.idle_work.work);
3569        unsigned int epoch = I915_EPOCH_INVALID;
3570        bool rearm_hangcheck;
3571
3572        if (!READ_ONCE(dev_priv->gt.awake))
3573                return;
3574
3575        if (READ_ONCE(dev_priv->gt.active_requests))
3576                return;
3577
3578        /*
3579         * Flush out the last user context, leaving only the pinned
3580         * kernel context resident. When we are idling on the kernel_context,
3581         * no more new requests (with a context switch) are emitted and we
3582         * can finally rest. A consequence is that the idle work handler is
3583         * always called at least twice before idling (and if the system is
3584         * idle that implies a round trip through the retire worker).
3585         */
3586        mutex_lock(&dev_priv->drm.struct_mutex);
3587        i915_gem_switch_to_kernel_context(dev_priv);
3588        mutex_unlock(&dev_priv->drm.struct_mutex);
3589
3590        GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
3591                  READ_ONCE(dev_priv->gt.active_requests));
3592
3593        /*
3594         * Wait for last execlists context complete, but bail out in case a
3595         * new request is submitted. As we don't trust the hardware, we
3596         * continue on if the wait times out. This is necessary to allow
3597         * the machine to suspend even if the hardware dies, and we will
3598         * try to recover in resume (after depriving the hardware of power,
3599         * it may be in a better mmod).
3600         */
3601        __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3602                   intel_engines_are_idle(dev_priv),
3603                   I915_IDLE_ENGINES_TIMEOUT * 1000,
3604                   10, 500);
3605
3606        rearm_hangcheck =
3607                cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3608
3609        if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3610                /* Currently busy, come back later */
3611                mod_delayed_work(dev_priv->wq,
3612                                 &dev_priv->gt.idle_work,
3613                                 msecs_to_jiffies(50));
3614                goto out_rearm;
3615        }
3616
3617        /*
3618         * New request retired after this work handler started, extend active
3619         * period until next instance of the work.
3620         */
3621        if (new_requests_since_last_retire(dev_priv))
3622                goto out_unlock;
3623
3624        epoch = __i915_gem_park(dev_priv);
3625
3626        assert_kernel_context_is_current(dev_priv);
3627
3628        rearm_hangcheck = false;
3629out_unlock:
3630        mutex_unlock(&dev_priv->drm.struct_mutex);
3631
3632out_rearm:
3633        if (rearm_hangcheck) {
3634                GEM_BUG_ON(!dev_priv->gt.awake);
3635                i915_queue_hangcheck(dev_priv);
3636        }
3637
3638        /*
3639         * When we are idle, it is an opportune time to reap our caches.
3640         * However, we have many objects that utilise RCU and the ordered
3641         * i915->wq that this work is executing on. To try and flush any
3642         * pending frees now we are idle, we first wait for an RCU grace
3643         * period, and then queue a task (that will run last on the wq) to
3644         * shrink and re-optimize the caches.
3645         */
3646        if (same_epoch(dev_priv, epoch)) {
3647                struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3648                if (s) {
3649                        s->i915 = dev_priv;
3650                        s->epoch = epoch;
3651                        call_rcu(&s->rcu, __sleep_rcu);
3652                }
3653        }
3654}
3655
3656void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3657{
3658        struct drm_i915_private *i915 = to_i915(gem->dev);
3659        struct drm_i915_gem_object *obj = to_intel_bo(gem);
3660        struct drm_i915_file_private *fpriv = file->driver_priv;
3661        struct i915_lut_handle *lut, *ln;
3662
3663        mutex_lock(&i915->drm.struct_mutex);
3664
3665        list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3666                struct i915_gem_context *ctx = lut->ctx;
3667                struct i915_vma *vma;
3668
3669                GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3670                if (ctx->file_priv != fpriv)
3671                        continue;
3672
3673                vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3674                GEM_BUG_ON(vma->obj != obj);
3675
3676                /* We allow the process to have multiple handles to the same
3677                 * vma, in the same fd namespace, by virtue of flink/open.
3678                 */
3679                GEM_BUG_ON(!vma->open_count);
3680                if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3681                        i915_vma_close(vma);
3682
3683                list_del(&lut->obj_link);
3684                list_del(&lut->ctx_link);
3685
3686                kmem_cache_free(i915->luts, lut);
3687                __i915_gem_object_release_unless_active(obj);
3688        }
3689
3690        mutex_unlock(&i915->drm.struct_mutex);
3691}
3692
3693static unsigned long to_wait_timeout(s64 timeout_ns)
3694{
3695        if (timeout_ns < 0)
3696                return MAX_SCHEDULE_TIMEOUT;
3697
3698        if (timeout_ns == 0)
3699                return 0;
3700
3701        return nsecs_to_jiffies_timeout(timeout_ns);
3702}
3703
3704/**
3705 * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3706 * @dev: drm device pointer
3707 * @data: ioctl data blob
3708 * @file: drm file pointer
3709 *
3710 * Returns 0 if successful, else an error is returned with the remaining time in
3711 * the timeout parameter.
3712 *  -ETIME: object is still busy after timeout
3713 *  -ERESTARTSYS: signal interrupted the wait
3714 *  -ENONENT: object doesn't exist
3715 * Also possible, but rare:
3716 *  -EAGAIN: incomplete, restart syscall
3717 *  -ENOMEM: damn
3718 *  -ENODEV: Internal IRQ fail
3719 *  -E?: The add request failed
3720 *
3721 * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3722 * non-zero timeout parameter the wait ioctl will wait for the given number of
3723 * nanoseconds on an object becoming unbusy. Since the wait itself does so
3724 * without holding struct_mutex the object may become re-busied before this
3725 * function completes. A similar but shorter * race condition exists in the busy
3726 * ioctl
3727 */
3728int
3729i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3730{
3731        struct drm_i915_gem_wait *args = data;
3732        struct drm_i915_gem_object *obj;
3733        ktime_t start;
3734        long ret;
3735
3736        if (args->flags != 0)
3737                return -EINVAL;
3738
3739        obj = i915_gem_object_lookup(file, args->bo_handle);
3740        if (!obj)
3741                return -ENOENT;
3742
3743        start = ktime_get();
3744
3745        ret = i915_gem_object_wait(obj,
3746                                   I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL,
3747                                   to_wait_timeout(args->timeout_ns),
3748                                   to_rps_client(file));
3749
3750        if (args->timeout_ns > 0) {
3751                args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3752                if (args->timeout_ns < 0)
3753                        args->timeout_ns = 0;
3754
3755                /*
3756                 * Apparently ktime isn't accurate enough and occasionally has a
3757                 * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3758                 * things up to make the test happy. We allow up to 1 jiffy.
3759                 *
3760                 * This is a regression from the timespec->ktime conversion.
3761                 */
3762                if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3763                        args->timeout_ns = 0;
3764
3765                /* Asked to wait beyond the jiffie/scheduler precision? */
3766                if (ret == -ETIME && args->timeout_ns)
3767                        ret = -EAGAIN;
3768        }
3769
3770        i915_gem_object_put(obj);
3771        return ret;
3772}
3773
3774static long wait_for_timeline(struct i915_timeline *tl,
3775                              unsigned int flags, long timeout)
3776{
3777        struct i915_request *rq;
3778
3779        rq = i915_gem_active_get_unlocked(&tl->last_request);
3780        if (!rq)
3781                return timeout;
3782
3783        /*
3784         * "Race-to-idle".
3785         *
3786         * Switching to the kernel context is often used a synchronous
3787         * step prior to idling, e.g. in suspend for flushing all
3788         * current operations to memory before sleeping. These we
3789         * want to complete as quickly as possible to avoid prolonged
3790         * stalls, so allow the gpu to boost to maximum clocks.
3791         */
3792        if (flags & I915_WAIT_FOR_IDLE_BOOST)
3793                gen6_rps_boost(rq, NULL);
3794
3795        timeout = i915_request_wait(rq, flags, timeout);
3796        i915_request_put(rq);
3797
3798        return timeout;
3799}
3800
3801static int wait_for_engines(struct drm_i915_private *i915)
3802{
3803        if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3804                dev_err(i915->drm.dev,
3805                        "Failed to idle engines, declaring wedged!\n");
3806                GEM_TRACE_DUMP();
3807                i915_gem_set_wedged(i915);
3808                return -EIO;
3809        }
3810
3811        return 0;
3812}
3813
3814int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3815                           unsigned int flags, long timeout)
3816{
3817        GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3818                  flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3819                  timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3820
3821        /* If the device is asleep, we have no requests outstanding */
3822        if (!READ_ONCE(i915->gt.awake))
3823                return 0;
3824
3825        if (flags & I915_WAIT_LOCKED) {
3826                struct i915_timeline *tl;
3827                int err;
3828
3829                lockdep_assert_held(&i915->drm.struct_mutex);
3830
3831                list_for_each_entry(tl, &i915->gt.timelines, link) {
3832                        timeout = wait_for_timeline(tl, flags, timeout);
3833                        if (timeout < 0)
3834                                return timeout;
3835                }
3836                if (GEM_SHOW_DEBUG() && !timeout) {
3837                        /* Presume that timeout was non-zero to begin with! */
3838                        dev_warn(&i915->drm.pdev->dev,
3839                                 "Missed idle-completion interrupt!\n");
3840                        GEM_TRACE_DUMP();
3841                }
3842
3843                err = wait_for_engines(i915);
3844                if (err)
3845                        return err;
3846
3847                i915_retire_requests(i915);
3848                GEM_BUG_ON(i915->gt.active_requests);
3849        } else {
3850                struct intel_engine_cs *engine;
3851                enum intel_engine_id id;
3852
3853                for_each_engine(engine, i915, id) {
3854                        struct i915_timeline *tl = &engine->timeline;
3855
3856                        timeout = wait_for_timeline(tl, flags, timeout);
3857                        if (timeout < 0)
3858                                return timeout;
3859                }
3860        }
3861
3862        return 0;
3863}
3864
3865static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3866{
3867        /*
3868         * We manually flush the CPU domain so that we can override and
3869         * force the flush for the display, and perform it asyncrhonously.
3870         */
3871        flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3872        if (obj->cache_dirty)
3873                i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3874        obj->write_domain = 0;
3875}
3876
3877void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3878{
3879        if (!READ_ONCE(obj->pin_global))
3880                return;
3881
3882        mutex_lock(&obj->base.dev->struct_mutex);
3883        __i915_gem_object_flush_for_display(obj);
3884        mutex_unlock(&obj->base.dev->struct_mutex);
3885}
3886
3887/**
3888 * Moves a single object to the WC read, and possibly write domain.
3889 * @obj: object to act on
3890 * @write: ask for write access or read only
3891 *
3892 * This function returns when the move is complete, including waiting on
3893 * flushes to occur.
3894 */
3895int
3896i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3897{
3898        int ret;
3899
3900        lockdep_assert_held(&obj->base.dev->struct_mutex);
3901
3902        ret = i915_gem_object_wait(obj,
3903                                   I915_WAIT_INTERRUPTIBLE |
3904                                   I915_WAIT_LOCKED |
3905                                   (write ? I915_WAIT_ALL : 0),
3906                                   MAX_SCHEDULE_TIMEOUT,
3907                                   NULL);
3908        if (ret)
3909                return ret;
3910
3911        if (obj->write_domain == I915_GEM_DOMAIN_WC)
3912                return 0;
3913
3914        /* Flush and acquire obj->pages so that we are coherent through
3915         * direct access in memory with previous cached writes through
3916         * shmemfs and that our cache domain tracking remains valid.
3917         * For example, if the obj->filp was moved to swap without us
3918         * being notified and releasing the pages, we would mistakenly
3919         * continue to assume that the obj remained out of the CPU cached
3920         * domain.
3921         */
3922        ret = i915_gem_object_pin_pages(obj);
3923        if (ret)
3924                return ret;
3925
3926        flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3927
3928        /* Serialise direct access to this object with the barriers for
3929         * coherent writes from the GPU, by effectively invalidating the
3930         * WC domain upon first access.
3931         */
3932        if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3933                mb();
3934
3935        /* It should now be out of any other write domains, and we can update
3936         * the domain values for our changes.
3937         */
3938        GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3939        obj->read_domains |= I915_GEM_DOMAIN_WC;
3940        if (write) {
3941                obj->read_domains = I915_GEM_DOMAIN_WC;
3942                obj->write_domain = I915_GEM_DOMAIN_WC;
3943                obj->mm.dirty = true;
3944        }
3945
3946        i915_gem_object_unpin_pages(obj);
3947        return 0;
3948}
3949
3950/**
3951 * Moves a single object to the GTT read, and possibly write domain.
3952 * @obj: object to act on
3953 * @write: ask for write access or read only
3954 *
3955 * This function returns when the move is complete, including waiting on
3956 * flushes to occur.
3957 */
3958int
3959i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3960{
3961        int ret;
3962
3963        lockdep_assert_held(&obj->base.dev->struct_mutex);
3964
3965        ret = i915_gem_object_wait(obj,
3966                                   I915_WAIT_INTERRUPTIBLE |
3967                                   I915_WAIT_LOCKED |
3968                                   (write ? I915_WAIT_ALL : 0),
3969                                   MAX_SCHEDULE_TIMEOUT,
3970                                   NULL);
3971        if (ret)
3972                return ret;
3973
3974        if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3975                return 0;
3976
3977        /* Flush and acquire obj->pages so that we are coherent through
3978         * direct access in memory with previous cached writes through
3979         * shmemfs and that our cache domain tracking remains valid.
3980         * For example, if the obj->filp was moved to swap without us
3981         * being notified and releasing the pages, we would mistakenly
3982         * continue to assume that the obj remained out of the CPU cached
3983         * domain.
3984         */
3985        ret = i915_gem_object_pin_pages(obj);
3986        if (ret)
3987                return ret;
3988
3989        flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3990
3991        /* Serialise direct access to this object with the barriers for
3992         * coherent writes from the GPU, by effectively invalidating the
3993         * GTT domain upon first access.
3994         */
3995        if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3996                mb();
3997
3998        /* It should now be out of any other write domains, and we can update
3999         * the domain values for our changes.
4000         */

4001        GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
4002        obj->read_domains |= I915_GEM_DOMAIN_GTT;
4003        if (write) {
4004                obj->read_domains = I915_GEM_DOMAIN_GTT;
4005                obj->write_domain = I915_GEM_DOMAIN_GTT;
4006                obj->mm.dirty = true;
4007        }
4008
4009        i915_gem_object_unpin_pages(obj);
4010        return 0;
4011}
4012
4013/**
4014 * Changes the cache-level of an object across all VMA.
4015 * @obj: object to act on
4016 * @cache_level: new cache level to set for the object
4017 *
4018 * After this function returns, the object will be in the new cache-level
4019 * across all GTT and the contents of the backing storage will be coherent,
4020 * with respect to the new cache-level. In order to keep the backing storage
4021 * coherent for all users, we only allow a single cache level to be set
4022 * globally on the object and prevent it from being changed whilst the
4023 * hardware is reading from the object. That is if the object is currently
4024 * on the scanout it will be set to uncached (or equivalent display
4025 * cache coherency) and all non-MOCS GPU access will also be uncached so
4026 * that all direct access to the scanout remains coherent.
4027 */
4028int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
4029                                    enum i915_cache_level cache_level)
4030{
4031        struct i915_vma *vma;
4032        int ret;
4033
4034        lockdep_assert_held(&obj->base.dev->struct_mutex);
4035
4036        if (obj->cache_level == cache_level)
4037                return 0;
4038
4039        /* Inspect the list of currently bound VMA and unbind any that would
4040         * be invalid given the new cache-level. This is principally to
4041         * catch the issue of the CS prefetch crossing page boundaries and
4042         * reading an invalid PTE on older architectures.
4043         */
4044restart:
4045        list_for_each_entry(vma, &obj->vma_list, obj_link) {
4046                if (!drm_mm_node_allocated(&vma->node))
4047                        continue;
4048
4049                if (i915_vma_is_pinned(vma)) {
4050                        DRM_DEBUG("can not change the cache level of pinned objects\n");
4051                        return -EBUSY;
4052                }
4053
4054                if (!i915_vma_is_closed(vma) &&
4055                    i915_gem_valid_gtt_space(vma, cache_level))
4056                        continue;
4057
4058                ret = i915_vma_unbind(vma);
4059                if (ret)
4060                        return ret;
4061
4062                /* As unbinding may affect other elements in the
4063                 * obj->vma_list (due to side-effects from retiring
4064                 * an active vma), play safe and restart the iterator.
4065                 */
4066                goto restart;
4067        }
4068
4069        /* We can reuse the existing drm_mm nodes but need to change the
4070         * cache-level on the PTE. We could simply unbind them all and
4071         * rebind with the correct cache-level on next use. However since
4072         * we already have a valid slot, dma mapping, pages etc, we may as
4073         * rewrite the PTE in the belief that doing so tramples upon less
4074         * state and so involves less work.
4075         */
4076        if (obj->bind_count) {
4077                /* Before we change the PTE, the GPU must not be accessing it.
4078                 * If we wait upon the object, we know that all the bound
4079                 * VMA are no longer active.
4080                 */
4081                ret = i915_gem_object_wait(obj,
4082                                           I915_WAIT_INTERRUPTIBLE |
4083                                           I915_WAIT_LOCKED |
4084                                           I915_WAIT_ALL,
4085                                           MAX_SCHEDULE_TIMEOUT,
4086                                           NULL);
4087                if (ret)
4088                        return ret;
4089
4090                if (!HAS_LLC(to_i915(obj->base.dev)) &&
4091                    cache_level != I915_CACHE_NONE) {
4092                        /* Access to snoopable pages through the GTT is
4093                         * incoherent and on some machines causes a hard
4094                         * lockup. Relinquish the CPU mmaping to force
4095                         * userspace to refault in the pages and we can
4096                         * then double check if the GTT mapping is still
4097                         * valid for that pointer access.
4098                         */
4099                        i915_gem_release_mmap(obj);
4100
4101                        /* As we no longer need a fence for GTT access,
4102                         * we can relinquish it now (and so prevent having
4103                         * to steal a fence from someone else on the next
4104                         * fence request). Note GPU activity would have
4105                         * dropped the fence as all snoopable access is
4106                         * supposed to be linear.
4107                         */
4108                        for_each_ggtt_vma(vma, obj) {
4109                                ret = i915_vma_put_fence(vma);
4110                                if (ret)
4111                                        return ret;
4112                        }
4113                } else {
4114                        /* We either have incoherent backing store and
4115                         * so no GTT access or the architecture is fully
4116                         * coherent. In such cases, existing GTT mmaps
4117                         * ignore the cache bit in the PTE and we can
4118                         * rewrite it without confusing the GPU or having
4119                         * to force userspace to fault back in its mmaps.
4120                         */
4121                }
4122
4123                list_for_each_entry(vma, &obj->vma_list, obj_link) {
4124                        if (!drm_mm_node_allocated(&vma->node))
4125                                continue;
4126
4127                        ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
4128                        if (ret)
4129                                return ret;
4130                }
4131        }
4132
4133        list_for_each_entry(vma, &obj->vma_list, obj_link)
4134                vma->node.color = cache_level;
4135        i915_gem_object_set_cache_coherency(obj, cache_level);
4136        obj->cache_dirty = true; /* Always invalidate stale cachelines */
4137
4138        return 0;
4139}
4140
4141int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
4142                               struct drm_file *file)
4143{
4144        struct drm_i915_gem_caching *args = data;
4145        struct drm_i915_gem_object *obj;
4146        int err = 0;
4147
4148        rcu_read_lock();
4149        obj = i915_gem_object_lookup_rcu(file, args->handle);
4150        if (!obj) {
4151                err = -ENOENT;
4152                goto out;
4153        }
4154
4155        switch (obj->cache_level) {
4156        case I915_CACHE_LLC:
4157        case I915_CACHE_L3_LLC:
4158                args->caching = I915_CACHING_CACHED;
4159                break;
4160
4161        case I915_CACHE_WT:
4162                args->caching = I915_CACHING_DISPLAY;
4163                break;
4164
4165        default:
4166                args->caching = I915_CACHING_NONE;
4167                break;
4168        }
4169out:
4170        rcu_read_unlock();
4171        return err;
4172}
4173
4174int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
4175                               struct drm_file *file)
4176{
4177        struct drm_i915_private *i915 = to_i915(dev);
4178        struct drm_i915_gem_caching *args = data;
4179        struct drm_i915_gem_object *obj;
4180        enum i915_cache_level level;
4181        int ret = 0;
4182
4183        switch (args->caching) {
4184        case I915_CACHING_NONE:
4185                level = I915_CACHE_NONE;
4186                break;
4187        case I915_CACHING_CACHED:
4188                /*
4189                 * Due to a HW issue on BXT A stepping, GPU stores via a
4190                 * snooped mapping may leave stale data in a corresponding CPU
4191                 * cacheline, whereas normally such cachelines would get
4192                 * invalidated.
4193                 */
4194                if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
4195                        return -ENODEV;
4196
4197                level = I915_CACHE_LLC;
4198                break;
4199        case I915_CACHING_DISPLAY:
4200                level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
4201                break;
4202        default:
4203                return -EINVAL;
4204        }
4205
4206        obj = i915_gem_object_lookup(file, args->handle);
4207        if (!obj)
4208                return -ENOENT;
4209
4210        /*
4211         * The caching mode of proxy object is handled by its generator, and
4212         * not allowed to be changed by userspace.
4213         */
4214        if (i915_gem_object_is_proxy(obj)) {
4215                ret = -ENXIO;
4216                goto out;
4217        }
4218
4219        if (obj->cache_level == level)
4220                goto out;
4221
4222        ret = i915_gem_object_wait(obj,
4223                                   I915_WAIT_INTERRUPTIBLE,
4224                                   MAX_SCHEDULE_TIMEOUT,
4225                                   to_rps_client(file));
4226        if (ret)
4227                goto out;
4228
4229        ret = i915_mutex_lock_interruptible(dev);
4230        if (ret)
4231                goto out;
4232
4233        ret = i915_gem_object_set_cache_level(obj, level);
4234        mutex_unlock(&dev->struct_mutex);
4235
4236out:
4237        i915_gem_object_put(obj);
4238        return ret;
4239}
4240
4241/*
4242 * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
4243 * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
4244 * (for pageflips). We only flush the caches while preparing the buffer for
4245 * display, the callers are responsible for frontbuffer flush.
4246 */
4247struct i915_vma *
4248i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
4249                                     u32 alignment,
4250                                     const struct i915_ggtt_view *view,
4251                                     unsigned int flags)
4252{
4253        struct i915_vma *vma;
4254        int ret;
4255
4256        lockdep_assert_held(&obj->base.dev->struct_mutex);
4257
4258        /* Mark the global pin early so that we account for the
4259         * display coherency whilst setting up the cache domains.
4260         */
4261        obj->pin_global++;
4262
4263        /* The display engine is not coherent with the LLC cache on gen6.  As
4264         * a result, we make sure that the pinning that is about to occur is
4265         * done with uncached PTEs. This is lowest common denominator for all
4266         * chipsets.
4267         *
4268         * However for gen6+, we could do better by using the GFDT bit instead
4269         * of uncaching, which would allow us to flush all the LLC-cached data
4270         * with that bit in the PTE to main memory with just one PIPE_CONTROL.
4271         */
4272        ret = i915_gem_object_set_cache_level(obj,
4273                                              HAS_WT(to_i915(obj->base.dev)) ?
4274                                              I915_CACHE_WT : I915_CACHE_NONE);
4275        if (ret) {
4276                vma = ERR_PTR(ret);
4277                goto err_unpin_global;
4278        }
4279
4280        /* As the user may map the buffer once pinned in the display plane
4281         * (e.g. libkms for the bootup splash), we have to ensure that we
4282         * always use map_and_fenceable for all scanout buffers. However,
4283         * it may simply be too big to fit into mappable, in which case
4284         * put it anyway and hope that userspace can cope (but always first
4285         * try to preserve the existing ABI).
4286         */
4287        vma = ERR_PTR(-ENOSPC);
4288        if ((flags & PIN_MAPPABLE) == 0 &&
4289            (!view || view->type == I915_GGTT_VIEW_NORMAL))
4290                vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
4291                                               flags |
4292                                               PIN_MAPPABLE |
4293                                               PIN_NONBLOCK);
4294        if (IS_ERR(vma))
4295                vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
4296        if (IS_ERR(vma))
4297                goto err_unpin_global;
4298
4299        vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
4300
4301        __i915_gem_object_flush_for_display(obj);
4302
4303        /* It should now be out of any other write domains, and we can update
4304         * the domain values for our changes.
4305         */
4306        obj->read_domains |= I915_GEM_DOMAIN_GTT;
4307
4308        return vma;
4309
4310err_unpin_global:
4311        obj->pin_global--;
4312        return vma;
4313}
4314
4315void
4316i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
4317{
4318        lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
4319
4320        if (WARN_ON(vma->obj->pin_global == 0))
4321                return;
4322
4323        if (--vma->obj->pin_global == 0)
4324                vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
4325
4326        /* Bump the LRU to try and avoid premature eviction whilst flipping  */
4327        i915_gem_object_bump_inactive_ggtt(vma->obj);
4328
4329        i915_vma_unpin(vma);
4330}
4331
4332/**
4333 * Moves a single object to the CPU read, and possibly write domain.
4334 * @obj: object to act on
4335 * @write: requesting write or read-only access
4336 *
4337 * This function returns when the move is complete, including waiting on
4338 * flushes to occur.
4339 */
4340int
4341i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
4342{
4343        int ret;
4344
4345        lockdep_assert_held(&obj->base.dev->struct_mutex);
4346
4347        ret = i915_gem_object_wait(obj,
4348                                   I915_WAIT_INTERRUPTIBLE |
4349                                   I915_WAIT_LOCKED |
4350                                   (write ? I915_WAIT_ALL : 0),
4351                                   MAX_SCHEDULE_TIMEOUT,
4352                                   NULL);
4353        if (ret)
4354                return ret;
4355
4356        flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
4357
4358        /* Flush the CPU cache if it's still invalid. */
4359        if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
4360                i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
4361                obj->read_domains |= I915_GEM_DOMAIN_CPU;
4362        }
4363
4364        /* It should now be out of any other write domains, and we can update
4365         * the domain values for our changes.
4366         */
4367        GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
4368
4369        /* If we're writing through the CPU, then the GPU read domains will
4370         * need to be invalidated at next use.
4371         */
4372        if (write)
4373                __start_cpu_write(obj);
4374
4375        return 0;
4376}
4377
4378/* Throttle our rendering by waiting until the ring has completed our requests
4379 * emitted over 20 msec ago.
4380 *
4381 * Note that if we were to use the current jiffies each time around the loop,
4382 * we wouldn't escape the function with any frames outstanding if the time to
4383 * render a frame was over 20ms.
4384 *
4385 * This should get us reasonable parallelism between CPU and GPU but also
4386 * relatively low latency when blocking on a particular request to finish.
4387 */
4388static int
4389i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
4390{
4391        struct drm_i915_private *dev_priv = to_i915(dev);
4392        struct drm_i915_file_private *file_priv = file->driver_priv;
4393        unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
4394        struct i915_request *request, *target = NULL;
4395        long ret;
4396
4397        /* ABI: return -EIO if already wedged */
4398        if (i915_terminally_wedged(&dev_priv->gpu_error))
4399                return -EIO;
4400
4401        spin_lock(&file_priv->mm.lock);
4402        list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
4403                if (time_after_eq(request->emitted_jiffies, recent_enough))
4404                        break;
4405
4406                if (target) {
4407                        list_del(&target->client_link);
4408                        target->file_priv = NULL;
4409                }
4410
4411                target = request;
4412        }
4413        if (target)
4414                i915_request_get(target);
4415        spin_unlock(&file_priv->mm.lock);
4416
4417        if (target == NULL)
4418                return 0;
4419
4420        ret = i915_request_wait(target,
4421                                I915_WAIT_INTERRUPTIBLE,
4422                                MAX_SCHEDULE_TIMEOUT);
4423        i915_request_put(target);
4424
4425        return ret < 0 ? ret : 0;
4426}
4427
4428struct i915_vma *
4429i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
4430                         const struct i915_ggtt_view *view,
4431                         u64 size,
4432                         u64 alignment,
4433                         u64 flags)
4434{
4435        struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
4436        struct i915_address_space *vm = &dev_priv->ggtt.vm;
4437        struct i915_vma *vma;
4438        int ret;
4439
4440        lockdep_assert_held(&obj->base.dev->struct_mutex);
4441
4442        if (flags & PIN_MAPPABLE &&
4443            (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
4444                /* If the required space is larger than the available
4445                 * aperture, we will not able to find a slot for the
4446                 * object and unbinding the object now will be in
4447                 * vain. Worse, doing so may cause us to ping-pong
4448                 * the object in and out of the Global GTT and
4449                 * waste a lot of cycles under the mutex.
4450                 */
4451                if (obj->base.size > dev_priv->ggtt.mappable_end)
4452                        return ERR_PTR(-E2BIG);
4453
4454                /* If NONBLOCK is set the caller is optimistically
4455                 * trying to cache the full object within the mappable
4456                 * aperture, and *must* have a fallback in place for
4457                 * situations where we cannot bind the object. We
4458                 * can be a little more lax here and use the fallback
4459                 * more often to avoid costly migrations of ourselves
4460                 * and other objects within the aperture.
4461                 *
4462                 * Half-the-aperture is used as a simple heuristic.
4463                 * More interesting would to do search for a free
4464                 * block prior to making the commitment to unbind.
4465                 * That caters for the self-harm case, and with a
4466                 * little more heuristics (e.g. NOFAULT, NOEVICT)
4467                 * we could try to minimise harm to others.
4468                 */
4469                if (flags & PIN_NONBLOCK &&
4470                    obj->base.size > dev_priv->ggtt.mappable_end / 2)
4471                        return ERR_PTR(-ENOSPC);
4472        }
4473
4474        vma = i915_vma_instance(obj, vm, view);
4475        if (unlikely(IS_ERR(vma)))
4476                return vma;
4477
4478        if (i915_vma_misplaced(vma, size, alignment, flags)) {
4479                if (flags & PIN_NONBLOCK) {
4480                        if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
4481                                return ERR_PTR(-ENOSPC);
4482
4483                        if (flags & PIN_MAPPABLE &&
4484                            vma->fence_size > dev_priv->ggtt.mappable_end / 2)
4485                                return ERR_PTR(-ENOSPC);
4486                }
4487
4488                WARN(i915_vma_is_pinned(vma),
4489                     "bo is already pinned in ggtt with incorrect alignment:"
4490                     " offset=%08x, req.alignment=%llx,"
4491                     " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
4492                     i915_ggtt_offset(vma), alignment,
4493                     !!(flags & PIN_MAPPABLE),
4494                     i915_vma_is_map_and_fenceable(vma));
4495                ret = i915_vma_unbind(vma);
4496                if (ret)
4497                        return ERR_PTR(ret);
4498        }
4499
4500        ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
4501        if (ret)
4502                return ERR_PTR(ret);
4503
4504        return vma;
4505}
4506
4507static __always_inline unsigned int __busy_read_flag(unsigned int id)
4508{
4509        /* Note that we could alias engines in the execbuf API, but
4510         * that would be very unwise as it prevents userspace from
4511         * fine control over engine selection. Ahem.
4512         *
4513         * This should be something like EXEC_MAX_ENGINE instead of
4514         * I915_NUM_ENGINES.
4515         */
4516        BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4517        return 0x10000 << id;
4518}
4519
4520static __always_inline unsigned int __busy_write_id(unsigned int id)
4521{
4522        /* The uABI guarantees an active writer is also amongst the read
4523         * engines. This would be true if we accessed the activity tracking
4524         * under the lock, but as we perform the lookup of the object and
4525         * its activity locklessly we can not guarantee that the last_write
4526         * being active implies that we have set the same engine flag from
4527         * last_read - hence we always set both read and write busy for
4528         * last_write.
4529         */
4530        return id | __busy_read_flag(id);
4531}
4532
4533static __always_inline unsigned int
4534__busy_set_if_active(const struct dma_fence *fence,
4535                     unsigned int (*flag)(unsigned int id))
4536{
4537        struct i915_request *rq;
4538
4539        /* We have to check the current hw status of the fence as the uABI
4540         * guarantees forward progress. We could rely on the idle worker
4541         * to eventually flush us, but to minimise latency just ask the
4542         * hardware.
4543         *
4544         * Note we only report on the status of native fences.
4545         */
4546        if (!dma_fence_is_i915(fence))
4547                return 0;
4548
4549        /* opencode to_request() in order to avoid const warnings */
4550        rq = container_of(fence, struct i915_request, fence);
4551        if (i915_request_completed(rq))
4552                return 0;
4553
4554        return flag(rq->engine->uabi_id);
4555}
4556
4557static __always_inline unsigned int
4558busy_check_reader(const struct dma_fence *fence)
4559{
4560        return __busy_set_if_active(fence, __busy_read_flag);
4561}
4562
4563static __always_inline unsigned int
4564busy_check_writer(const struct dma_fence *fence)
4565{
4566        if (!fence)
4567                return 0;
4568
4569        return __busy_set_if_active(fence, __busy_write_id);
4570}
4571
4572int
4573i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4574                    struct drm_file *file)
4575{
4576        struct drm_i915_gem_busy *args = data;
4577        struct drm_i915_gem_object *obj;
4578        struct reservation_object_list *list;
4579        unsigned int seq;
4580        int err;
4581
4582        err = -ENOENT;
4583        rcu_read_lock();
4584        obj = i915_gem_object_lookup_rcu(file, args->handle);
4585        if (!obj)
4586                goto out;
4587
4588        /* A discrepancy here is that we do not report the status of
4589         * non-i915 fences, i.e. even though we may report the object as idle,
4590         * a call to set-domain may still stall waiting for foreign rendering.
4591         * This also means that wait-ioctl may report an object as busy,
4592         * where busy-ioctl considers it idle.
4593         *
4594         * We trade the ability to warn of foreign fences to report on which
4595         * i915 engines are active for the object.
4596         *
4597         * Alternatively, we can trade that extra information on read/write
4598         * activity with
4599         *      args->busy =
4600         *              !reservation_object_test_signaled_rcu(obj->resv, true);
4601         * to report the overall busyness. This is what the wait-ioctl does.
4602         *
4603         */
4604retry:
4605        seq = raw_read_seqcount(&obj->resv->seq);
4606
4607        /* Translate the exclusive fence to the READ *and* WRITE engine */
4608        args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4609
4610        /* Translate shared fences to READ set of engines */
4611        list = rcu_dereference(obj->resv->fence);
4612        if (list) {
4613                unsigned int shared_count = list->shared_count, i;
4614
4615                for (i = 0; i < shared_count; ++i) {
4616                        struct dma_fence *fence =
4617                                rcu_dereference(list->shared[i]);
4618
4619                        args->busy |= busy_check_reader(fence);
4620                }
4621        }
4622
4623        if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4624                goto retry;
4625
4626        err = 0;
4627out:
4628        rcu_read_unlock();
4629        return err;
4630}
4631
4632int
4633i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4634                        struct drm_file *file_priv)
4635{
4636        return i915_gem_ring_throttle(dev, file_priv);
4637}
4638
4639int
4640i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4641                       struct drm_file *file_priv)
4642{
4643        struct drm_i915_private *dev_priv = to_i915(dev);
4644        struct drm_i915_gem_madvise *args = data;
4645        struct drm_i915_gem_object *obj;
4646        int err;
4647
4648        switch (args->madv) {
4649        case I915_MADV_DONTNEED:
4650        case I915_MADV_WILLNEED:
4651            break;
4652        default:
4653            return -EINVAL;
4654        }
4655
4656        obj = i915_gem_object_lookup(file_priv, args->handle);
4657        if (!obj)
4658                return -ENOENT;
4659
4660        err = mutex_lock_interruptible(&obj->mm.lock);
4661        if (err)
4662                goto out;
4663
4664        if (i915_gem_object_has_pages(obj) &&
4665            i915_gem_object_is_tiled(obj) &&
4666            dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4667                if (obj->mm.madv == I915_MADV_WILLNEED) {
4668                        GEM_BUG_ON(!obj->mm.quirked);
4669                        __i915_gem_object_unpin_pages(obj);
4670                        obj->mm.quirked = false;
4671                }
4672                if (args->madv == I915_MADV_WILLNEED) {
4673                        GEM_BUG_ON(obj->mm.quirked);
4674                        __i915_gem_object_pin_pages(obj);
4675                        obj->mm.quirked = true;
4676                }
4677        }
4678
4679        if (obj->mm.madv != __I915_MADV_PURGED)
4680                obj->mm.madv = args->madv;
4681
4682        /* if the object is no longer attached, discard its backing storage */
4683        if (obj->mm.madv == I915_MADV_DONTNEED &&
4684            !i915_gem_object_has_pages(obj))
4685                i915_gem_object_truncate(obj);
4686
4687        args->retained = obj->mm.madv != __I915_MADV_PURGED;
4688        mutex_unlock(&obj->mm.lock);
4689
4690out:
4691        i915_gem_object_put(obj);
4692        return err;
4693}
4694
4695static void
4696frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
4697{
4698        struct drm_i915_gem_object *obj =
4699                container_of(active, typeof(*obj), frontbuffer_write);
4700
4701        intel_fb_obj_flush(obj, ORIGIN_CS);
4702}
4703
4704void i915_gem_object_init(struct drm_i915_gem_object *obj,
4705                          const struct drm_i915_gem_object_ops *ops)
4706{
4707        mutex_init(&obj->mm.lock);
4708
4709        INIT_LIST_HEAD(&obj->vma_list);
4710        INIT_LIST_HEAD(&obj->lut_list);
4711        INIT_LIST_HEAD(&obj->batch_pool_link);
4712
4713        obj->ops = ops;
4714
4715        reservation_object_init(&obj->__builtin_resv);
4716        obj->resv = &obj->__builtin_resv;
4717
4718        obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4719        init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
4720
4721        obj->mm.madv = I915_MADV_WILLNEED;
4722        INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4723        mutex_init(&obj->mm.get_page.lock);
4724
4725        i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4726}
4727
4728static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4729        .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4730                 I915_GEM_OBJECT_IS_SHRINKABLE,
4731
4732        .get_pages = i915_gem_object_get_pages_gtt,
4733        .put_pages = i915_gem_object_put_pages_gtt,
4734
4735        .pwrite = i915_gem_object_pwrite_gtt,
4736};
4737
4738static int i915_gem_object_create_shmem(struct drm_device *dev,
4739                                        struct drm_gem_object *obj,
4740                                        size_t size)
4741{
4742        struct drm_i915_private *i915 = to_i915(dev);
4743        unsigned long flags = VM_NORESERVE;
4744        struct file *filp;
4745
4746        drm_gem_private_object_init(dev, obj, size);
4747
4748        if (i915->mm.gemfs)
4749                filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4750                                                 flags);
4751        else
4752                filp = shmem_file_setup("i915", size, flags);
4753
4754        if (IS_ERR(filp))
4755                return PTR_ERR(filp);
4756
4757        obj->filp = filp;
4758
4759        return 0;
4760}
4761
4762struct drm_i915_gem_object *
4763i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4764{
4765        struct drm_i915_gem_object *obj;
4766        struct address_space *mapping;
4767        unsigned int cache_level;
4768        gfp_t mask;
4769        int ret;
4770
4771        /* There is a prevalence of the assumption that we fit the object's
4772         * page count inside a 32bit _signed_ variable. Let's document this and
4773         * catch if we ever need to fix it. In the meantime, if you do spot
4774         * such a local variable, please consider fixing!
4775         */
4776        if (size >> PAGE_SHIFT > INT_MAX)
4777                return ERR_PTR(-E2BIG);
4778
4779        if (overflows_type(size, obj->base.size))
4780                return ERR_PTR(-E2BIG);
4781
4782        obj = i915_gem_object_alloc(dev_priv);
4783        if (obj == NULL)
4784                return ERR_PTR(-ENOMEM);
4785
4786        ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4787        if (ret)
4788                goto fail;
4789
4790        mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4791        if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4792                /* 965gm cannot relocate objects above 4GiB. */
4793                mask &= ~__GFP_HIGHMEM;
4794                mask |= __GFP_DMA32;
4795        }
4796
4797        mapping = obj->base.filp->f_mapping;
4798        mapping_set_gfp_mask(mapping, mask);
4799        GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4800
4801        i915_gem_object_init(obj, &i915_gem_object_ops);
4802
4803        obj->write_domain = I915_GEM_DOMAIN_CPU;
4804        obj->read_domains = I915_GEM_DOMAIN_CPU;
4805
4806        if (HAS_LLC(dev_priv))
4807                /* On some devices, we can have the GPU use the LLC (the CPU
4808                 * cache) for about a 10% performance improvement
4809                 * compared to uncached.  Graphics requests other than
4810                 * display scanout are coherent with the CPU in
4811                 * accessing this cache.  This means in this mode we
4812                 * don't need to clflush on the CPU side, and on the
4813                 * GPU side we only need to flush internal caches to
4814                 * get data visible to the CPU.
4815                 *
4816                 * However, we maintain the display planes as UC, and so
4817                 * need to rebind when first used as such.
4818                 */
4819                cache_level = I915_CACHE_LLC;
4820        else
4821                cache_level = I915_CACHE_NONE;
4822
4823        i915_gem_object_set_cache_coherency(obj, cache_level);
4824
4825        trace_i915_gem_object_create(obj);
4826
4827        return obj;
4828
4829fail:
4830        i915_gem_object_free(obj);
4831        return ERR_PTR(ret);
4832}
4833
4834static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4835{
4836        /* If we are the last user of the backing storage (be it shmemfs
4837         * pages or stolen etc), we know that the pages are going to be
4838         * immediately released. In this case, we can then skip copying
4839         * back the contents from the GPU.
4840         */
4841
4842        if (obj->mm.madv != I915_MADV_WILLNEED)
4843                return false;
4844
4845        if (obj->base.filp == NULL)
4846                return true;
4847
4848        /* At first glance, this looks racy, but then again so would be
4849         * userspace racing mmap against close. However, the first external
4850         * reference to the filp can only be obtained through the
4851         * i915_gem_mmap_ioctl() which safeguards us against the user
4852         * acquiring such a reference whilst we are in the middle of
4853         * freeing the object.
4854         */
4855        return atomic_long_read(&obj->base.filp->f_count) == 1;
4856}
4857
4858static void __i915_gem_free_objects(struct drm_i915_private *i915,
4859                                    struct llist_node *freed)
4860{
4861        struct drm_i915_gem_object *obj, *on;
4862
4863        intel_runtime_pm_get(i915);
4864        llist_for_each_entry_safe(obj, on, freed, freed) {
4865                struct i915_vma *vma, *vn;
4866
4867                trace_i915_gem_object_destroy(obj);
4868
4869                mutex_lock(&i915->drm.struct_mutex);
4870
4871                GEM_BUG_ON(i915_gem_object_is_active(obj));
4872                list_for_each_entry_safe(vma, vn,
4873                                         &obj->vma_list, obj_link) {
4874                        GEM_BUG_ON(i915_vma_is_active(vma));
4875                        vma->flags &= ~I915_VMA_PIN_MASK;
4876                        i915_vma_destroy(vma);
4877                }
4878                GEM_BUG_ON(!list_empty(&obj->vma_list));
4879                GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
4880
4881                /* This serializes freeing with the shrinker. Since the free
4882                 * is delayed, first by RCU then by the workqueue, we want the
4883                 * shrinker to be able to free pages of unreferenced objects,
4884                 * or else we may oom whilst there are plenty of deferred
4885                 * freed objects.
4886                 */
4887                if (i915_gem_object_has_pages(obj)) {
4888                        spin_lock(&i915->mm.obj_lock);
4889                        list_del_init(&obj->mm.link);
4890                        spin_unlock(&i915->mm.obj_lock);
4891                }
4892
4893                mutex_unlock(&i915->drm.struct_mutex);
4894
4895                GEM_BUG_ON(obj->bind_count);
4896                GEM_BUG_ON(obj->userfault_count);
4897                GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4898                GEM_BUG_ON(!list_empty(&obj->lut_list));
4899
4900                if (obj->ops->release)
4901                        obj->ops->release(obj);
4902
4903                if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4904                        atomic_set(&obj->mm.pages_pin_count, 0);
4905                __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4906                GEM_BUG_ON(i915_gem_object_has_pages(obj));
4907
4908                if (obj->base.import_attach)
4909                        drm_prime_gem_destroy(&obj->base, NULL);
4910
4911                reservation_object_fini(&obj->__builtin_resv);
4912                drm_gem_object_release(&obj->base);
4913                i915_gem_info_remove_obj(i915, obj->base.size);
4914
4915                kfree(obj->bit_17);
4916                i915_gem_object_free(obj);
4917
4918                GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4919                atomic_dec(&i915->mm.free_count);
4920
4921                if (on)
4922                        cond_resched();
4923        }
4924        intel_runtime_pm_put(i915);
4925}
4926
4927static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4928{
4929        struct llist_node *freed;
4930
4931        /* Free the oldest, most stale object to keep the free_list short */
4932        freed = NULL;
4933        if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4934                /* Only one consumer of llist_del_first() allowed */
4935                spin_lock(&i915->mm.free_lock);
4936                freed = llist_del_first(&i915->mm.free_list);
4937                spin_unlock(&i915->mm.free_lock);
4938        }
4939        if (unlikely(freed)) {
4940                freed->next = NULL;
4941                __i915_gem_free_objects(i915, freed);
4942        }
4943}
4944
4945static void __i915_gem_free_work(struct work_struct *work)
4946{
4947        struct drm_i915_private *i915 =
4948                container_of(work, struct drm_i915_private, mm.free_work);
4949        struct llist_node *freed;
4950
4951        /*
4952         * All file-owned VMA should have been released by this point through
4953         * i915_gem_close_object(), or earlier by i915_gem_context_close().
4954         * However, the object may also be bound into the global GTT (e.g.
4955         * older GPUs without per-process support, or for direct access through
4956         * the GTT either for the user or for scanout). Those VMA still need to
4957         * unbound now.
4958         */
4959
4960        spin_lock(&i915->mm.free_lock);
4961        while ((freed = llist_del_all(&i915->mm.free_list))) {
4962                spin_unlock(&i915->mm.free_lock);
4963
4964                __i915_gem_free_objects(i915, freed);
4965                if (need_resched())
4966                        return;
4967
4968                spin_lock(&i915->mm.free_lock);
4969        }
4970        spin_unlock(&i915->mm.free_lock);
4971}
4972
4973static void __i915_gem_free_object_rcu(struct rcu_head *head)
4974{
4975        struct drm_i915_gem_object *obj =
4976                container_of(head, typeof(*obj), rcu);
4977        struct drm_i915_private *i915 = to_i915(obj->base.dev);
4978
4979        /*
4980         * Since we require blocking on struct_mutex to unbind the freed
4981         * object from the GPU before releasing resources back to the
4982         * system, we can not do that directly from the RCU callback (which may
4983         * be a softirq context), but must instead then defer that work onto a
4984         * kthread. We use the RCU callback rather than move the freed object
4985         * directly onto the work queue so that we can mix between using the
4986         * worker and performing frees directly from subsequent allocations for
4987         * crude but effective memory throttling.
4988         */
4989        if (llist_add(&obj->freed, &i915->mm.free_list))
4990                queue_work(i915->wq, &i915->mm.free_work);
4991}
4992
4993void i915_gem_free_object(struct drm_gem_object *gem_obj)
4994{
4995        struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4996
4997        if (obj->mm.quirked)
4998                __i915_gem_object_unpin_pages(obj);
4999
5000        if (discard_backing_storage(obj))

5001                obj->mm.madv = I915_MADV_DONTNEED;
5002
5003        /*
5004         * Before we free the object, make sure any pure RCU-only
5005         * read-side critical sections are complete, e.g.
5006         * i915_gem_busy_ioctl(). For the corresponding synchronized
5007         * lookup see i915_gem_object_lookup_rcu().
5008         */
5009        atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
5010        call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
5011}
5012
5013void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
5014{
5015        lockdep_assert_held(&obj->base.dev->struct_mutex);
5016
5017        if (!i915_gem_object_has_active_reference(obj) &&
5018            i915_gem_object_is_active(obj))
5019                i915_gem_object_set_active_reference(obj);
5020        else
5021                i915_gem_object_put(obj);
5022}
5023
5024void i915_gem_sanitize(struct drm_i915_private *i915)
5025{
5026        int err;
5027
5028        GEM_TRACE("\n");
5029
5030        mutex_lock(&i915->drm.struct_mutex);
5031
5032        intel_runtime_pm_get(i915);
5033        intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5034
5035        /*
5036         * As we have just resumed the machine and woken the device up from
5037         * deep PCI sleep (presumably D3_cold), assume the HW has been reset
5038         * back to defaults, recovering from whatever wedged state we left it
5039         * in and so worth trying to use the device once more.
5040         */
5041        if (i915_terminally_wedged(&i915->gpu_error))
5042                i915_gem_unset_wedged(i915);
5043
5044        /*
5045         * If we inherit context state from the BIOS or earlier occupants
5046         * of the GPU, the GPU may be in an inconsistent state when we
5047         * try to take over. The only way to remove the earlier state
5048         * is by resetting. However, resetting on earlier gen is tricky as
5049         * it may impact the display and we are uncertain about the stability
5050         * of the reset, so this could be applied to even earlier gen.
5051         */
5052        err = -ENODEV;
5053        if (INTEL_GEN(i915) >= 5 && intel_has_gpu_reset(i915))
5054                err = WARN_ON(intel_gpu_reset(i915, ALL_ENGINES));
5055        if (!err)
5056                intel_engines_sanitize(i915);
5057
5058        intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5059        intel_runtime_pm_put(i915);
5060
5061        i915_gem_contexts_lost(i915);
5062        mutex_unlock(&i915->drm.struct_mutex);
5063}
5064
5065int i915_gem_suspend(struct drm_i915_private *i915)
5066{
5067        int ret;
5068
5069        GEM_TRACE("\n");
5070
5071        intel_runtime_pm_get(i915);
5072        intel_suspend_gt_powersave(i915);
5073
5074        mutex_lock(&i915->drm.struct_mutex);
5075
5076        /*
5077         * We have to flush all the executing contexts to main memory so
5078         * that they can saved in the hibernation image. To ensure the last
5079         * context image is coherent, we have to switch away from it. That
5080         * leaves the i915->kernel_context still active when
5081         * we actually suspend, and its image in memory may not match the GPU
5082         * state. Fortunately, the kernel_context is disposable and we do
5083         * not rely on its state.
5084         */
5085        if (!i915_terminally_wedged(&i915->gpu_error)) {
5086                ret = i915_gem_switch_to_kernel_context(i915);
5087                if (ret)
5088                        goto err_unlock;
5089
5090                ret = i915_gem_wait_for_idle(i915,
5091                                             I915_WAIT_INTERRUPTIBLE |
5092                                             I915_WAIT_LOCKED |
5093                                             I915_WAIT_FOR_IDLE_BOOST,
5094                                             MAX_SCHEDULE_TIMEOUT);
5095                if (ret && ret != -EIO)
5096                        goto err_unlock;
5097
5098                assert_kernel_context_is_current(i915);
5099        }
5100        i915_retire_requests(i915); /* ensure we flush after wedging */
5101
5102        mutex_unlock(&i915->drm.struct_mutex);
5103
5104        intel_uc_suspend(i915);
5105
5106        cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
5107        cancel_delayed_work_sync(&i915->gt.retire_work);
5108
5109        /*
5110         * As the idle_work is rearming if it detects a race, play safe and
5111         * repeat the flush until it is definitely idle.
5112         */
5113        drain_delayed_work(&i915->gt.idle_work);
5114
5115        /*
5116         * Assert that we successfully flushed all the work and
5117         * reset the GPU back to its idle, low power state.
5118         */
5119        WARN_ON(i915->gt.awake);
5120        if (WARN_ON(!intel_engines_are_idle(i915)))
5121                i915_gem_set_wedged(i915); /* no hope, discard everything */
5122
5123        intel_runtime_pm_put(i915);
5124        return 0;
5125
5126err_unlock:
5127        mutex_unlock(&i915->drm.struct_mutex);
5128        intel_runtime_pm_put(i915);
5129        return ret;
5130}
5131
5132void i915_gem_suspend_late(struct drm_i915_private *i915)
5133{
5134        struct drm_i915_gem_object *obj;
5135        struct list_head *phases[] = {
5136                &i915->mm.unbound_list,
5137                &i915->mm.bound_list,
5138                NULL
5139        }, **phase;
5140
5141        /*
5142         * Neither the BIOS, ourselves or any other kernel
5143         * expects the system to be in execlists mode on startup,
5144         * so we need to reset the GPU back to legacy mode. And the only
5145         * known way to disable logical contexts is through a GPU reset.
5146         *
5147         * So in order to leave the system in a known default configuration,
5148         * always reset the GPU upon unload and suspend. Afterwards we then
5149         * clean up the GEM state tracking, flushing off the requests and
5150         * leaving the system in a known idle state.
5151         *
5152         * Note that is of the upmost importance that the GPU is idle and
5153         * all stray writes are flushed *before* we dismantle the backing
5154         * storage for the pinned objects.
5155         *
5156         * However, since we are uncertain that resetting the GPU on older
5157         * machines is a good idea, we don't - just in case it leaves the
5158         * machine in an unusable condition.
5159         */
5160
5161        mutex_lock(&i915->drm.struct_mutex);
5162        for (phase = phases; *phase; phase++) {
5163                list_for_each_entry(obj, *phase, mm.link)
5164                        WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
5165        }
5166        mutex_unlock(&i915->drm.struct_mutex);
5167
5168        intel_uc_sanitize(i915);
5169        i915_gem_sanitize(i915);
5170}
5171
5172void i915_gem_resume(struct drm_i915_private *i915)
5173{
5174        GEM_TRACE("\n");
5175
5176        WARN_ON(i915->gt.awake);
5177
5178        mutex_lock(&i915->drm.struct_mutex);
5179        intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
5180
5181        i915_gem_restore_gtt_mappings(i915);
5182        i915_gem_restore_fences(i915);
5183
5184        /*
5185         * As we didn't flush the kernel context before suspend, we cannot
5186         * guarantee that the context image is complete. So let's just reset
5187         * it and start again.
5188         */
5189        i915->gt.resume(i915);
5190
5191        if (i915_gem_init_hw(i915))
5192                goto err_wedged;
5193
5194        intel_uc_resume(i915);
5195
5196        /* Always reload a context for powersaving. */
5197        if (i915_gem_switch_to_kernel_context(i915))
5198                goto err_wedged;
5199
5200out_unlock:
5201        intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
5202        mutex_unlock(&i915->drm.struct_mutex);
5203        return;
5204
5205err_wedged:
5206        if (!i915_terminally_wedged(&i915->gpu_error)) {
5207                DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
5208                i915_gem_set_wedged(i915);
5209        }
5210        goto out_unlock;
5211}
5212
5213void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
5214{
5215        if (INTEL_GEN(dev_priv) < 5 ||
5216            dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
5217                return;
5218
5219        I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
5220                                 DISP_TILE_SURFACE_SWIZZLING);
5221
5222        if (IS_GEN5(dev_priv))
5223                return;
5224
5225        I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
5226        if (IS_GEN6(dev_priv))
5227                I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
5228        else if (IS_GEN7(dev_priv))
5229                I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
5230        else if (IS_GEN8(dev_priv))
5231                I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
5232        else
5233                BUG();
5234}
5235
5236static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
5237{
5238        I915_WRITE(RING_CTL(base), 0);
5239        I915_WRITE(RING_HEAD(base), 0);
5240        I915_WRITE(RING_TAIL(base), 0);
5241        I915_WRITE(RING_START(base), 0);
5242}
5243
5244static void init_unused_rings(struct drm_i915_private *dev_priv)
5245{
5246        if (IS_I830(dev_priv)) {
5247                init_unused_ring(dev_priv, PRB1_BASE);
5248                init_unused_ring(dev_priv, SRB0_BASE);
5249                init_unused_ring(dev_priv, SRB1_BASE);
5250                init_unused_ring(dev_priv, SRB2_BASE);
5251                init_unused_ring(dev_priv, SRB3_BASE);
5252        } else if (IS_GEN2(dev_priv)) {
5253                init_unused_ring(dev_priv, SRB0_BASE);
5254                init_unused_ring(dev_priv, SRB1_BASE);
5255        } else if (IS_GEN3(dev_priv)) {
5256                init_unused_ring(dev_priv, PRB1_BASE);
5257                init_unused_ring(dev_priv, PRB2_BASE);
5258        }
5259}
5260
5261static int __i915_gem_restart_engines(void *data)
5262{
5263        struct drm_i915_private *i915 = data;
5264        struct intel_engine_cs *engine;
5265        enum intel_engine_id id;
5266        int err;
5267
5268        for_each_engine(engine, i915, id) {
5269                err = engine->init_hw(engine);
5270                if (err) {
5271                        DRM_ERROR("Failed to restart %s (%d)\n",
5272                                  engine->name, err);
5273                        return err;
5274                }
5275        }
5276
5277        return 0;
5278}
5279
5280int i915_gem_init_hw(struct drm_i915_private *dev_priv)
5281{
5282        int ret;
5283
5284        dev_priv->gt.last_init_time = ktime_get();
5285
5286        /* Double layer security blanket, see i915_gem_init() */
5287        intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5288
5289        if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
5290                I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
5291
5292        if (IS_HASWELL(dev_priv))
5293                I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
5294                           LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
5295
5296        if (HAS_PCH_NOP(dev_priv)) {
5297                if (IS_IVYBRIDGE(dev_priv)) {
5298                        u32 temp = I915_READ(GEN7_MSG_CTL);
5299                        temp &= ~(WAIT_FOR_PCH_FLR_ACK | WAIT_FOR_PCH_RESET_ACK);
5300                        I915_WRITE(GEN7_MSG_CTL, temp);
5301                } else if (INTEL_GEN(dev_priv) >= 7) {
5302                        u32 temp = I915_READ(HSW_NDE_RSTWRN_OPT);
5303                        temp &= ~RESET_PCH_HANDSHAKE_ENABLE;
5304                        I915_WRITE(HSW_NDE_RSTWRN_OPT, temp);
5305                }
5306        }
5307
5308        intel_gt_apply_workarounds(dev_priv);
5309
5310        i915_gem_init_swizzling(dev_priv);
5311
5312        /*
5313         * At least 830 can leave some of the unused rings
5314         * "active" (ie. head != tail) after resume which
5315         * will prevent c3 entry. Makes sure all unused rings
5316         * are totally idle.
5317         */
5318        init_unused_rings(dev_priv);
5319
5320        BUG_ON(!dev_priv->kernel_context);
5321        if (i915_terminally_wedged(&dev_priv->gpu_error)) {
5322                ret = -EIO;
5323                goto out;
5324        }
5325
5326        ret = i915_ppgtt_init_hw(dev_priv);
5327        if (ret) {
5328                DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
5329                goto out;
5330        }
5331
5332        ret = intel_wopcm_init_hw(&dev_priv->wopcm);
5333        if (ret) {
5334                DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
5335                goto out;
5336        }
5337
5338        /* We can't enable contexts until all firmware is loaded */
5339        ret = intel_uc_init_hw(dev_priv);
5340        if (ret) {
5341                DRM_ERROR("Enabling uc failed (%d)\n", ret);
5342                goto out;
5343        }
5344
5345        intel_mocs_init_l3cc_table(dev_priv);
5346
5347        /* Only when the HW is re-initialised, can we replay the requests */
5348        ret = __i915_gem_restart_engines(dev_priv);
5349        if (ret)
5350                goto cleanup_uc;
5351
5352        intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5353
5354        return 0;
5355
5356cleanup_uc:
5357        intel_uc_fini_hw(dev_priv);
5358out:
5359        intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5360
5361        return ret;
5362}
5363
5364static int __intel_engines_record_defaults(struct drm_i915_private *i915)
5365{
5366        struct i915_gem_context *ctx;
5367        struct intel_engine_cs *engine;
5368        enum intel_engine_id id;
5369        int err;
5370
5371        /*
5372         * As we reset the gpu during very early sanitisation, the current
5373         * register state on the GPU should reflect its defaults values.
5374         * We load a context onto the hw (with restore-inhibit), then switch
5375         * over to a second context to save that default register state. We
5376         * can then prime every new context with that state so they all start
5377         * from the same default HW values.
5378         */
5379
5380        ctx = i915_gem_context_create_kernel(i915, 0);
5381        if (IS_ERR(ctx))
5382                return PTR_ERR(ctx);
5383
5384        for_each_engine(engine, i915, id) {
5385                struct i915_request *rq;
5386
5387                rq = i915_request_alloc(engine, ctx);
5388                if (IS_ERR(rq)) {
5389                        err = PTR_ERR(rq);
5390                        goto out_ctx;
5391                }
5392
5393                err = 0;
5394                if (engine->init_context)
5395                        err = engine->init_context(rq);
5396
5397                i915_request_add(rq);
5398                if (err)
5399                        goto err_active;
5400        }
5401
5402        err = i915_gem_switch_to_kernel_context(i915);
5403        if (err)
5404                goto err_active;
5405
5406        if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
5407                i915_gem_set_wedged(i915);
5408                err = -EIO; /* Caller will declare us wedged */
5409                goto err_active;
5410        }
5411
5412        assert_kernel_context_is_current(i915);
5413
5414        /*
5415         * Immediately park the GPU so that we enable powersaving and
5416         * treat it as idle. The next time we issue a request, we will
5417         * unpark and start using the engine->pinned_default_state, otherwise
5418         * it is in limbo and an early reset may fail.
5419         */
5420        __i915_gem_park(i915);
5421
5422        for_each_engine(engine, i915, id) {
5423                struct i915_vma *state;
5424                void *vaddr;
5425
5426                GEM_BUG_ON(to_intel_context(ctx, engine)->pin_count);
5427
5428                state = to_intel_context(ctx, engine)->state;
5429                if (!state)
5430                        continue;
5431
5432                /*
5433                 * As we will hold a reference to the logical state, it will
5434                 * not be torn down with the context, and importantly the
5435                 * object will hold onto its vma (making it possible for a
5436                 * stray GTT write to corrupt our defaults). Unmap the vma
5437                 * from the GTT to prevent such accidents and reclaim the
5438                 * space.
5439                 */
5440                err = i915_vma_unbind(state);
5441                if (err)
5442                        goto err_active;
5443
5444                err = i915_gem_object_set_to_cpu_domain(state->obj, false);
5445                if (err)
5446                        goto err_active;
5447
5448                engine->default_state = i915_gem_object_get(state->obj);
5449
5450                /* Check we can acquire the image of the context state */
5451                vaddr = i915_gem_object_pin_map(engine->default_state,
5452                                                I915_MAP_FORCE_WB);
5453                if (IS_ERR(vaddr)) {
5454                        err = PTR_ERR(vaddr);
5455                        goto err_active;
5456                }
5457
5458                i915_gem_object_unpin_map(engine->default_state);
5459        }
5460
5461        if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
5462                unsigned int found = intel_engines_has_context_isolation(i915);
5463
5464                /*
5465                 * Make sure that classes with multiple engine instances all
5466                 * share the same basic configuration.
5467                 */
5468                for_each_engine(engine, i915, id) {
5469                        unsigned int bit = BIT(engine->uabi_class);
5470                        unsigned int expected = engine->default_state ? bit : 0;
5471
5472                        if ((found & bit) != expected) {
5473                                DRM_ERROR("mismatching default context state for class %d on engine %s\n",
5474                                          engine->uabi_class, engine->name);
5475                        }
5476                }
5477        }
5478
5479out_ctx:
5480        i915_gem_context_set_closed(ctx);
5481        i915_gem_context_put(ctx);
5482        return err;
5483
5484err_active:
5485        /*
5486         * If we have to abandon now, we expect the engines to be idle
5487         * and ready to be torn-down. First try to flush any remaining
5488         * request, ensure we are pointing at the kernel context and
5489         * then remove it.
5490         */
5491        if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
5492                goto out_ctx;
5493
5494        if (WARN_ON(i915_gem_wait_for_idle(i915,
5495                                           I915_WAIT_LOCKED,
5496                                           MAX_SCHEDULE_TIMEOUT)))
5497                goto out_ctx;
5498
5499        i915_gem_contexts_lost(i915);
5500        goto out_ctx;
5501}
5502
5503static int
5504i915_gem_init_scratch(struct drm_i915_private *i915, unsigned int size)
5505{
5506        struct drm_i915_gem_object *obj;
5507        struct i915_vma *vma;
5508        int ret;
5509
5510        obj = i915_gem_object_create_stolen(i915, size);
5511        if (!obj)
5512                obj = i915_gem_object_create_internal(i915, size);
5513        if (IS_ERR(obj)) {
5514                DRM_ERROR("Failed to allocate scratch page\n");
5515                return PTR_ERR(obj);
5516        }
5517
5518        vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
5519        if (IS_ERR(vma)) {
5520                ret = PTR_ERR(vma);
5521                goto err_unref;
5522        }
5523
5524        ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
5525        if (ret)
5526                goto err_unref;
5527
5528        i915->gt.scratch = vma;
5529        return 0;
5530
5531err_unref:
5532        i915_gem_object_put(obj);
5533        return ret;
5534}
5535
5536static void i915_gem_fini_scratch(struct drm_i915_private *i915)
5537{
5538        i915_vma_unpin_and_release(&i915->gt.scratch, 0);
5539}
5540
5541int i915_gem_init(struct drm_i915_private *dev_priv)
5542{
5543        int ret;
5544
5545        /* We need to fallback to 4K pages if host doesn't support huge gtt. */
5546        if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
5547                mkwrite_device_info(dev_priv)->page_sizes =
5548                        I915_GTT_PAGE_SIZE_4K;
5549
5550        dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5551
5552        if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5553                dev_priv->gt.resume = intel_lr_context_resume;
5554                dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5555        } else {
5556                dev_priv->gt.resume = intel_legacy_submission_resume;
5557                dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5558        }
5559
5560        ret = i915_gem_init_userptr(dev_priv);
5561        if (ret)
5562                return ret;
5563
5564        ret = intel_uc_init_misc(dev_priv);
5565        if (ret)
5566                return ret;
5567
5568        ret = intel_wopcm_init(&dev_priv->wopcm);
5569        if (ret)
5570                goto err_uc_misc;
5571
5572        /* This is just a security blanket to placate dragons.
5573         * On some systems, we very sporadically observe that the first TLBs
5574         * used by the CS may be stale, despite us poking the TLB reset. If
5575         * we hold the forcewake during initialisation these problems
5576         * just magically go away.
5577         */
5578        mutex_lock(&dev_priv->drm.struct_mutex);
5579        intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5580
5581        ret = i915_gem_init_ggtt(dev_priv);
5582        if (ret) {
5583                GEM_BUG_ON(ret == -EIO);
5584                goto err_unlock;
5585        }
5586
5587        ret = i915_gem_init_scratch(dev_priv,
5588                                    IS_GEN2(dev_priv) ? SZ_256K : PAGE_SIZE);
5589        if (ret) {
5590                GEM_BUG_ON(ret == -EIO);
5591                goto err_ggtt;
5592        }
5593
5594        ret = i915_gem_contexts_init(dev_priv);
5595        if (ret) {
5596                GEM_BUG_ON(ret == -EIO);
5597                goto err_scratch;
5598        }
5599
5600        ret = intel_engines_init(dev_priv);
5601        if (ret) {
5602                GEM_BUG_ON(ret == -EIO);
5603                goto err_context;
5604        }
5605
5606        intel_init_gt_powersave(dev_priv);
5607
5608        ret = intel_uc_init(dev_priv);
5609        if (ret)
5610                goto err_pm;
5611
5612        ret = i915_gem_init_hw(dev_priv);
5613        if (ret)
5614                goto err_uc_init;
5615
5616        /*
5617         * Despite its name intel_init_clock_gating applies both display
5618         * clock gating workarounds; GT mmio workarounds and the occasional
5619         * GT power context workaround. Worse, sometimes it includes a context
5620         * register workaround which we need to apply before we record the
5621         * default HW state for all contexts.
5622         *
5623         * FIXME: break up the workarounds and apply them at the right time!
5624         */
5625        intel_init_clock_gating(dev_priv);
5626
5627        ret = __intel_engines_record_defaults(dev_priv);
5628        if (ret)
5629                goto err_init_hw;
5630
5631        if (i915_inject_load_failure()) {
5632                ret = -ENODEV;
5633                goto err_init_hw;
5634        }
5635
5636        if (i915_inject_load_failure()) {
5637                ret = -EIO;
5638                goto err_init_hw;
5639        }
5640
5641        intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5642        mutex_unlock(&dev_priv->drm.struct_mutex);
5643
5644        return 0;
5645
5646        /*
5647         * Unwinding is complicated by that we want to handle -EIO to mean
5648         * disable GPU submission but keep KMS alive. We want to mark the
5649         * HW as irrevisibly wedged, but keep enough state around that the
5650         * driver doesn't explode during runtime.
5651         */
5652err_init_hw:
5653        mutex_unlock(&dev_priv->drm.struct_mutex);
5654
5655        WARN_ON(i915_gem_suspend(dev_priv));
5656        i915_gem_suspend_late(dev_priv);
5657
5658        i915_gem_drain_workqueue(dev_priv);
5659
5660        mutex_lock(&dev_priv->drm.struct_mutex);
5661        intel_uc_fini_hw(dev_priv);
5662err_uc_init:
5663        intel_uc_fini(dev_priv);
5664err_pm:
5665        if (ret != -EIO) {
5666                intel_cleanup_gt_powersave(dev_priv);
5667                i915_gem_cleanup_engines(dev_priv);
5668        }
5669err_context:
5670        if (ret != -EIO)
5671                i915_gem_contexts_fini(dev_priv);
5672err_scratch:
5673        i915_gem_fini_scratch(dev_priv);
5674err_ggtt:
5675err_unlock:
5676        intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5677        mutex_unlock(&dev_priv->drm.struct_mutex);
5678
5679err_uc_misc:
5680        intel_uc_fini_misc(dev_priv);
5681
5682        if (ret != -EIO)
5683                i915_gem_cleanup_userptr(dev_priv);
5684
5685        if (ret == -EIO) {
5686                mutex_lock(&dev_priv->drm.struct_mutex);
5687
5688                /*
5689                 * Allow engine initialisation to fail by marking the GPU as
5690                 * wedged. But we only want to do this where the GPU is angry,
5691                 * for all other failure, such as an allocation failure, bail.
5692                 */
5693                if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5694                        i915_load_error(dev_priv,
5695                                        "Failed to initialize GPU, declaring it wedged!\n");
5696                        i915_gem_set_wedged(dev_priv);
5697                }
5698
5699                /* Minimal basic recovery for KMS */
5700                ret = i915_ggtt_enable_hw(dev_priv);
5701                i915_gem_restore_gtt_mappings(dev_priv);
5702                i915_gem_restore_fences(dev_priv);
5703                intel_init_clock_gating(dev_priv);
5704
5705                mutex_unlock(&dev_priv->drm.struct_mutex);
5706        }
5707
5708        i915_gem_drain_freed_objects(dev_priv);
5709        return ret;
5710}
5711
5712void i915_gem_fini(struct drm_i915_private *dev_priv)
5713{
5714        i915_gem_suspend_late(dev_priv);
5715        intel_disable_gt_powersave(dev_priv);
5716
5717        /* Flush any outstanding unpin_work. */
5718        i915_gem_drain_workqueue(dev_priv);
5719
5720        mutex_lock(&dev_priv->drm.struct_mutex);
5721        intel_uc_fini_hw(dev_priv);
5722        intel_uc_fini(dev_priv);
5723        i915_gem_cleanup_engines(dev_priv);
5724        i915_gem_contexts_fini(dev_priv);
5725        i915_gem_fini_scratch(dev_priv);
5726        mutex_unlock(&dev_priv->drm.struct_mutex);
5727
5728        intel_wa_list_free(&dev_priv->gt_wa_list);
5729
5730        intel_cleanup_gt_powersave(dev_priv);
5731
5732        intel_uc_fini_misc(dev_priv);
5733        i915_gem_cleanup_userptr(dev_priv);
5734
5735        i915_gem_drain_freed_objects(dev_priv);
5736
5737        WARN_ON(!list_empty(&dev_priv->contexts.list));
5738}
5739
5740void i915_gem_init_mmio(struct drm_i915_private *i915)
5741{
5742        i915_gem_sanitize(i915);
5743}
5744
5745void
5746i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5747{
5748        struct intel_engine_cs *engine;
5749        enum intel_engine_id id;
5750
5751        for_each_engine(engine, dev_priv, id)
5752                dev_priv->gt.cleanup_engine(engine);
5753}
5754
5755void
5756i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5757{
5758        int i;
5759
5760        if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5761            !IS_CHERRYVIEW(dev_priv))
5762                dev_priv->num_fence_regs = 32;
5763        else if (INTEL_GEN(dev_priv) >= 4 ||
5764                 IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5765                 IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5766                dev_priv->num_fence_regs = 16;
5767        else
5768                dev_priv->num_fence_regs = 8;
5769
5770        if (intel_vgpu_active(dev_priv))
5771                dev_priv->num_fence_regs =
5772                                I915_READ(vgtif_reg(avail_rs.fence_num));
5773
5774        /* Initialize fence registers to zero */
5775        for (i = 0; i < dev_priv->num_fence_regs; i++) {
5776                struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5777
5778                fence->i915 = dev_priv;
5779                fence->id = i;
5780                list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5781        }
5782        i915_gem_restore_fences(dev_priv);
5783
5784        i915_gem_detect_bit_6_swizzle(dev_priv);
5785}
5786
5787static void i915_gem_init__mm(struct drm_i915_private *i915)
5788{
5789        spin_lock_init(&i915->mm.object_stat_lock);
5790        spin_lock_init(&i915->mm.obj_lock);
5791        spin_lock_init(&i915->mm.free_lock);
5792
5793        init_llist_head(&i915->mm.free_list);
5794
5795        INIT_LIST_HEAD(&i915->mm.unbound_list);
5796        INIT_LIST_HEAD(&i915->mm.bound_list);
5797        INIT_LIST_HEAD(&i915->mm.fence_list);
5798        INIT_LIST_HEAD(&i915->mm.userfault_list);
5799
5800        INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5801}
5802
5803int i915_gem_init_early(struct drm_i915_private *dev_priv)
5804{
5805        int err = -ENOMEM;
5806
5807        dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5808        if (!dev_priv->objects)
5809                goto err_out;
5810
5811        dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5812        if (!dev_priv->vmas)
5813                goto err_objects;
5814
5815        dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5816        if (!dev_priv->luts)
5817                goto err_vmas;
5818
5819        dev_priv->requests = KMEM_CACHE(i915_request,
5820                                        SLAB_HWCACHE_ALIGN |
5821                                        SLAB_RECLAIM_ACCOUNT |
5822                                        SLAB_TYPESAFE_BY_RCU);
5823        if (!dev_priv->requests)
5824                goto err_luts;
5825
5826        dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5827                                            SLAB_HWCACHE_ALIGN |
5828                                            SLAB_RECLAIM_ACCOUNT);
5829        if (!dev_priv->dependencies)
5830                goto err_requests;
5831
5832        dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5833        if (!dev_priv->priorities)
5834                goto err_dependencies;
5835
5836        INIT_LIST_HEAD(&dev_priv->gt.timelines);
5837        INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5838        INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5839
5840        i915_gem_init__mm(dev_priv);
5841
5842        INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5843                          i915_gem_retire_work_handler);
5844        INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5845                          i915_gem_idle_work_handler);
5846        init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5847        init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5848
5849        atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5850
5851        spin_lock_init(&dev_priv->fb_tracking.lock);
5852
5853        err = i915_gemfs_init(dev_priv);
5854        if (err)
5855                DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5856
5857        return 0;
5858
5859err_dependencies:
5860        kmem_cache_destroy(dev_priv->dependencies);
5861err_requests:
5862        kmem_cache_destroy(dev_priv->requests);
5863err_luts:
5864        kmem_cache_destroy(dev_priv->luts);
5865err_vmas:
5866        kmem_cache_destroy(dev_priv->vmas);
5867err_objects:
5868        kmem_cache_destroy(dev_priv->objects);
5869err_out:
5870        return err;
5871}
5872
5873void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5874{
5875        i915_gem_drain_freed_objects(dev_priv);
5876        GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5877        GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5878        WARN_ON(dev_priv->mm.object_count);
5879        WARN_ON(!list_empty(&dev_priv->gt.timelines));
5880
5881        kmem_cache_destroy(dev_priv->priorities);
5882        kmem_cache_destroy(dev_priv->dependencies);
5883        kmem_cache_destroy(dev_priv->requests);
5884        kmem_cache_destroy(dev_priv->luts);
5885        kmem_cache_destroy(dev_priv->vmas);
5886        kmem_cache_destroy(dev_priv->objects);
5887
5888        /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5889        rcu_barrier();
5890
5891        i915_gemfs_fini(dev_priv);
5892}
5893
5894int i915_gem_freeze(struct drm_i915_private *dev_priv)
5895{
5896        /* Discard all purgeable objects, let userspace recover those as
5897         * required after resuming.
5898         */
5899        i915_gem_shrink_all(dev_priv);
5900
5901        return 0;
5902}
5903
5904int i915_gem_freeze_late(struct drm_i915_private *i915)
5905{
5906        struct drm_i915_gem_object *obj;
5907        struct list_head *phases[] = {
5908                &i915->mm.unbound_list,
5909                &i915->mm.bound_list,
5910                NULL
5911        }, **phase;
5912
5913        /*
5914         * Called just before we write the hibernation image.
5915         *
5916         * We need to update the domain tracking to reflect that the CPU
5917         * will be accessing all the pages to create and restore from the
5918         * hibernation, and so upon restoration those pages will be in the
5919         * CPU domain.
5920         *
5921         * To make sure the hibernation image contains the latest state,
5922         * we update that state just before writing out the image.
5923         *
5924         * To try and reduce the hibernation image, we manually shrink
5925         * the objects as well, see i915_gem_freeze()
5926         */
5927
5928        i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5929        i915_gem_drain_freed_objects(i915);
5930
5931        mutex_lock(&i915->drm.struct_mutex);
5932        for (phase = phases; *phase; phase++) {
5933                list_for_each_entry(obj, *phase, mm.link)
5934                        WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5935        }
5936        mutex_unlock(&i915->drm.struct_mutex);
5937
5938        return 0;
5939}
5940
5941void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5942{
5943        struct drm_i915_file_private *file_priv = file->driver_priv;
5944        struct i915_request *request;
5945
5946        /* Clean up our request list when the client is going away, so that
5947         * later retire_requests won't dereference our soon-to-be-gone
5948         * file_priv.
5949         */
5950        spin_lock(&file_priv->mm.lock);
5951        list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5952                request->file_priv = NULL;
5953        spin_unlock(&file_priv->mm.lock);
5954}
5955
5956int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5957{
5958        struct drm_i915_file_private *file_priv;
5959        int ret;
5960
5961        DRM_DEBUG("\n");
5962
5963        file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5964        if (!file_priv)
5965                return -ENOMEM;
5966
5967        file->driver_priv = file_priv;
5968        file_priv->dev_priv = i915;
5969        file_priv->file = file;
5970
5971        spin_lock_init(&file_priv->mm.lock);
5972        INIT_LIST_HEAD(&file_priv->mm.request_list);
5973
5974        file_priv->bsd_engine = -1;
5975        file_priv->hang_timestamp = jiffies;
5976
5977        ret = i915_gem_context_open(i915, file);
5978        if (ret)
5979                kfree(file_priv);
5980
5981        return ret;
5982}
5983
5984/**
5985 * i915_gem_track_fb - update frontbuffer tracking
5986 * @old: current GEM buffer for the frontbuffer slots
5987 * @new: new GEM buffer for the frontbuffer slots
5988 * @frontbuffer_bits: bitmask of frontbuffer slots
5989 *
5990 * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5991 * from @old and setting them in @new. Both @old and @new can be NULL.
5992 */
5993void i915_gem_track_fb(struct drm_i915_gem_object *old,
5994                       struct drm_i915_gem_object *new,
5995                       unsigned frontbuffer_bits)
5996{
5997        /* Control of individual bits within the mask are guarded by
5998         * the owning plane->mutex, i.e. we can never see concurrent
5999         * manipulation of individual bits. But since the bitfield as a whole
6000         * is updated using RMW, we need to use atomics in order to update

6001         * the bits.
6002         */
6003        BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
6004                     sizeof(atomic_t) * BITS_PER_BYTE);
6005
6006        if (old) {
6007                WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
6008                atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
6009        }
6010
6011        if (new) {
6012                WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
6013                atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
6014        }
6015}
6016
6017/* Allocate a new GEM object and fill it with the supplied data */
6018struct drm_i915_gem_object *
6019i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
6020                                 const void *data, size_t size)
6021{
6022        struct drm_i915_gem_object *obj;
6023        struct file *file;
6024        size_t offset;
6025        int err;
6026
6027        obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
6028        if (IS_ERR(obj))
6029                return obj;
6030
6031        GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
6032
6033        file = obj->base.filp;
6034        offset = 0;
6035        do {
6036                unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
6037                struct page *page;
6038                void *pgdata, *vaddr;
6039
6040                err = pagecache_write_begin(file, file->f_mapping,
6041                                            offset, len, 0,
6042                                            &page, &pgdata);
6043                if (err < 0)
6044                        goto fail;
6045
6046                vaddr = kmap(page);
6047                memcpy(vaddr, data, len);
6048                kunmap(page);
6049
6050                err = pagecache_write_end(file, file->f_mapping,
6051                                          offset, len, len,
6052                                          page, pgdata);
6053                if (err < 0)
6054                        goto fail;
6055
6056                size -= len;
6057                data += len;
6058                offset += len;
6059        } while (size);
6060
6061        return obj;
6062
6063fail:
6064        i915_gem_object_put(obj);
6065        return ERR_PTR(err);
6066}
6067
6068struct scatterlist *
6069i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
6070                       unsigned int n,
6071                       unsigned int *offset)
6072{
6073        struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
6074        struct scatterlist *sg;
6075        unsigned int idx, count;
6076
6077        might_sleep();
6078        GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
6079        GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
6080
6081        /* As we iterate forward through the sg, we record each entry in a
6082         * radixtree for quick repeated (backwards) lookups. If we have seen
6083         * this index previously, we will have an entry for it.
6084         *
6085         * Initial lookup is O(N), but this is amortized to O(1) for
6086         * sequential page access (where each new request is consecutive
6087         * to the previous one). Repeated lookups are O(lg(obj->base.size)),
6088         * i.e. O(1) with a large constant!
6089         */
6090        if (n < READ_ONCE(iter->sg_idx))
6091                goto lookup;
6092
6093        mutex_lock(&iter->lock);
6094
6095        /* We prefer to reuse the last sg so that repeated lookup of this
6096         * (or the subsequent) sg are fast - comparing against the last
6097         * sg is faster than going through the radixtree.
6098         */
6099
6100        sg = iter->sg_pos;
6101        idx = iter->sg_idx;
6102        count = __sg_page_count(sg);
6103
6104        while (idx + count <= n) {
6105                void *entry;
6106                unsigned long i;
6107                int ret;
6108
6109                /* If we cannot allocate and insert this entry, or the
6110                 * individual pages from this range, cancel updating the
6111                 * sg_idx so that on this lookup we are forced to linearly
6112                 * scan onwards, but on future lookups we will try the
6113                 * insertion again (in which case we need to be careful of
6114                 * the error return reporting that we have already inserted
6115                 * this index).
6116                 */
6117                ret = radix_tree_insert(&iter->radix, idx, sg);
6118                if (ret && ret != -EEXIST)
6119                        goto scan;
6120
6121                entry = xa_mk_value(idx);
6122                for (i = 1; i < count; i++) {
6123                        ret = radix_tree_insert(&iter->radix, idx + i, entry);
6124                        if (ret && ret != -EEXIST)
6125                                goto scan;
6126                }
6127
6128                idx += count;
6129                sg = ____sg_next(sg);
6130                count = __sg_page_count(sg);
6131        }
6132
6133scan:
6134        iter->sg_pos = sg;
6135        iter->sg_idx = idx;
6136
6137        mutex_unlock(&iter->lock);
6138
6139        if (unlikely(n < idx)) /* insertion completed by another thread */
6140                goto lookup;
6141
6142        /* In case we failed to insert the entry into the radixtree, we need
6143         * to look beyond the current sg.
6144         */
6145        while (idx + count <= n) {
6146                idx += count;
6147                sg = ____sg_next(sg);
6148                count = __sg_page_count(sg);
6149        }
6150
6151        *offset = n - idx;
6152        return sg;
6153
6154lookup:
6155        rcu_read_lock();
6156
6157        sg = radix_tree_lookup(&iter->radix, n);
6158        GEM_BUG_ON(!sg);
6159
6160        /* If this index is in the middle of multi-page sg entry,
6161         * the radix tree will contain a value entry that points
6162         * to the start of that range. We will return the pointer to
6163         * the base page and the offset of this page within the
6164         * sg entry's range.
6165         */
6166        *offset = 0;
6167        if (unlikely(xa_is_value(sg))) {
6168                unsigned long base = xa_to_value(sg);
6169
6170                sg = radix_tree_lookup(&iter->radix, base);
6171                GEM_BUG_ON(!sg);
6172
6173                *offset = n - base;
6174        }
6175
6176        rcu_read_unlock();
6177
6178        return sg;
6179}
6180
6181struct page *
6182i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
6183{
6184        struct scatterlist *sg;
6185        unsigned int offset;
6186
6187        GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
6188
6189        sg = i915_gem_object_get_sg(obj, n, &offset);
6190        return nth_page(sg_page(sg), offset);
6191}
6192
6193/* Like i915_gem_object_get_page(), but mark the returned page dirty */
6194struct page *
6195i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
6196                               unsigned int n)
6197{
6198        struct page *page;
6199
6200        page = i915_gem_object_get_page(obj, n);
6201        if (!obj->mm.dirty)
6202                set_page_dirty(page);
6203
6204        return page;
6205}
6206
6207dma_addr_t
6208i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
6209                                unsigned long n)
6210{
6211        struct scatterlist *sg;
6212        unsigned int offset;
6213
6214        sg = i915_gem_object_get_sg(obj, n, &offset);
6215        return sg_dma_address(sg) + (offset << PAGE_SHIFT);
6216}
6217
6218int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
6219{
6220        struct sg_table *pages;
6221        int err;
6222
6223        if (align > obj->base.size)
6224                return -EINVAL;
6225
6226        if (obj->ops == &i915_gem_phys_ops)
6227                return 0;
6228
6229        if (obj->ops != &i915_gem_object_ops)
6230                return -EINVAL;
6231
6232        err = i915_gem_object_unbind(obj);
6233        if (err)
6234                return err;
6235
6236        mutex_lock(&obj->mm.lock);
6237
6238        if (obj->mm.madv != I915_MADV_WILLNEED) {
6239                err = -EFAULT;
6240                goto err_unlock;
6241        }
6242
6243        if (obj->mm.quirked) {
6244                err = -EFAULT;
6245                goto err_unlock;
6246        }
6247
6248        if (obj->mm.mapping) {
6249                err = -EBUSY;
6250                goto err_unlock;
6251        }
6252
6253        pages = __i915_gem_object_unset_pages(obj);
6254
6255        obj->ops = &i915_gem_phys_ops;
6256
6257        err = ____i915_gem_object_get_pages(obj);
6258        if (err)
6259                goto err_xfer;
6260
6261        /* Perma-pin (until release) the physical set of pages */
6262        __i915_gem_object_pin_pages(obj);
6263
6264        if (!IS_ERR_OR_NULL(pages))
6265                i915_gem_object_ops.put_pages(obj, pages);
6266        mutex_unlock(&obj->mm.lock);
6267        return 0;
6268
6269err_xfer:
6270        obj->ops = &i915_gem_object_ops;
6271        if (!IS_ERR_OR_NULL(pages)) {
6272                unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
6273
6274                __i915_gem_object_set_pages(obj, pages, sg_page_sizes);
6275        }
6276err_unlock:
6277        mutex_unlock(&obj->mm.lock);
6278        return err;
6279}
6280
6281#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6282#include "selftests/scatterlist.c"
6283#include "selftests/mock_gem_device.c"
6284#include "selftests/huge_gem_object.c"
6285#include "selftests/huge_pages.c"
6286#include "selftests/i915_gem_object.c"
6287#include "selftests/i915_gem_coherency.c"
6288#include "selftests/i915_gem.c"
6289#endif
6290