LXR linux/drivers/gpu/drm/i915/gt/intel

   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2020 Intel Corporation
   4 */
   5
   6#include <linux/slab.h> /* fault-inject.h is not standalone! */
   7
   8#include <linux/fault-inject.h>
   9
  10#include "gem/i915_gem_lmem.h"
  11#include "i915_trace.h"
  12#include "intel_gt.h"
  13#include "intel_gtt.h"
  14
  15struct drm_i915_gem_object *alloc_pt_lmem(struct i915_address_space *vm, int sz)
  16{
  17        struct drm_i915_gem_object *obj;
  18
  19        /*
  20         * To avoid severe over-allocation when dealing with min_page_size
  21         * restrictions, we override that behaviour here by allowing an object
  22         * size and page layout which can be smaller. In practice this should be
  23         * totally fine, since GTT paging structures are not typically inserted
  24         * into the GTT.
  25         *
  26         * Note that we also hit this path for the scratch page, and for this
  27         * case it might need to be 64K, but that should work fine here since we
  28         * used the passed in size for the page size, which should ensure it
  29         * also has the same alignment.
  30         */
  31        obj = __i915_gem_object_create_lmem_with_ps(vm->i915, sz, sz, 0);
  32        /*
  33         * Ensure all paging structures for this vm share the same dma-resv
  34         * object underneath, with the idea that one object_lock() will lock
  35         * them all at once.
  36         */
  37        if (!IS_ERR(obj)) {
  38                obj->base.resv = i915_vm_resv_get(vm);
  39                obj->shares_resv_from = vm;
  40        }
  41
  42        return obj;
  43}
  44
  45struct drm_i915_gem_object *alloc_pt_dma(struct i915_address_space *vm, int sz)
  46{
  47        struct drm_i915_gem_object *obj;
  48
  49        if (I915_SELFTEST_ONLY(should_fail(&vm->fault_attr, 1)))
  50                i915_gem_shrink_all(vm->i915);
  51
  52        obj = i915_gem_object_create_internal(vm->i915, sz);
  53        /*
  54         * Ensure all paging structures for this vm share the same dma-resv
  55         * object underneath, with the idea that one object_lock() will lock
  56         * them all at once.
  57         */
  58        if (!IS_ERR(obj)) {
  59                obj->base.resv = i915_vm_resv_get(vm);
  60                obj->shares_resv_from = vm;
  61        }
  62
  63        return obj;
  64}
  65
  66int map_pt_dma(struct i915_address_space *vm, struct drm_i915_gem_object *obj)
  67{
  68        enum i915_map_type type;
  69        void *vaddr;
  70
  71        type = i915_coherent_map_type(vm->i915, obj, true);
  72        vaddr = i915_gem_object_pin_map_unlocked(obj, type);
  73        if (IS_ERR(vaddr))
  74                return PTR_ERR(vaddr);
  75
  76        i915_gem_object_make_unshrinkable(obj);
  77        return 0;
  78}
  79
  80int map_pt_dma_locked(struct i915_address_space *vm, struct drm_i915_gem_object *obj)
  81{
  82        enum i915_map_type type;
  83        void *vaddr;
  84
  85        type = i915_coherent_map_type(vm->i915, obj, true);
  86        vaddr = i915_gem_object_pin_map(obj, type);
  87        if (IS_ERR(vaddr))
  88                return PTR_ERR(vaddr);
  89
  90        i915_gem_object_make_unshrinkable(obj);
  91        return 0;
  92}
  93
  94void __i915_vm_close(struct i915_address_space *vm)
  95{
  96        struct i915_vma *vma, *vn;
  97
  98        if (!atomic_dec_and_mutex_lock(&vm->open, &vm->mutex))
  99                return;
 100
 101        list_for_each_entry_safe(vma, vn, &vm->bound_list, vm_link) {
 102                struct drm_i915_gem_object *obj = vma->obj;
 103
 104                /* Keep the obj (and hence the vma) alive as _we_ destroy it */
 105                if (!kref_get_unless_zero(&obj->base.refcount))
 106                        continue;
 107
 108                atomic_and(~I915_VMA_PIN_MASK, &vma->flags);
 109                WARN_ON(__i915_vma_unbind(vma));
 110                __i915_vma_put(vma);
 111
 112                i915_gem_object_put(obj);
 113        }
 114        GEM_BUG_ON(!list_empty(&vm->bound_list));
 115
 116        mutex_unlock(&vm->mutex);
 117}
 118
 119/* lock the vm into the current ww, if we lock one, we lock all */
 120int i915_vm_lock_objects(struct i915_address_space *vm,
 121                         struct i915_gem_ww_ctx *ww)
 122{
 123        if (vm->scratch[0]->base.resv == &vm->_resv) {
 124                return i915_gem_object_lock(vm->scratch[0], ww);
 125        } else {
 126                struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(vm);
 127
 128                /* We borrowed the scratch page from ggtt, take the top level object */
 129                return i915_gem_object_lock(ppgtt->pd->pt.base, ww);
 130        }
 131}
 132
 133void i915_address_space_fini(struct i915_address_space *vm)
 134{
 135        drm_mm_takedown(&vm->mm);
 136        mutex_destroy(&vm->mutex);
 137}
 138
 139/**
 140 * i915_vm_resv_release - Final struct i915_address_space destructor
 141 * @kref: Pointer to the &i915_address_space.resv_ref member.
 142 *
 143 * This function is called when the last lock sharer no longer shares the
 144 * &i915_address_space._resv lock.
 145 */
 146void i915_vm_resv_release(struct kref *kref)
 147{
 148        struct i915_address_space *vm =
 149                container_of(kref, typeof(*vm), resv_ref);
 150
 151        dma_resv_fini(&vm->_resv);
 152        kfree(vm);
 153}
 154
 155static void __i915_vm_release(struct work_struct *work)
 156{
 157        struct i915_address_space *vm =
 158                container_of(work, struct i915_address_space, rcu.work);
 159
 160        vm->cleanup(vm);
 161        i915_address_space_fini(vm);
 162
 163        i915_vm_resv_put(vm);
 164}
 165
 166void i915_vm_release(struct kref *kref)
 167{
 168        struct i915_address_space *vm =
 169                container_of(kref, struct i915_address_space, ref);
 170
 171        GEM_BUG_ON(i915_is_ggtt(vm));
 172        trace_i915_ppgtt_release(vm);
 173
 174        queue_rcu_work(vm->i915->wq, &vm->rcu);
 175}
 176
 177void i915_address_space_init(struct i915_address_space *vm, int subclass)
 178{
 179        kref_init(&vm->ref);
 180
 181        /*
 182         * Special case for GGTT that has already done an early
 183         * kref_init here.
 184         */
 185        if (!kref_read(&vm->resv_ref))
 186                kref_init(&vm->resv_ref);
 187
 188        INIT_RCU_WORK(&vm->rcu, __i915_vm_release);
 189        atomic_set(&vm->open, 1);
 190
 191        /*
 192         * The vm->mutex must be reclaim safe (for use in the shrinker).
 193         * Do a dummy acquire now under fs_reclaim so that any allocation
 194         * attempt holding the lock is immediately reported by lockdep.
 195         */
 196        mutex_init(&vm->mutex);
 197        lockdep_set_subclass(&vm->mutex, subclass);
 198
 199        if (!intel_vm_no_concurrent_access_wa(vm->i915)) {
 200                i915_gem_shrinker_taints_mutex(vm->i915, &vm->mutex);
 201        } else {
 202                /*
 203                 * CHV + BXT VTD workaround use stop_machine(),
 204                 * which is allowed to allocate memory. This means &vm->mutex
 205                 * is the outer lock, and in theory we can allocate memory inside
 206                 * it through stop_machine().
 207                 *
 208                 * Add the annotation for this, we use trylock in shrinker.
 209                 */
 210                mutex_acquire(&vm->mutex.dep_map, 0, 0, _THIS_IP_);
 211                might_alloc(GFP_KERNEL);
 212                mutex_release(&vm->mutex.dep_map, _THIS_IP_);
 213        }
 214        dma_resv_init(&vm->_resv);
 215
 216        GEM_BUG_ON(!vm->total);
 217        drm_mm_init(&vm->mm, 0, vm->total);
 218        vm->mm.head_node.color = I915_COLOR_UNEVICTABLE;
 219
 220        INIT_LIST_HEAD(&vm->bound_list);
 221}
 222
 223void clear_pages(struct i915_vma *vma)
 224{
 225        GEM_BUG_ON(!vma->pages);
 226
 227        if (vma->pages != vma->obj->mm.pages) {
 228                sg_free_table(vma->pages);
 229                kfree(vma->pages);
 230        }
 231        vma->pages = NULL;
 232
 233        memset(&vma->page_sizes, 0, sizeof(vma->page_sizes));
 234}
 235
 236void *__px_vaddr(struct drm_i915_gem_object *p)
 237{
 238        enum i915_map_type type;
 239
 240        GEM_BUG_ON(!i915_gem_object_has_pages(p));
 241        return page_unpack_bits(p->mm.mapping, &type);
 242}
 243
 244dma_addr_t __px_dma(struct drm_i915_gem_object *p)
 245{
 246        GEM_BUG_ON(!i915_gem_object_has_pages(p));
 247        return sg_dma_address(p->mm.pages->sgl);
 248}
 249
 250struct page *__px_page(struct drm_i915_gem_object *p)
 251{
 252        GEM_BUG_ON(!i915_gem_object_has_pages(p));
 253        return sg_page(p->mm.pages->sgl);
 254}
 255
 256void
 257fill_page_dma(struct drm_i915_gem_object *p, const u64 val, unsigned int count)
 258{
 259        void *vaddr = __px_vaddr(p);
 260
 261        memset64(vaddr, val, count);
 262        clflush_cache_range(vaddr, PAGE_SIZE);
 263}
 264
 265static void poison_scratch_page(struct drm_i915_gem_object *scratch)
 266{
 267        void *vaddr = __px_vaddr(scratch);
 268        u8 val;
 269
 270        val = 0;
 271        if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 272                val = POISON_FREE;
 273
 274        memset(vaddr, val, scratch->base.size);
 275}
 276
 277int setup_scratch_page(struct i915_address_space *vm)
 278{
 279        unsigned long size;
 280
 281        /*
 282         * In order to utilize 64K pages for an object with a size < 2M, we will
 283         * need to support a 64K scratch page, given that every 16th entry for a
 284         * page-table operating in 64K mode must point to a properly aligned 64K
 285         * region, including any PTEs which happen to point to scratch.
 286         *
 287         * This is only relevant for the 48b PPGTT where we support
 288         * huge-gtt-pages, see also i915_vma_insert(). However, as we share the
 289         * scratch (read-only) between all vm, we create one 64k scratch page
 290         * for all.
 291         */
 292        size = I915_GTT_PAGE_SIZE_4K;
 293        if (i915_vm_is_4lvl(vm) &&
 294            HAS_PAGE_SIZES(vm->i915, I915_GTT_PAGE_SIZE_64K))
 295                size = I915_GTT_PAGE_SIZE_64K;
 296
 297        do {
 298                struct drm_i915_gem_object *obj;
 299
 300                obj = vm->alloc_pt_dma(vm, size);
 301                if (IS_ERR(obj))
 302                        goto skip;
 303
 304                if (map_pt_dma(vm, obj))
 305                        goto skip_obj;
 306
 307                /* We need a single contiguous page for our scratch */
 308                if (obj->mm.page_sizes.sg < size)
 309                        goto skip_obj;
 310
 311                /* And it needs to be correspondingly aligned */
 312                if (__px_dma(obj) & (size - 1))
 313                        goto skip_obj;
 314
 315                /*
 316                 * Use a non-zero scratch page for debugging.
 317                 *
 318                 * We want a value that should be reasonably obvious
 319                 * to spot in the error state, while also causing a GPU hang
 320                 * if executed. We prefer using a clear page in production, so
 321                 * should it ever be accidentally used, the effect should be
 322                 * fairly benign.
 323                 */
 324                poison_scratch_page(obj);
 325
 326                vm->scratch[0] = obj;
 327                vm->scratch_order = get_order(size);
 328                return 0;
 329
 330skip_obj:
 331                i915_gem_object_put(obj);
 332skip:
 333                if (size == I915_GTT_PAGE_SIZE_4K)
 334                        return -ENOMEM;
 335
 336                size = I915_GTT_PAGE_SIZE_4K;
 337        } while (1);
 338}
 339
 340void free_scratch(struct i915_address_space *vm)
 341{
 342        int i;
 343
 344        for (i = 0; i <= vm->top; i++)
 345                i915_gem_object_put(vm->scratch[i]);
 346}
 347
 348void gtt_write_workarounds(struct intel_gt *gt)
 349{
 350        struct drm_i915_private *i915 = gt->i915;
 351        struct intel_uncore *uncore = gt->uncore;
 352
 353        /*
 354         * This function is for gtt related workarounds. This function is
 355         * called on driver load and after a GPU reset, so you can place
 356         * workarounds here even if they get overwritten by GPU reset.
 357         */
 358        /* WaIncreaseDefaultTLBEntries:chv,bdw,skl,bxt,kbl,glk,cfl,cnl,icl */
 359        if (IS_BROADWELL(i915))
 360                intel_uncore_write(uncore,
 361                                   GEN8_L3_LRA_1_GPGPU,
 362                                   GEN8_L3_LRA_1_GPGPU_DEFAULT_VALUE_BDW);
 363        else if (IS_CHERRYVIEW(i915))
 364                intel_uncore_write(uncore,
 365                                   GEN8_L3_LRA_1_GPGPU,
 366                                   GEN8_L3_LRA_1_GPGPU_DEFAULT_VALUE_CHV);
 367        else if (IS_GEN9_LP(i915))
 368                intel_uncore_write(uncore,
 369                                   GEN8_L3_LRA_1_GPGPU,
 370                                   GEN9_L3_LRA_1_GPGPU_DEFAULT_VALUE_BXT);
 371        else if (GRAPHICS_VER(i915) >= 9 && GRAPHICS_VER(i915) <= 11)
 372                intel_uncore_write(uncore,
 373                                   GEN8_L3_LRA_1_GPGPU,
 374                                   GEN9_L3_LRA_1_GPGPU_DEFAULT_VALUE_SKL);
 375
 376        /*
 377         * To support 64K PTEs we need to first enable the use of the
 378         * Intermediate-Page-Size(IPS) bit of the PDE field via some magical
 379         * mmio, otherwise the page-walker will simply ignore the IPS bit. This
 380         * shouldn't be needed after GEN10.
 381         *
 382         * 64K pages were first introduced from BDW+, although technically they
 383         * only *work* from gen9+. For pre-BDW we instead have the option for
 384         * 32K pages, but we don't currently have any support for it in our
 385         * driver.
 386         */
 387        if (HAS_PAGE_SIZES(i915, I915_GTT_PAGE_SIZE_64K) &&
 388            GRAPHICS_VER(i915) <= 10)
 389                intel_uncore_rmw(uncore,
 390                                 GEN8_GAMW_ECO_DEV_RW_IA,
 391                                 0,
 392                                 GAMW_ECO_ENABLE_64K_IPS_FIELD);
 393
 394        if (IS_GRAPHICS_VER(i915, 8, 11)) {
 395                bool can_use_gtt_cache = true;
 396
 397                /*
 398                 * According to the BSpec if we use 2M/1G pages then we also
 399                 * need to disable the GTT cache. At least on BDW we can see
 400                 * visual corruption when using 2M pages, and not disabling the
 401                 * GTT cache.
 402                 */
 403                if (HAS_PAGE_SIZES(i915, I915_GTT_PAGE_SIZE_2M))
 404                        can_use_gtt_cache = false;
 405
 406                /* WaGttCachingOffByDefault */
 407                intel_uncore_write(uncore,
 408                                   HSW_GTT_CACHE_EN,
 409                                   can_use_gtt_cache ? GTT_CACHE_EN_ALL : 0);
 410                drm_WARN_ON_ONCE(&i915->drm, can_use_gtt_cache &&
 411                                 intel_uncore_read(uncore,
 412                                                   HSW_GTT_CACHE_EN) == 0);
 413        }
 414}
 415
 416static void tgl_setup_private_ppat(struct intel_uncore *uncore)
 417{
 418        /* TGL doesn't support LLC or AGE settings */
 419        intel_uncore_write(uncore, GEN12_PAT_INDEX(0), GEN8_PPAT_WB);
 420        intel_uncore_write(uncore, GEN12_PAT_INDEX(1), GEN8_PPAT_WC);
 421        intel_uncore_write(uncore, GEN12_PAT_INDEX(2), GEN8_PPAT_WT);
 422        intel_uncore_write(uncore, GEN12_PAT_INDEX(3), GEN8_PPAT_UC);
 423        intel_uncore_write(uncore, GEN12_PAT_INDEX(4), GEN8_PPAT_WB);
 424        intel_uncore_write(uncore, GEN12_PAT_INDEX(5), GEN8_PPAT_WB);
 425        intel_uncore_write(uncore, GEN12_PAT_INDEX(6), GEN8_PPAT_WB);
 426        intel_uncore_write(uncore, GEN12_PAT_INDEX(7), GEN8_PPAT_WB);
 427}
 428
 429static void icl_setup_private_ppat(struct intel_uncore *uncore)
 430{
 431        intel_uncore_write(uncore,
 432                           GEN10_PAT_INDEX(0),
 433                           GEN8_PPAT_WB | GEN8_PPAT_LLC);
 434        intel_uncore_write(uncore,
 435                           GEN10_PAT_INDEX(1),
 436                           GEN8_PPAT_WC | GEN8_PPAT_LLCELLC);
 437        intel_uncore_write(uncore,
 438                           GEN10_PAT_INDEX(2),
 439                           GEN8_PPAT_WB | GEN8_PPAT_ELLC_OVERRIDE);
 440        intel_uncore_write(uncore,
 441                           GEN10_PAT_INDEX(3),
 442                           GEN8_PPAT_UC);
 443        intel_uncore_write(uncore,
 444                           GEN10_PAT_INDEX(4),
 445                           GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0));
 446        intel_uncore_write(uncore,
 447                           GEN10_PAT_INDEX(5),
 448                           GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1));
 449        intel_uncore_write(uncore,
 450                           GEN10_PAT_INDEX(6),
 451                           GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2));
 452        intel_uncore_write(uncore,
 453                           GEN10_PAT_INDEX(7),
 454                           GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));
 455}
 456
 457/*
 458 * The GGTT and PPGTT need a private PPAT setup in order to handle cacheability
 459 * bits. When using advanced contexts each context stores its own PAT, but
 460 * writing this data shouldn't be harmful even in those cases.
 461 */
 462static void bdw_setup_private_ppat(struct intel_uncore *uncore)
 463{
 464        struct drm_i915_private *i915 = uncore->i915;
 465        u64 pat;
 466
 467        pat = GEN8_PPAT(0, GEN8_PPAT_WB | GEN8_PPAT_LLC) |      /* for normal objects, no eLLC */
 468              GEN8_PPAT(1, GEN8_PPAT_WC | GEN8_PPAT_LLCELLC) |  /* for something pointing to ptes? */
 469              GEN8_PPAT(3, GEN8_PPAT_UC) |                      /* Uncached objects, mostly for scanout */
 470              GEN8_PPAT(4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0)) |
 471              GEN8_PPAT(5, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1)) |
 472              GEN8_PPAT(6, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2)) |
 473              GEN8_PPAT(7, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3));
 474
 475        /* for scanout with eLLC */
 476        if (GRAPHICS_VER(i915) >= 9)
 477                pat |= GEN8_PPAT(2, GEN8_PPAT_WB | GEN8_PPAT_ELLC_OVERRIDE);
 478        else
 479                pat |= GEN8_PPAT(2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC);
 480
 481        intel_uncore_write(uncore, GEN8_PRIVATE_PAT_LO, lower_32_bits(pat));
 482        intel_uncore_write(uncore, GEN8_PRIVATE_PAT_HI, upper_32_bits(pat));
 483}
 484
 485static void chv_setup_private_ppat(struct intel_uncore *uncore)
 486{
 487        u64 pat;
 488
 489        /*
 490         * Map WB on BDW to snooped on CHV.
 491         *
 492         * Only the snoop bit has meaning for CHV, the rest is
 493         * ignored.
 494         *
 495         * The hardware will never snoop for certain types of accesses:
 496         * - CPU GTT (GMADR->GGTT->no snoop->memory)
 497         * - PPGTT page tables
 498         * - some other special cycles
 499         *
 500         * As with BDW, we also need to consider the following for GT accesses:
 501         * "For GGTT, there is NO pat_sel[2:0] from the entry,
 502         * so RTL will always use the value corresponding to
 503         * pat_sel = 000".
 504         * Which means we must set the snoop bit in PAT entry 0
 505         * in order to keep the global status page working.
 506         */
 507
 508        pat = GEN8_PPAT(0, CHV_PPAT_SNOOP) |
 509              GEN8_PPAT(1, 0) |
 510              GEN8_PPAT(2, 0) |
 511              GEN8_PPAT(3, 0) |
 512              GEN8_PPAT(4, CHV_PPAT_SNOOP) |
 513              GEN8_PPAT(5, CHV_PPAT_SNOOP) |
 514              GEN8_PPAT(6, CHV_PPAT_SNOOP) |
 515              GEN8_PPAT(7, CHV_PPAT_SNOOP);
 516
 517        intel_uncore_write(uncore, GEN8_PRIVATE_PAT_LO, lower_32_bits(pat));
 518        intel_uncore_write(uncore, GEN8_PRIVATE_PAT_HI, upper_32_bits(pat));
 519}
 520
 521void setup_private_pat(struct intel_uncore *uncore)
 522{
 523        struct drm_i915_private *i915 = uncore->i915;
 524
 525        GEM_BUG_ON(GRAPHICS_VER(i915) < 8);
 526
 527        if (GRAPHICS_VER(i915) >= 12)
 528                tgl_setup_private_ppat(uncore);
 529        else if (GRAPHICS_VER(i915) >= 11)
 530                icl_setup_private_ppat(uncore);
 531        else if (IS_CHERRYVIEW(i915) || IS_GEN9_LP(i915))
 532                chv_setup_private_ppat(uncore);
 533        else
 534                bdw_setup_private_ppat(uncore);
 535}
 536
 537struct i915_vma *
 538__vm_create_scratch_for_read(struct i915_address_space *vm, unsigned long size)
 539{
 540        struct drm_i915_gem_object *obj;
 541        struct i915_vma *vma;
 542
 543        obj = i915_gem_object_create_internal(vm->i915, PAGE_ALIGN(size));
 544        if (IS_ERR(obj))
 545                return ERR_CAST(obj);
 546
 547        i915_gem_object_set_cache_coherency(obj, I915_CACHING_CACHED);
 548
 549        vma = i915_vma_instance(obj, vm, NULL);
 550        if (IS_ERR(vma)) {
 551                i915_gem_object_put(obj);
 552                return vma;
 553        }
 554
 555        return vma;
 556}
 557
 558struct i915_vma *
 559__vm_create_scratch_for_read_pinned(struct i915_address_space *vm, unsigned long size)
 560{
 561        struct i915_vma *vma;
 562        int err;
 563
 564        vma = __vm_create_scratch_for_read(vm, size);
 565        if (IS_ERR(vma))
 566                return vma;
 567
 568        err = i915_vma_pin(vma, 0, 0,
 569                           i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
 570        if (err) {
 571                i915_vma_put(vma);
 572                return ERR_PTR(err);
 573        }
 574
 575        return vma;
 576}
 577
 578#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 579#include "selftests/mock_gtt.c"
 580#endif
 581