linux/drivers/gpu/drm/i915/i915_gem_fence_reg.c
<<
>>
Prefs
   1/*
   2 * Copyright © 2008-2015 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 */
  23
  24#include <drm/drmP.h>
  25#include <drm/i915_drm.h>
  26#include "i915_drv.h"
  27
  28/**
  29 * DOC: fence register handling
  30 *
  31 * Important to avoid confusions: "fences" in the i915 driver are not execution
  32 * fences used to track command completion but hardware detiler objects which
  33 * wrap a given range of the global GTT. Each platform has only a fairly limited
  34 * set of these objects.
  35 *
  36 * Fences are used to detile GTT memory mappings. They're also connected to the
  37 * hardware frontbuffer render tracking and hence interact with frontbuffer
  38 * compression. Furthermore on older platforms fences are required for tiled
  39 * objects used by the display engine. They can also be used by the render
  40 * engine - they're required for blitter commands and are optional for render
  41 * commands. But on gen4+ both display (with the exception of fbc) and rendering
  42 * have their own tiling state bits and don't need fences.
  43 *
  44 * Also note that fences only support X and Y tiling and hence can't be used for
  45 * the fancier new tiling formats like W, Ys and Yf.
  46 *
  47 * Finally note that because fences are such a restricted resource they're
  48 * dynamically associated with objects. Furthermore fence state is committed to
  49 * the hardware lazily to avoid unnecessary stalls on gen2/3. Therefore code must
  50 * explicitly call i915_gem_object_get_fence() to synchronize fencing status
  51 * for cpu access. Also note that some code wants an unfenced view, for those
  52 * cases the fence can be removed forcefully with i915_gem_object_put_fence().
  53 *
  54 * Internally these functions will synchronize with userspace access by removing
  55 * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
  56 */
  57
  58#define pipelined 0
  59
  60static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
  61                                 struct i915_vma *vma)
  62{
  63        i915_reg_t fence_reg_lo, fence_reg_hi;
  64        int fence_pitch_shift;
  65        u64 val;
  66
  67        if (INTEL_INFO(fence->i915)->gen >= 6) {
  68                fence_reg_lo = FENCE_REG_GEN6_LO(fence->id);
  69                fence_reg_hi = FENCE_REG_GEN6_HI(fence->id);
  70                fence_pitch_shift = GEN6_FENCE_PITCH_SHIFT;
  71
  72        } else {
  73                fence_reg_lo = FENCE_REG_965_LO(fence->id);
  74                fence_reg_hi = FENCE_REG_965_HI(fence->id);
  75                fence_pitch_shift = I965_FENCE_PITCH_SHIFT;
  76        }
  77
  78        val = 0;
  79        if (vma) {
  80                unsigned int stride = i915_gem_object_get_stride(vma->obj);
  81
  82                GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
  83                GEM_BUG_ON(!IS_ALIGNED(vma->node.start, I965_FENCE_PAGE));
  84                GEM_BUG_ON(!IS_ALIGNED(vma->fence_size, I965_FENCE_PAGE));
  85                GEM_BUG_ON(!IS_ALIGNED(stride, 128));
  86
  87                val = (vma->node.start + vma->fence_size - I965_FENCE_PAGE) << 32;
  88                val |= vma->node.start;
  89                val |= (u64)((stride / 128) - 1) << fence_pitch_shift;
  90                if (i915_gem_object_get_tiling(vma->obj) == I915_TILING_Y)
  91                        val |= BIT(I965_FENCE_TILING_Y_SHIFT);
  92                val |= I965_FENCE_REG_VALID;
  93        }
  94
  95        if (!pipelined) {
  96                struct drm_i915_private *dev_priv = fence->i915;
  97
  98                /* To w/a incoherency with non-atomic 64-bit register updates,
  99                 * we split the 64-bit update into two 32-bit writes. In order
 100                 * for a partial fence not to be evaluated between writes, we
 101                 * precede the update with write to turn off the fence register,
 102                 * and only enable the fence as the last step.
 103                 *
 104                 * For extra levels of paranoia, we make sure each step lands
 105                 * before applying the next step.
 106                 */
 107                I915_WRITE(fence_reg_lo, 0);
 108                POSTING_READ(fence_reg_lo);
 109
 110                I915_WRITE(fence_reg_hi, upper_32_bits(val));
 111                I915_WRITE(fence_reg_lo, lower_32_bits(val));
 112                POSTING_READ(fence_reg_lo);
 113        }
 114}
 115
 116static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
 117                                 struct i915_vma *vma)
 118{
 119        u32 val;
 120
 121        val = 0;
 122        if (vma) {
 123                unsigned int tiling = i915_gem_object_get_tiling(vma->obj);
 124                bool is_y_tiled = tiling == I915_TILING_Y;
 125                unsigned int stride = i915_gem_object_get_stride(vma->obj);
 126
 127                GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
 128                GEM_BUG_ON(vma->node.start & ~I915_FENCE_START_MASK);
 129                GEM_BUG_ON(!is_power_of_2(vma->fence_size));
 130                GEM_BUG_ON(!IS_ALIGNED(vma->node.start, vma->fence_size));
 131
 132                if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence->i915))
 133                        stride /= 128;
 134                else
 135                        stride /= 512;
 136                GEM_BUG_ON(!is_power_of_2(stride));
 137
 138                val = vma->node.start;
 139                if (is_y_tiled)
 140                        val |= BIT(I830_FENCE_TILING_Y_SHIFT);
 141                val |= I915_FENCE_SIZE_BITS(vma->fence_size);
 142                val |= ilog2(stride) << I830_FENCE_PITCH_SHIFT;
 143
 144                val |= I830_FENCE_REG_VALID;
 145        }
 146
 147        if (!pipelined) {
 148                struct drm_i915_private *dev_priv = fence->i915;
 149                i915_reg_t reg = FENCE_REG(fence->id);
 150
 151                I915_WRITE(reg, val);
 152                POSTING_READ(reg);
 153        }
 154}
 155
 156static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
 157                                 struct i915_vma *vma)
 158{
 159        u32 val;
 160
 161        val = 0;
 162        if (vma) {
 163                unsigned int stride = i915_gem_object_get_stride(vma->obj);
 164
 165                GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
 166                GEM_BUG_ON(vma->node.start & ~I830_FENCE_START_MASK);
 167                GEM_BUG_ON(!is_power_of_2(vma->fence_size));
 168                GEM_BUG_ON(!is_power_of_2(stride / 128));
 169                GEM_BUG_ON(!IS_ALIGNED(vma->node.start, vma->fence_size));
 170
 171                val = vma->node.start;
 172                if (i915_gem_object_get_tiling(vma->obj) == I915_TILING_Y)
 173                        val |= BIT(I830_FENCE_TILING_Y_SHIFT);
 174                val |= I830_FENCE_SIZE_BITS(vma->fence_size);
 175                val |= ilog2(stride / 128) << I830_FENCE_PITCH_SHIFT;
 176                val |= I830_FENCE_REG_VALID;
 177        }
 178
 179        if (!pipelined) {
 180                struct drm_i915_private *dev_priv = fence->i915;
 181                i915_reg_t reg = FENCE_REG(fence->id);
 182
 183                I915_WRITE(reg, val);
 184                POSTING_READ(reg);
 185        }
 186}
 187
 188static void fence_write(struct drm_i915_fence_reg *fence,
 189                        struct i915_vma *vma)
 190{
 191        /* Previous access through the fence register is marshalled by
 192         * the mb() inside the fault handlers (i915_gem_release_mmaps)
 193         * and explicitly managed for internal users.
 194         */
 195
 196        if (IS_GEN2(fence->i915))
 197                i830_write_fence_reg(fence, vma);
 198        else if (IS_GEN3(fence->i915))
 199                i915_write_fence_reg(fence, vma);
 200        else
 201                i965_write_fence_reg(fence, vma);
 202
 203        /* Access through the fenced region afterwards is
 204         * ordered by the posting reads whilst writing the registers.
 205         */
 206
 207        fence->dirty = false;
 208}
 209
 210static int fence_update(struct drm_i915_fence_reg *fence,
 211                        struct i915_vma *vma)
 212{
 213        int ret;
 214
 215        if (vma) {
 216                if (!i915_vma_is_map_and_fenceable(vma))
 217                        return -EINVAL;
 218
 219                if (WARN(!i915_gem_object_get_stride(vma->obj) ||
 220                         !i915_gem_object_get_tiling(vma->obj),
 221                         "bogus fence setup with stride: 0x%x, tiling mode: %i\n",
 222                         i915_gem_object_get_stride(vma->obj),
 223                         i915_gem_object_get_tiling(vma->obj)))
 224                        return -EINVAL;
 225
 226                ret = i915_gem_active_retire(&vma->last_fence,
 227                                             &vma->obj->base.dev->struct_mutex);
 228                if (ret)
 229                        return ret;
 230        }
 231
 232        if (fence->vma) {
 233                ret = i915_gem_active_retire(&fence->vma->last_fence,
 234                                      &fence->vma->obj->base.dev->struct_mutex);
 235                if (ret)
 236                        return ret;
 237        }
 238
 239        if (fence->vma && fence->vma != vma) {
 240                /* Ensure that all userspace CPU access is completed before
 241                 * stealing the fence.
 242                 */
 243                i915_gem_release_mmap(fence->vma->obj);
 244
 245                fence->vma->fence = NULL;
 246                fence->vma = NULL;
 247
 248                list_move(&fence->link, &fence->i915->mm.fence_list);
 249        }
 250
 251        /* We only need to update the register itself if the device is awake.
 252         * If the device is currently powered down, we will defer the write
 253         * to the runtime resume, see i915_gem_restore_fences().
 254         */
 255        if (intel_runtime_pm_get_if_in_use(fence->i915)) {
 256                fence_write(fence, vma);
 257                intel_runtime_pm_put(fence->i915);
 258        }
 259
 260        if (vma) {
 261                if (fence->vma != vma) {
 262                        vma->fence = fence;
 263                        fence->vma = vma;
 264                }
 265
 266                list_move_tail(&fence->link, &fence->i915->mm.fence_list);
 267        }
 268
 269        return 0;
 270}
 271
 272/**
 273 * i915_vma_put_fence - force-remove fence for a VMA
 274 * @vma: vma to map linearly (not through a fence reg)
 275 *
 276 * This function force-removes any fence from the given object, which is useful
 277 * if the kernel wants to do untiled GTT access.
 278 *
 279 * Returns:
 280 *
 281 * 0 on success, negative error code on failure.
 282 */
 283int
 284i915_vma_put_fence(struct i915_vma *vma)
 285{
 286        struct drm_i915_fence_reg *fence = vma->fence;
 287
 288        if (!fence)
 289                return 0;
 290
 291        if (fence->pin_count)
 292                return -EBUSY;
 293
 294        return fence_update(fence, NULL);
 295}
 296
 297static struct drm_i915_fence_reg *fence_find(struct drm_i915_private *dev_priv)
 298{
 299        struct drm_i915_fence_reg *fence;
 300
 301        list_for_each_entry(fence, &dev_priv->mm.fence_list, link) {
 302                if (fence->pin_count)
 303                        continue;
 304
 305                return fence;
 306        }
 307
 308        /* Wait for completion of pending flips which consume fences */
 309        if (intel_has_pending_fb_unpin(dev_priv))
 310                return ERR_PTR(-EAGAIN);
 311
 312        return ERR_PTR(-EDEADLK);
 313}
 314
 315/**
 316 * i915_vma_get_fence - set up fencing for a vma
 317 * @vma: vma to map through a fence reg
 318 *
 319 * When mapping objects through the GTT, userspace wants to be able to write
 320 * to them without having to worry about swizzling if the object is tiled.
 321 * This function walks the fence regs looking for a free one for @obj,
 322 * stealing one if it can't find any.
 323 *
 324 * It then sets up the reg based on the object's properties: address, pitch
 325 * and tiling format.
 326 *
 327 * For an untiled surface, this removes any existing fence.
 328 *
 329 * Returns:
 330 *
 331 * 0 on success, negative error code on failure.
 332 */
 333int
 334i915_vma_get_fence(struct i915_vma *vma)
 335{
 336        struct drm_i915_fence_reg *fence;
 337        struct i915_vma *set = i915_gem_object_is_tiled(vma->obj) ? vma : NULL;
 338
 339        /* Note that we revoke fences on runtime suspend. Therefore the user
 340         * must keep the device awake whilst using the fence.
 341         */
 342        assert_rpm_wakelock_held(vma->vm->i915);
 343
 344        /* Just update our place in the LRU if our fence is getting reused. */
 345        if (vma->fence) {
 346                fence = vma->fence;
 347                if (!fence->dirty) {
 348                        list_move_tail(&fence->link,
 349                                       &fence->i915->mm.fence_list);
 350                        return 0;
 351                }
 352        } else if (set) {
 353                fence = fence_find(vma->vm->i915);
 354                if (IS_ERR(fence))
 355                        return PTR_ERR(fence);
 356        } else
 357                return 0;
 358
 359        return fence_update(fence, set);
 360}
 361
 362/**
 363 * i915_gem_revoke_fences - revoke fence state
 364 * @dev_priv: i915 device private
 365 *
 366 * Removes all GTT mmappings via the fence registers. This forces any user
 367 * of the fence to reacquire that fence before continuing with their access.
 368 * One use is during GPU reset where the fence register is lost and we need to
 369 * revoke concurrent userspace access via GTT mmaps until the hardware has been
 370 * reset and the fence registers have been restored.
 371 */
 372void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
 373{
 374        int i;
 375
 376        lockdep_assert_held(&dev_priv->drm.struct_mutex);
 377
 378        for (i = 0; i < dev_priv->num_fence_regs; i++) {
 379                struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
 380
 381                if (fence->vma)
 382                        i915_gem_release_mmap(fence->vma->obj);
 383        }
 384}
 385
 386/**
 387 * i915_gem_restore_fences - restore fence state
 388 * @dev_priv: i915 device private
 389 *
 390 * Restore the hw fence state to match the software tracking again, to be called
 391 * after a gpu reset and on resume. Note that on runtime suspend we only cancel
 392 * the fences, to be reacquired by the user later.
 393 */
 394void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
 395{
 396        int i;
 397
 398        for (i = 0; i < dev_priv->num_fence_regs; i++) {
 399                struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
 400                struct i915_vma *vma = reg->vma;
 401
 402                /*
 403                 * Commit delayed tiling changes if we have an object still
 404                 * attached to the fence, otherwise just clear the fence.
 405                 */
 406                if (vma && !i915_gem_object_is_tiled(vma->obj)) {
 407                        GEM_BUG_ON(!reg->dirty);
 408                        GEM_BUG_ON(!list_empty(&vma->obj->userfault_link));
 409
 410                        list_move(&reg->link, &dev_priv->mm.fence_list);
 411                        vma->fence = NULL;
 412                        vma = NULL;
 413                }
 414
 415                fence_write(reg, vma);
 416                reg->vma = vma;
 417        }
 418}
 419
 420/**
 421 * DOC: tiling swizzling details
 422 *
 423 * The idea behind tiling is to increase cache hit rates by rearranging
 424 * pixel data so that a group of pixel accesses are in the same cacheline.
 425 * Performance improvement from doing this on the back/depth buffer are on
 426 * the order of 30%.
 427 *
 428 * Intel architectures make this somewhat more complicated, though, by
 429 * adjustments made to addressing of data when the memory is in interleaved
 430 * mode (matched pairs of DIMMS) to improve memory bandwidth.
 431 * For interleaved memory, the CPU sends every sequential 64 bytes
 432 * to an alternate memory channel so it can get the bandwidth from both.
 433 *
 434 * The GPU also rearranges its accesses for increased bandwidth to interleaved
 435 * memory, and it matches what the CPU does for non-tiled.  However, when tiled
 436 * it does it a little differently, since one walks addresses not just in the
 437 * X direction but also Y.  So, along with alternating channels when bit
 438 * 6 of the address flips, it also alternates when other bits flip --  Bits 9
 439 * (every 512 bytes, an X tile scanline) and 10 (every two X tile scanlines)
 440 * are common to both the 915 and 965-class hardware.
 441 *
 442 * The CPU also sometimes XORs in higher bits as well, to improve
 443 * bandwidth doing strided access like we do so frequently in graphics.  This
 444 * is called "Channel XOR Randomization" in the MCH documentation.  The result
 445 * is that the CPU is XORing in either bit 11 or bit 17 to bit 6 of its address
 446 * decode.
 447 *
 448 * All of this bit 6 XORing has an effect on our memory management,
 449 * as we need to make sure that the 3d driver can correctly address object
 450 * contents.
 451 *
 452 * If we don't have interleaved memory, all tiling is safe and no swizzling is
 453 * required.
 454 *
 455 * When bit 17 is XORed in, we simply refuse to tile at all.  Bit
 456 * 17 is not just a page offset, so as we page an object out and back in,
 457 * individual pages in it will have different bit 17 addresses, resulting in
 458 * each 64 bytes being swapped with its neighbor!
 459 *
 460 * Otherwise, if interleaved, we have to tell the 3d driver what the address
 461 * swizzling it needs to do is, since it's writing with the CPU to the pages
 462 * (bit 6 and potentially bit 11 XORed in), and the GPU is reading from the
 463 * pages (bit 6, 9, and 10 XORed in), resulting in a cumulative bit swizzling
 464 * required by the CPU of XORing in bit 6, 9, 10, and potentially 11, in order
 465 * to match what the GPU expects.
 466 */
 467
 468/**
 469 * i915_gem_detect_bit_6_swizzle - detect bit 6 swizzling pattern
 470 * @dev_priv: i915 device private
 471 *
 472 * Detects bit 6 swizzling of address lookup between IGD access and CPU
 473 * access through main memory.
 474 */
 475void
 476i915_gem_detect_bit_6_swizzle(struct drm_i915_private *dev_priv)
 477{
 478        uint32_t swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
 479        uint32_t swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
 480
 481        if (INTEL_GEN(dev_priv) >= 8 || IS_VALLEYVIEW(dev_priv)) {
 482                /*
 483                 * On BDW+, swizzling is not used. We leave the CPU memory
 484                 * controller in charge of optimizing memory accesses without
 485                 * the extra address manipulation GPU side.
 486                 *
 487                 * VLV and CHV don't have GPU swizzling.
 488                 */
 489                swizzle_x = I915_BIT_6_SWIZZLE_NONE;
 490                swizzle_y = I915_BIT_6_SWIZZLE_NONE;
 491        } else if (INTEL_GEN(dev_priv) >= 6) {
 492                if (dev_priv->preserve_bios_swizzle) {
 493                        if (I915_READ(DISP_ARB_CTL) &
 494                            DISP_TILE_SURFACE_SWIZZLING) {
 495                                swizzle_x = I915_BIT_6_SWIZZLE_9_10;
 496                                swizzle_y = I915_BIT_6_SWIZZLE_9;
 497                        } else {
 498                                swizzle_x = I915_BIT_6_SWIZZLE_NONE;
 499                                swizzle_y = I915_BIT_6_SWIZZLE_NONE;
 500                        }
 501                } else {
 502                        uint32_t dimm_c0, dimm_c1;
 503                        dimm_c0 = I915_READ(MAD_DIMM_C0);
 504                        dimm_c1 = I915_READ(MAD_DIMM_C1);
 505                        dimm_c0 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK;
 506                        dimm_c1 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK;
 507                        /* Enable swizzling when the channels are populated
 508                         * with identically sized dimms. We don't need to check
 509                         * the 3rd channel because no cpu with gpu attached
 510                         * ships in that configuration. Also, swizzling only
 511                         * makes sense for 2 channels anyway. */
 512                        if (dimm_c0 == dimm_c1) {
 513                                swizzle_x = I915_BIT_6_SWIZZLE_9_10;
 514                                swizzle_y = I915_BIT_6_SWIZZLE_9;
 515                        } else {
 516                                swizzle_x = I915_BIT_6_SWIZZLE_NONE;
 517                                swizzle_y = I915_BIT_6_SWIZZLE_NONE;
 518                        }
 519                }
 520        } else if (IS_GEN5(dev_priv)) {
 521                /* On Ironlake whatever DRAM config, GPU always do
 522                 * same swizzling setup.
 523                 */
 524                swizzle_x = I915_BIT_6_SWIZZLE_9_10;
 525                swizzle_y = I915_BIT_6_SWIZZLE_9;
 526        } else if (IS_GEN2(dev_priv)) {
 527                /* As far as we know, the 865 doesn't have these bit 6
 528                 * swizzling issues.
 529                 */
 530                swizzle_x = I915_BIT_6_SWIZZLE_NONE;
 531                swizzle_y = I915_BIT_6_SWIZZLE_NONE;
 532        } else if (IS_MOBILE(dev_priv) ||
 533                   IS_I915G(dev_priv) || IS_I945G(dev_priv)) {
 534                uint32_t dcc;
 535
 536                /* On 9xx chipsets, channel interleave by the CPU is
 537                 * determined by DCC.  For single-channel, neither the CPU
 538                 * nor the GPU do swizzling.  For dual channel interleaved,
 539                 * the GPU's interleave is bit 9 and 10 for X tiled, and bit
 540                 * 9 for Y tiled.  The CPU's interleave is independent, and
 541                 * can be based on either bit 11 (haven't seen this yet) or
 542                 * bit 17 (common).
 543                 */
 544                dcc = I915_READ(DCC);
 545                switch (dcc & DCC_ADDRESSING_MODE_MASK) {
 546                case DCC_ADDRESSING_MODE_SINGLE_CHANNEL:
 547                case DCC_ADDRESSING_MODE_DUAL_CHANNEL_ASYMMETRIC:
 548                        swizzle_x = I915_BIT_6_SWIZZLE_NONE;
 549                        swizzle_y = I915_BIT_6_SWIZZLE_NONE;
 550                        break;
 551                case DCC_ADDRESSING_MODE_DUAL_CHANNEL_INTERLEAVED:
 552                        if (dcc & DCC_CHANNEL_XOR_DISABLE) {
 553                                /* This is the base swizzling by the GPU for
 554                                 * tiled buffers.
 555                                 */
 556                                swizzle_x = I915_BIT_6_SWIZZLE_9_10;
 557                                swizzle_y = I915_BIT_6_SWIZZLE_9;
 558                        } else if ((dcc & DCC_CHANNEL_XOR_BIT_17) == 0) {
 559                                /* Bit 11 swizzling by the CPU in addition. */
 560                                swizzle_x = I915_BIT_6_SWIZZLE_9_10_11;
 561                                swizzle_y = I915_BIT_6_SWIZZLE_9_11;
 562                        } else {
 563                                /* Bit 17 swizzling by the CPU in addition. */
 564                                swizzle_x = I915_BIT_6_SWIZZLE_9_10_17;
 565                                swizzle_y = I915_BIT_6_SWIZZLE_9_17;
 566                        }
 567                        break;
 568                }
 569
 570                /* check for L-shaped memory aka modified enhanced addressing */
 571                if (IS_GEN4(dev_priv) &&
 572                    !(I915_READ(DCC2) & DCC2_MODIFIED_ENHANCED_DISABLE)) {
 573                        swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
 574                        swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
 575                }
 576
 577                if (dcc == 0xffffffff) {
 578                        DRM_ERROR("Couldn't read from MCHBAR.  "
 579                                  "Disabling tiling.\n");
 580                        swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
 581                        swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
 582                }
 583        } else {
 584                /* The 965, G33, and newer, have a very flexible memory
 585                 * configuration.  It will enable dual-channel mode
 586                 * (interleaving) on as much memory as it can, and the GPU
 587                 * will additionally sometimes enable different bit 6
 588                 * swizzling for tiled objects from the CPU.
 589                 *
 590                 * Here's what I found on the G965:
 591                 *    slot fill         memory size  swizzling
 592                 * 0A   0B   1A   1B    1-ch   2-ch
 593                 * 512  0    0    0     512    0     O
 594                 * 512  0    512  0     16     1008  X
 595                 * 512  0    0    512   16     1008  X
 596                 * 0    512  0    512   16     1008  X
 597                 * 1024 1024 1024 0     2048   1024  O
 598                 *
 599                 * We could probably detect this based on either the DRB
 600                 * matching, which was the case for the swizzling required in
 601                 * the table above, or from the 1-ch value being less than
 602                 * the minimum size of a rank.
 603                 *
 604                 * Reports indicate that the swizzling actually
 605                 * varies depending upon page placement inside the
 606                 * channels, i.e. we see swizzled pages where the
 607                 * banks of memory are paired and unswizzled on the
 608                 * uneven portion, so leave that as unknown.
 609                 */
 610                if (I915_READ16(C0DRB3) == I915_READ16(C1DRB3)) {
 611                        swizzle_x = I915_BIT_6_SWIZZLE_9_10;
 612                        swizzle_y = I915_BIT_6_SWIZZLE_9;
 613                }
 614        }
 615
 616        if (swizzle_x == I915_BIT_6_SWIZZLE_UNKNOWN ||
 617            swizzle_y == I915_BIT_6_SWIZZLE_UNKNOWN) {
 618                /* Userspace likes to explode if it sees unknown swizzling,
 619                 * so lie. We will finish the lie when reporting through
 620                 * the get-tiling-ioctl by reporting the physical swizzle
 621                 * mode as unknown instead.
 622                 *
 623                 * As we don't strictly know what the swizzling is, it may be
 624                 * bit17 dependent, and so we need to also prevent the pages
 625                 * from being moved.
 626                 */
 627                dev_priv->quirks |= QUIRK_PIN_SWIZZLED_PAGES;
 628                swizzle_x = I915_BIT_6_SWIZZLE_NONE;
 629                swizzle_y = I915_BIT_6_SWIZZLE_NONE;
 630        }
 631
 632        dev_priv->mm.bit_6_swizzle_x = swizzle_x;
 633        dev_priv->mm.bit_6_swizzle_y = swizzle_y;
 634}
 635
 636/*
 637 * Swap every 64 bytes of this page around, to account for it having a new
 638 * bit 17 of its physical address and therefore being interpreted differently
 639 * by the GPU.
 640 */
 641static void
 642i915_gem_swizzle_page(struct page *page)
 643{
 644        char temp[64];
 645        char *vaddr;
 646        int i;
 647
 648        vaddr = kmap(page);
 649
 650        for (i = 0; i < PAGE_SIZE; i += 128) {
 651                memcpy(temp, &vaddr[i], 64);
 652                memcpy(&vaddr[i], &vaddr[i + 64], 64);
 653                memcpy(&vaddr[i + 64], temp, 64);
 654        }
 655
 656        kunmap(page);
 657}
 658
 659/**
 660 * i915_gem_object_do_bit_17_swizzle - fixup bit 17 swizzling
 661 * @obj: i915 GEM buffer object
 662 * @pages: the scattergather list of physical pages
 663 *
 664 * This function fixes up the swizzling in case any page frame number for this
 665 * object has changed in bit 17 since that state has been saved with
 666 * i915_gem_object_save_bit_17_swizzle().
 667 *
 668 * This is called when pinning backing storage again, since the kernel is free
 669 * to move unpinned backing storage around (either by directly moving pages or
 670 * by swapping them out and back in again).
 671 */
 672void
 673i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj,
 674                                  struct sg_table *pages)
 675{
 676        struct sgt_iter sgt_iter;
 677        struct page *page;
 678        int i;
 679
 680        if (obj->bit_17 == NULL)
 681                return;
 682
 683        i = 0;
 684        for_each_sgt_page(page, sgt_iter, pages) {
 685                char new_bit_17 = page_to_phys(page) >> 17;
 686                if ((new_bit_17 & 0x1) != (test_bit(i, obj->bit_17) != 0)) {
 687                        i915_gem_swizzle_page(page);
 688                        set_page_dirty(page);
 689                }
 690                i++;
 691        }
 692}
 693
 694/**
 695 * i915_gem_object_save_bit_17_swizzle - save bit 17 swizzling
 696 * @obj: i915 GEM buffer object
 697 * @pages: the scattergather list of physical pages
 698 *
 699 * This function saves the bit 17 of each page frame number so that swizzling
 700 * can be fixed up later on with i915_gem_object_do_bit_17_swizzle(). This must
 701 * be called before the backing storage can be unpinned.
 702 */
 703void
 704i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj,
 705                                    struct sg_table *pages)
 706{
 707        const unsigned int page_count = obj->base.size >> PAGE_SHIFT;
 708        struct sgt_iter sgt_iter;
 709        struct page *page;
 710        int i;
 711
 712        if (obj->bit_17 == NULL) {
 713                obj->bit_17 = kcalloc(BITS_TO_LONGS(page_count),
 714                                      sizeof(long), GFP_KERNEL);
 715                if (obj->bit_17 == NULL) {
 716                        DRM_ERROR("Failed to allocate memory for bit 17 "
 717                                  "record\n");
 718                        return;
 719                }
 720        }
 721
 722        i = 0;
 723
 724        for_each_sgt_page(page, sgt_iter, pages) {
 725                if (page_to_phys(page) & (1 << 17))
 726                        __set_bit(i, obj->bit_17);
 727                else
 728                        __clear_bit(i, obj->bit_17);
 729                i++;
 730        }
 731}
 732