linux/drivers/gpu/drm/i915/i915_gpu_error.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2008 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * Authors:
  24 *    Eric Anholt <eric@anholt.net>
  25 *    Keith Packard <keithp@keithp.com>
  26 *    Mika Kuoppala <mika.kuoppala@intel.com>
  27 *
  28 */
  29
  30#include <linux/ascii85.h>
  31#include <linux/nmi.h>
  32#include <linux/scatterlist.h>
  33#include <linux/stop_machine.h>
  34#include <linux/utsname.h>
  35#include <linux/zlib.h>
  36
  37#include <drm/drm_print.h>
  38
  39#include "display/intel_atomic.h"
  40#include "display/intel_overlay.h"
  41
  42#include "gem/i915_gem_context.h"
  43
  44#include "i915_drv.h"
  45#include "i915_gpu_error.h"
  46#include "i915_scatterlist.h"
  47#include "intel_csr.h"
  48
  49static inline const struct intel_engine_cs *
  50engine_lookup(const struct drm_i915_private *i915, unsigned int id)
  51{
  52        if (id >= I915_NUM_ENGINES)
  53                return NULL;
  54
  55        return i915->engine[id];
  56}
  57
  58static inline const char *
  59__engine_name(const struct intel_engine_cs *engine)
  60{
  61        return engine ? engine->name : "";
  62}
  63
  64static const char *
  65engine_name(const struct drm_i915_private *i915, unsigned int id)
  66{
  67        return __engine_name(engine_lookup(i915, id));
  68}
  69
  70static const char *tiling_flag(int tiling)
  71{
  72        switch (tiling) {
  73        default:
  74        case I915_TILING_NONE: return "";
  75        case I915_TILING_X: return " X";
  76        case I915_TILING_Y: return " Y";
  77        }
  78}
  79
  80static const char *dirty_flag(int dirty)
  81{
  82        return dirty ? " dirty" : "";
  83}
  84
  85static const char *purgeable_flag(int purgeable)
  86{
  87        return purgeable ? " purgeable" : "";
  88}
  89
  90static void __sg_set_buf(struct scatterlist *sg,
  91                         void *addr, unsigned int len, loff_t it)
  92{
  93        sg->page_link = (unsigned long)virt_to_page(addr);
  94        sg->offset = offset_in_page(addr);
  95        sg->length = len;
  96        sg->dma_address = it;
  97}
  98
  99static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
 100{
 101        if (!len)
 102                return false;
 103
 104        if (e->bytes + len + 1 <= e->size)
 105                return true;
 106
 107        if (e->bytes) {
 108                __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
 109                e->iter += e->bytes;
 110                e->buf = NULL;
 111                e->bytes = 0;
 112        }
 113
 114        if (e->cur == e->end) {
 115                struct scatterlist *sgl;
 116
 117                sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
 118                if (!sgl) {
 119                        e->err = -ENOMEM;
 120                        return false;
 121                }
 122
 123                if (e->cur) {
 124                        e->cur->offset = 0;
 125                        e->cur->length = 0;
 126                        e->cur->page_link =
 127                                (unsigned long)sgl | SG_CHAIN;
 128                } else {
 129                        e->sgl = sgl;
 130                }
 131
 132                e->cur = sgl;
 133                e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
 134        }
 135
 136        e->size = ALIGN(len + 1, SZ_64K);
 137        e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
 138        if (!e->buf) {
 139                e->size = PAGE_ALIGN(len + 1);
 140                e->buf = kmalloc(e->size, GFP_KERNEL);
 141        }
 142        if (!e->buf) {
 143                e->err = -ENOMEM;
 144                return false;
 145        }
 146
 147        return true;
 148}
 149
 150__printf(2, 0)
 151static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
 152                               const char *fmt, va_list args)
 153{
 154        va_list ap;
 155        int len;
 156
 157        if (e->err)
 158                return;
 159
 160        va_copy(ap, args);
 161        len = vsnprintf(NULL, 0, fmt, ap);
 162        va_end(ap);
 163        if (len <= 0) {
 164                e->err = len;
 165                return;
 166        }
 167
 168        if (!__i915_error_grow(e, len))
 169                return;
 170
 171        GEM_BUG_ON(e->bytes >= e->size);
 172        len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
 173        if (len < 0) {
 174                e->err = len;
 175                return;
 176        }
 177        e->bytes += len;
 178}
 179
 180static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
 181{
 182        unsigned len;
 183
 184        if (e->err || !str)
 185                return;
 186
 187        len = strlen(str);
 188        if (!__i915_error_grow(e, len))
 189                return;
 190
 191        GEM_BUG_ON(e->bytes + len > e->size);
 192        memcpy(e->buf + e->bytes, str, len);
 193        e->bytes += len;
 194}
 195
 196#define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
 197#define err_puts(e, s) i915_error_puts(e, s)
 198
 199static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
 200{
 201        i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
 202}
 203
 204static inline struct drm_printer
 205i915_error_printer(struct drm_i915_error_state_buf *e)
 206{
 207        struct drm_printer p = {
 208                .printfn = __i915_printfn_error,
 209                .arg = e,
 210        };
 211        return p;
 212}
 213
 214#ifdef CONFIG_DRM_I915_COMPRESS_ERROR
 215
 216struct compress {
 217        struct z_stream_s zstream;
 218        void *tmp;
 219};
 220
 221static bool compress_init(struct compress *c)
 222{
 223        struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
 224
 225        zstream->workspace =
 226                kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
 227                        GFP_ATOMIC | __GFP_NOWARN);
 228        if (!zstream->workspace)
 229                return false;
 230
 231        if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
 232                kfree(zstream->workspace);
 233                return false;
 234        }
 235
 236        c->tmp = NULL;
 237        if (i915_has_memcpy_from_wc())
 238                c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
 239
 240        return true;
 241}
 242
 243static void *compress_next_page(struct drm_i915_error_object *dst)
 244{
 245        unsigned long page;
 246
 247        if (dst->page_count >= dst->num_pages)
 248                return ERR_PTR(-ENOSPC);
 249
 250        page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
 251        if (!page)
 252                return ERR_PTR(-ENOMEM);
 253
 254        return dst->pages[dst->page_count++] = (void *)page;
 255}
 256
 257static int compress_page(struct compress *c,
 258                         void *src,
 259                         struct drm_i915_error_object *dst)
 260{
 261        struct z_stream_s *zstream = &c->zstream;
 262
 263        zstream->next_in = src;
 264        if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
 265                zstream->next_in = c->tmp;
 266        zstream->avail_in = PAGE_SIZE;
 267
 268        do {
 269                if (zstream->avail_out == 0) {
 270                        zstream->next_out = compress_next_page(dst);
 271                        if (IS_ERR(zstream->next_out))
 272                                return PTR_ERR(zstream->next_out);
 273
 274                        zstream->avail_out = PAGE_SIZE;
 275                }
 276
 277                if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
 278                        return -EIO;
 279
 280                touch_nmi_watchdog();
 281        } while (zstream->avail_in);
 282
 283        /* Fallback to uncompressed if we increase size? */
 284        if (0 && zstream->total_out > zstream->total_in)
 285                return -E2BIG;
 286
 287        return 0;
 288}
 289
 290static int compress_flush(struct compress *c,
 291                          struct drm_i915_error_object *dst)
 292{
 293        struct z_stream_s *zstream = &c->zstream;
 294
 295        do {
 296                switch (zlib_deflate(zstream, Z_FINISH)) {
 297                case Z_OK: /* more space requested */
 298                        zstream->next_out = compress_next_page(dst);
 299                        if (IS_ERR(zstream->next_out))
 300                                return PTR_ERR(zstream->next_out);
 301
 302                        zstream->avail_out = PAGE_SIZE;
 303                        break;
 304
 305                case Z_STREAM_END:
 306                        goto end;
 307
 308                default: /* any error */
 309                        return -EIO;
 310                }
 311        } while (1);
 312
 313end:
 314        memset(zstream->next_out, 0, zstream->avail_out);
 315        dst->unused = zstream->avail_out;
 316        return 0;
 317}
 318
 319static void compress_fini(struct compress *c,
 320                          struct drm_i915_error_object *dst)
 321{
 322        struct z_stream_s *zstream = &c->zstream;
 323
 324        zlib_deflateEnd(zstream);
 325        kfree(zstream->workspace);
 326        if (c->tmp)
 327                free_page((unsigned long)c->tmp);
 328}
 329
 330static void err_compression_marker(struct drm_i915_error_state_buf *m)
 331{
 332        err_puts(m, ":");
 333}
 334
 335#else
 336
 337struct compress {
 338};
 339
 340static bool compress_init(struct compress *c)
 341{
 342        return true;
 343}
 344
 345static int compress_page(struct compress *c,
 346                         void *src,
 347                         struct drm_i915_error_object *dst)
 348{
 349        unsigned long page;
 350        void *ptr;
 351
 352        page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
 353        if (!page)
 354                return -ENOMEM;
 355
 356        ptr = (void *)page;
 357        if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
 358                memcpy(ptr, src, PAGE_SIZE);
 359        dst->pages[dst->page_count++] = ptr;
 360
 361        return 0;
 362}
 363
 364static int compress_flush(struct compress *c,
 365                          struct drm_i915_error_object *dst)
 366{
 367        return 0;
 368}
 369
 370static void compress_fini(struct compress *c,
 371                          struct drm_i915_error_object *dst)
 372{
 373}
 374
 375static void err_compression_marker(struct drm_i915_error_state_buf *m)
 376{
 377        err_puts(m, "~");
 378}
 379
 380#endif
 381
 382static void print_error_buffers(struct drm_i915_error_state_buf *m,
 383                                const char *name,
 384                                struct drm_i915_error_buffer *err,
 385                                int count)
 386{
 387        err_printf(m, "%s [%d]:\n", name, count);
 388
 389        while (count--) {
 390                err_printf(m, "    %08x_%08x %8u %02x %02x",
 391                           upper_32_bits(err->gtt_offset),
 392                           lower_32_bits(err->gtt_offset),
 393                           err->size,
 394                           err->read_domains,
 395                           err->write_domain);
 396                err_puts(m, tiling_flag(err->tiling));
 397                err_puts(m, dirty_flag(err->dirty));
 398                err_puts(m, purgeable_flag(err->purgeable));
 399                err_puts(m, err->userptr ? " userptr" : "");
 400                err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
 401
 402                if (err->name)
 403                        err_printf(m, " (name: %d)", err->name);
 404                if (err->fence_reg != I915_FENCE_REG_NONE)
 405                        err_printf(m, " (fence: %d)", err->fence_reg);
 406
 407                err_puts(m, "\n");
 408                err++;
 409        }
 410}
 411
 412static void error_print_instdone(struct drm_i915_error_state_buf *m,
 413                                 const struct drm_i915_error_engine *ee)
 414{
 415        int slice;
 416        int subslice;
 417
 418        err_printf(m, "  INSTDONE: 0x%08x\n",
 419                   ee->instdone.instdone);
 420
 421        if (ee->engine_id != RCS0 || INTEL_GEN(m->i915) <= 3)
 422                return;
 423
 424        err_printf(m, "  SC_INSTDONE: 0x%08x\n",
 425                   ee->instdone.slice_common);
 426
 427        if (INTEL_GEN(m->i915) <= 6)
 428                return;
 429
 430        for_each_instdone_slice_subslice(m->i915, slice, subslice)
 431                err_printf(m, "  SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
 432                           slice, subslice,
 433                           ee->instdone.sampler[slice][subslice]);
 434
 435        for_each_instdone_slice_subslice(m->i915, slice, subslice)
 436                err_printf(m, "  ROW_INSTDONE[%d][%d]: 0x%08x\n",
 437                           slice, subslice,
 438                           ee->instdone.row[slice][subslice]);
 439}
 440
 441static void error_print_request(struct drm_i915_error_state_buf *m,
 442                                const char *prefix,
 443                                const struct drm_i915_error_request *erq,
 444                                const unsigned long epoch)
 445{
 446        if (!erq->seqno)
 447                return;
 448
 449        err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
 450                   prefix, erq->pid, erq->context, erq->seqno,
 451                   test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 452                            &erq->flags) ? "!" : "",
 453                   test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
 454                            &erq->flags) ? "+" : "",
 455                   erq->sched_attr.priority,
 456                   jiffies_to_msecs(erq->jiffies - epoch),
 457                   erq->start, erq->head, erq->tail);
 458}
 459
 460static void error_print_context(struct drm_i915_error_state_buf *m,
 461                                const char *header,
 462                                const struct drm_i915_error_context *ctx)
 463{
 464        err_printf(m, "%s%s[%d] hw_id %d, prio %d, guilty %d active %d\n",
 465                   header, ctx->comm, ctx->pid, ctx->hw_id,
 466                   ctx->sched_attr.priority, ctx->guilty, ctx->active);
 467}
 468
 469static void error_print_engine(struct drm_i915_error_state_buf *m,
 470                               const struct drm_i915_error_engine *ee,
 471                               const unsigned long epoch)
 472{
 473        int n;
 474
 475        err_printf(m, "%s command stream:\n",
 476                   engine_name(m->i915, ee->engine_id));
 477        err_printf(m, "  IDLE?: %s\n", yesno(ee->idle));
 478        err_printf(m, "  START: 0x%08x\n", ee->start);
 479        err_printf(m, "  HEAD:  0x%08x [0x%08x]\n", ee->head, ee->rq_head);
 480        err_printf(m, "  TAIL:  0x%08x [0x%08x, 0x%08x]\n",
 481                   ee->tail, ee->rq_post, ee->rq_tail);
 482        err_printf(m, "  CTL:   0x%08x\n", ee->ctl);
 483        err_printf(m, "  MODE:  0x%08x\n", ee->mode);
 484        err_printf(m, "  HWS:   0x%08x\n", ee->hws);
 485        err_printf(m, "  ACTHD: 0x%08x %08x\n",
 486                   (u32)(ee->acthd>>32), (u32)ee->acthd);
 487        err_printf(m, "  IPEIR: 0x%08x\n", ee->ipeir);
 488        err_printf(m, "  IPEHR: 0x%08x\n", ee->ipehr);
 489
 490        error_print_instdone(m, ee);
 491
 492        if (ee->batchbuffer) {
 493                u64 start = ee->batchbuffer->gtt_offset;
 494                u64 end = start + ee->batchbuffer->gtt_size;
 495
 496                err_printf(m, "  batch: [0x%08x_%08x, 0x%08x_%08x]\n",
 497                           upper_32_bits(start), lower_32_bits(start),
 498                           upper_32_bits(end), lower_32_bits(end));
 499        }
 500        if (INTEL_GEN(m->i915) >= 4) {
 501                err_printf(m, "  BBADDR: 0x%08x_%08x\n",
 502                           (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
 503                err_printf(m, "  BB_STATE: 0x%08x\n", ee->bbstate);
 504                err_printf(m, "  INSTPS: 0x%08x\n", ee->instps);
 505        }
 506        err_printf(m, "  INSTPM: 0x%08x\n", ee->instpm);
 507        err_printf(m, "  FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
 508                   lower_32_bits(ee->faddr));
 509        if (INTEL_GEN(m->i915) >= 6) {
 510                err_printf(m, "  RC PSMI: 0x%08x\n", ee->rc_psmi);
 511                err_printf(m, "  FAULT_REG: 0x%08x\n", ee->fault_reg);
 512        }
 513        if (HAS_PPGTT(m->i915)) {
 514                err_printf(m, "  GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
 515
 516                if (INTEL_GEN(m->i915) >= 8) {
 517                        int i;
 518                        for (i = 0; i < 4; i++)
 519                                err_printf(m, "  PDP%d: 0x%016llx\n",
 520                                           i, ee->vm_info.pdp[i]);
 521                } else {
 522                        err_printf(m, "  PP_DIR_BASE: 0x%08x\n",
 523                                   ee->vm_info.pp_dir_base);
 524                }
 525        }
 526        err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
 527        err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
 528        err_printf(m, "  hangcheck timestamp: %dms (%lu%s)\n",
 529                   jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
 530                   ee->hangcheck_timestamp,
 531                   ee->hangcheck_timestamp == epoch ? "; epoch" : "");
 532        err_printf(m, "  engine reset count: %u\n", ee->reset_count);
 533
 534        for (n = 0; n < ee->num_ports; n++) {
 535                err_printf(m, "  ELSP[%d]:", n);
 536                error_print_request(m, " ", &ee->execlist[n], epoch);
 537        }
 538
 539        error_print_context(m, "  Active context: ", &ee->context);
 540}
 541
 542void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
 543{
 544        va_list args;
 545
 546        va_start(args, f);
 547        i915_error_vprintf(e, f, args);
 548        va_end(args);
 549}
 550
 551static void print_error_obj(struct drm_i915_error_state_buf *m,
 552                            struct intel_engine_cs *engine,
 553                            const char *name,
 554                            struct drm_i915_error_object *obj)
 555{
 556        char out[ASCII85_BUFSZ];
 557        int page;
 558
 559        if (!obj)
 560                return;
 561
 562        if (name) {
 563                err_printf(m, "%s --- %s = 0x%08x %08x\n",
 564                           engine ? engine->name : "global", name,
 565                           upper_32_bits(obj->gtt_offset),
 566                           lower_32_bits(obj->gtt_offset));
 567        }
 568
 569        err_compression_marker(m);
 570        for (page = 0; page < obj->page_count; page++) {
 571                int i, len;
 572
 573                len = PAGE_SIZE;
 574                if (page == obj->page_count - 1)
 575                        len -= obj->unused;
 576                len = ascii85_encode_len(len);
 577
 578                for (i = 0; i < len; i++)
 579                        err_puts(m, ascii85_encode(obj->pages[page][i], out));
 580        }
 581        err_puts(m, "\n");
 582}
 583
 584static void err_print_capabilities(struct drm_i915_error_state_buf *m,
 585                                   const struct intel_device_info *info,
 586                                   const struct intel_runtime_info *runtime,
 587                                   const struct intel_driver_caps *caps)
 588{
 589        struct drm_printer p = i915_error_printer(m);
 590
 591        intel_device_info_dump_flags(info, &p);
 592        intel_driver_caps_print(caps, &p);
 593        intel_device_info_dump_topology(&runtime->sseu, &p);
 594}
 595
 596static void err_print_params(struct drm_i915_error_state_buf *m,
 597                             const struct i915_params *params)
 598{
 599        struct drm_printer p = i915_error_printer(m);
 600
 601        i915_params_dump(params, &p);
 602}
 603
 604static void err_print_pciid(struct drm_i915_error_state_buf *m,
 605                            struct drm_i915_private *i915)
 606{
 607        struct pci_dev *pdev = i915->drm.pdev;
 608
 609        err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
 610        err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
 611        err_printf(m, "PCI Subsystem: %04x:%04x\n",
 612                   pdev->subsystem_vendor,
 613                   pdev->subsystem_device);
 614}
 615
 616static void err_print_uc(struct drm_i915_error_state_buf *m,
 617                         const struct i915_error_uc *error_uc)
 618{
 619        struct drm_printer p = i915_error_printer(m);
 620        const struct i915_gpu_state *error =
 621                container_of(error_uc, typeof(*error), uc);
 622
 623        if (!error->device_info.has_guc)
 624                return;
 625
 626        intel_uc_fw_dump(&error_uc->guc_fw, &p);
 627        intel_uc_fw_dump(&error_uc->huc_fw, &p);
 628        print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log);
 629}
 630
 631static void err_free_sgl(struct scatterlist *sgl)
 632{
 633        while (sgl) {
 634                struct scatterlist *sg;
 635
 636                for (sg = sgl; !sg_is_chain(sg); sg++) {
 637                        kfree(sg_virt(sg));
 638                        if (sg_is_last(sg))
 639                                break;
 640                }
 641
 642                sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
 643                free_page((unsigned long)sgl);
 644                sgl = sg;
 645        }
 646}
 647
 648static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 649                               struct i915_gpu_state *error)
 650{
 651        struct drm_i915_error_object *obj;
 652        struct timespec64 ts;
 653        int i, j;
 654
 655        if (*error->error_msg)
 656                err_printf(m, "%s\n", error->error_msg);
 657        err_printf(m, "Kernel: %s %s\n",
 658                   init_utsname()->release,
 659                   init_utsname()->machine);
 660        ts = ktime_to_timespec64(error->time);
 661        err_printf(m, "Time: %lld s %ld us\n",
 662                   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
 663        ts = ktime_to_timespec64(error->boottime);
 664        err_printf(m, "Boottime: %lld s %ld us\n",
 665                   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
 666        ts = ktime_to_timespec64(error->uptime);
 667        err_printf(m, "Uptime: %lld s %ld us\n",
 668                   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
 669        err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ);
 670        err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n",
 671                   error->capture,
 672                   jiffies_to_msecs(jiffies - error->capture),
 673                   jiffies_to_msecs(error->capture - error->epoch));
 674
 675        for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 676                if (!error->engine[i].context.pid)
 677                        continue;
 678
 679                err_printf(m, "Active process (on ring %s): %s [%d]\n",
 680                           engine_name(m->i915, i),
 681                           error->engine[i].context.comm,
 682                           error->engine[i].context.pid);
 683        }
 684        err_printf(m, "Reset count: %u\n", error->reset_count);
 685        err_printf(m, "Suspend count: %u\n", error->suspend_count);
 686        err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
 687        err_printf(m, "Subplatform: 0x%x\n",
 688                   intel_subplatform(&error->runtime_info,
 689                                     error->device_info.platform));
 690        err_print_pciid(m, m->i915);
 691
 692        err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
 693
 694        if (HAS_CSR(m->i915)) {
 695                struct intel_csr *csr = &m->i915->csr;
 696
 697                err_printf(m, "DMC loaded: %s\n",
 698                           yesno(csr->dmc_payload != NULL));
 699                err_printf(m, "DMC fw version: %d.%d\n",
 700                           CSR_VERSION_MAJOR(csr->version),
 701                           CSR_VERSION_MINOR(csr->version));
 702        }
 703
 704        err_printf(m, "GT awake: %s\n", yesno(error->awake));
 705        err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
 706        err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
 707        err_printf(m, "EIR: 0x%08x\n", error->eir);
 708        err_printf(m, "IER: 0x%08x\n", error->ier);
 709        for (i = 0; i < error->ngtier; i++)
 710                err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]);
 711        err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
 712        err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
 713        err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
 714        err_printf(m, "CCID: 0x%08x\n", error->ccid);
 715
 716        for (i = 0; i < error->nfence; i++)
 717                err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
 718
 719        if (INTEL_GEN(m->i915) >= 6) {
 720                err_printf(m, "ERROR: 0x%08x\n", error->error);
 721
 722                if (INTEL_GEN(m->i915) >= 8)
 723                        err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
 724                                   error->fault_data1, error->fault_data0);
 725
 726                err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
 727        }
 728
 729        if (IS_GEN(m->i915, 7))
 730                err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
 731
 732        for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 733                if (error->engine[i].engine_id != -1)
 734                        error_print_engine(m, &error->engine[i], error->epoch);
 735        }
 736
 737        for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
 738                char buf[128];
 739                int len, first = 1;
 740
 741                if (!error->active_vm[i])
 742                        break;
 743
 744                len = scnprintf(buf, sizeof(buf), "Active (");
 745                for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
 746                        if (error->engine[j].vm != error->active_vm[i])
 747                                continue;
 748
 749                        len += scnprintf(buf + len, sizeof(buf), "%s%s",
 750                                         first ? "" : ", ",
 751                                         m->i915->engine[j]->name);
 752                        first = 0;
 753                }
 754                scnprintf(buf + len, sizeof(buf), ")");
 755                print_error_buffers(m, buf,
 756                                    error->active_bo[i],
 757                                    error->active_bo_count[i]);
 758        }
 759
 760        print_error_buffers(m, "Pinned (global)",
 761                            error->pinned_bo,
 762                            error->pinned_bo_count);
 763
 764        for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 765                const struct drm_i915_error_engine *ee = &error->engine[i];
 766
 767                obj = ee->batchbuffer;
 768                if (obj) {
 769                        err_puts(m, m->i915->engine[i]->name);
 770                        if (ee->context.pid)
 771                                err_printf(m, " (submitted by %s [%d])",
 772                                           ee->context.comm,
 773                                           ee->context.pid);
 774                        err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
 775                                   upper_32_bits(obj->gtt_offset),
 776                                   lower_32_bits(obj->gtt_offset));
 777                        print_error_obj(m, m->i915->engine[i], NULL, obj);
 778                }
 779
 780                for (j = 0; j < ee->user_bo_count; j++)
 781                        print_error_obj(m, m->i915->engine[i],
 782                                        "user", ee->user_bo[j]);
 783
 784                if (ee->num_requests) {
 785                        err_printf(m, "%s --- %d requests\n",
 786                                   m->i915->engine[i]->name,
 787                                   ee->num_requests);
 788                        for (j = 0; j < ee->num_requests; j++)
 789                                error_print_request(m, " ",
 790                                                    &ee->requests[j],
 791                                                    error->epoch);
 792                }
 793
 794                print_error_obj(m, m->i915->engine[i],
 795                                "ringbuffer", ee->ringbuffer);
 796
 797                print_error_obj(m, m->i915->engine[i],
 798                                "HW Status", ee->hws_page);
 799
 800                print_error_obj(m, m->i915->engine[i],
 801                                "HW context", ee->ctx);
 802
 803                print_error_obj(m, m->i915->engine[i],
 804                                "WA context", ee->wa_ctx);
 805
 806                print_error_obj(m, m->i915->engine[i],
 807                                "WA batchbuffer", ee->wa_batchbuffer);
 808
 809                print_error_obj(m, m->i915->engine[i],
 810                                "NULL context", ee->default_state);
 811        }
 812
 813        if (error->overlay)
 814                intel_overlay_print_error_state(m, error->overlay);
 815
 816        if (error->display)
 817                intel_display_print_error_state(m, error->display);
 818
 819        err_print_capabilities(m, &error->device_info, &error->runtime_info,
 820                               &error->driver_caps);
 821        err_print_params(m, &error->params);
 822        err_print_uc(m, &error->uc);
 823}
 824
 825static int err_print_to_sgl(struct i915_gpu_state *error)
 826{
 827        struct drm_i915_error_state_buf m;
 828
 829        if (IS_ERR(error))
 830                return PTR_ERR(error);
 831
 832        if (READ_ONCE(error->sgl))
 833                return 0;
 834
 835        memset(&m, 0, sizeof(m));
 836        m.i915 = error->i915;
 837
 838        __err_print_to_sgl(&m, error);
 839
 840        if (m.buf) {
 841                __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
 842                m.bytes = 0;
 843                m.buf = NULL;
 844        }
 845        if (m.cur) {
 846                GEM_BUG_ON(m.end < m.cur);
 847                sg_mark_end(m.cur - 1);
 848        }
 849        GEM_BUG_ON(m.sgl && !m.cur);
 850
 851        if (m.err) {
 852                err_free_sgl(m.sgl);
 853                return m.err;
 854        }
 855
 856        if (cmpxchg(&error->sgl, NULL, m.sgl))
 857                err_free_sgl(m.sgl);
 858
 859        return 0;
 860}
 861
 862ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
 863                                      char *buf, loff_t off, size_t rem)
 864{
 865        struct scatterlist *sg;
 866        size_t count;
 867        loff_t pos;
 868        int err;
 869
 870        if (!error || !rem)
 871                return 0;
 872
 873        err = err_print_to_sgl(error);
 874        if (err)
 875                return err;
 876
 877        sg = READ_ONCE(error->fit);
 878        if (!sg || off < sg->dma_address)
 879                sg = error->sgl;
 880        if (!sg)
 881                return 0;
 882
 883        pos = sg->dma_address;
 884        count = 0;
 885        do {
 886                size_t len, start;
 887
 888                if (sg_is_chain(sg)) {
 889                        sg = sg_chain_ptr(sg);
 890                        GEM_BUG_ON(sg_is_chain(sg));
 891                }
 892
 893                len = sg->length;
 894                if (pos + len <= off) {
 895                        pos += len;
 896                        continue;
 897                }
 898
 899                start = sg->offset;
 900                if (pos < off) {
 901                        GEM_BUG_ON(off - pos > len);
 902                        len -= off - pos;
 903                        start += off - pos;
 904                        pos = off;
 905                }
 906
 907                len = min(len, rem);
 908                GEM_BUG_ON(!len || len > sg->length);
 909
 910                memcpy(buf, page_address(sg_page(sg)) + start, len);
 911
 912                count += len;
 913                pos += len;
 914
 915                buf += len;
 916                rem -= len;
 917                if (!rem) {
 918                        WRITE_ONCE(error->fit, sg);
 919                        break;
 920                }
 921        } while (!sg_is_last(sg++));
 922
 923        return count;
 924}
 925
 926static void i915_error_object_free(struct drm_i915_error_object *obj)
 927{
 928        int page;
 929
 930        if (obj == NULL)
 931                return;
 932
 933        for (page = 0; page < obj->page_count; page++)
 934                free_page((unsigned long)obj->pages[page]);
 935
 936        kfree(obj);
 937}
 938
 939
 940static void cleanup_params(struct i915_gpu_state *error)
 941{
 942        i915_params_free(&error->params);
 943}
 944
 945static void cleanup_uc_state(struct i915_gpu_state *error)
 946{
 947        struct i915_error_uc *error_uc = &error->uc;
 948
 949        kfree(error_uc->guc_fw.path);
 950        kfree(error_uc->huc_fw.path);
 951        i915_error_object_free(error_uc->guc_log);
 952}
 953
 954void __i915_gpu_state_free(struct kref *error_ref)
 955{
 956        struct i915_gpu_state *error =
 957                container_of(error_ref, typeof(*error), ref);
 958        long i, j;
 959
 960        for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 961                struct drm_i915_error_engine *ee = &error->engine[i];
 962
 963                for (j = 0; j < ee->user_bo_count; j++)
 964                        i915_error_object_free(ee->user_bo[j]);
 965                kfree(ee->user_bo);
 966
 967                i915_error_object_free(ee->batchbuffer);
 968                i915_error_object_free(ee->wa_batchbuffer);
 969                i915_error_object_free(ee->ringbuffer);
 970                i915_error_object_free(ee->hws_page);
 971                i915_error_object_free(ee->ctx);
 972                i915_error_object_free(ee->wa_ctx);
 973
 974                kfree(ee->requests);
 975        }
 976
 977        for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
 978                kfree(error->active_bo[i]);
 979        kfree(error->pinned_bo);
 980
 981        kfree(error->overlay);
 982        kfree(error->display);
 983
 984        cleanup_params(error);
 985        cleanup_uc_state(error);
 986
 987        err_free_sgl(error->sgl);
 988        kfree(error);
 989}
 990
 991static struct drm_i915_error_object *
 992i915_error_object_create(struct drm_i915_private *i915,
 993                         struct i915_vma *vma)
 994{
 995        struct i915_ggtt *ggtt = &i915->ggtt;
 996        const u64 slot = ggtt->error_capture.start;
 997        struct drm_i915_error_object *dst;
 998        struct compress compress;
 999        unsigned long num_pages;
1000        struct sgt_iter iter;
1001        dma_addr_t dma;
1002        int ret;
1003
1004        if (!vma || !vma->pages)
1005                return NULL;
1006
1007        num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
1008        num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
1009        dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
1010                      GFP_ATOMIC | __GFP_NOWARN);
1011        if (!dst)
1012                return NULL;
1013
1014        dst->gtt_offset = vma->node.start;
1015        dst->gtt_size = vma->node.size;
1016        dst->num_pages = num_pages;
1017        dst->page_count = 0;
1018        dst->unused = 0;
1019
1020        if (!compress_init(&compress)) {
1021                kfree(dst);
1022                return NULL;
1023        }
1024
1025        ret = -EINVAL;
1026        for_each_sgt_dma(dma, iter, vma->pages) {
1027                void __iomem *s;
1028
1029                ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
1030
1031                s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
1032                ret = compress_page(&compress, (void  __force *)s, dst);
1033                io_mapping_unmap_atomic(s);
1034                if (ret)
1035                        break;
1036        }
1037
1038        if (ret || compress_flush(&compress, dst)) {
1039                while (dst->page_count--)
1040                        free_page((unsigned long)dst->pages[dst->page_count]);
1041                kfree(dst);
1042                dst = NULL;
1043        }
1044
1045        compress_fini(&compress, dst);
1046        return dst;
1047}
1048
1049static void capture_bo(struct drm_i915_error_buffer *err,
1050                       struct i915_vma *vma)
1051{
1052        struct drm_i915_gem_object *obj = vma->obj;
1053
1054        err->size = obj->base.size;
1055        err->name = obj->base.name;
1056
1057        err->gtt_offset = vma->node.start;
1058        err->read_domains = obj->read_domains;
1059        err->write_domain = obj->write_domain;
1060        err->fence_reg = vma->fence ? vma->fence->id : -1;
1061        err->tiling = i915_gem_object_get_tiling(obj);
1062        err->dirty = obj->mm.dirty;
1063        err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
1064        err->userptr = obj->userptr.mm != NULL;
1065        err->cache_level = obj->cache_level;
1066}
1067
1068static u32 capture_error_bo(struct drm_i915_error_buffer *err,
1069                            int count, struct list_head *head,
1070                            unsigned int flags)
1071#define ACTIVE_ONLY BIT(0)
1072#define PINNED_ONLY BIT(1)
1073{
1074        struct i915_vma *vma;
1075        int i = 0;
1076
1077        list_for_each_entry(vma, head, vm_link) {
1078                if (!vma->obj)
1079                        continue;
1080
1081                if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma))
1082                        continue;
1083
1084                if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma))
1085                        continue;
1086
1087                capture_bo(err++, vma);
1088                if (++i == count)
1089                        break;
1090        }
1091
1092        return i;
1093}
1094
1095/*
1096 * Generate a semi-unique error code. The code is not meant to have meaning, The
1097 * code's only purpose is to try to prevent false duplicated bug reports by
1098 * grossly estimating a GPU error state.
1099 *
1100 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
1101 * the hang if we could strip the GTT offset information from it.
1102 *
1103 * It's only a small step better than a random number in its current form.
1104 */
1105static u32 i915_error_generate_code(struct i915_gpu_state *error,
1106                                    intel_engine_mask_t engine_mask)
1107{
1108        /*
1109         * IPEHR would be an ideal way to detect errors, as it's the gross
1110         * measure of "the command that hung." However, has some very common
1111         * synchronization commands which almost always appear in the case
1112         * strictly a client bug. Use instdone to differentiate those some.
1113         */
1114        if (engine_mask) {
1115                struct drm_i915_error_engine *ee =
1116                        &error->engine[ffs(engine_mask)];
1117
1118                return ee->ipehr ^ ee->instdone.instdone;
1119        }
1120
1121        return 0;
1122}
1123
1124static void gem_record_fences(struct i915_gpu_state *error)
1125{
1126        struct drm_i915_private *dev_priv = error->i915;
1127        struct intel_uncore *uncore = &dev_priv->uncore;
1128        int i;
1129
1130        if (INTEL_GEN(dev_priv) >= 6) {
1131                for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1132                        error->fence[i] =
1133                                intel_uncore_read64(uncore,
1134                                                    FENCE_REG_GEN6_LO(i));
1135        } else if (INTEL_GEN(dev_priv) >= 4) {
1136                for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1137                        error->fence[i] =
1138                                intel_uncore_read64(uncore,
1139                                                    FENCE_REG_965_LO(i));
1140        } else {
1141                for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1142                        error->fence[i] =
1143                                intel_uncore_read(uncore, FENCE_REG(i));
1144        }
1145        error->nfence = i;
1146}
1147
1148static void error_record_engine_registers(struct i915_gpu_state *error,
1149                                          struct intel_engine_cs *engine,
1150                                          struct drm_i915_error_engine *ee)
1151{
1152        struct drm_i915_private *dev_priv = engine->i915;
1153
1154        if (INTEL_GEN(dev_priv) >= 6) {
1155                ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL);
1156                if (INTEL_GEN(dev_priv) >= 8)
1157                        ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG);
1158                else
1159                        ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine);
1160        }
1161
1162        if (INTEL_GEN(dev_priv) >= 4) {
1163                ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
1164                ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
1165                ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
1166                ee->instps = ENGINE_READ(engine, RING_INSTPS);
1167                ee->bbaddr = ENGINE_READ(engine, RING_BBADDR);
1168                if (INTEL_GEN(dev_priv) >= 8) {
1169                        ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;
1170                        ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;
1171                }
1172                ee->bbstate = ENGINE_READ(engine, RING_BBSTATE);
1173        } else {
1174                ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX);
1175                ee->ipeir = ENGINE_READ(engine, IPEIR);
1176                ee->ipehr = ENGINE_READ(engine, IPEHR);
1177        }
1178
1179        intel_engine_get_instdone(engine, &ee->instdone);
1180
1181        ee->instpm = ENGINE_READ(engine, RING_INSTPM);
1182        ee->acthd = intel_engine_get_active_head(engine);
1183        ee->start = ENGINE_READ(engine, RING_START);
1184        ee->head = ENGINE_READ(engine, RING_HEAD);
1185        ee->tail = ENGINE_READ(engine, RING_TAIL);
1186        ee->ctl = ENGINE_READ(engine, RING_CTL);
1187        if (INTEL_GEN(dev_priv) > 2)
1188                ee->mode = ENGINE_READ(engine, RING_MI_MODE);
1189
1190        if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
1191                i915_reg_t mmio;
1192
1193                if (IS_GEN(dev_priv, 7)) {
1194                        switch (engine->id) {
1195                        default:
1196                                MISSING_CASE(engine->id);
1197                                /* fall through */
1198                        case RCS0:
1199                                mmio = RENDER_HWS_PGA_GEN7;
1200                                break;
1201                        case BCS0:
1202                                mmio = BLT_HWS_PGA_GEN7;
1203                                break;
1204                        case VCS0:
1205                                mmio = BSD_HWS_PGA_GEN7;
1206                                break;
1207                        case VECS0:
1208                                mmio = VEBOX_HWS_PGA_GEN7;
1209                                break;
1210                        }
1211                } else if (IS_GEN(engine->i915, 6)) {
1212                        mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
1213                } else {
1214                        /* XXX: gen8 returns to sanity */
1215                        mmio = RING_HWS_PGA(engine->mmio_base);
1216                }
1217
1218                ee->hws = I915_READ(mmio);
1219        }
1220
1221        ee->idle = intel_engine_is_idle(engine);
1222        if (!ee->idle)
1223                ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
1224        ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
1225                                                  engine);
1226
1227        if (HAS_PPGTT(dev_priv)) {
1228                int i;
1229
1230                ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7);
1231
1232                if (IS_GEN(dev_priv, 6)) {
1233                        ee->vm_info.pp_dir_base =
1234                                ENGINE_READ(engine, RING_PP_DIR_BASE_READ);
1235                } else if (IS_GEN(dev_priv, 7)) {
1236                        ee->vm_info.pp_dir_base =
1237                                ENGINE_READ(engine, RING_PP_DIR_BASE);
1238                } else if (INTEL_GEN(dev_priv) >= 8) {
1239                        u32 base = engine->mmio_base;
1240
1241                        for (i = 0; i < 4; i++) {
1242                                ee->vm_info.pdp[i] =
1243                                        I915_READ(GEN8_RING_PDP_UDW(base, i));
1244                                ee->vm_info.pdp[i] <<= 32;
1245                                ee->vm_info.pdp[i] |=
1246                                        I915_READ(GEN8_RING_PDP_LDW(base, i));
1247                        }
1248                }
1249        }
1250}
1251
1252static void record_request(struct i915_request *request,
1253                           struct drm_i915_error_request *erq)
1254{
1255        struct i915_gem_context *ctx = request->gem_context;
1256
1257        erq->flags = request->fence.flags;
1258        erq->context = request->fence.context;
1259        erq->seqno = request->fence.seqno;
1260        erq->sched_attr = request->sched.attr;
1261        erq->jiffies = request->emitted_jiffies;
1262        erq->start = i915_ggtt_offset(request->ring->vma);
1263        erq->head = request->head;
1264        erq->tail = request->tail;
1265
1266        rcu_read_lock();
1267        erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0;
1268        rcu_read_unlock();
1269}
1270
1271static void engine_record_requests(struct intel_engine_cs *engine,
1272                                   struct i915_request *first,
1273                                   struct drm_i915_error_engine *ee)
1274{
1275        struct i915_request *request;
1276        int count;
1277
1278        count = 0;
1279        request = first;
1280        list_for_each_entry_from(request, &engine->active.requests, sched.link)
1281                count++;
1282        if (!count)
1283                return;
1284
1285        ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
1286        if (!ee->requests)
1287                return;
1288
1289        ee->num_requests = count;
1290
1291        count = 0;
1292        request = first;
1293        list_for_each_entry_from(request,
1294                                 &engine->active.requests, sched.link) {
1295                if (count >= ee->num_requests) {
1296                        /*
1297                         * If the ring request list was changed in
1298                         * between the point where the error request
1299                         * list was created and dimensioned and this
1300                         * point then just exit early to avoid crashes.
1301                         *
1302                         * We don't need to communicate that the
1303                         * request list changed state during error
1304                         * state capture and that the error state is
1305                         * slightly incorrect as a consequence since we
1306                         * are typically only interested in the request
1307                         * list state at the point of error state
1308                         * capture, not in any changes happening during
1309                         * the capture.
1310                         */
1311                        break;
1312                }
1313
1314                record_request(request, &ee->requests[count++]);
1315        }
1316        ee->num_requests = count;
1317}
1318
1319static void error_record_engine_execlists(struct intel_engine_cs *engine,
1320                                          struct drm_i915_error_engine *ee)
1321{
1322        const struct intel_engine_execlists * const execlists = &engine->execlists;
1323        unsigned int n;
1324
1325        for (n = 0; n < execlists_num_ports(execlists); n++) {
1326                struct i915_request *rq = port_request(&execlists->port[n]);
1327
1328                if (!rq)
1329                        break;
1330
1331                record_request(rq, &ee->execlist[n]);
1332        }
1333
1334        ee->num_ports = n;
1335}
1336
1337static void record_context(struct drm_i915_error_context *e,
1338                           struct i915_gem_context *ctx)
1339{
1340        if (ctx->pid) {
1341                struct task_struct *task;
1342
1343                rcu_read_lock();
1344                task = pid_task(ctx->pid, PIDTYPE_PID);
1345                if (task) {
1346                        strcpy(e->comm, task->comm);
1347                        e->pid = task->pid;
1348                }
1349                rcu_read_unlock();
1350        }
1351
1352        e->hw_id = ctx->hw_id;
1353        e->sched_attr = ctx->sched;
1354        e->guilty = atomic_read(&ctx->guilty_count);
1355        e->active = atomic_read(&ctx->active_count);
1356}
1357
1358static void request_record_user_bo(struct i915_request *request,
1359                                   struct drm_i915_error_engine *ee)
1360{
1361        struct i915_capture_list *c;
1362        struct drm_i915_error_object **bo;
1363        long count, max;
1364
1365        max = 0;
1366        for (c = request->capture_list; c; c = c->next)
1367                max++;
1368        if (!max)
1369                return;
1370
1371        bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1372        if (!bo) {
1373                /* If we can't capture everything, try to capture something. */
1374                max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
1375                bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1376        }
1377        if (!bo)
1378                return;
1379
1380        count = 0;
1381        for (c = request->capture_list; c; c = c->next) {
1382                bo[count] = i915_error_object_create(request->i915, c->vma);
1383                if (!bo[count])
1384                        break;
1385                if (++count == max)
1386                        break;
1387        }
1388
1389        ee->user_bo = bo;
1390        ee->user_bo_count = count;
1391}
1392
1393static struct drm_i915_error_object *
1394capture_object(struct drm_i915_private *dev_priv,
1395               struct drm_i915_gem_object *obj)
1396{
1397        if (obj && i915_gem_object_has_pages(obj)) {
1398                struct i915_vma fake = {
1399                        .node = { .start = U64_MAX, .size = obj->base.size },
1400                        .size = obj->base.size,
1401                        .pages = obj->mm.pages,
1402                        .obj = obj,
1403                };
1404
1405                return i915_error_object_create(dev_priv, &fake);
1406        } else {
1407                return NULL;
1408        }
1409}
1410
1411static void gem_record_rings(struct i915_gpu_state *error)
1412{
1413        struct drm_i915_private *i915 = error->i915;
1414        struct i915_ggtt *ggtt = &i915->ggtt;
1415        int i;
1416
1417        for (i = 0; i < I915_NUM_ENGINES; i++) {
1418                struct intel_engine_cs *engine = i915->engine[i];
1419                struct drm_i915_error_engine *ee = &error->engine[i];
1420                struct i915_request *request;
1421                unsigned long flags;
1422
1423                ee->engine_id = -1;
1424
1425                if (!engine)
1426                        continue;
1427
1428                ee->engine_id = i;
1429
1430                error_record_engine_registers(error, engine, ee);
1431                error_record_engine_execlists(engine, ee);
1432
1433                spin_lock_irqsave(&engine->active.lock, flags);
1434                request = intel_engine_find_active_request(engine);
1435                if (request) {
1436                        struct i915_gem_context *ctx = request->gem_context;
1437                        struct intel_ring *ring = request->ring;
1438
1439                        ee->vm = ctx->vm ?: &ggtt->vm;
1440
1441                        record_context(&ee->context, ctx);
1442
1443                        /* We need to copy these to an anonymous buffer
1444                         * as the simplest method to avoid being overwritten
1445                         * by userspace.
1446                         */
1447                        ee->batchbuffer =
1448                                i915_error_object_create(i915, request->batch);
1449
1450                        if (HAS_BROKEN_CS_TLB(i915))
1451                                ee->wa_batchbuffer =
1452                                        i915_error_object_create(i915,
1453                                                                 i915->gt.scratch);
1454                        request_record_user_bo(request, ee);
1455
1456                        ee->ctx =
1457                                i915_error_object_create(i915,
1458                                                         request->hw_context->state);
1459
1460                        error->simulated |=
1461                                i915_gem_context_no_error_capture(ctx);
1462
1463                        ee->rq_head = request->head;
1464                        ee->rq_post = request->postfix;
1465                        ee->rq_tail = request->tail;
1466
1467                        ee->cpu_ring_head = ring->head;
1468                        ee->cpu_ring_tail = ring->tail;
1469                        ee->ringbuffer =
1470                                i915_error_object_create(i915, ring->vma);
1471
1472                        engine_record_requests(engine, request, ee);
1473                }
1474                spin_unlock_irqrestore(&engine->active.lock, flags);
1475
1476                ee->hws_page =
1477                        i915_error_object_create(i915,
1478                                                 engine->status_page.vma);
1479
1480                ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
1481
1482                ee->default_state = capture_object(i915, engine->default_state);
1483        }
1484}
1485
1486static void gem_capture_vm(struct i915_gpu_state *error,
1487                           struct i915_address_space *vm,
1488                           int idx)
1489{
1490        struct drm_i915_error_buffer *active_bo;
1491        struct i915_vma *vma;
1492        int count;
1493
1494        count = 0;
1495        list_for_each_entry(vma, &vm->bound_list, vm_link)
1496                if (i915_vma_is_active(vma))
1497                        count++;
1498
1499        active_bo = NULL;
1500        if (count)
1501                active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
1502        if (active_bo)
1503                count = capture_error_bo(active_bo,
1504                                         count, &vm->bound_list,
1505                                         ACTIVE_ONLY);
1506        else
1507                count = 0;
1508
1509        error->active_vm[idx] = vm;
1510        error->active_bo[idx] = active_bo;
1511        error->active_bo_count[idx] = count;
1512}
1513
1514static void capture_active_buffers(struct i915_gpu_state *error)
1515{
1516        int cnt = 0, i, j;
1517
1518        BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
1519        BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
1520        BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
1521
1522        /* Scan each engine looking for unique active contexts/vm */
1523        for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1524                struct drm_i915_error_engine *ee = &error->engine[i];
1525                bool found;
1526
1527                if (!ee->vm)
1528                        continue;
1529
1530                found = false;
1531                for (j = 0; j < i && !found; j++)
1532                        found = error->engine[j].vm == ee->vm;
1533                if (!found)
1534                        gem_capture_vm(error, ee->vm, cnt++);
1535        }
1536}
1537
1538static void capture_pinned_buffers(struct i915_gpu_state *error)
1539{
1540        struct i915_address_space *vm = &error->i915->ggtt.vm;
1541        struct drm_i915_error_buffer *bo;
1542        struct i915_vma *vma;
1543        int count;
1544
1545        count = 0;
1546        list_for_each_entry(vma, &vm->bound_list, vm_link)
1547                count++;
1548
1549        bo = NULL;
1550        if (count)
1551                bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
1552        if (!bo)
1553                return;
1554
1555        error->pinned_bo_count =
1556                capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY);
1557        error->pinned_bo = bo;
1558}
1559
1560static void capture_uc_state(struct i915_gpu_state *error)
1561{
1562        struct drm_i915_private *i915 = error->i915;
1563        struct i915_error_uc *error_uc = &error->uc;
1564
1565        /* Capturing uC state won't be useful if there is no GuC */
1566        if (!error->device_info.has_guc)
1567                return;
1568
1569        error_uc->guc_fw = i915->guc.fw;
1570        error_uc->huc_fw = i915->huc.fw;
1571
1572        /* Non-default firmware paths will be specified by the modparam.
1573         * As modparams are generally accesible from the userspace make
1574         * explicit copies of the firmware paths.
1575         */
1576        error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC);
1577        error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC);
1578        error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma);
1579}
1580
1581/* Capture all registers which don't fit into another category. */
1582static void capture_reg_state(struct i915_gpu_state *error)
1583{
1584        struct drm_i915_private *i915 = error->i915;
1585        struct intel_uncore *uncore = &i915->uncore;
1586        int i;
1587
1588        /* General organization
1589         * 1. Registers specific to a single generation
1590         * 2. Registers which belong to multiple generations
1591         * 3. Feature specific registers.
1592         * 4. Everything else
1593         * Please try to follow the order.
1594         */
1595
1596        /* 1: Registers specific to a single generation */
1597        if (IS_VALLEYVIEW(i915)) {
1598                error->gtier[0] = intel_uncore_read(uncore, GTIER);
1599                error->ier = intel_uncore_read(uncore, VLV_IER);
1600                error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV);
1601        }
1602
1603        if (IS_GEN(i915, 7))
1604                error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT);
1605
1606        if (INTEL_GEN(i915) >= 8) {
1607                error->fault_data0 = intel_uncore_read(uncore,
1608                                                       GEN8_FAULT_TLB_DATA0);
1609                error->fault_data1 = intel_uncore_read(uncore,
1610                                                       GEN8_FAULT_TLB_DATA1);
1611        }
1612
1613        if (IS_GEN(i915, 6)) {
1614                error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE);
1615                error->gab_ctl = intel_uncore_read(uncore, GAB_CTL);
1616                error->gfx_mode = intel_uncore_read(uncore, GFX_MODE);
1617        }
1618
1619        /* 2: Registers which belong to multiple generations */
1620        if (INTEL_GEN(i915) >= 7)
1621                error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT);
1622
1623        if (INTEL_GEN(i915) >= 6) {
1624                error->derrmr = intel_uncore_read(uncore, DERRMR);
1625                error->error = intel_uncore_read(uncore, ERROR_GEN6);
1626                error->done_reg = intel_uncore_read(uncore, DONE_REG);
1627        }
1628
1629        if (INTEL_GEN(i915) >= 5)
1630                error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE));
1631
1632        /* 3: Feature specific registers */
1633        if (IS_GEN_RANGE(i915, 6, 7)) {
1634                error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK);
1635                error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS);
1636        }
1637
1638        /* 4: Everything else */
1639        if (INTEL_GEN(i915) >= 11) {
1640                error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1641                error->gtier[0] =
1642                        intel_uncore_read(uncore,
1643                                          GEN11_RENDER_COPY_INTR_ENABLE);
1644                error->gtier[1] =
1645                        intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE);
1646                error->gtier[2] =
1647                        intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE);
1648                error->gtier[3] =
1649                        intel_uncore_read(uncore,
1650                                          GEN11_GPM_WGBOXPERF_INTR_ENABLE);
1651                error->gtier[4] =
1652                        intel_uncore_read(uncore,
1653                                          GEN11_CRYPTO_RSVD_INTR_ENABLE);
1654                error->gtier[5] =
1655                        intel_uncore_read(uncore,
1656                                          GEN11_GUNIT_CSME_INTR_ENABLE);
1657                error->ngtier = 6;
1658        } else if (INTEL_GEN(i915) >= 8) {
1659                error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1660                for (i = 0; i < 4; i++)
1661                        error->gtier[i] = intel_uncore_read(uncore,
1662                                                            GEN8_GT_IER(i));
1663                error->ngtier = 4;
1664        } else if (HAS_PCH_SPLIT(i915)) {
1665                error->ier = intel_uncore_read(uncore, DEIER);
1666                error->gtier[0] = intel_uncore_read(uncore, GTIER);
1667                error->ngtier = 1;
1668        } else if (IS_GEN(i915, 2)) {
1669                error->ier = intel_uncore_read16(uncore, GEN2_IER);
1670        } else if (!IS_VALLEYVIEW(i915)) {
1671                error->ier = intel_uncore_read(uncore, GEN2_IER);
1672        }
1673        error->eir = intel_uncore_read(uncore, EIR);
1674        error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER);
1675}
1676
1677static const char *
1678error_msg(struct i915_gpu_state *error,
1679          intel_engine_mask_t engines, const char *msg)
1680{
1681        int len;
1682        int i;
1683
1684        for (i = 0; i < ARRAY_SIZE(error->engine); i++)
1685                if (!error->engine[i].context.pid)
1686                        engines &= ~BIT(i);
1687
1688        len = scnprintf(error->error_msg, sizeof(error->error_msg),
1689                        "GPU HANG: ecode %d:%x:0x%08x",
1690                        INTEL_GEN(error->i915), engines,
1691                        i915_error_generate_code(error, engines));
1692        if (engines) {
1693                /* Just show the first executing process, more is confusing */
1694                i = __ffs(engines);
1695                len += scnprintf(error->error_msg + len,
1696                                 sizeof(error->error_msg) - len,
1697                                 ", in %s [%d]",
1698                                 error->engine[i].context.comm,
1699                                 error->engine[i].context.pid);
1700        }
1701        if (msg)
1702                len += scnprintf(error->error_msg + len,
1703                                 sizeof(error->error_msg) - len,
1704                                 ", %s", msg);
1705
1706        return error->error_msg;
1707}
1708
1709static void capture_gen_state(struct i915_gpu_state *error)
1710{
1711        struct drm_i915_private *i915 = error->i915;
1712
1713        error->awake = i915->gt.awake;
1714        error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
1715        error->suspended = i915->runtime_pm.suspended;
1716
1717        error->iommu = -1;
1718#ifdef CONFIG_INTEL_IOMMU
1719        error->iommu = intel_iommu_gfx_mapped;
1720#endif
1721        error->reset_count = i915_reset_count(&i915->gpu_error);
1722        error->suspend_count = i915->suspend_count;
1723
1724        memcpy(&error->device_info,
1725               INTEL_INFO(i915),
1726               sizeof(error->device_info));
1727        memcpy(&error->runtime_info,
1728               RUNTIME_INFO(i915),
1729               sizeof(error->runtime_info));
1730        error->driver_caps = i915->caps;
1731}
1732
1733static void capture_params(struct i915_gpu_state *error)
1734{
1735        i915_params_copy(&error->params, &i915_modparams);
1736}
1737
1738static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
1739{
1740        unsigned long epoch = error->capture;
1741        int i;
1742
1743        for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1744                const struct drm_i915_error_engine *ee = &error->engine[i];
1745
1746                if (ee->hangcheck_timestamp &&
1747                    time_before(ee->hangcheck_timestamp, epoch))
1748                        epoch = ee->hangcheck_timestamp;
1749        }
1750
1751        return epoch;
1752}
1753
1754static void capture_finish(struct i915_gpu_state *error)
1755{
1756        struct i915_ggtt *ggtt = &error->i915->ggtt;
1757        const u64 slot = ggtt->error_capture.start;
1758
1759        ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
1760}
1761
1762static int capture(void *data)
1763{
1764        struct i915_gpu_state *error = data;
1765
1766        error->time = ktime_get_real();
1767        error->boottime = ktime_get_boottime();
1768        error->uptime = ktime_sub(ktime_get(),
1769                                  error->i915->gt.last_init_time);
1770        error->capture = jiffies;
1771
1772        capture_params(error);
1773        capture_gen_state(error);
1774        capture_uc_state(error);
1775        capture_reg_state(error);
1776        gem_record_fences(error);
1777        gem_record_rings(error);
1778        capture_active_buffers(error);
1779        capture_pinned_buffers(error);
1780
1781        error->overlay = intel_overlay_capture_error_state(error->i915);
1782        error->display = intel_display_capture_error_state(error->i915);
1783
1784        error->epoch = capture_find_epoch(error);
1785
1786        capture_finish(error);
1787        return 0;
1788}
1789
1790#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
1791
1792struct i915_gpu_state *
1793i915_capture_gpu_state(struct drm_i915_private *i915)
1794{
1795        struct i915_gpu_state *error;
1796
1797        /* Check if GPU capture has been disabled */
1798        error = READ_ONCE(i915->gpu_error.first_error);
1799        if (IS_ERR(error))
1800                return error;
1801
1802        error = kzalloc(sizeof(*error), GFP_ATOMIC);
1803        if (!error) {
1804                i915_disable_error_state(i915, -ENOMEM);
1805                return ERR_PTR(-ENOMEM);
1806        }
1807
1808        kref_init(&error->ref);
1809        error->i915 = i915;
1810
1811        stop_machine(capture, error, NULL);
1812
1813        return error;
1814}
1815
1816/**
1817 * i915_capture_error_state - capture an error record for later analysis
1818 * @i915: i915 device
1819 * @engine_mask: the mask of engines triggering the hang
1820 * @msg: a message to insert into the error capture header
1821 *
1822 * Should be called when an error is detected (either a hang or an error
1823 * interrupt) to capture error state from the time of the error.  Fills
1824 * out a structure which becomes available in debugfs for user level tools
1825 * to pick up.
1826 */
1827void i915_capture_error_state(struct drm_i915_private *i915,
1828                              intel_engine_mask_t engine_mask,
1829                              const char *msg)
1830{
1831        static bool warned;
1832        struct i915_gpu_state *error;
1833        unsigned long flags;
1834
1835        if (!i915_modparams.error_capture)
1836                return;
1837
1838        if (READ_ONCE(i915->gpu_error.first_error))
1839                return;
1840
1841        error = i915_capture_gpu_state(i915);
1842        if (IS_ERR(error))
1843                return;
1844
1845        dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
1846
1847        if (!error->simulated) {
1848                spin_lock_irqsave(&i915->gpu_error.lock, flags);
1849                if (!i915->gpu_error.first_error) {
1850                        i915->gpu_error.first_error = error;
1851                        error = NULL;
1852                }
1853                spin_unlock_irqrestore(&i915->gpu_error.lock, flags);
1854        }
1855
1856        if (error) {
1857                __i915_gpu_state_free(&error->ref);
1858                return;
1859        }
1860
1861        if (!warned &&
1862            ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
1863                DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
1864                DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
1865                DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
1866                DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
1867                DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
1868                         i915->drm.primary->index);
1869                warned = true;
1870        }
1871}
1872
1873struct i915_gpu_state *
1874i915_first_error_state(struct drm_i915_private *i915)
1875{
1876        struct i915_gpu_state *error;
1877
1878        spin_lock_irq(&i915->gpu_error.lock);
1879        error = i915->gpu_error.first_error;
1880        if (!IS_ERR_OR_NULL(error))
1881                i915_gpu_state_get(error);
1882        spin_unlock_irq(&i915->gpu_error.lock);
1883
1884        return error;
1885}
1886
1887void i915_reset_error_state(struct drm_i915_private *i915)
1888{
1889        struct i915_gpu_state *error;
1890
1891        spin_lock_irq(&i915->gpu_error.lock);
1892        error = i915->gpu_error.first_error;
1893        if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
1894                i915->gpu_error.first_error = NULL;
1895        spin_unlock_irq(&i915->gpu_error.lock);
1896
1897        if (!IS_ERR_OR_NULL(error))
1898                i915_gpu_state_put(error);
1899}
1900
1901void i915_disable_error_state(struct drm_i915_private *i915, int err)
1902{
1903        spin_lock_irq(&i915->gpu_error.lock);
1904        if (!i915->gpu_error.first_error)
1905                i915->gpu_error.first_error = ERR_PTR(err);
1906        spin_unlock_irq(&i915->gpu_error.lock);
1907}
1908