linux/drivers/gpu/drm/i915/i915_perf.c
<<
>>
Prefs
   1/*
   2 * Copyright © 2015-2016 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * Authors:
  24 *   Robert Bragg <robert@sixbynine.org>
  25 */
  26
  27
  28/**
  29 * DOC: i915 Perf Overview
  30 *
  31 * Gen graphics supports a large number of performance counters that can help
  32 * driver and application developers understand and optimize their use of the
  33 * GPU.
  34 *
  35 * This i915 perf interface enables userspace to configure and open a file
  36 * descriptor representing a stream of GPU metrics which can then be read() as
  37 * a stream of sample records.
  38 *
  39 * The interface is particularly suited to exposing buffered metrics that are
  40 * captured by DMA from the GPU, unsynchronized with and unrelated to the CPU.
  41 *
  42 * Streams representing a single context are accessible to applications with a
  43 * corresponding drm file descriptor, such that OpenGL can use the interface
  44 * without special privileges. Access to system-wide metrics requires root
  45 * privileges by default, unless changed via the dev.i915.perf_event_paranoid
  46 * sysctl option.
  47 *
  48 */
  49
  50/**
  51 * DOC: i915 Perf History and Comparison with Core Perf
  52 *
  53 * The interface was initially inspired by the core Perf infrastructure but
  54 * some notable differences are:
  55 *
  56 * i915 perf file descriptors represent a "stream" instead of an "event"; where
  57 * a perf event primarily corresponds to a single 64bit value, while a stream
  58 * might sample sets of tightly-coupled counters, depending on the
  59 * configuration.  For example the Gen OA unit isn't designed to support
  60 * orthogonal configurations of individual counters; it's configured for a set
  61 * of related counters. Samples for an i915 perf stream capturing OA metrics
  62 * will include a set of counter values packed in a compact HW specific format.
  63 * The OA unit supports a number of different packing formats which can be
  64 * selected by the user opening the stream. Perf has support for grouping
  65 * events, but each event in the group is configured, validated and
  66 * authenticated individually with separate system calls.
  67 *
  68 * i915 perf stream configurations are provided as an array of u64 (key,value)
  69 * pairs, instead of a fixed struct with multiple miscellaneous config members,
  70 * interleaved with event-type specific members.
  71 *
  72 * i915 perf doesn't support exposing metrics via an mmap'd circular buffer.
  73 * The supported metrics are being written to memory by the GPU unsynchronized
  74 * with the CPU, using HW specific packing formats for counter sets. Sometimes
  75 * the constraints on HW configuration require reports to be filtered before it
  76 * would be acceptable to expose them to unprivileged applications - to hide
  77 * the metrics of other processes/contexts. For these use cases a read() based
  78 * interface is a good fit, and provides an opportunity to filter data as it
  79 * gets copied from the GPU mapped buffers to userspace buffers.
  80 *
  81 *
  82 * Issues hit with first prototype based on Core Perf
  83 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  84 *
  85 * The first prototype of this driver was based on the core perf
  86 * infrastructure, and while we did make that mostly work, with some changes to
  87 * perf, we found we were breaking or working around too many assumptions baked
  88 * into perf's currently cpu centric design.
  89 *
  90 * In the end we didn't see a clear benefit to making perf's implementation and
  91 * interface more complex by changing design assumptions while we knew we still
  92 * wouldn't be able to use any existing perf based userspace tools.
  93 *
  94 * Also considering the Gen specific nature of the Observability hardware and
  95 * how userspace will sometimes need to combine i915 perf OA metrics with
  96 * side-band OA data captured via MI_REPORT_PERF_COUNT commands; we're
  97 * expecting the interface to be used by a platform specific userspace such as
  98 * OpenGL or tools. This is to say; we aren't inherently missing out on having
  99 * a standard vendor/architecture agnostic interface by not using perf.
 100 *
 101 *
 102 * For posterity, in case we might re-visit trying to adapt core perf to be
 103 * better suited to exposing i915 metrics these were the main pain points we
 104 * hit:
 105 *
 106 * - The perf based OA PMU driver broke some significant design assumptions:
 107 *
 108 *   Existing perf pmus are used for profiling work on a cpu and we were
 109 *   introducing the idea of _IS_DEVICE pmus with different security
 110 *   implications, the need to fake cpu-related data (such as user/kernel
 111 *   registers) to fit with perf's current design, and adding _DEVICE records
 112 *   as a way to forward device-specific status records.
 113 *
 114 *   The OA unit writes reports of counters into a circular buffer, without
 115 *   involvement from the CPU, making our PMU driver the first of a kind.
 116 *
 117 *   Given the way we were periodically forward data from the GPU-mapped, OA
 118 *   buffer to perf's buffer, those bursts of sample writes looked to perf like
 119 *   we were sampling too fast and so we had to subvert its throttling checks.
 120 *
 121 *   Perf supports groups of counters and allows those to be read via
 122 *   transactions internally but transactions currently seem designed to be
 123 *   explicitly initiated from the cpu (say in response to a userspace read())
 124 *   and while we could pull a report out of the OA buffer we can't
 125 *   trigger a report from the cpu on demand.
 126 *
 127 *   Related to being report based; the OA counters are configured in HW as a
 128 *   set while perf generally expects counter configurations to be orthogonal.
 129 *   Although counters can be associated with a group leader as they are
 130 *   opened, there's no clear precedent for being able to provide group-wide
 131 *   configuration attributes (for example we want to let userspace choose the
 132 *   OA unit report format used to capture all counters in a set, or specify a
 133 *   GPU context to filter metrics on). We avoided using perf's grouping
 134 *   feature and forwarded OA reports to userspace via perf's 'raw' sample
 135 *   field. This suited our userspace well considering how coupled the counters
 136 *   are when dealing with normalizing. It would be inconvenient to split
 137 *   counters up into separate events, only to require userspace to recombine
 138 *   them. For Mesa it's also convenient to be forwarded raw, periodic reports
 139 *   for combining with the side-band raw reports it captures using
 140 *   MI_REPORT_PERF_COUNT commands.
 141 *
 142 *   - As a side note on perf's grouping feature; there was also some concern
 143 *     that using PERF_FORMAT_GROUP as a way to pack together counter values
 144 *     would quite drastically inflate our sample sizes, which would likely
 145 *     lower the effective sampling resolutions we could use when the available
 146 *     memory bandwidth is limited.
 147 *
 148 *     With the OA unit's report formats, counters are packed together as 32
 149 *     or 40bit values, with the largest report size being 256 bytes.
 150 *
 151 *     PERF_FORMAT_GROUP values are 64bit, but there doesn't appear to be a
 152 *     documented ordering to the values, implying PERF_FORMAT_ID must also be
 153 *     used to add a 64bit ID before each value; giving 16 bytes per counter.
 154 *
 155 *   Related to counter orthogonality; we can't time share the OA unit, while
 156 *   event scheduling is a central design idea within perf for allowing
 157 *   userspace to open + enable more events than can be configured in HW at any
 158 *   one time.  The OA unit is not designed to allow re-configuration while in
 159 *   use. We can't reconfigure the OA unit without losing internal OA unit
 160 *   state which we can't access explicitly to save and restore. Reconfiguring
 161 *   the OA unit is also relatively slow, involving ~100 register writes. From
 162 *   userspace Mesa also depends on a stable OA configuration when emitting
 163 *   MI_REPORT_PERF_COUNT commands and importantly the OA unit can't be
 164 *   disabled while there are outstanding MI_RPC commands lest we hang the
 165 *   command streamer.
 166 *
 167 *   The contents of sample records aren't extensible by device drivers (i.e.
 168 *   the sample_type bits). As an example; Sourab Gupta had been looking to
 169 *   attach GPU timestamps to our OA samples. We were shoehorning OA reports
 170 *   into sample records by using the 'raw' field, but it's tricky to pack more
 171 *   than one thing into this field because events/core.c currently only lets a
 172 *   pmu give a single raw data pointer plus len which will be copied into the
 173 *   ring buffer. To include more than the OA report we'd have to copy the
 174 *   report into an intermediate larger buffer. I'd been considering allowing a
 175 *   vector of data+len values to be specified for copying the raw data, but
 176 *   it felt like a kludge to being using the raw field for this purpose.
 177 *
 178 * - It felt like our perf based PMU was making some technical compromises
 179 *   just for the sake of using perf:
 180 *
 181 *   perf_event_open() requires events to either relate to a pid or a specific
 182 *   cpu core, while our device pmu related to neither.  Events opened with a
 183 *   pid will be automatically enabled/disabled according to the scheduling of
 184 *   that process - so not appropriate for us. When an event is related to a
 185 *   cpu id, perf ensures pmu methods will be invoked via an inter process
 186 *   interrupt on that core. To avoid invasive changes our userspace opened OA
 187 *   perf events for a specific cpu. This was workable but it meant the
 188 *   majority of the OA driver ran in atomic context, including all OA report
 189 *   forwarding, which wasn't really necessary in our case and seems to make
 190 *   our locking requirements somewhat complex as we handled the interaction
 191 *   with the rest of the i915 driver.
 192 */
 193
 194#include <linux/anon_inodes.h>
 195#include <linux/sizes.h>
 196#include <linux/uuid.h>
 197
 198#include "i915_drv.h"
 199#include "i915_oa_hsw.h"
 200#include "i915_oa_bdw.h"
 201#include "i915_oa_chv.h"
 202#include "i915_oa_sklgt2.h"
 203#include "i915_oa_sklgt3.h"
 204#include "i915_oa_sklgt4.h"
 205#include "i915_oa_bxt.h"
 206#include "i915_oa_kblgt2.h"
 207#include "i915_oa_kblgt3.h"
 208#include "i915_oa_glk.h"
 209#include "i915_oa_cflgt2.h"
 210#include "i915_oa_cflgt3.h"
 211#include "i915_oa_cnl.h"
 212#include "i915_oa_icl.h"
 213#include "intel_lrc_reg.h"
 214
 215/* HW requires this to be a power of two, between 128k and 16M, though driver
 216 * is currently generally designed assuming the largest 16M size is used such
 217 * that the overflow cases are unlikely in normal operation.
 218 */
 219#define OA_BUFFER_SIZE          SZ_16M
 220
 221#define OA_TAKEN(tail, head)    ((tail - head) & (OA_BUFFER_SIZE - 1))
 222
 223/**
 224 * DOC: OA Tail Pointer Race
 225 *
 226 * There's a HW race condition between OA unit tail pointer register updates and
 227 * writes to memory whereby the tail pointer can sometimes get ahead of what's
 228 * been written out to the OA buffer so far (in terms of what's visible to the
 229 * CPU).
 230 *
 231 * Although this can be observed explicitly while copying reports to userspace
 232 * by checking for a zeroed report-id field in tail reports, we want to account
 233 * for this earlier, as part of the oa_buffer_check to avoid lots of redundant
 234 * read() attempts.
 235 *
 236 * In effect we define a tail pointer for reading that lags the real tail
 237 * pointer by at least %OA_TAIL_MARGIN_NSEC nanoseconds, which gives enough
 238 * time for the corresponding reports to become visible to the CPU.
 239 *
 240 * To manage this we actually track two tail pointers:
 241 *  1) An 'aging' tail with an associated timestamp that is tracked until we
 242 *     can trust the corresponding data is visible to the CPU; at which point
 243 *     it is considered 'aged'.
 244 *  2) An 'aged' tail that can be used for read()ing.
 245 *
 246 * The two separate pointers let us decouple read()s from tail pointer aging.
 247 *
 248 * The tail pointers are checked and updated at a limited rate within a hrtimer
 249 * callback (the same callback that is used for delivering EPOLLIN events)
 250 *
 251 * Initially the tails are marked invalid with %INVALID_TAIL_PTR which
 252 * indicates that an updated tail pointer is needed.
 253 *
 254 * Most of the implementation details for this workaround are in
 255 * oa_buffer_check_unlocked() and _append_oa_reports()
 256 *
 257 * Note for posterity: previously the driver used to define an effective tail
 258 * pointer that lagged the real pointer by a 'tail margin' measured in bytes
 259 * derived from %OA_TAIL_MARGIN_NSEC and the configured sampling frequency.
 260 * This was flawed considering that the OA unit may also automatically generate
 261 * non-periodic reports (such as on context switch) or the OA unit may be
 262 * enabled without any periodic sampling.
 263 */
 264#define OA_TAIL_MARGIN_NSEC     100000ULL
 265#define INVALID_TAIL_PTR        0xffffffff
 266
 267/* frequency for checking whether the OA unit has written new reports to the
 268 * circular OA buffer...
 269 */
 270#define POLL_FREQUENCY 200
 271#define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY)
 272
 273/* for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid */
 274static int zero;
 275static int one = 1;
 276static u32 i915_perf_stream_paranoid = true;
 277
 278/* The maximum exponent the hardware accepts is 63 (essentially it selects one
 279 * of the 64bit timestamp bits to trigger reports from) but there's currently
 280 * no known use case for sampling as infrequently as once per 47 thousand years.
 281 *
 282 * Since the timestamps included in OA reports are only 32bits it seems
 283 * reasonable to limit the OA exponent where it's still possible to account for
 284 * overflow in OA report timestamps.
 285 */
 286#define OA_EXPONENT_MAX 31
 287
 288#define INVALID_CTX_ID 0xffffffff
 289
 290/* On Gen8+ automatically triggered OA reports include a 'reason' field... */
 291#define OAREPORT_REASON_MASK           0x3f
 292#define OAREPORT_REASON_SHIFT          19
 293#define OAREPORT_REASON_TIMER          (1<<0)
 294#define OAREPORT_REASON_CTX_SWITCH     (1<<3)
 295#define OAREPORT_REASON_CLK_RATIO      (1<<5)
 296
 297
 298/* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate
 299 *
 300 * The highest sampling frequency we can theoretically program the OA unit
 301 * with is always half the timestamp frequency: E.g. 6.25Mhz for Haswell.
 302 *
 303 * Initialized just before we register the sysctl parameter.
 304 */
 305static int oa_sample_rate_hard_limit;
 306
 307/* Theoretically we can program the OA unit to sample every 160ns but don't
 308 * allow that by default unless root...
 309 *
 310 * The default threshold of 100000Hz is based on perf's similar
 311 * kernel.perf_event_max_sample_rate sysctl parameter.
 312 */
 313static u32 i915_oa_max_sample_rate = 100000;
 314
 315/* XXX: beware if future OA HW adds new report formats that the current
 316 * code assumes all reports have a power-of-two size and ~(size - 1) can
 317 * be used as a mask to align the OA tail pointer.
 318 */
 319static const struct i915_oa_format hsw_oa_formats[I915_OA_FORMAT_MAX] = {
 320        [I915_OA_FORMAT_A13]        = { 0, 64 },
 321        [I915_OA_FORMAT_A29]        = { 1, 128 },
 322        [I915_OA_FORMAT_A13_B8_C8]  = { 2, 128 },
 323        /* A29_B8_C8 Disallowed as 192 bytes doesn't factor into buffer size */
 324        [I915_OA_FORMAT_B4_C8]      = { 4, 64 },
 325        [I915_OA_FORMAT_A45_B8_C8]  = { 5, 256 },
 326        [I915_OA_FORMAT_B4_C8_A16]  = { 6, 128 },
 327        [I915_OA_FORMAT_C4_B8]      = { 7, 64 },
 328};
 329
 330static const struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
 331        [I915_OA_FORMAT_A12]                = { 0, 64 },
 332        [I915_OA_FORMAT_A12_B8_C8]          = { 2, 128 },
 333        [I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
 334        [I915_OA_FORMAT_C4_B8]              = { 7, 64 },
 335};
 336
 337#define SAMPLE_OA_REPORT      (1<<0)
 338
 339/**
 340 * struct perf_open_properties - for validated properties given to open a stream
 341 * @sample_flags: `DRM_I915_PERF_PROP_SAMPLE_*` properties are tracked as flags
 342 * @single_context: Whether a single or all gpu contexts should be monitored
 343 * @ctx_handle: A gem ctx handle for use with @single_context
 344 * @metrics_set: An ID for an OA unit metric set advertised via sysfs
 345 * @oa_format: An OA unit HW report format
 346 * @oa_periodic: Whether to enable periodic OA unit sampling
 347 * @oa_period_exponent: The OA unit sampling period is derived from this
 348 *
 349 * As read_properties_unlocked() enumerates and validates the properties given
 350 * to open a stream of metrics the configuration is built up in the structure
 351 * which starts out zero initialized.
 352 */
 353struct perf_open_properties {
 354        u32 sample_flags;
 355
 356        u64 single_context:1;
 357        u64 ctx_handle;
 358
 359        /* OA sampling state */
 360        int metrics_set;
 361        int oa_format;
 362        bool oa_periodic;
 363        int oa_period_exponent;
 364};
 365
 366static void free_oa_config(struct drm_i915_private *dev_priv,
 367                           struct i915_oa_config *oa_config)
 368{
 369        if (!PTR_ERR(oa_config->flex_regs))
 370                kfree(oa_config->flex_regs);
 371        if (!PTR_ERR(oa_config->b_counter_regs))
 372                kfree(oa_config->b_counter_regs);
 373        if (!PTR_ERR(oa_config->mux_regs))
 374                kfree(oa_config->mux_regs);
 375        kfree(oa_config);
 376}
 377
 378static void put_oa_config(struct drm_i915_private *dev_priv,
 379                          struct i915_oa_config *oa_config)
 380{
 381        if (!atomic_dec_and_test(&oa_config->ref_count))
 382                return;
 383
 384        free_oa_config(dev_priv, oa_config);
 385}
 386
 387static int get_oa_config(struct drm_i915_private *dev_priv,
 388                         int metrics_set,
 389                         struct i915_oa_config **out_config)
 390{
 391        int ret;
 392
 393        if (metrics_set == 1) {
 394                *out_config = &dev_priv->perf.oa.test_config;
 395                atomic_inc(&dev_priv->perf.oa.test_config.ref_count);
 396                return 0;
 397        }
 398
 399        ret = mutex_lock_interruptible(&dev_priv->perf.metrics_lock);
 400        if (ret)
 401                return ret;
 402
 403        *out_config = idr_find(&dev_priv->perf.metrics_idr, metrics_set);
 404        if (!*out_config)
 405                ret = -EINVAL;
 406        else
 407                atomic_inc(&(*out_config)->ref_count);
 408
 409        mutex_unlock(&dev_priv->perf.metrics_lock);
 410
 411        return ret;
 412}
 413
 414static u32 gen8_oa_hw_tail_read(struct drm_i915_private *dev_priv)
 415{
 416        return I915_READ(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
 417}
 418
 419static u32 gen7_oa_hw_tail_read(struct drm_i915_private *dev_priv)
 420{
 421        u32 oastatus1 = I915_READ(GEN7_OASTATUS1);
 422
 423        return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
 424}
 425
 426/**
 427 * oa_buffer_check_unlocked - check for data and update tail ptr state
 428 * @dev_priv: i915 device instance
 429 *
 430 * This is either called via fops (for blocking reads in user ctx) or the poll
 431 * check hrtimer (atomic ctx) to check the OA buffer tail pointer and check
 432 * if there is data available for userspace to read.
 433 *
 434 * This function is central to providing a workaround for the OA unit tail
 435 * pointer having a race with respect to what data is visible to the CPU.
 436 * It is responsible for reading tail pointers from the hardware and giving
 437 * the pointers time to 'age' before they are made available for reading.
 438 * (See description of OA_TAIL_MARGIN_NSEC above for further details.)
 439 *
 440 * Besides returning true when there is data available to read() this function
 441 * also has the side effect of updating the oa_buffer.tails[], .aging_timestamp
 442 * and .aged_tail_idx state used for reading.
 443 *
 444 * Note: It's safe to read OA config state here unlocked, assuming that this is
 445 * only called while the stream is enabled, while the global OA configuration
 446 * can't be modified.
 447 *
 448 * Returns: %true if the OA buffer contains data, else %false
 449 */
 450static bool oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
 451{
 452        int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 453        unsigned long flags;
 454        unsigned int aged_idx;
 455        u32 head, hw_tail, aged_tail, aging_tail;
 456        u64 now;
 457
 458        /* We have to consider the (unlikely) possibility that read() errors
 459         * could result in an OA buffer reset which might reset the head,
 460         * tails[] and aged_tail state.
 461         */
 462        spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 463
 464        /* NB: The head we observe here might effectively be a little out of
 465         * date (between head and tails[aged_idx].offset if there is currently
 466         * a read() in progress.
 467         */
 468        head = dev_priv->perf.oa.oa_buffer.head;
 469
 470        aged_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
 471        aged_tail = dev_priv->perf.oa.oa_buffer.tails[aged_idx].offset;
 472        aging_tail = dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset;
 473
 474        hw_tail = dev_priv->perf.oa.ops.oa_hw_tail_read(dev_priv);
 475
 476        /* The tail pointer increases in 64 byte increments,
 477         * not in report_size steps...
 478         */
 479        hw_tail &= ~(report_size - 1);
 480
 481        now = ktime_get_mono_fast_ns();
 482
 483        /* Update the aged tail
 484         *
 485         * Flip the tail pointer available for read()s once the aging tail is
 486         * old enough to trust that the corresponding data will be visible to
 487         * the CPU...
 488         *
 489         * Do this before updating the aging pointer in case we may be able to
 490         * immediately start aging a new pointer too (if new data has become
 491         * available) without needing to wait for a later hrtimer callback.
 492         */
 493        if (aging_tail != INVALID_TAIL_PTR &&
 494            ((now - dev_priv->perf.oa.oa_buffer.aging_timestamp) >
 495             OA_TAIL_MARGIN_NSEC)) {
 496
 497                aged_idx ^= 1;
 498                dev_priv->perf.oa.oa_buffer.aged_tail_idx = aged_idx;
 499
 500                aged_tail = aging_tail;
 501
 502                /* Mark that we need a new pointer to start aging... */
 503                dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset = INVALID_TAIL_PTR;
 504                aging_tail = INVALID_TAIL_PTR;
 505        }
 506
 507        /* Update the aging tail
 508         *
 509         * We throttle aging tail updates until we have a new tail that
 510         * represents >= one report more data than is already available for
 511         * reading. This ensures there will be enough data for a successful
 512         * read once this new pointer has aged and ensures we will give the new
 513         * pointer time to age.
 514         */
 515        if (aging_tail == INVALID_TAIL_PTR &&
 516            (aged_tail == INVALID_TAIL_PTR ||
 517             OA_TAKEN(hw_tail, aged_tail) >= report_size)) {
 518                struct i915_vma *vma = dev_priv->perf.oa.oa_buffer.vma;
 519                u32 gtt_offset = i915_ggtt_offset(vma);
 520
 521                /* Be paranoid and do a bounds check on the pointer read back
 522                 * from hardware, just in case some spurious hardware condition
 523                 * could put the tail out of bounds...
 524                 */
 525                if (hw_tail >= gtt_offset &&
 526                    hw_tail < (gtt_offset + OA_BUFFER_SIZE)) {
 527                        dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset =
 528                                aging_tail = hw_tail;
 529                        dev_priv->perf.oa.oa_buffer.aging_timestamp = now;
 530                } else {
 531                        DRM_ERROR("Ignoring spurious out of range OA buffer tail pointer = %u\n",
 532                                  hw_tail);
 533                }
 534        }
 535
 536        spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 537
 538        return aged_tail == INVALID_TAIL_PTR ?
 539                false : OA_TAKEN(aged_tail, head) >= report_size;
 540}
 541
 542/**
 543 * append_oa_status - Appends a status record to a userspace read() buffer.
 544 * @stream: An i915-perf stream opened for OA metrics
 545 * @buf: destination buffer given by userspace
 546 * @count: the number of bytes userspace wants to read
 547 * @offset: (inout): the current position for writing into @buf
 548 * @type: The kind of status to report to userspace
 549 *
 550 * Writes a status record (such as `DRM_I915_PERF_RECORD_OA_REPORT_LOST`)
 551 * into the userspace read() buffer.
 552 *
 553 * The @buf @offset will only be updated on success.
 554 *
 555 * Returns: 0 on success, negative error code on failure.
 556 */
 557static int append_oa_status(struct i915_perf_stream *stream,
 558                            char __user *buf,
 559                            size_t count,
 560                            size_t *offset,
 561                            enum drm_i915_perf_record_type type)
 562{
 563        struct drm_i915_perf_record_header header = { type, 0, sizeof(header) };
 564
 565        if ((count - *offset) < header.size)
 566                return -ENOSPC;
 567
 568        if (copy_to_user(buf + *offset, &header, sizeof(header)))
 569                return -EFAULT;
 570
 571        (*offset) += header.size;
 572
 573        return 0;
 574}
 575
 576/**
 577 * append_oa_sample - Copies single OA report into userspace read() buffer.
 578 * @stream: An i915-perf stream opened for OA metrics
 579 * @buf: destination buffer given by userspace
 580 * @count: the number of bytes userspace wants to read
 581 * @offset: (inout): the current position for writing into @buf
 582 * @report: A single OA report to (optionally) include as part of the sample
 583 *
 584 * The contents of a sample are configured through `DRM_I915_PERF_PROP_SAMPLE_*`
 585 * properties when opening a stream, tracked as `stream->sample_flags`. This
 586 * function copies the requested components of a single sample to the given
 587 * read() @buf.
 588 *
 589 * The @buf @offset will only be updated on success.
 590 *
 591 * Returns: 0 on success, negative error code on failure.
 592 */
 593static int append_oa_sample(struct i915_perf_stream *stream,
 594                            char __user *buf,
 595                            size_t count,
 596                            size_t *offset,
 597                            const u8 *report)
 598{
 599        struct drm_i915_private *dev_priv = stream->dev_priv;
 600        int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 601        struct drm_i915_perf_record_header header;
 602        u32 sample_flags = stream->sample_flags;
 603
 604        header.type = DRM_I915_PERF_RECORD_SAMPLE;
 605        header.pad = 0;
 606        header.size = stream->sample_size;
 607
 608        if ((count - *offset) < header.size)
 609                return -ENOSPC;
 610
 611        buf += *offset;
 612        if (copy_to_user(buf, &header, sizeof(header)))
 613                return -EFAULT;
 614        buf += sizeof(header);
 615
 616        if (sample_flags & SAMPLE_OA_REPORT) {
 617                if (copy_to_user(buf, report, report_size))
 618                        return -EFAULT;
 619        }
 620
 621        (*offset) += header.size;
 622
 623        return 0;
 624}
 625
 626/**
 627 * Copies all buffered OA reports into userspace read() buffer.
 628 * @stream: An i915-perf stream opened for OA metrics
 629 * @buf: destination buffer given by userspace
 630 * @count: the number of bytes userspace wants to read
 631 * @offset: (inout): the current position for writing into @buf
 632 *
 633 * Notably any error condition resulting in a short read (-%ENOSPC or
 634 * -%EFAULT) will be returned even though one or more records may
 635 * have been successfully copied. In this case it's up to the caller
 636 * to decide if the error should be squashed before returning to
 637 * userspace.
 638 *
 639 * Note: reports are consumed from the head, and appended to the
 640 * tail, so the tail chases the head?... If you think that's mad
 641 * and back-to-front you're not alone, but this follows the
 642 * Gen PRM naming convention.
 643 *
 644 * Returns: 0 on success, negative error code on failure.
 645 */
 646static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 647                                  char __user *buf,
 648                                  size_t count,
 649                                  size_t *offset)
 650{
 651        struct drm_i915_private *dev_priv = stream->dev_priv;
 652        int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 653        u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
 654        u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
 655        u32 mask = (OA_BUFFER_SIZE - 1);
 656        size_t start_offset = *offset;
 657        unsigned long flags;
 658        unsigned int aged_tail_idx;
 659        u32 head, tail;
 660        u32 taken;
 661        int ret = 0;
 662
 663        if (WARN_ON(!stream->enabled))
 664                return -EIO;
 665
 666        spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 667
 668        head = dev_priv->perf.oa.oa_buffer.head;
 669        aged_tail_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
 670        tail = dev_priv->perf.oa.oa_buffer.tails[aged_tail_idx].offset;
 671
 672        spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 673
 674        /*
 675         * An invalid tail pointer here means we're still waiting for the poll
 676         * hrtimer callback to give us a pointer
 677         */
 678        if (tail == INVALID_TAIL_PTR)
 679                return -EAGAIN;
 680
 681        /*
 682         * NB: oa_buffer.head/tail include the gtt_offset which we don't want
 683         * while indexing relative to oa_buf_base.
 684         */
 685        head -= gtt_offset;
 686        tail -= gtt_offset;
 687
 688        /*
 689         * An out of bounds or misaligned head or tail pointer implies a driver
 690         * bug since we validate + align the tail pointers we read from the
 691         * hardware and we are in full control of the head pointer which should
 692         * only be incremented by multiples of the report size (notably also
 693         * all a power of two).
 694         */
 695        if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
 696                      tail > OA_BUFFER_SIZE || tail % report_size,
 697                      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
 698                      head, tail))
 699                return -EIO;
 700
 701
 702        for (/* none */;
 703             (taken = OA_TAKEN(tail, head));
 704             head = (head + report_size) & mask) {
 705                u8 *report = oa_buf_base + head;
 706                u32 *report32 = (void *)report;
 707                u32 ctx_id;
 708                u32 reason;
 709
 710                /*
 711                 * All the report sizes factor neatly into the buffer
 712                 * size so we never expect to see a report split
 713                 * between the beginning and end of the buffer.
 714                 *
 715                 * Given the initial alignment check a misalignment
 716                 * here would imply a driver bug that would result
 717                 * in an overrun.
 718                 */
 719                if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
 720                        DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
 721                        break;
 722                }
 723
 724                /*
 725                 * The reason field includes flags identifying what
 726                 * triggered this specific report (mostly timer
 727                 * triggered or e.g. due to a context switch).
 728                 *
 729                 * This field is never expected to be zero so we can
 730                 * check that the report isn't invalid before copying
 731                 * it to userspace...
 732                 */
 733                reason = ((report32[0] >> OAREPORT_REASON_SHIFT) &
 734                          OAREPORT_REASON_MASK);
 735                if (reason == 0) {
 736                        if (__ratelimit(&dev_priv->perf.oa.spurious_report_rs))
 737                                DRM_NOTE("Skipping spurious, invalid OA report\n");
 738                        continue;
 739                }
 740
 741                ctx_id = report32[2] & dev_priv->perf.oa.specific_ctx_id_mask;
 742
 743                /*
 744                 * Squash whatever is in the CTX_ID field if it's marked as
 745                 * invalid to be sure we avoid false-positive, single-context
 746                 * filtering below...
 747                 *
 748                 * Note: that we don't clear the valid_ctx_bit so userspace can
 749                 * understand that the ID has been squashed by the kernel.
 750                 */
 751                if (!(report32[0] & dev_priv->perf.oa.gen8_valid_ctx_bit))
 752                        ctx_id = report32[2] = INVALID_CTX_ID;
 753
 754                /*
 755                 * NB: For Gen 8 the OA unit no longer supports clock gating
 756                 * off for a specific context and the kernel can't securely
 757                 * stop the counters from updating as system-wide / global
 758                 * values.
 759                 *
 760                 * Automatic reports now include a context ID so reports can be
 761                 * filtered on the cpu but it's not worth trying to
 762                 * automatically subtract/hide counter progress for other
 763                 * contexts while filtering since we can't stop userspace
 764                 * issuing MI_REPORT_PERF_COUNT commands which would still
 765                 * provide a side-band view of the real values.
 766                 *
 767                 * To allow userspace (such as Mesa/GL_INTEL_performance_query)
 768                 * to normalize counters for a single filtered context then it
 769                 * needs be forwarded bookend context-switch reports so that it
 770                 * can track switches in between MI_REPORT_PERF_COUNT commands
 771                 * and can itself subtract/ignore the progress of counters
 772                 * associated with other contexts. Note that the hardware
 773                 * automatically triggers reports when switching to a new
 774                 * context which are tagged with the ID of the newly active
 775                 * context. To avoid the complexity (and likely fragility) of
 776                 * reading ahead while parsing reports to try and minimize
 777                 * forwarding redundant context switch reports (i.e. between
 778                 * other, unrelated contexts) we simply elect to forward them
 779                 * all.
 780                 *
 781                 * We don't rely solely on the reason field to identify context
 782                 * switches since it's not-uncommon for periodic samples to
 783                 * identify a switch before any 'context switch' report.
 784                 */
 785                if (!dev_priv->perf.oa.exclusive_stream->ctx ||
 786                    dev_priv->perf.oa.specific_ctx_id == ctx_id ||
 787                    (dev_priv->perf.oa.oa_buffer.last_ctx_id ==
 788                     dev_priv->perf.oa.specific_ctx_id) ||
 789                    reason & OAREPORT_REASON_CTX_SWITCH) {
 790
 791                        /*
 792                         * While filtering for a single context we avoid
 793                         * leaking the IDs of other contexts.
 794                         */
 795                        if (dev_priv->perf.oa.exclusive_stream->ctx &&
 796                            dev_priv->perf.oa.specific_ctx_id != ctx_id) {
 797                                report32[2] = INVALID_CTX_ID;
 798                        }
 799
 800                        ret = append_oa_sample(stream, buf, count, offset,
 801                                               report);
 802                        if (ret)
 803                                break;
 804
 805                        dev_priv->perf.oa.oa_buffer.last_ctx_id = ctx_id;
 806                }
 807
 808                /*
 809                 * The above reason field sanity check is based on
 810                 * the assumption that the OA buffer is initially
 811                 * zeroed and we reset the field after copying so the
 812                 * check is still meaningful once old reports start
 813                 * being overwritten.
 814                 */
 815                report32[0] = 0;
 816        }
 817
 818        if (start_offset != *offset) {
 819                spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 820
 821                /*
 822                 * We removed the gtt_offset for the copy loop above, indexing
 823                 * relative to oa_buf_base so put back here...
 824                 */
 825                head += gtt_offset;
 826
 827                I915_WRITE(GEN8_OAHEADPTR, head & GEN8_OAHEADPTR_MASK);
 828                dev_priv->perf.oa.oa_buffer.head = head;
 829
 830                spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 831        }
 832
 833        return ret;
 834}
 835
 836/**
 837 * gen8_oa_read - copy status records then buffered OA reports
 838 * @stream: An i915-perf stream opened for OA metrics
 839 * @buf: destination buffer given by userspace
 840 * @count: the number of bytes userspace wants to read
 841 * @offset: (inout): the current position for writing into @buf
 842 *
 843 * Checks OA unit status registers and if necessary appends corresponding
 844 * status records for userspace (such as for a buffer full condition) and then
 845 * initiate appending any buffered OA reports.
 846 *
 847 * Updates @offset according to the number of bytes successfully copied into
 848 * the userspace buffer.
 849 *
 850 * NB: some data may be successfully copied to the userspace buffer
 851 * even if an error is returned, and this is reflected in the
 852 * updated @offset.
 853 *
 854 * Returns: zero on success or a negative error code
 855 */
 856static int gen8_oa_read(struct i915_perf_stream *stream,
 857                        char __user *buf,
 858                        size_t count,
 859                        size_t *offset)
 860{
 861        struct drm_i915_private *dev_priv = stream->dev_priv;
 862        u32 oastatus;
 863        int ret;
 864
 865        if (WARN_ON(!dev_priv->perf.oa.oa_buffer.vaddr))
 866                return -EIO;
 867
 868        oastatus = I915_READ(GEN8_OASTATUS);
 869
 870        /*
 871         * We treat OABUFFER_OVERFLOW as a significant error:
 872         *
 873         * Although theoretically we could handle this more gracefully
 874         * sometimes, some Gens don't correctly suppress certain
 875         * automatically triggered reports in this condition and so we
 876         * have to assume that old reports are now being trampled
 877         * over.
 878         *
 879         * Considering how we don't currently give userspace control
 880         * over the OA buffer size and always configure a large 16MB
 881         * buffer, then a buffer overflow does anyway likely indicate
 882         * that something has gone quite badly wrong.
 883         */
 884        if (oastatus & GEN8_OASTATUS_OABUFFER_OVERFLOW) {
 885                ret = append_oa_status(stream, buf, count, offset,
 886                                       DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
 887                if (ret)
 888                        return ret;
 889
 890                DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
 891                          dev_priv->perf.oa.period_exponent);
 892
 893                dev_priv->perf.oa.ops.oa_disable(stream);
 894                dev_priv->perf.oa.ops.oa_enable(stream);
 895
 896                /*
 897                 * Note: .oa_enable() is expected to re-init the oabuffer and
 898                 * reset GEN8_OASTATUS for us
 899                 */
 900                oastatus = I915_READ(GEN8_OASTATUS);
 901        }
 902
 903        if (oastatus & GEN8_OASTATUS_REPORT_LOST) {
 904                ret = append_oa_status(stream, buf, count, offset,
 905                                       DRM_I915_PERF_RECORD_OA_REPORT_LOST);
 906                if (ret)
 907                        return ret;
 908                I915_WRITE(GEN8_OASTATUS,
 909                           oastatus & ~GEN8_OASTATUS_REPORT_LOST);
 910        }
 911
 912        return gen8_append_oa_reports(stream, buf, count, offset);
 913}
 914
 915/**
 916 * Copies all buffered OA reports into userspace read() buffer.
 917 * @stream: An i915-perf stream opened for OA metrics
 918 * @buf: destination buffer given by userspace
 919 * @count: the number of bytes userspace wants to read
 920 * @offset: (inout): the current position for writing into @buf
 921 *
 922 * Notably any error condition resulting in a short read (-%ENOSPC or
 923 * -%EFAULT) will be returned even though one or more records may
 924 * have been successfully copied. In this case it's up to the caller
 925 * to decide if the error should be squashed before returning to
 926 * userspace.
 927 *
 928 * Note: reports are consumed from the head, and appended to the
 929 * tail, so the tail chases the head?... If you think that's mad
 930 * and back-to-front you're not alone, but this follows the
 931 * Gen PRM naming convention.
 932 *
 933 * Returns: 0 on success, negative error code on failure.
 934 */
 935static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 936                                  char __user *buf,
 937                                  size_t count,
 938                                  size_t *offset)
 939{
 940        struct drm_i915_private *dev_priv = stream->dev_priv;
 941        int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 942        u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
 943        u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
 944        u32 mask = (OA_BUFFER_SIZE - 1);
 945        size_t start_offset = *offset;
 946        unsigned long flags;
 947        unsigned int aged_tail_idx;
 948        u32 head, tail;
 949        u32 taken;
 950        int ret = 0;
 951
 952        if (WARN_ON(!stream->enabled))
 953                return -EIO;
 954
 955        spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 956
 957        head = dev_priv->perf.oa.oa_buffer.head;
 958        aged_tail_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
 959        tail = dev_priv->perf.oa.oa_buffer.tails[aged_tail_idx].offset;
 960
 961        spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
 962
 963        /* An invalid tail pointer here means we're still waiting for the poll
 964         * hrtimer callback to give us a pointer
 965         */
 966        if (tail == INVALID_TAIL_PTR)
 967                return -EAGAIN;
 968
 969        /* NB: oa_buffer.head/tail include the gtt_offset which we don't want
 970         * while indexing relative to oa_buf_base.
 971         */
 972        head -= gtt_offset;
 973        tail -= gtt_offset;
 974
 975        /* An out of bounds or misaligned head or tail pointer implies a driver
 976         * bug since we validate + align the tail pointers we read from the
 977         * hardware and we are in full control of the head pointer which should
 978         * only be incremented by multiples of the report size (notably also
 979         * all a power of two).
 980         */
 981        if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
 982                      tail > OA_BUFFER_SIZE || tail % report_size,
 983                      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
 984                      head, tail))
 985                return -EIO;
 986
 987
 988        for (/* none */;
 989             (taken = OA_TAKEN(tail, head));
 990             head = (head + report_size) & mask) {
 991                u8 *report = oa_buf_base + head;
 992                u32 *report32 = (void *)report;
 993
 994                /* All the report sizes factor neatly into the buffer
 995                 * size so we never expect to see a report split
 996                 * between the beginning and end of the buffer.
 997                 *
 998                 * Given the initial alignment check a misalignment
 999                 * here would imply a driver bug that would result
1000                 * in an overrun.
1001                 */
1002                if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
1003                        DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
1004                        break;
1005                }
1006
1007                /* The report-ID field for periodic samples includes
1008                 * some undocumented flags related to what triggered
1009                 * the report and is never expected to be zero so we
1010                 * can check that the report isn't invalid before
1011                 * copying it to userspace...
1012                 */
1013                if (report32[0] == 0) {
1014                        if (__ratelimit(&dev_priv->perf.oa.spurious_report_rs))
1015                                DRM_NOTE("Skipping spurious, invalid OA report\n");
1016                        continue;
1017                }
1018
1019                ret = append_oa_sample(stream, buf, count, offset, report);
1020                if (ret)
1021                        break;
1022
1023                /* The above report-id field sanity check is based on
1024                 * the assumption that the OA buffer is initially
1025                 * zeroed and we reset the field after copying so the
1026                 * check is still meaningful once old reports start
1027                 * being overwritten.
1028                 */
1029                report32[0] = 0;
1030        }
1031
1032        if (start_offset != *offset) {
1033                spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
1034
1035                /* We removed the gtt_offset for the copy loop above, indexing
1036                 * relative to oa_buf_base so put back here...
1037                 */
1038                head += gtt_offset;
1039
1040                I915_WRITE(GEN7_OASTATUS2,
1041                           ((head & GEN7_OASTATUS2_HEAD_MASK) |
1042                            GEN7_OASTATUS2_MEM_SELECT_GGTT));
1043                dev_priv->perf.oa.oa_buffer.head = head;
1044
1045                spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
1046        }
1047
1048        return ret;
1049}
1050
1051/**
1052 * gen7_oa_read - copy status records then buffered OA reports
1053 * @stream: An i915-perf stream opened for OA metrics
1054 * @buf: destination buffer given by userspace
1055 * @count: the number of bytes userspace wants to read
1056 * @offset: (inout): the current position for writing into @buf
1057 *
1058 * Checks Gen 7 specific OA unit status registers and if necessary appends
1059 * corresponding status records for userspace (such as for a buffer full
1060 * condition) and then initiate appending any buffered OA reports.
1061 *
1062 * Updates @offset according to the number of bytes successfully copied into
1063 * the userspace buffer.
1064 *
1065 * Returns: zero on success or a negative error code
1066 */
1067static int gen7_oa_read(struct i915_perf_stream *stream,
1068                        char __user *buf,
1069                        size_t count,
1070                        size_t *offset)
1071{
1072        struct drm_i915_private *dev_priv = stream->dev_priv;
1073        u32 oastatus1;
1074        int ret;
1075
1076        if (WARN_ON(!dev_priv->perf.oa.oa_buffer.vaddr))
1077                return -EIO;
1078
1079        oastatus1 = I915_READ(GEN7_OASTATUS1);
1080
1081        /* XXX: On Haswell we don't have a safe way to clear oastatus1
1082         * bits while the OA unit is enabled (while the tail pointer
1083         * may be updated asynchronously) so we ignore status bits
1084         * that have already been reported to userspace.
1085         */
1086        oastatus1 &= ~dev_priv->perf.oa.gen7_latched_oastatus1;
1087
1088        /* We treat OABUFFER_OVERFLOW as a significant error:
1089         *
1090         * - The status can be interpreted to mean that the buffer is
1091         *   currently full (with a higher precedence than OA_TAKEN()
1092         *   which will start to report a near-empty buffer after an
1093         *   overflow) but it's awkward that we can't clear the status
1094         *   on Haswell, so without a reset we won't be able to catch
1095         *   the state again.
1096         *
1097         * - Since it also implies the HW has started overwriting old
1098         *   reports it may also affect our sanity checks for invalid
1099         *   reports when copying to userspace that assume new reports
1100         *   are being written to cleared memory.
1101         *
1102         * - In the future we may want to introduce a flight recorder
1103         *   mode where the driver will automatically maintain a safe
1104         *   guard band between head/tail, avoiding this overflow
1105         *   condition, but we avoid the added driver complexity for
1106         *   now.
1107         */
1108        if (unlikely(oastatus1 & GEN7_OASTATUS1_OABUFFER_OVERFLOW)) {
1109                ret = append_oa_status(stream, buf, count, offset,
1110                                       DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
1111                if (ret)
1112                        return ret;
1113
1114                DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
1115                          dev_priv->perf.oa.period_exponent);
1116
1117                dev_priv->perf.oa.ops.oa_disable(stream);
1118                dev_priv->perf.oa.ops.oa_enable(stream);
1119
1120                oastatus1 = I915_READ(GEN7_OASTATUS1);
1121        }
1122
1123        if (unlikely(oastatus1 & GEN7_OASTATUS1_REPORT_LOST)) {
1124                ret = append_oa_status(stream, buf, count, offset,
1125                                       DRM_I915_PERF_RECORD_OA_REPORT_LOST);
1126                if (ret)
1127                        return ret;
1128                dev_priv->perf.oa.gen7_latched_oastatus1 |=
1129                        GEN7_OASTATUS1_REPORT_LOST;
1130        }
1131
1132        return gen7_append_oa_reports(stream, buf, count, offset);
1133}
1134
1135/**
1136 * i915_oa_wait_unlocked - handles blocking IO until OA data available
1137 * @stream: An i915-perf stream opened for OA metrics
1138 *
1139 * Called when userspace tries to read() from a blocking stream FD opened
1140 * for OA metrics. It waits until the hrtimer callback finds a non-empty
1141 * OA buffer and wakes us.
1142 *
1143 * Note: it's acceptable to have this return with some false positives
1144 * since any subsequent read handling will return -EAGAIN if there isn't
1145 * really data ready for userspace yet.
1146 *
1147 * Returns: zero on success or a negative error code
1148 */
1149static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
1150{
1151        struct drm_i915_private *dev_priv = stream->dev_priv;
1152
1153        /* We would wait indefinitely if periodic sampling is not enabled */
1154        if (!dev_priv->perf.oa.periodic)
1155                return -EIO;
1156
1157        return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
1158                                        oa_buffer_check_unlocked(dev_priv));
1159}
1160
1161/**
1162 * i915_oa_poll_wait - call poll_wait() for an OA stream poll()
1163 * @stream: An i915-perf stream opened for OA metrics
1164 * @file: An i915 perf stream file
1165 * @wait: poll() state table
1166 *
1167 * For handling userspace polling on an i915 perf stream opened for OA metrics,
1168 * this starts a poll_wait with the wait queue that our hrtimer callback wakes
1169 * when it sees data ready to read in the circular OA buffer.
1170 */
1171static void i915_oa_poll_wait(struct i915_perf_stream *stream,
1172                              struct file *file,
1173                              poll_table *wait)
1174{
1175        struct drm_i915_private *dev_priv = stream->dev_priv;
1176
1177        poll_wait(file, &dev_priv->perf.oa.poll_wq, wait);
1178}
1179
1180/**
1181 * i915_oa_read - just calls through to &i915_oa_ops->read
1182 * @stream: An i915-perf stream opened for OA metrics
1183 * @buf: destination buffer given by userspace
1184 * @count: the number of bytes userspace wants to read
1185 * @offset: (inout): the current position for writing into @buf
1186 *
1187 * Updates @offset according to the number of bytes successfully copied into
1188 * the userspace buffer.
1189 *
1190 * Returns: zero on success or a negative error code
1191 */
1192static int i915_oa_read(struct i915_perf_stream *stream,
1193                        char __user *buf,
1194                        size_t count,
1195                        size_t *offset)
1196{
1197        struct drm_i915_private *dev_priv = stream->dev_priv;
1198
1199        return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
1200}
1201
1202static struct intel_context *oa_pin_context(struct drm_i915_private *i915,
1203                                            struct i915_gem_context *ctx)
1204{
1205        struct intel_engine_cs *engine = i915->engine[RCS0];
1206        struct intel_context *ce;
1207        int ret;
1208
1209        ret = i915_mutex_lock_interruptible(&i915->drm);
1210        if (ret)
1211                return ERR_PTR(ret);
1212
1213        /*
1214         * As the ID is the gtt offset of the context's vma we
1215         * pin the vma to ensure the ID remains fixed.
1216         *
1217         * NB: implied RCS engine...
1218         */
1219        ce = intel_context_pin(ctx, engine);
1220        mutex_unlock(&i915->drm.struct_mutex);
1221        if (IS_ERR(ce))
1222                return ce;
1223
1224        i915->perf.oa.pinned_ctx = ce;
1225
1226        return ce;
1227}
1228
1229/**
1230 * oa_get_render_ctx_id - determine and hold ctx hw id
1231 * @stream: An i915-perf stream opened for OA metrics
1232 *
1233 * Determine the render context hw id, and ensure it remains fixed for the
1234 * lifetime of the stream. This ensures that we don't have to worry about
1235 * updating the context ID in OACONTROL on the fly.
1236 *
1237 * Returns: zero on success or a negative error code
1238 */
1239static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
1240{
1241        struct drm_i915_private *i915 = stream->dev_priv;
1242        struct intel_context *ce;
1243
1244        ce = oa_pin_context(i915, stream->ctx);
1245        if (IS_ERR(ce))
1246                return PTR_ERR(ce);
1247
1248        switch (INTEL_GEN(i915)) {
1249        case 7: {
1250                /*
1251                 * On Haswell we don't do any post processing of the reports
1252                 * and don't need to use the mask.
1253                 */
1254                i915->perf.oa.specific_ctx_id = i915_ggtt_offset(ce->state);
1255                i915->perf.oa.specific_ctx_id_mask = 0;
1256                break;
1257        }
1258
1259        case 8:
1260        case 9:
1261        case 10:
1262                if (USES_GUC_SUBMISSION(i915)) {
1263                        /*
1264                         * When using GuC, the context descriptor we write in
1265                         * i915 is read by GuC and rewritten before it's
1266                         * actually written into the hardware. The LRCA is
1267                         * what is put into the context id field of the
1268                         * context descriptor by GuC. Because it's aligned to
1269                         * a page, the lower 12bits are always at 0 and
1270                         * dropped by GuC. They won't be part of the context
1271                         * ID in the OA reports, so squash those lower bits.
1272                         */
1273                        i915->perf.oa.specific_ctx_id =
1274                                lower_32_bits(ce->lrc_desc) >> 12;
1275
1276                        /*
1277                         * GuC uses the top bit to signal proxy submission, so
1278                         * ignore that bit.
1279                         */
1280                        i915->perf.oa.specific_ctx_id_mask =
1281                                (1U << (GEN8_CTX_ID_WIDTH - 1)) - 1;
1282                } else {
1283                        i915->perf.oa.specific_ctx_id_mask =
1284                                (1U << GEN8_CTX_ID_WIDTH) - 1;
1285                        i915->perf.oa.specific_ctx_id =
1286                                upper_32_bits(ce->lrc_desc);
1287                        i915->perf.oa.specific_ctx_id &=
1288                                i915->perf.oa.specific_ctx_id_mask;
1289                }
1290                break;
1291
1292        case 11: {
1293                i915->perf.oa.specific_ctx_id_mask =
1294                        ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32) |
1295                        ((1U << GEN11_ENGINE_INSTANCE_WIDTH) - 1) << (GEN11_ENGINE_INSTANCE_SHIFT - 32) |
1296                        ((1 << GEN11_ENGINE_CLASS_WIDTH) - 1) << (GEN11_ENGINE_CLASS_SHIFT - 32);
1297                i915->perf.oa.specific_ctx_id = upper_32_bits(ce->lrc_desc);
1298                i915->perf.oa.specific_ctx_id &=
1299                        i915->perf.oa.specific_ctx_id_mask;
1300                break;
1301        }
1302
1303        default:
1304                MISSING_CASE(INTEL_GEN(i915));
1305        }
1306
1307        DRM_DEBUG_DRIVER("filtering on ctx_id=0x%x ctx_id_mask=0x%x\n",
1308                         i915->perf.oa.specific_ctx_id,
1309                         i915->perf.oa.specific_ctx_id_mask);
1310
1311        return 0;
1312}
1313
1314/**
1315 * oa_put_render_ctx_id - counterpart to oa_get_render_ctx_id releases hold
1316 * @stream: An i915-perf stream opened for OA metrics
1317 *
1318 * In case anything needed doing to ensure the context HW ID would remain valid
1319 * for the lifetime of the stream, then that can be undone here.
1320 */
1321static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
1322{
1323        struct drm_i915_private *dev_priv = stream->dev_priv;
1324        struct intel_context *ce;
1325
1326        dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
1327        dev_priv->perf.oa.specific_ctx_id_mask = 0;
1328
1329        ce = fetch_and_zero(&dev_priv->perf.oa.pinned_ctx);
1330        if (ce) {
1331                mutex_lock(&dev_priv->drm.struct_mutex);
1332                intel_context_unpin(ce);
1333                mutex_unlock(&dev_priv->drm.struct_mutex);
1334        }
1335}
1336
1337static void
1338free_oa_buffer(struct drm_i915_private *i915)
1339{
1340        mutex_lock(&i915->drm.struct_mutex);
1341
1342        i915_vma_unpin_and_release(&i915->perf.oa.oa_buffer.vma,
1343                                   I915_VMA_RELEASE_MAP);
1344
1345        mutex_unlock(&i915->drm.struct_mutex);
1346
1347        i915->perf.oa.oa_buffer.vaddr = NULL;
1348}
1349
1350static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
1351{
1352        struct drm_i915_private *dev_priv = stream->dev_priv;
1353
1354        BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
1355
1356        /*
1357         * Unset exclusive_stream first, it will be checked while disabling
1358         * the metric set on gen8+.
1359         */
1360        mutex_lock(&dev_priv->drm.struct_mutex);
1361        dev_priv->perf.oa.exclusive_stream = NULL;
1362        dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
1363        mutex_unlock(&dev_priv->drm.struct_mutex);
1364
1365        free_oa_buffer(dev_priv);
1366
1367        intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
1368        intel_runtime_pm_put(dev_priv, stream->wakeref);
1369
1370        if (stream->ctx)
1371                oa_put_render_ctx_id(stream);
1372
1373        put_oa_config(dev_priv, stream->oa_config);
1374
1375        if (dev_priv->perf.oa.spurious_report_rs.missed) {
1376                DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n",
1377                         dev_priv->perf.oa.spurious_report_rs.missed);
1378        }
1379}
1380
1381static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
1382{
1383        u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
1384        unsigned long flags;
1385
1386        spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
1387
1388        /* Pre-DevBDW: OABUFFER must be set with counters off,
1389         * before OASTATUS1, but after OASTATUS2
1390         */
1391        I915_WRITE(GEN7_OASTATUS2,
1392                   gtt_offset | GEN7_OASTATUS2_MEM_SELECT_GGTT); /* head */
1393        dev_priv->perf.oa.oa_buffer.head = gtt_offset;
1394
1395        I915_WRITE(GEN7_OABUFFER, gtt_offset);
1396
1397        I915_WRITE(GEN7_OASTATUS1, gtt_offset | OABUFFER_SIZE_16M); /* tail */
1398
1399        /* Mark that we need updated tail pointers to read from... */
1400        dev_priv->perf.oa.oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
1401        dev_priv->perf.oa.oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
1402
1403        spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
1404
1405        /* On Haswell we have to track which OASTATUS1 flags we've
1406         * already seen since they can't be cleared while periodic
1407         * sampling is enabled.
1408         */
1409        dev_priv->perf.oa.gen7_latched_oastatus1 = 0;
1410
1411        /* NB: although the OA buffer will initially be allocated
1412         * zeroed via shmfs (and so this memset is redundant when
1413         * first allocating), we may re-init the OA buffer, either
1414         * when re-enabling a stream or in error/reset paths.
1415         *
1416         * The reason we clear the buffer for each re-init is for the
1417         * sanity check in gen7_append_oa_reports() that looks at the
1418         * report-id field to make sure it's non-zero which relies on
1419         * the assumption that new reports are being written to zeroed
1420         * memory...
1421         */
1422        memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
1423
1424        /* Maybe make ->pollin per-stream state if we support multiple
1425         * concurrent streams in the future.
1426         */
1427        dev_priv->perf.oa.pollin = false;
1428}
1429
1430static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
1431{
1432        u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
1433        unsigned long flags;
1434
1435        spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
1436
1437        I915_WRITE(GEN8_OASTATUS, 0);
1438        I915_WRITE(GEN8_OAHEADPTR, gtt_offset);
1439        dev_priv->perf.oa.oa_buffer.head = gtt_offset;
1440
1441        I915_WRITE(GEN8_OABUFFER_UDW, 0);
1442
1443        /*
1444         * PRM says:
1445         *
1446         *  "This MMIO must be set before the OATAILPTR
1447         *  register and after the OAHEADPTR register. This is
1448         *  to enable proper functionality of the overflow
1449         *  bit."
1450         */
1451        I915_WRITE(GEN8_OABUFFER, gtt_offset |
1452                   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
1453        I915_WRITE(GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
1454
1455        /* Mark that we need updated tail pointers to read from... */
1456        dev_priv->perf.oa.oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
1457        dev_priv->perf.oa.oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
1458
1459        /*
1460         * Reset state used to recognise context switches, affecting which
1461         * reports we will forward to userspace while filtering for a single
1462         * context.
1463         */
1464        dev_priv->perf.oa.oa_buffer.last_ctx_id = INVALID_CTX_ID;
1465
1466        spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
1467
1468        /*
1469         * NB: although the OA buffer will initially be allocated
1470         * zeroed via shmfs (and so this memset is redundant when
1471         * first allocating), we may re-init the OA buffer, either
1472         * when re-enabling a stream or in error/reset paths.
1473         *
1474         * The reason we clear the buffer for each re-init is for the
1475         * sanity check in gen8_append_oa_reports() that looks at the
1476         * reason field to make sure it's non-zero which relies on
1477         * the assumption that new reports are being written to zeroed
1478         * memory...
1479         */
1480        memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
1481
1482        /*
1483         * Maybe make ->pollin per-stream state if we support multiple
1484         * concurrent streams in the future.
1485         */
1486        dev_priv->perf.oa.pollin = false;
1487}
1488
1489static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
1490{
1491        struct drm_i915_gem_object *bo;
1492        struct i915_vma *vma;
1493        int ret;
1494
1495        if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
1496                return -ENODEV;
1497
1498        ret = i915_mutex_lock_interruptible(&dev_priv->drm);
1499        if (ret)
1500                return ret;
1501
1502        BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
1503        BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
1504
1505        bo = i915_gem_object_create(dev_priv, OA_BUFFER_SIZE);
1506        if (IS_ERR(bo)) {
1507                DRM_ERROR("Failed to allocate OA buffer\n");
1508                ret = PTR_ERR(bo);
1509                goto unlock;
1510        }
1511
1512        i915_gem_object_set_cache_coherency(bo, I915_CACHE_LLC);
1513
1514        /* PreHSW required 512K alignment, HSW requires 16M */
1515        vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
1516        if (IS_ERR(vma)) {
1517                ret = PTR_ERR(vma);
1518                goto err_unref;
1519        }
1520        dev_priv->perf.oa.oa_buffer.vma = vma;
1521
1522        dev_priv->perf.oa.oa_buffer.vaddr =
1523                i915_gem_object_pin_map(bo, I915_MAP_WB);
1524        if (IS_ERR(dev_priv->perf.oa.oa_buffer.vaddr)) {
1525                ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.vaddr);
1526                goto err_unpin;
1527        }
1528
1529        DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p\n",
1530                         i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
1531                         dev_priv->perf.oa.oa_buffer.vaddr);
1532
1533        goto unlock;
1534
1535err_unpin:
1536        __i915_vma_unpin(vma);
1537
1538err_unref:
1539        i915_gem_object_put(bo);
1540
1541        dev_priv->perf.oa.oa_buffer.vaddr = NULL;
1542        dev_priv->perf.oa.oa_buffer.vma = NULL;
1543
1544unlock:
1545        mutex_unlock(&dev_priv->drm.struct_mutex);
1546        return ret;
1547}
1548
1549static void config_oa_regs(struct drm_i915_private *dev_priv,
1550                           const struct i915_oa_reg *regs,
1551                           u32 n_regs)
1552{
1553        u32 i;
1554
1555        for (i = 0; i < n_regs; i++) {
1556                const struct i915_oa_reg *reg = regs + i;
1557
1558                I915_WRITE(reg->addr, reg->value);
1559        }
1560}
1561
1562static int hsw_enable_metric_set(struct i915_perf_stream *stream)
1563{
1564        struct drm_i915_private *dev_priv = stream->dev_priv;
1565        const struct i915_oa_config *oa_config = stream->oa_config;
1566
1567        /* PRM:
1568         *
1569         * OA unit is using “crclk” for its functionality. When trunk
1570         * level clock gating takes place, OA clock would be gated,
1571         * unable to count the events from non-render clock domain.
1572         * Render clock gating must be disabled when OA is enabled to
1573         * count the events from non-render domain. Unit level clock
1574         * gating for RCS should also be disabled.
1575         */
1576        I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
1577                                    ~GEN7_DOP_CLOCK_GATE_ENABLE));
1578        I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) |
1579                                  GEN6_CSUNIT_CLOCK_GATE_DISABLE));
1580
1581        config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
1582
1583        /* It apparently takes a fairly long time for a new MUX
1584         * configuration to be be applied after these register writes.
1585         * This delay duration was derived empirically based on the
1586         * render_basic config but hopefully it covers the maximum
1587         * configuration latency.
1588         *
1589         * As a fallback, the checks in _append_oa_reports() to skip
1590         * invalid OA reports do also seem to work to discard reports
1591         * generated before this config has completed - albeit not
1592         * silently.
1593         *
1594         * Unfortunately this is essentially a magic number, since we
1595         * don't currently know of a reliable mechanism for predicting
1596         * how long the MUX config will take to apply and besides
1597         * seeing invalid reports we don't know of a reliable way to
1598         * explicitly check that the MUX config has landed.
1599         *
1600         * It's even possible we've miss characterized the underlying
1601         * problem - it just seems like the simplest explanation why
1602         * a delay at this location would mitigate any invalid reports.
1603         */
1604        usleep_range(15000, 20000);
1605
1606        config_oa_regs(dev_priv, oa_config->b_counter_regs,
1607                       oa_config->b_counter_regs_len);
1608
1609        return 0;
1610}
1611
1612static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
1613{
1614        I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) &
1615                                  ~GEN6_CSUNIT_CLOCK_GATE_DISABLE));
1616        I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) |
1617                                    GEN7_DOP_CLOCK_GATE_ENABLE));
1618
1619        I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
1620                                      ~GT_NOA_ENABLE));
1621}
1622
1623/*
1624 * NB: It must always remain pointer safe to run this even if the OA unit
1625 * has been disabled.
1626 *
1627 * It's fine to put out-of-date values into these per-context registers
1628 * in the case that the OA unit has been disabled.
1629 */
1630static void
1631gen8_update_reg_state_unlocked(struct intel_context *ce,
1632                               u32 *reg_state,
1633                               const struct i915_oa_config *oa_config)
1634{
1635        struct drm_i915_private *i915 = ce->gem_context->i915;
1636        u32 ctx_oactxctrl = i915->perf.oa.ctx_oactxctrl_offset;
1637        u32 ctx_flexeu0 = i915->perf.oa.ctx_flexeu0_offset;
1638        /* The MMIO offsets for Flex EU registers aren't contiguous */
1639        i915_reg_t flex_regs[] = {
1640                EU_PERF_CNTL0,
1641                EU_PERF_CNTL1,
1642                EU_PERF_CNTL2,
1643                EU_PERF_CNTL3,
1644                EU_PERF_CNTL4,
1645                EU_PERF_CNTL5,
1646                EU_PERF_CNTL6,
1647        };
1648        int i;
1649
1650        CTX_REG(reg_state, ctx_oactxctrl, GEN8_OACTXCONTROL,
1651                (i915->perf.oa.period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
1652                (i915->perf.oa.periodic ? GEN8_OA_TIMER_ENABLE : 0) |
1653                GEN8_OA_COUNTER_RESUME);
1654
1655        for (i = 0; i < ARRAY_SIZE(flex_regs); i++) {
1656                u32 state_offset = ctx_flexeu0 + i * 2;
1657                u32 mmio = i915_mmio_reg_offset(flex_regs[i]);
1658
1659                /*
1660                 * This arbitrary default will select the 'EU FPU0 Pipeline
1661                 * Active' event. In the future it's anticipated that there
1662                 * will be an explicit 'No Event' we can select, but not yet...
1663                 */
1664                u32 value = 0;
1665
1666                if (oa_config) {
1667                        u32 j;
1668
1669                        for (j = 0; j < oa_config->flex_regs_len; j++) {
1670                                if (i915_mmio_reg_offset(oa_config->flex_regs[j].addr) == mmio) {
1671                                        value = oa_config->flex_regs[j].value;
1672                                        break;
1673                                }
1674                        }
1675                }
1676
1677                CTX_REG(reg_state, state_offset, flex_regs[i], value);
1678        }
1679
1680        CTX_REG(reg_state,
1681                CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
1682                gen8_make_rpcs(i915, &ce->sseu));
1683}
1684
1685/*
1686 * Manages updating the per-context aspects of the OA stream
1687 * configuration across all contexts.
1688 *
1689 * The awkward consideration here is that OACTXCONTROL controls the
1690 * exponent for periodic sampling which is primarily used for system
1691 * wide profiling where we'd like a consistent sampling period even in
1692 * the face of context switches.
1693 *
1694 * Our approach of updating the register state context (as opposed to
1695 * say using a workaround batch buffer) ensures that the hardware
1696 * won't automatically reload an out-of-date timer exponent even
1697 * transiently before a WA BB could be parsed.
1698 *
1699 * This function needs to:
1700 * - Ensure the currently running context's per-context OA state is
1701 *   updated
1702 * - Ensure that all existing contexts will have the correct per-context
1703 *   OA state if they are scheduled for use.
1704 * - Ensure any new contexts will be initialized with the correct
1705 *   per-context OA state.
1706 *
1707 * Note: it's only the RCS/Render context that has any OA state.
1708 */
1709static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
1710                                       const struct i915_oa_config *oa_config)
1711{
1712        struct intel_engine_cs *engine = dev_priv->engine[RCS0];
1713        unsigned int map_type = i915_coherent_map_type(dev_priv);
1714        struct i915_gem_context *ctx;
1715        struct i915_request *rq;
1716        int ret;
1717
1718        lockdep_assert_held(&dev_priv->drm.struct_mutex);
1719
1720        /*
1721         * The OA register config is setup through the context image. This image
1722         * might be written to by the GPU on context switch (in particular on
1723         * lite-restore). This means we can't safely update a context's image,
1724         * if this context is scheduled/submitted to run on the GPU.
1725         *
1726         * We could emit the OA register config through the batch buffer but
1727         * this might leave small interval of time where the OA unit is
1728         * configured at an invalid sampling period.
1729         *
1730         * So far the best way to work around this issue seems to be draining
1731         * the GPU from any submitted work.
1732         */
1733        ret = i915_gem_wait_for_idle(dev_priv,
1734                                     I915_WAIT_LOCKED,
1735                                     MAX_SCHEDULE_TIMEOUT);
1736        if (ret)
1737                return ret;
1738
1739        /* Update all contexts now that we've stalled the submission. */
1740        list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
1741                struct intel_context *ce = intel_context_lookup(ctx, engine);
1742                u32 *regs;
1743
1744                /* OA settings will be set upon first use */
1745                if (!ce || !ce->state)
1746                        continue;
1747
1748                regs = i915_gem_object_pin_map(ce->state->obj, map_type);
1749                if (IS_ERR(regs))
1750                        return PTR_ERR(regs);
1751
1752                ce->state->obj->mm.dirty = true;
1753                regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs);
1754
1755                gen8_update_reg_state_unlocked(ce, regs, oa_config);
1756
1757                i915_gem_object_unpin_map(ce->state->obj);
1758        }
1759
1760        /*
1761         * Apply the configuration by doing one context restore of the edited
1762         * context image.
1763         */
1764        rq = i915_request_alloc(engine, dev_priv->kernel_context);
1765        if (IS_ERR(rq))
1766                return PTR_ERR(rq);
1767
1768        i915_request_add(rq);
1769
1770        return 0;
1771}
1772
1773static int gen8_enable_metric_set(struct i915_perf_stream *stream)
1774{
1775        struct drm_i915_private *dev_priv = stream->dev_priv;
1776        const struct i915_oa_config *oa_config = stream->oa_config;
1777        int ret;
1778
1779        /*
1780         * We disable slice/unslice clock ratio change reports on SKL since
1781         * they are too noisy. The HW generates a lot of redundant reports
1782         * where the ratio hasn't really changed causing a lot of redundant
1783         * work to processes and increasing the chances we'll hit buffer
1784         * overruns.
1785         *
1786         * Although we don't currently use the 'disable overrun' OABUFFER
1787         * feature it's worth noting that clock ratio reports have to be
1788         * disabled before considering to use that feature since the HW doesn't
1789         * correctly block these reports.
1790         *
1791         * Currently none of the high-level metrics we have depend on knowing
1792         * this ratio to normalize.
1793         *
1794         * Note: This register is not power context saved and restored, but
1795         * that's OK considering that we disable RC6 while the OA unit is
1796         * enabled.
1797         *
1798         * The _INCLUDE_CLK_RATIO bit allows the slice/unslice frequency to
1799         * be read back from automatically triggered reports, as part of the
1800         * RPT_ID field.
1801         */
1802        if (IS_GEN_RANGE(dev_priv, 9, 11)) {
1803                I915_WRITE(GEN8_OA_DEBUG,
1804                           _MASKED_BIT_ENABLE(GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
1805                                              GEN9_OA_DEBUG_INCLUDE_CLK_RATIO));
1806        }
1807
1808        /*
1809         * Update all contexts prior writing the mux configurations as we need
1810         * to make sure all slices/subslices are ON before writing to NOA
1811         * registers.
1812         */
1813        ret = gen8_configure_all_contexts(dev_priv, oa_config);
1814        if (ret)
1815                return ret;
1816
1817        config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
1818
1819        config_oa_regs(dev_priv, oa_config->b_counter_regs,
1820                       oa_config->b_counter_regs_len);
1821
1822        return 0;
1823}
1824
1825static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
1826{
1827        /* Reset all contexts' slices/subslices configurations. */
1828        gen8_configure_all_contexts(dev_priv, NULL);
1829
1830        I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
1831                                      ~GT_NOA_ENABLE));
1832}
1833
1834static void gen10_disable_metric_set(struct drm_i915_private *dev_priv)
1835{
1836        /* Reset all contexts' slices/subslices configurations. */
1837        gen8_configure_all_contexts(dev_priv, NULL);
1838
1839        /* Make sure we disable noa to save power. */
1840        I915_WRITE(RPM_CONFIG1,
1841                   I915_READ(RPM_CONFIG1) & ~GEN10_GT_NOA_ENABLE);
1842}
1843
1844static void gen7_oa_enable(struct i915_perf_stream *stream)
1845{
1846        struct drm_i915_private *dev_priv = stream->dev_priv;
1847        struct i915_gem_context *ctx = stream->ctx;
1848        u32 ctx_id = dev_priv->perf.oa.specific_ctx_id;
1849        bool periodic = dev_priv->perf.oa.periodic;
1850        u32 period_exponent = dev_priv->perf.oa.period_exponent;
1851        u32 report_format = dev_priv->perf.oa.oa_buffer.format;
1852
1853        /*
1854         * Reset buf pointers so we don't forward reports from before now.
1855         *
1856         * Think carefully if considering trying to avoid this, since it
1857         * also ensures status flags and the buffer itself are cleared
1858         * in error paths, and we have checks for invalid reports based
1859         * on the assumption that certain fields are written to zeroed
1860         * memory which this helps maintains.
1861         */
1862        gen7_init_oa_buffer(dev_priv);
1863
1864        I915_WRITE(GEN7_OACONTROL,
1865                   (ctx_id & GEN7_OACONTROL_CTX_MASK) |
1866                   (period_exponent <<
1867                    GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
1868                   (periodic ? GEN7_OACONTROL_TIMER_ENABLE : 0) |
1869                   (report_format << GEN7_OACONTROL_FORMAT_SHIFT) |
1870                   (ctx ? GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
1871                   GEN7_OACONTROL_ENABLE);
1872}
1873
1874static void gen8_oa_enable(struct i915_perf_stream *stream)
1875{
1876        struct drm_i915_private *dev_priv = stream->dev_priv;
1877        u32 report_format = dev_priv->perf.oa.oa_buffer.format;
1878
1879        /*
1880         * Reset buf pointers so we don't forward reports from before now.
1881         *
1882         * Think carefully if considering trying to avoid this, since it
1883         * also ensures status flags and the buffer itself are cleared
1884         * in error paths, and we have checks for invalid reports based
1885         * on the assumption that certain fields are written to zeroed
1886         * memory which this helps maintains.
1887         */
1888        gen8_init_oa_buffer(dev_priv);
1889
1890        /*
1891         * Note: we don't rely on the hardware to perform single context
1892         * filtering and instead filter on the cpu based on the context-id
1893         * field of reports
1894         */
1895        I915_WRITE(GEN8_OACONTROL, (report_format <<
1896                                    GEN8_OA_REPORT_FORMAT_SHIFT) |
1897                                   GEN8_OA_COUNTER_ENABLE);
1898}
1899
1900/**
1901 * i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream
1902 * @stream: An i915 perf stream opened for OA metrics
1903 *
1904 * [Re]enables hardware periodic sampling according to the period configured
1905 * when opening the stream. This also starts a hrtimer that will periodically
1906 * check for data in the circular OA buffer for notifying userspace (e.g.
1907 * during a read() or poll()).
1908 */
1909static void i915_oa_stream_enable(struct i915_perf_stream *stream)
1910{
1911        struct drm_i915_private *dev_priv = stream->dev_priv;
1912
1913        dev_priv->perf.oa.ops.oa_enable(stream);
1914
1915        if (dev_priv->perf.oa.periodic)
1916                hrtimer_start(&dev_priv->perf.oa.poll_check_timer,
1917                              ns_to_ktime(POLL_PERIOD),
1918                              HRTIMER_MODE_REL_PINNED);
1919}
1920
1921static void gen7_oa_disable(struct i915_perf_stream *stream)
1922{
1923        struct intel_uncore *uncore = &stream->dev_priv->uncore;
1924
1925        intel_uncore_write(uncore, GEN7_OACONTROL, 0);
1926        if (intel_wait_for_register(uncore,
1927                                    GEN7_OACONTROL, GEN7_OACONTROL_ENABLE, 0,
1928                                    50))
1929                DRM_ERROR("wait for OA to be disabled timed out\n");
1930}
1931
1932static void gen8_oa_disable(struct i915_perf_stream *stream)
1933{
1934        struct intel_uncore *uncore = &stream->dev_priv->uncore;
1935
1936        intel_uncore_write(uncore, GEN8_OACONTROL, 0);
1937        if (intel_wait_for_register(uncore,
1938                                    GEN8_OACONTROL, GEN8_OA_COUNTER_ENABLE, 0,
1939                                    50))
1940                DRM_ERROR("wait for OA to be disabled timed out\n");
1941}
1942
1943/**
1944 * i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream
1945 * @stream: An i915 perf stream opened for OA metrics
1946 *
1947 * Stops the OA unit from periodically writing counter reports into the
1948 * circular OA buffer. This also stops the hrtimer that periodically checks for
1949 * data in the circular OA buffer, for notifying userspace.
1950 */
1951static void i915_oa_stream_disable(struct i915_perf_stream *stream)
1952{
1953        struct drm_i915_private *dev_priv = stream->dev_priv;
1954
1955        dev_priv->perf.oa.ops.oa_disable(stream);
1956
1957        if (dev_priv->perf.oa.periodic)
1958                hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
1959}
1960
1961static const struct i915_perf_stream_ops i915_oa_stream_ops = {
1962        .destroy = i915_oa_stream_destroy,
1963        .enable = i915_oa_stream_enable,
1964        .disable = i915_oa_stream_disable,
1965        .wait_unlocked = i915_oa_wait_unlocked,
1966        .poll_wait = i915_oa_poll_wait,
1967        .read = i915_oa_read,
1968};
1969
1970/**
1971 * i915_oa_stream_init - validate combined props for OA stream and init
1972 * @stream: An i915 perf stream
1973 * @param: The open parameters passed to `DRM_I915_PERF_OPEN`
1974 * @props: The property state that configures stream (individually validated)
1975 *
1976 * While read_properties_unlocked() validates properties in isolation it
1977 * doesn't ensure that the combination necessarily makes sense.
1978 *
1979 * At this point it has been determined that userspace wants a stream of
1980 * OA metrics, but still we need to further validate the combined
1981 * properties are OK.
1982 *
1983 * If the configuration makes sense then we can allocate memory for
1984 * a circular OA buffer and apply the requested metric set configuration.
1985 *
1986 * Returns: zero on success or a negative error code.
1987 */
1988static int i915_oa_stream_init(struct i915_perf_stream *stream,
1989                               struct drm_i915_perf_open_param *param,
1990                               struct perf_open_properties *props)
1991{
1992        struct drm_i915_private *dev_priv = stream->dev_priv;
1993        int format_size;
1994        int ret;
1995
1996        /* If the sysfs metrics/ directory wasn't registered for some
1997         * reason then don't let userspace try their luck with config
1998         * IDs
1999         */
2000        if (!dev_priv->perf.metrics_kobj) {
2001                DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
2002                return -EINVAL;
2003        }
2004
2005        if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
2006                DRM_DEBUG("Only OA report sampling supported\n");
2007                return -EINVAL;
2008        }
2009
2010        if (!dev_priv->perf.oa.ops.enable_metric_set) {
2011                DRM_DEBUG("OA unit not supported\n");
2012                return -ENODEV;
2013        }
2014
2015        /* To avoid the complexity of having to accurately filter
2016         * counter reports and marshal to the appropriate client
2017         * we currently only allow exclusive access
2018         */
2019        if (dev_priv->perf.oa.exclusive_stream) {
2020                DRM_DEBUG("OA unit already in use\n");
2021                return -EBUSY;
2022        }
2023
2024        if (!props->oa_format) {
2025                DRM_DEBUG("OA report format not specified\n");
2026                return -EINVAL;
2027        }
2028
2029        /* We set up some ratelimit state to potentially throttle any _NOTES
2030         * about spurious, invalid OA reports which we don't forward to
2031         * userspace.
2032         *
2033         * The initialization is associated with opening the stream (not driver
2034         * init) considering we print a _NOTE about any throttling when closing
2035         * the stream instead of waiting until driver _fini which no one would
2036         * ever see.
2037         *
2038         * Using the same limiting factors as printk_ratelimit()
2039         */
2040        ratelimit_state_init(&dev_priv->perf.oa.spurious_report_rs,
2041                             5 * HZ, 10);
2042        /* Since we use a DRM_NOTE for spurious reports it would be
2043         * inconsistent to let __ratelimit() automatically print a warning for
2044         * throttling.
2045         */
2046        ratelimit_set_flags(&dev_priv->perf.oa.spurious_report_rs,
2047                            RATELIMIT_MSG_ON_RELEASE);
2048
2049        stream->sample_size = sizeof(struct drm_i915_perf_record_header);
2050
2051        format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size;
2052
2053        stream->sample_flags |= SAMPLE_OA_REPORT;
2054        stream->sample_size += format_size;
2055
2056        dev_priv->perf.oa.oa_buffer.format_size = format_size;
2057        if (WARN_ON(dev_priv->perf.oa.oa_buffer.format_size == 0))
2058                return -EINVAL;
2059
2060        dev_priv->perf.oa.oa_buffer.format =
2061                dev_priv->perf.oa.oa_formats[props->oa_format].format;
2062
2063        dev_priv->perf.oa.periodic = props->oa_periodic;
2064        if (dev_priv->perf.oa.periodic)
2065                dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
2066
2067        if (stream->ctx) {
2068                ret = oa_get_render_ctx_id(stream);
2069                if (ret) {
2070                        DRM_DEBUG("Invalid context id to filter with\n");
2071                        return ret;
2072                }
2073        }
2074
2075        ret = get_oa_config(dev_priv, props->metrics_set, &stream->oa_config);
2076        if (ret) {
2077                DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
2078                goto err_config;
2079        }
2080
2081        /* PRM - observability performance counters:
2082         *
2083         *   OACONTROL, performance counter enable, note:
2084         *
2085         *   "When this bit is set, in order to have coherent counts,
2086         *   RC6 power state and trunk clock gating must be disabled.
2087         *   This can be achieved by programming MMIO registers as
2088         *   0xA094=0 and 0xA090[31]=1"
2089         *
2090         *   In our case we are expecting that taking pm + FORCEWAKE
2091         *   references will effectively disable RC6.
2092         */
2093        stream->wakeref = intel_runtime_pm_get(dev_priv);
2094        intel_uncore_forcewake_get(&dev_priv->uncore, FORCEWAKE_ALL);
2095
2096        ret = alloc_oa_buffer(dev_priv);
2097        if (ret)
2098                goto err_oa_buf_alloc;
2099
2100        ret = i915_mutex_lock_interruptible(&dev_priv->drm);
2101        if (ret)
2102                goto err_lock;
2103
2104        stream->ops = &i915_oa_stream_ops;
2105        dev_priv->perf.oa.exclusive_stream = stream;
2106
2107        ret = dev_priv->perf.oa.ops.enable_metric_set(stream);
2108        if (ret) {
2109                DRM_DEBUG("Unable to enable metric set\n");
2110                goto err_enable;
2111        }
2112
2113        mutex_unlock(&dev_priv->drm.struct_mutex);
2114
2115        return 0;
2116
2117err_enable:
2118        dev_priv->perf.oa.exclusive_stream = NULL;
2119        dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
2120        mutex_unlock(&dev_priv->drm.struct_mutex);
2121
2122err_lock:
2123        free_oa_buffer(dev_priv);
2124
2125err_oa_buf_alloc:
2126        put_oa_config(dev_priv, stream->oa_config);
2127
2128        intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
2129        intel_runtime_pm_put(dev_priv, stream->wakeref);
2130
2131err_config:
2132        if (stream->ctx)
2133                oa_put_render_ctx_id(stream);
2134
2135        return ret;
2136}
2137
2138void i915_oa_init_reg_state(struct intel_engine_cs *engine,
2139                            struct intel_context *ce,
2140                            u32 *regs)
2141{
2142        struct i915_perf_stream *stream;
2143
2144        if (engine->class != RENDER_CLASS)
2145                return;
2146
2147        stream = engine->i915->perf.oa.exclusive_stream;
2148        if (stream)
2149                gen8_update_reg_state_unlocked(ce, regs, stream->oa_config);
2150}
2151
2152/**
2153 * i915_perf_read_locked - &i915_perf_stream_ops->read with error normalisation
2154 * @stream: An i915 perf stream
2155 * @file: An i915 perf stream file
2156 * @buf: destination buffer given by userspace
2157 * @count: the number of bytes userspace wants to read
2158 * @ppos: (inout) file seek position (unused)
2159 *
2160 * Besides wrapping &i915_perf_stream_ops->read this provides a common place to
2161 * ensure that if we've successfully copied any data then reporting that takes
2162 * precedence over any internal error status, so the data isn't lost.
2163 *
2164 * For example ret will be -ENOSPC whenever there is more buffered data than
2165 * can be copied to userspace, but that's only interesting if we weren't able
2166 * to copy some data because it implies the userspace buffer is too small to
2167 * receive a single record (and we never split records).
2168 *
2169 * Another case with ret == -EFAULT is more of a grey area since it would seem
2170 * like bad form for userspace to ask us to overrun its buffer, but the user
2171 * knows best:
2172 *
2173 *   http://yarchive.net/comp/linux/partial_reads_writes.html
2174 *
2175 * Returns: The number of bytes copied or a negative error code on failure.
2176 */
2177static ssize_t i915_perf_read_locked(struct i915_perf_stream *stream,
2178                                     struct file *file,
2179                                     char __user *buf,
2180                                     size_t count,
2181                                     loff_t *ppos)
2182{
2183        /* Note we keep the offset (aka bytes read) separate from any
2184         * error status so that the final check for whether we return
2185         * the bytes read with a higher precedence than any error (see
2186         * comment below) doesn't need to be handled/duplicated in
2187         * stream->ops->read() implementations.
2188         */
2189        size_t offset = 0;
2190        int ret = stream->ops->read(stream, buf, count, &offset);
2191
2192        return offset ?: (ret ?: -EAGAIN);
2193}
2194
2195/**
2196 * i915_perf_read - handles read() FOP for i915 perf stream FDs
2197 * @file: An i915 perf stream file
2198 * @buf: destination buffer given by userspace
2199 * @count: the number of bytes userspace wants to read
2200 * @ppos: (inout) file seek position (unused)
2201 *
2202 * The entry point for handling a read() on a stream file descriptor from
2203 * userspace. Most of the work is left to the i915_perf_read_locked() and
2204 * &i915_perf_stream_ops->read but to save having stream implementations (of
2205 * which we might have multiple later) we handle blocking read here.
2206 *
2207 * We can also consistently treat trying to read from a disabled stream
2208 * as an IO error so implementations can assume the stream is enabled
2209 * while reading.
2210 *
2211 * Returns: The number of bytes copied or a negative error code on failure.
2212 */
2213static ssize_t i915_perf_read(struct file *file,
2214                              char __user *buf,
2215                              size_t count,
2216                              loff_t *ppos)
2217{
2218        struct i915_perf_stream *stream = file->private_data;
2219        struct drm_i915_private *dev_priv = stream->dev_priv;
2220        ssize_t ret;
2221
2222        /* To ensure it's handled consistently we simply treat all reads of a
2223         * disabled stream as an error. In particular it might otherwise lead
2224         * to a deadlock for blocking file descriptors...
2225         */
2226        if (!stream->enabled)
2227                return -EIO;
2228
2229        if (!(file->f_flags & O_NONBLOCK)) {
2230                /* There's the small chance of false positives from
2231                 * stream->ops->wait_unlocked.
2232                 *
2233                 * E.g. with single context filtering since we only wait until
2234                 * oabuffer has >= 1 report we don't immediately know whether
2235                 * any reports really belong to the current context
2236                 */
2237                do {
2238                        ret = stream->ops->wait_unlocked(stream);
2239                        if (ret)
2240                                return ret;
2241
2242                        mutex_lock(&dev_priv->perf.lock);
2243                        ret = i915_perf_read_locked(stream, file,
2244                                                    buf, count, ppos);
2245                        mutex_unlock(&dev_priv->perf.lock);
2246                } while (ret == -EAGAIN);
2247        } else {
2248                mutex_lock(&dev_priv->perf.lock);
2249                ret = i915_perf_read_locked(stream, file, buf, count, ppos);
2250                mutex_unlock(&dev_priv->perf.lock);
2251        }
2252
2253        /* We allow the poll checking to sometimes report false positive EPOLLIN
2254         * events where we might actually report EAGAIN on read() if there's
2255         * not really any data available. In this situation though we don't
2256         * want to enter a busy loop between poll() reporting a EPOLLIN event
2257         * and read() returning -EAGAIN. Clearing the oa.pollin state here
2258         * effectively ensures we back off until the next hrtimer callback
2259         * before reporting another EPOLLIN event.
2260         */
2261        if (ret >= 0 || ret == -EAGAIN) {
2262                /* Maybe make ->pollin per-stream state if we support multiple
2263                 * concurrent streams in the future.
2264                 */
2265                dev_priv->perf.oa.pollin = false;
2266        }
2267
2268        return ret;
2269}
2270
2271static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
2272{
2273        struct drm_i915_private *dev_priv =
2274                container_of(hrtimer, typeof(*dev_priv),
2275                             perf.oa.poll_check_timer);
2276
2277        if (oa_buffer_check_unlocked(dev_priv)) {
2278                dev_priv->perf.oa.pollin = true;
2279                wake_up(&dev_priv->perf.oa.poll_wq);
2280        }
2281
2282        hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
2283
2284        return HRTIMER_RESTART;
2285}
2286
2287/**
2288 * i915_perf_poll_locked - poll_wait() with a suitable wait queue for stream
2289 * @dev_priv: i915 device instance
2290 * @stream: An i915 perf stream
2291 * @file: An i915 perf stream file
2292 * @wait: poll() state table
2293 *
2294 * For handling userspace polling on an i915 perf stream, this calls through to
2295 * &i915_perf_stream_ops->poll_wait to call poll_wait() with a wait queue that
2296 * will be woken for new stream data.
2297 *
2298 * Note: The &drm_i915_private->perf.lock mutex has been taken to serialize
2299 * with any non-file-operation driver hooks.
2300 *
2301 * Returns: any poll events that are ready without sleeping
2302 */
2303static __poll_t i915_perf_poll_locked(struct drm_i915_private *dev_priv,
2304                                          struct i915_perf_stream *stream,
2305                                          struct file *file,
2306                                          poll_table *wait)
2307{
2308        __poll_t events = 0;
2309
2310        stream->ops->poll_wait(stream, file, wait);
2311
2312        /* Note: we don't explicitly check whether there's something to read
2313         * here since this path may be very hot depending on what else
2314         * userspace is polling, or on the timeout in use. We rely solely on
2315         * the hrtimer/oa_poll_check_timer_cb to notify us when there are
2316         * samples to read.
2317         */
2318        if (dev_priv->perf.oa.pollin)
2319                events |= EPOLLIN;
2320
2321        return events;
2322}
2323
2324/**
2325 * i915_perf_poll - call poll_wait() with a suitable wait queue for stream
2326 * @file: An i915 perf stream file
2327 * @wait: poll() state table
2328 *
2329 * For handling userspace polling on an i915 perf stream, this ensures
2330 * poll_wait() gets called with a wait queue that will be woken for new stream
2331 * data.
2332 *
2333 * Note: Implementation deferred to i915_perf_poll_locked()
2334 *
2335 * Returns: any poll events that are ready without sleeping
2336 */
2337static __poll_t i915_perf_poll(struct file *file, poll_table *wait)
2338{
2339        struct i915_perf_stream *stream = file->private_data;
2340        struct drm_i915_private *dev_priv = stream->dev_priv;
2341        __poll_t ret;
2342
2343        mutex_lock(&dev_priv->perf.lock);
2344        ret = i915_perf_poll_locked(dev_priv, stream, file, wait);
2345        mutex_unlock(&dev_priv->perf.lock);
2346
2347        return ret;
2348}
2349
2350/**
2351 * i915_perf_enable_locked - handle `I915_PERF_IOCTL_ENABLE` ioctl
2352 * @stream: A disabled i915 perf stream
2353 *
2354 * [Re]enables the associated capture of data for this stream.
2355 *
2356 * If a stream was previously enabled then there's currently no intention
2357 * to provide userspace any guarantee about the preservation of previously
2358 * buffered data.
2359 */
2360static void i915_perf_enable_locked(struct i915_perf_stream *stream)
2361{
2362        if (stream->enabled)
2363                return;
2364
2365        /* Allow stream->ops->enable() to refer to this */
2366        stream->enabled = true;
2367
2368        if (stream->ops->enable)
2369                stream->ops->enable(stream);
2370}
2371
2372/**
2373 * i915_perf_disable_locked - handle `I915_PERF_IOCTL_DISABLE` ioctl
2374 * @stream: An enabled i915 perf stream
2375 *
2376 * Disables the associated capture of data for this stream.
2377 *
2378 * The intention is that disabling an re-enabling a stream will ideally be
2379 * cheaper than destroying and re-opening a stream with the same configuration,
2380 * though there are no formal guarantees about what state or buffered data
2381 * must be retained between disabling and re-enabling a stream.
2382 *
2383 * Note: while a stream is disabled it's considered an error for userspace
2384 * to attempt to read from the stream (-EIO).
2385 */
2386static void i915_perf_disable_locked(struct i915_perf_stream *stream)
2387{
2388        if (!stream->enabled)
2389                return;
2390
2391        /* Allow stream->ops->disable() to refer to this */
2392        stream->enabled = false;
2393
2394        if (stream->ops->disable)
2395                stream->ops->disable(stream);
2396}
2397
2398/**
2399 * i915_perf_ioctl - support ioctl() usage with i915 perf stream FDs
2400 * @stream: An i915 perf stream
2401 * @cmd: the ioctl request
2402 * @arg: the ioctl data
2403 *
2404 * Note: The &drm_i915_private->perf.lock mutex has been taken to serialize
2405 * with any non-file-operation driver hooks.
2406 *
2407 * Returns: zero on success or a negative error code. Returns -EINVAL for
2408 * an unknown ioctl request.
2409 */
2410static long i915_perf_ioctl_locked(struct i915_perf_stream *stream,
2411                                   unsigned int cmd,
2412                                   unsigned long arg)
2413{
2414        switch (cmd) {
2415        case I915_PERF_IOCTL_ENABLE:
2416                i915_perf_enable_locked(stream);
2417                return 0;
2418        case I915_PERF_IOCTL_DISABLE:
2419                i915_perf_disable_locked(stream);
2420                return 0;
2421        }
2422
2423        return -EINVAL;
2424}
2425
2426/**
2427 * i915_perf_ioctl - support ioctl() usage with i915 perf stream FDs
2428 * @file: An i915 perf stream file
2429 * @cmd: the ioctl request
2430 * @arg: the ioctl data
2431 *
2432 * Implementation deferred to i915_perf_ioctl_locked().
2433 *
2434 * Returns: zero on success or a negative error code. Returns -EINVAL for
2435 * an unknown ioctl request.
2436 */
2437static long i915_perf_ioctl(struct file *file,
2438                            unsigned int cmd,
2439                            unsigned long arg)
2440{
2441        struct i915_perf_stream *stream = file->private_data;
2442        struct drm_i915_private *dev_priv = stream->dev_priv;
2443        long ret;
2444
2445        mutex_lock(&dev_priv->perf.lock);
2446        ret = i915_perf_ioctl_locked(stream, cmd, arg);
2447        mutex_unlock(&dev_priv->perf.lock);
2448
2449        return ret;
2450}
2451
2452/**
2453 * i915_perf_destroy_locked - destroy an i915 perf stream
2454 * @stream: An i915 perf stream
2455 *
2456 * Frees all resources associated with the given i915 perf @stream, disabling
2457 * any associated data capture in the process.
2458 *
2459 * Note: The &drm_i915_private->perf.lock mutex has been taken to serialize
2460 * with any non-file-operation driver hooks.
2461 */
2462static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
2463{
2464        if (stream->enabled)
2465                i915_perf_disable_locked(stream);
2466
2467        if (stream->ops->destroy)
2468                stream->ops->destroy(stream);
2469
2470        list_del(&stream->link);
2471
2472        if (stream->ctx)
2473                i915_gem_context_put(stream->ctx);
2474
2475        kfree(stream);
2476}
2477
2478/**
2479 * i915_perf_release - handles userspace close() of a stream file
2480 * @inode: anonymous inode associated with file
2481 * @file: An i915 perf stream file
2482 *
2483 * Cleans up any resources associated with an open i915 perf stream file.
2484 *
2485 * NB: close() can't really fail from the userspace point of view.
2486 *
2487 * Returns: zero on success or a negative error code.
2488 */
2489static int i915_perf_release(struct inode *inode, struct file *file)
2490{
2491        struct i915_perf_stream *stream = file->private_data;
2492        struct drm_i915_private *dev_priv = stream->dev_priv;
2493
2494        mutex_lock(&dev_priv->perf.lock);
2495        i915_perf_destroy_locked(stream);
2496        mutex_unlock(&dev_priv->perf.lock);
2497
2498        return 0;
2499}
2500
2501
2502static const struct file_operations fops = {
2503        .owner          = THIS_MODULE,
2504        .llseek         = no_llseek,
2505        .release        = i915_perf_release,
2506        .poll           = i915_perf_poll,
2507        .read           = i915_perf_read,
2508        .unlocked_ioctl = i915_perf_ioctl,
2509        /* Our ioctl have no arguments, so it's safe to use the same function
2510         * to handle 32bits compatibility.
2511         */
2512        .compat_ioctl   = i915_perf_ioctl,
2513};
2514
2515
2516/**
2517 * i915_perf_open_ioctl_locked - DRM ioctl() for userspace to open a stream FD
2518 * @dev_priv: i915 device instance
2519 * @param: The open parameters passed to 'DRM_I915_PERF_OPEN`
2520 * @props: individually validated u64 property value pairs
2521 * @file: drm file
2522 *
2523 * See i915_perf_ioctl_open() for interface details.
2524 *
2525 * Implements further stream config validation and stream initialization on
2526 * behalf of i915_perf_open_ioctl() with the &drm_i915_private->perf.lock mutex
2527 * taken to serialize with any non-file-operation driver hooks.
2528 *
2529 * Note: at this point the @props have only been validated in isolation and
2530 * it's still necessary to validate that the combination of properties makes
2531 * sense.
2532 *
2533 * In the case where userspace is interested in OA unit metrics then further
2534 * config validation and stream initialization details will be handled by
2535 * i915_oa_stream_init(). The code here should only validate config state that
2536 * will be relevant to all stream types / backends.
2537 *
2538 * Returns: zero on success or a negative error code.
2539 */
2540static int
2541i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
2542                            struct drm_i915_perf_open_param *param,
2543                            struct perf_open_properties *props,
2544                            struct drm_file *file)
2545{
2546        struct i915_gem_context *specific_ctx = NULL;
2547        struct i915_perf_stream *stream = NULL;
2548        unsigned long f_flags = 0;
2549        bool privileged_op = true;
2550        int stream_fd;
2551        int ret;
2552
2553        if (props->single_context) {
2554                u32 ctx_handle = props->ctx_handle;
2555                struct drm_i915_file_private *file_priv = file->driver_priv;
2556
2557                specific_ctx = i915_gem_context_lookup(file_priv, ctx_handle);
2558                if (!specific_ctx) {
2559                        DRM_DEBUG("Failed to look up context with ID %u for opening perf stream\n",
2560                                  ctx_handle);
2561                        ret = -ENOENT;
2562                        goto err;
2563                }
2564        }
2565
2566        /*
2567         * On Haswell the OA unit supports clock gating off for a specific
2568         * context and in this mode there's no visibility of metrics for the
2569         * rest of the system, which we consider acceptable for a
2570         * non-privileged client.
2571         *
2572         * For Gen8+ the OA unit no longer supports clock gating off for a
2573         * specific context and the kernel can't securely stop the counters
2574         * from updating as system-wide / global values. Even though we can
2575         * filter reports based on the included context ID we can't block
2576         * clients from seeing the raw / global counter values via
2577         * MI_REPORT_PERF_COUNT commands and so consider it a privileged op to
2578         * enable the OA unit by default.
2579         */
2580        if (IS_HASWELL(dev_priv) && specific_ctx)
2581                privileged_op = false;
2582
2583        /* Similar to perf's kernel.perf_paranoid_cpu sysctl option
2584         * we check a dev.i915.perf_stream_paranoid sysctl option
2585         * to determine if it's ok to access system wide OA counters
2586         * without CAP_SYS_ADMIN privileges.
2587         */
2588        if (privileged_op &&
2589            i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
2590                DRM_DEBUG("Insufficient privileges to open system-wide i915 perf stream\n");
2591                ret = -EACCES;
2592                goto err_ctx;
2593        }
2594
2595        stream = kzalloc(sizeof(*stream), GFP_KERNEL);
2596        if (!stream) {
2597                ret = -ENOMEM;
2598                goto err_ctx;
2599        }
2600
2601        stream->dev_priv = dev_priv;
2602        stream->ctx = specific_ctx;
2603
2604        ret = i915_oa_stream_init(stream, param, props);
2605        if (ret)
2606                goto err_alloc;
2607
2608        /* we avoid simply assigning stream->sample_flags = props->sample_flags
2609         * to have _stream_init check the combination of sample flags more
2610         * thoroughly, but still this is the expected result at this point.
2611         */
2612        if (WARN_ON(stream->sample_flags != props->sample_flags)) {
2613                ret = -ENODEV;
2614                goto err_flags;
2615        }
2616
2617        list_add(&stream->link, &dev_priv->perf.streams);
2618
2619        if (param->flags & I915_PERF_FLAG_FD_CLOEXEC)
2620                f_flags |= O_CLOEXEC;
2621        if (param->flags & I915_PERF_FLAG_FD_NONBLOCK)
2622                f_flags |= O_NONBLOCK;
2623
2624        stream_fd = anon_inode_getfd("[i915_perf]", &fops, stream, f_flags);
2625        if (stream_fd < 0) {
2626                ret = stream_fd;
2627                goto err_open;
2628        }
2629
2630        if (!(param->flags & I915_PERF_FLAG_DISABLED))
2631                i915_perf_enable_locked(stream);
2632
2633        return stream_fd;
2634
2635err_open:
2636        list_del(&stream->link);
2637err_flags:
2638        if (stream->ops->destroy)
2639                stream->ops->destroy(stream);
2640err_alloc:
2641        kfree(stream);
2642err_ctx:
2643        if (specific_ctx)
2644                i915_gem_context_put(specific_ctx);
2645err:
2646        return ret;
2647}
2648
2649static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
2650{
2651        return div64_u64(1000000000ULL * (2ULL << exponent),
2652                         1000ULL * RUNTIME_INFO(dev_priv)->cs_timestamp_frequency_khz);
2653}
2654
2655/**
2656 * read_properties_unlocked - validate + copy userspace stream open properties
2657 * @dev_priv: i915 device instance
2658 * @uprops: The array of u64 key value pairs given by userspace
2659 * @n_props: The number of key value pairs expected in @uprops
2660 * @props: The stream configuration built up while validating properties
2661 *
2662 * Note this function only validates properties in isolation it doesn't
2663 * validate that the combination of properties makes sense or that all
2664 * properties necessary for a particular kind of stream have been set.
2665 *
2666 * Note that there currently aren't any ordering requirements for properties so
2667 * we shouldn't validate or assume anything about ordering here. This doesn't
2668 * rule out defining new properties with ordering requirements in the future.
2669 */
2670static int read_properties_unlocked(struct drm_i915_private *dev_priv,
2671                                    u64 __user *uprops,
2672                                    u32 n_props,
2673                                    struct perf_open_properties *props)
2674{
2675        u64 __user *uprop = uprops;
2676        u32 i;
2677
2678        memset(props, 0, sizeof(struct perf_open_properties));
2679
2680        if (!n_props) {
2681                DRM_DEBUG("No i915 perf properties given\n");
2682                return -EINVAL;
2683        }
2684
2685        /* Considering that ID = 0 is reserved and assuming that we don't
2686         * (currently) expect any configurations to ever specify duplicate
2687         * values for a particular property ID then the last _PROP_MAX value is
2688         * one greater than the maximum number of properties we expect to get
2689         * from userspace.
2690         */
2691        if (n_props >= DRM_I915_PERF_PROP_MAX) {
2692                DRM_DEBUG("More i915 perf properties specified than exist\n");
2693                return -EINVAL;
2694        }
2695
2696        for (i = 0; i < n_props; i++) {
2697                u64 oa_period, oa_freq_hz;
2698                u64 id, value;
2699                int ret;
2700
2701                ret = get_user(id, uprop);
2702                if (ret)
2703                        return ret;
2704
2705                ret = get_user(value, uprop + 1);
2706                if (ret)
2707                        return ret;
2708
2709                if (id == 0 || id >= DRM_I915_PERF_PROP_MAX) {
2710                        DRM_DEBUG("Unknown i915 perf property ID\n");
2711                        return -EINVAL;
2712                }
2713
2714                switch ((enum drm_i915_perf_property_id)id) {
2715                case DRM_I915_PERF_PROP_CTX_HANDLE:
2716                        props->single_context = 1;
2717                        props->ctx_handle = value;
2718                        break;
2719                case DRM_I915_PERF_PROP_SAMPLE_OA:
2720                        if (value)
2721                                props->sample_flags |= SAMPLE_OA_REPORT;
2722                        break;
2723                case DRM_I915_PERF_PROP_OA_METRICS_SET:
2724                        if (value == 0) {
2725                                DRM_DEBUG("Unknown OA metric set ID\n");
2726                                return -EINVAL;
2727                        }
2728                        props->metrics_set = value;
2729                        break;
2730                case DRM_I915_PERF_PROP_OA_FORMAT:
2731                        if (value == 0 || value >= I915_OA_FORMAT_MAX) {
2732                                DRM_DEBUG("Out-of-range OA report format %llu\n",
2733                                          value);
2734                                return -EINVAL;
2735                        }
2736                        if (!dev_priv->perf.oa.oa_formats[value].size) {
2737                                DRM_DEBUG("Unsupported OA report format %llu\n",
2738                                          value);
2739                                return -EINVAL;
2740                        }
2741                        props->oa_format = value;
2742                        break;
2743                case DRM_I915_PERF_PROP_OA_EXPONENT:
2744                        if (value > OA_EXPONENT_MAX) {
2745                                DRM_DEBUG("OA timer exponent too high (> %u)\n",
2746                                         OA_EXPONENT_MAX);
2747                                return -EINVAL;
2748                        }
2749
2750                        /* Theoretically we can program the OA unit to sample
2751                         * e.g. every 160ns for HSW, 167ns for BDW/SKL or 104ns
2752                         * for BXT. We don't allow such high sampling
2753                         * frequencies by default unless root.
2754                         */
2755
2756                        BUILD_BUG_ON(sizeof(oa_period) != 8);
2757                        oa_period = oa_exponent_to_ns(dev_priv, value);
2758
2759                        /* This check is primarily to ensure that oa_period <=
2760                         * UINT32_MAX (before passing to do_div which only
2761                         * accepts a u32 denominator), but we can also skip
2762                         * checking anything < 1Hz which implicitly can't be
2763                         * limited via an integer oa_max_sample_rate.
2764                         */
2765                        if (oa_period <= NSEC_PER_SEC) {
2766                                u64 tmp = NSEC_PER_SEC;
2767                                do_div(tmp, oa_period);
2768                                oa_freq_hz = tmp;
2769                        } else
2770                                oa_freq_hz = 0;
2771
2772                        if (oa_freq_hz > i915_oa_max_sample_rate &&
2773                            !capable(CAP_SYS_ADMIN)) {
2774                                DRM_DEBUG("OA exponent would exceed the max sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without root privileges\n",
2775                                          i915_oa_max_sample_rate);
2776                                return -EACCES;
2777                        }
2778
2779                        props->oa_periodic = true;
2780                        props->oa_period_exponent = value;
2781                        break;
2782                case DRM_I915_PERF_PROP_MAX:
2783                        MISSING_CASE(id);
2784                        return -EINVAL;
2785                }
2786
2787                uprop += 2;
2788        }
2789
2790        return 0;
2791}
2792
2793/**
2794 * i915_perf_open_ioctl - DRM ioctl() for userspace to open a stream FD
2795 * @dev: drm device
2796 * @data: ioctl data copied from userspace (unvalidated)
2797 * @file: drm file
2798 *
2799 * Validates the stream open parameters given by userspace including flags
2800 * and an array of u64 key, value pair properties.
2801 *
2802 * Very little is assumed up front about the nature of the stream being
2803 * opened (for instance we don't assume it's for periodic OA unit metrics). An
2804 * i915-perf stream is expected to be a suitable interface for other forms of
2805 * buffered data written by the GPU besides periodic OA metrics.
2806 *
2807 * Note we copy the properties from userspace outside of the i915 perf
2808 * mutex to avoid an awkward lockdep with mmap_sem.
2809 *
2810 * Most of the implementation details are handled by
2811 * i915_perf_open_ioctl_locked() after taking the &drm_i915_private->perf.lock
2812 * mutex for serializing with any non-file-operation driver hooks.
2813 *
2814 * Return: A newly opened i915 Perf stream file descriptor or negative
2815 * error code on failure.
2816 */
2817int i915_perf_open_ioctl(struct drm_device *dev, void *data,
2818                         struct drm_file *file)
2819{
2820        struct drm_i915_private *dev_priv = dev->dev_private;
2821        struct drm_i915_perf_open_param *param = data;
2822        struct perf_open_properties props;
2823        u32 known_open_flags;
2824        int ret;
2825
2826        if (!dev_priv->perf.initialized) {
2827                DRM_DEBUG("i915 perf interface not available for this system\n");
2828                return -ENOTSUPP;
2829        }
2830
2831        known_open_flags = I915_PERF_FLAG_FD_CLOEXEC |
2832                           I915_PERF_FLAG_FD_NONBLOCK |
2833                           I915_PERF_FLAG_DISABLED;
2834        if (param->flags & ~known_open_flags) {
2835                DRM_DEBUG("Unknown drm_i915_perf_open_param flag\n");
2836                return -EINVAL;
2837        }
2838
2839        ret = read_properties_unlocked(dev_priv,
2840                                       u64_to_user_ptr(param->properties_ptr),
2841                                       param->num_properties,
2842                                       &props);
2843        if (ret)
2844                return ret;
2845
2846        mutex_lock(&dev_priv->perf.lock);
2847        ret = i915_perf_open_ioctl_locked(dev_priv, param, &props, file);
2848        mutex_unlock(&dev_priv->perf.lock);
2849
2850        return ret;
2851}
2852
2853/**
2854 * i915_perf_register - exposes i915-perf to userspace
2855 * @dev_priv: i915 device instance
2856 *
2857 * In particular OA metric sets are advertised under a sysfs metrics/
2858 * directory allowing userspace to enumerate valid IDs that can be
2859 * used to open an i915-perf stream.
2860 */
2861void i915_perf_register(struct drm_i915_private *dev_priv)
2862{
2863        int ret;
2864
2865        if (!dev_priv->perf.initialized)
2866                return;
2867
2868        /* To be sure we're synchronized with an attempted
2869         * i915_perf_open_ioctl(); considering that we register after
2870         * being exposed to userspace.
2871         */
2872        mutex_lock(&dev_priv->perf.lock);
2873
2874        dev_priv->perf.metrics_kobj =
2875                kobject_create_and_add("metrics",
2876                                       &dev_priv->drm.primary->kdev->kobj);
2877        if (!dev_priv->perf.metrics_kobj)
2878                goto exit;
2879
2880        sysfs_attr_init(&dev_priv->perf.oa.test_config.sysfs_metric_id.attr);
2881
2882        if (INTEL_GEN(dev_priv) >= 11) {
2883                i915_perf_load_test_config_icl(dev_priv);
2884        } else if (IS_CANNONLAKE(dev_priv)) {
2885                i915_perf_load_test_config_cnl(dev_priv);
2886        } else if (IS_COFFEELAKE(dev_priv)) {
2887                if (IS_CFL_GT2(dev_priv))
2888                        i915_perf_load_test_config_cflgt2(dev_priv);
2889                if (IS_CFL_GT3(dev_priv))
2890                        i915_perf_load_test_config_cflgt3(dev_priv);
2891        } else if (IS_GEMINILAKE(dev_priv)) {
2892                i915_perf_load_test_config_glk(dev_priv);
2893        } else if (IS_KABYLAKE(dev_priv)) {
2894                if (IS_KBL_GT2(dev_priv))
2895                        i915_perf_load_test_config_kblgt2(dev_priv);
2896                else if (IS_KBL_GT3(dev_priv))
2897                        i915_perf_load_test_config_kblgt3(dev_priv);
2898        } else if (IS_BROXTON(dev_priv)) {
2899                i915_perf_load_test_config_bxt(dev_priv);
2900        } else if (IS_SKYLAKE(dev_priv)) {
2901                if (IS_SKL_GT2(dev_priv))
2902                        i915_perf_load_test_config_sklgt2(dev_priv);
2903                else if (IS_SKL_GT3(dev_priv))
2904                        i915_perf_load_test_config_sklgt3(dev_priv);
2905                else if (IS_SKL_GT4(dev_priv))
2906                        i915_perf_load_test_config_sklgt4(dev_priv);
2907        } else if (IS_CHERRYVIEW(dev_priv)) {
2908                i915_perf_load_test_config_chv(dev_priv);
2909        } else if (IS_BROADWELL(dev_priv)) {
2910                i915_perf_load_test_config_bdw(dev_priv);
2911        } else if (IS_HASWELL(dev_priv)) {
2912                i915_perf_load_test_config_hsw(dev_priv);
2913}
2914
2915        if (dev_priv->perf.oa.test_config.id == 0)
2916                goto sysfs_error;
2917
2918        ret = sysfs_create_group(dev_priv->perf.metrics_kobj,
2919                                 &dev_priv->perf.oa.test_config.sysfs_metric);
2920        if (ret)
2921                goto sysfs_error;
2922
2923        atomic_set(&dev_priv->perf.oa.test_config.ref_count, 1);
2924
2925        goto exit;
2926
2927sysfs_error:
2928        kobject_put(dev_priv->perf.metrics_kobj);
2929        dev_priv->perf.metrics_kobj = NULL;
2930
2931exit:
2932        mutex_unlock(&dev_priv->perf.lock);
2933}
2934
2935/**
2936 * i915_perf_unregister - hide i915-perf from userspace
2937 * @dev_priv: i915 device instance
2938 *
2939 * i915-perf state cleanup is split up into an 'unregister' and
2940 * 'deinit' phase where the interface is first hidden from
2941 * userspace by i915_perf_unregister() before cleaning up
2942 * remaining state in i915_perf_fini().
2943 */
2944void i915_perf_unregister(struct drm_i915_private *dev_priv)
2945{
2946        if (!dev_priv->perf.metrics_kobj)
2947                return;
2948
2949        sysfs_remove_group(dev_priv->perf.metrics_kobj,
2950                           &dev_priv->perf.oa.test_config.sysfs_metric);
2951
2952        kobject_put(dev_priv->perf.metrics_kobj);
2953        dev_priv->perf.metrics_kobj = NULL;
2954}
2955
2956static bool gen8_is_valid_flex_addr(struct drm_i915_private *dev_priv, u32 addr)
2957{
2958        static const i915_reg_t flex_eu_regs[] = {
2959                EU_PERF_CNTL0,
2960                EU_PERF_CNTL1,
2961                EU_PERF_CNTL2,
2962                EU_PERF_CNTL3,
2963                EU_PERF_CNTL4,
2964                EU_PERF_CNTL5,
2965                EU_PERF_CNTL6,
2966        };
2967        int i;
2968
2969        for (i = 0; i < ARRAY_SIZE(flex_eu_regs); i++) {
2970                if (i915_mmio_reg_offset(flex_eu_regs[i]) == addr)
2971                        return true;
2972        }
2973        return false;
2974}
2975
2976static bool gen7_is_valid_b_counter_addr(struct drm_i915_private *dev_priv, u32 addr)
2977{
2978        return (addr >= i915_mmio_reg_offset(OASTARTTRIG1) &&
2979                addr <= i915_mmio_reg_offset(OASTARTTRIG8)) ||
2980                (addr >= i915_mmio_reg_offset(OAREPORTTRIG1) &&
2981                 addr <= i915_mmio_reg_offset(OAREPORTTRIG8)) ||
2982                (addr >= i915_mmio_reg_offset(OACEC0_0) &&
2983                 addr <= i915_mmio_reg_offset(OACEC7_1));
2984}
2985
2986static bool gen7_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
2987{
2988        return addr == i915_mmio_reg_offset(HALF_SLICE_CHICKEN2) ||
2989                (addr >= i915_mmio_reg_offset(MICRO_BP0_0) &&
2990                 addr <= i915_mmio_reg_offset(NOA_WRITE)) ||
2991                (addr >= i915_mmio_reg_offset(OA_PERFCNT1_LO) &&
2992                 addr <= i915_mmio_reg_offset(OA_PERFCNT2_HI)) ||
2993                (addr >= i915_mmio_reg_offset(OA_PERFMATRIX_LO) &&
2994                 addr <= i915_mmio_reg_offset(OA_PERFMATRIX_HI));
2995}
2996
2997static bool gen8_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
2998{
2999        return gen7_is_valid_mux_addr(dev_priv, addr) ||
3000                addr == i915_mmio_reg_offset(WAIT_FOR_RC6_EXIT) ||
3001                (addr >= i915_mmio_reg_offset(RPM_CONFIG0) &&
3002                 addr <= i915_mmio_reg_offset(NOA_CONFIG(8)));
3003}
3004
3005static bool gen10_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
3006{
3007        return gen8_is_valid_mux_addr(dev_priv, addr) ||
3008                addr == i915_mmio_reg_offset(GEN10_NOA_WRITE_HIGH) ||
3009                (addr >= i915_mmio_reg_offset(OA_PERFCNT3_LO) &&
3010                 addr <= i915_mmio_reg_offset(OA_PERFCNT4_HI));
3011}
3012
3013static bool hsw_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
3014{
3015        return gen7_is_valid_mux_addr(dev_priv, addr) ||
3016                (addr >= 0x25100 && addr <= 0x2FF90) ||
3017                (addr >= i915_mmio_reg_offset(HSW_MBVID2_NOA0) &&
3018                 addr <= i915_mmio_reg_offset(HSW_MBVID2_NOA9)) ||
3019                addr == i915_mmio_reg_offset(HSW_MBVID2_MISR0);
3020}
3021
3022static bool chv_is_valid_mux_addr(struct drm_i915_private *dev_priv, u32 addr)
3023{
3024        return gen7_is_valid_mux_addr(dev_priv, addr) ||
3025                (addr >= 0x182300 && addr <= 0x1823A4);
3026}
3027
3028static u32 mask_reg_value(u32 reg, u32 val)
3029{
3030        /* HALF_SLICE_CHICKEN2 is programmed with a the
3031         * WaDisableSTUnitPowerOptimization workaround. Make sure the value
3032         * programmed by userspace doesn't change this.
3033         */
3034        if (i915_mmio_reg_offset(HALF_SLICE_CHICKEN2) == reg)
3035                val = val & ~_MASKED_BIT_ENABLE(GEN8_ST_PO_DISABLE);
3036
3037        /* WAIT_FOR_RC6_EXIT has only one bit fullfilling the function
3038         * indicated by its name and a bunch of selection fields used by OA
3039         * configs.
3040         */
3041        if (i915_mmio_reg_offset(WAIT_FOR_RC6_EXIT) == reg)
3042                val = val & ~_MASKED_BIT_ENABLE(HSW_WAIT_FOR_RC6_EXIT_ENABLE);
3043
3044        return val;
3045}
3046
3047static struct i915_oa_reg *alloc_oa_regs(struct drm_i915_private *dev_priv,
3048                                         bool (*is_valid)(struct drm_i915_private *dev_priv, u32 addr),
3049                                         u32 __user *regs,
3050                                         u32 n_regs)
3051{
3052        struct i915_oa_reg *oa_regs;
3053        int err;
3054        u32 i;
3055
3056        if (!n_regs)
3057                return NULL;
3058
3059        if (!access_ok(regs, n_regs * sizeof(u32) * 2))
3060                return ERR_PTR(-EFAULT);
3061
3062        /* No is_valid function means we're not allowing any register to be programmed. */
3063        GEM_BUG_ON(!is_valid);
3064        if (!is_valid)
3065                return ERR_PTR(-EINVAL);
3066
3067        oa_regs = kmalloc_array(n_regs, sizeof(*oa_regs), GFP_KERNEL);
3068        if (!oa_regs)
3069                return ERR_PTR(-ENOMEM);
3070
3071        for (i = 0; i < n_regs; i++) {
3072                u32 addr, value;
3073
3074                err = get_user(addr, regs);
3075                if (err)
3076                        goto addr_err;
3077
3078                if (!is_valid(dev_priv, addr)) {
3079                        DRM_DEBUG("Invalid oa_reg address: %X\n", addr);
3080                        err = -EINVAL;
3081                        goto addr_err;
3082                }
3083
3084                err = get_user(value, regs + 1);
3085                if (err)
3086                        goto addr_err;
3087
3088                oa_regs[i].addr = _MMIO(addr);
3089                oa_regs[i].value = mask_reg_value(addr, value);
3090
3091                regs += 2;
3092        }
3093
3094        return oa_regs;
3095
3096addr_err:
3097        kfree(oa_regs);
3098        return ERR_PTR(err);
3099}
3100
3101static ssize_t show_dynamic_id(struct device *dev,
3102                               struct device_attribute *attr,
3103                               char *buf)
3104{
3105        struct i915_oa_config *oa_config =
3106                container_of(attr, typeof(*oa_config), sysfs_metric_id);
3107
3108        return sprintf(buf, "%d\n", oa_config->id);
3109}
3110
3111static int create_dynamic_oa_sysfs_entry(struct drm_i915_private *dev_priv,
3112                                         struct i915_oa_config *oa_config)
3113{
3114        sysfs_attr_init(&oa_config->sysfs_metric_id.attr);
3115        oa_config->sysfs_metric_id.attr.name = "id";
3116        oa_config->sysfs_metric_id.attr.mode = S_IRUGO;
3117        oa_config->sysfs_metric_id.show = show_dynamic_id;
3118        oa_config->sysfs_metric_id.store = NULL;
3119
3120        oa_config->attrs[0] = &oa_config->sysfs_metric_id.attr;
3121        oa_config->attrs[1] = NULL;
3122
3123        oa_config->sysfs_metric.name = oa_config->uuid;
3124        oa_config->sysfs_metric.attrs = oa_config->attrs;
3125
3126        return sysfs_create_group(dev_priv->perf.metrics_kobj,
3127                                  &oa_config->sysfs_metric);
3128}
3129
3130/**
3131 * i915_perf_add_config_ioctl - DRM ioctl() for userspace to add a new OA config
3132 * @dev: drm device
3133 * @data: ioctl data (pointer to struct drm_i915_perf_oa_config) copied from
3134 *        userspace (unvalidated)
3135 * @file: drm file
3136 *
3137 * Validates the submitted OA register to be saved into a new OA config that
3138 * can then be used for programming the OA unit and its NOA network.
3139 *
3140 * Returns: A new allocated config number to be used with the perf open ioctl
3141 * or a negative error code on failure.
3142 */
3143int i915_perf_add_config_ioctl(struct drm_device *dev, void *data,
3144                               struct drm_file *file)
3145{
3146        struct drm_i915_private *dev_priv = dev->dev_private;
3147        struct drm_i915_perf_oa_config *args = data;
3148        struct i915_oa_config *oa_config, *tmp;
3149        int err, id;
3150
3151        if (!dev_priv->perf.initialized) {
3152                DRM_DEBUG("i915 perf interface not available for this system\n");
3153                return -ENOTSUPP;
3154        }
3155
3156        if (!dev_priv->perf.metrics_kobj) {
3157                DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
3158                return -EINVAL;
3159        }
3160
3161        if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
3162                DRM_DEBUG("Insufficient privileges to add i915 OA config\n");
3163                return -EACCES;
3164        }
3165
3166        if ((!args->mux_regs_ptr || !args->n_mux_regs) &&
3167            (!args->boolean_regs_ptr || !args->n_boolean_regs) &&
3168            (!args->flex_regs_ptr || !args->n_flex_regs)) {
3169                DRM_DEBUG("No OA registers given\n");
3170                return -EINVAL;
3171        }
3172
3173        oa_config = kzalloc(sizeof(*oa_config), GFP_KERNEL);
3174        if (!oa_config) {
3175                DRM_DEBUG("Failed to allocate memory for the OA config\n");
3176                return -ENOMEM;
3177        }
3178
3179        atomic_set(&oa_config->ref_count, 1);
3180
3181        if (!uuid_is_valid(args->uuid)) {
3182                DRM_DEBUG("Invalid uuid format for OA config\n");
3183                err = -EINVAL;
3184                goto reg_err;
3185        }
3186
3187        /* Last character in oa_config->uuid will be 0 because oa_config is
3188         * kzalloc.
3189         */
3190        memcpy(oa_config->uuid, args->uuid, sizeof(args->uuid));
3191
3192        oa_config->mux_regs_len = args->n_mux_regs;
3193        oa_config->mux_regs =
3194                alloc_oa_regs(dev_priv,
3195                              dev_priv->perf.oa.ops.is_valid_mux_reg,
3196                              u64_to_user_ptr(args->mux_regs_ptr),
3197                              args->n_mux_regs);
3198
3199        if (IS_ERR(oa_config->mux_regs)) {
3200                DRM_DEBUG("Failed to create OA config for mux_regs\n");
3201                err = PTR_ERR(oa_config->mux_regs);
3202                goto reg_err;
3203        }
3204
3205        oa_config->b_counter_regs_len = args->n_boolean_regs;
3206        oa_config->b_counter_regs =
3207                alloc_oa_regs(dev_priv,
3208                              dev_priv->perf.oa.ops.is_valid_b_counter_reg,
3209                              u64_to_user_ptr(args->boolean_regs_ptr),
3210                              args->n_boolean_regs);
3211
3212        if (IS_ERR(oa_config->b_counter_regs)) {
3213                DRM_DEBUG("Failed to create OA config for b_counter_regs\n");
3214                err = PTR_ERR(oa_config->b_counter_regs);
3215                goto reg_err;
3216        }
3217
3218        if (INTEL_GEN(dev_priv) < 8) {
3219                if (args->n_flex_regs != 0) {
3220                        err = -EINVAL;
3221                        goto reg_err;
3222                }
3223        } else {
3224                oa_config->flex_regs_len = args->n_flex_regs;
3225                oa_config->flex_regs =
3226                        alloc_oa_regs(dev_priv,
3227                                      dev_priv->perf.oa.ops.is_valid_flex_reg,
3228                                      u64_to_user_ptr(args->flex_regs_ptr),
3229                                      args->n_flex_regs);
3230
3231                if (IS_ERR(oa_config->flex_regs)) {
3232                        DRM_DEBUG("Failed to create OA config for flex_regs\n");
3233                        err = PTR_ERR(oa_config->flex_regs);
3234                        goto reg_err;
3235                }
3236        }
3237
3238        err = mutex_lock_interruptible(&dev_priv->perf.metrics_lock);
3239        if (err)
3240                goto reg_err;
3241
3242        /* We shouldn't have too many configs, so this iteration shouldn't be
3243         * too costly.
3244         */
3245        idr_for_each_entry(&dev_priv->perf.metrics_idr, tmp, id) {
3246                if (!strcmp(tmp->uuid, oa_config->uuid)) {
3247                        DRM_DEBUG("OA config already exists with this uuid\n");
3248                        err = -EADDRINUSE;
3249                        goto sysfs_err;
3250                }
3251        }
3252
3253        err = create_dynamic_oa_sysfs_entry(dev_priv, oa_config);
3254        if (err) {
3255                DRM_DEBUG("Failed to create sysfs entry for OA config\n");
3256                goto sysfs_err;
3257        }
3258
3259        /* Config id 0 is invalid, id 1 for kernel stored test config. */
3260        oa_config->id = idr_alloc(&dev_priv->perf.metrics_idr,
3261                                  oa_config, 2,
3262                                  0, GFP_KERNEL);
3263        if (oa_config->id < 0) {
3264                DRM_DEBUG("Failed to create sysfs entry for OA config\n");
3265                err = oa_config->id;
3266                goto sysfs_err;
3267        }
3268
3269        mutex_unlock(&dev_priv->perf.metrics_lock);
3270
3271        DRM_DEBUG("Added config %s id=%i\n", oa_config->uuid, oa_config->id);
3272
3273        return oa_config->id;
3274
3275sysfs_err:
3276        mutex_unlock(&dev_priv->perf.metrics_lock);
3277reg_err:
3278        put_oa_config(dev_priv, oa_config);
3279        DRM_DEBUG("Failed to add new OA config\n");
3280        return err;
3281}
3282
3283/**
3284 * i915_perf_remove_config_ioctl - DRM ioctl() for userspace to remove an OA config
3285 * @dev: drm device
3286 * @data: ioctl data (pointer to u64 integer) copied from userspace
3287 * @file: drm file
3288 *
3289 * Configs can be removed while being used, the will stop appearing in sysfs
3290 * and their content will be freed when the stream using the config is closed.
3291 *
3292 * Returns: 0 on success or a negative error code on failure.
3293 */
3294int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data,
3295                                  struct drm_file *file)
3296{
3297        struct drm_i915_private *dev_priv = dev->dev_private;
3298        u64 *arg = data;
3299        struct i915_oa_config *oa_config;
3300        int ret;
3301
3302        if (!dev_priv->perf.initialized) {
3303                DRM_DEBUG("i915 perf interface not available for this system\n");
3304                return -ENOTSUPP;
3305        }
3306
3307        if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
3308                DRM_DEBUG("Insufficient privileges to remove i915 OA config\n");
3309                return -EACCES;
3310        }
3311
3312        ret = mutex_lock_interruptible(&dev_priv->perf.metrics_lock);
3313        if (ret)
3314                goto lock_err;
3315
3316        oa_config = idr_find(&dev_priv->perf.metrics_idr, *arg);
3317        if (!oa_config) {
3318                DRM_DEBUG("Failed to remove unknown OA config\n");
3319                ret = -ENOENT;
3320                goto config_err;
3321        }
3322
3323        GEM_BUG_ON(*arg != oa_config->id);
3324
3325        sysfs_remove_group(dev_priv->perf.metrics_kobj,
3326                           &oa_config->sysfs_metric);
3327
3328        idr_remove(&dev_priv->perf.metrics_idr, *arg);
3329
3330        DRM_DEBUG("Removed config %s id=%i\n", oa_config->uuid, oa_config->id);
3331
3332        put_oa_config(dev_priv, oa_config);
3333
3334config_err:
3335        mutex_unlock(&dev_priv->perf.metrics_lock);
3336lock_err:
3337        return ret;
3338}
3339
3340static struct ctl_table oa_table[] = {
3341        {
3342         .procname = "perf_stream_paranoid",
3343         .data = &i915_perf_stream_paranoid,
3344         .maxlen = sizeof(i915_perf_stream_paranoid),
3345         .mode = 0644,
3346         .proc_handler = proc_dointvec_minmax,
3347         .extra1 = &zero,
3348         .extra2 = &one,
3349         },
3350        {
3351         .procname = "oa_max_sample_rate",
3352         .data = &i915_oa_max_sample_rate,
3353         .maxlen = sizeof(i915_oa_max_sample_rate),
3354         .mode = 0644,
3355         .proc_handler = proc_dointvec_minmax,
3356         .extra1 = &zero,
3357         .extra2 = &oa_sample_rate_hard_limit,
3358         },
3359        {}
3360};
3361
3362static struct ctl_table i915_root[] = {
3363        {
3364         .procname = "i915",
3365         .maxlen = 0,
3366         .mode = 0555,
3367         .child = oa_table,
3368         },
3369        {}
3370};
3371
3372static struct ctl_table dev_root[] = {
3373        {
3374         .procname = "dev",
3375         .maxlen = 0,
3376         .mode = 0555,
3377         .child = i915_root,
3378         },
3379        {}
3380};
3381
3382/**
3383 * i915_perf_init - initialize i915-perf state on module load
3384 * @dev_priv: i915 device instance
3385 *
3386 * Initializes i915-perf state without exposing anything to userspace.
3387 *
3388 * Note: i915-perf initialization is split into an 'init' and 'register'
3389 * phase with the i915_perf_register() exposing state to userspace.
3390 */
3391void i915_perf_init(struct drm_i915_private *dev_priv)
3392{
3393        if (IS_HASWELL(dev_priv)) {
3394                dev_priv->perf.oa.ops.is_valid_b_counter_reg =
3395                        gen7_is_valid_b_counter_addr;
3396                dev_priv->perf.oa.ops.is_valid_mux_reg =
3397                        hsw_is_valid_mux_addr;
3398                dev_priv->perf.oa.ops.is_valid_flex_reg = NULL;
3399                dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set;
3400                dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set;
3401                dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
3402                dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
3403                dev_priv->perf.oa.ops.read = gen7_oa_read;
3404                dev_priv->perf.oa.ops.oa_hw_tail_read =
3405                        gen7_oa_hw_tail_read;
3406
3407                dev_priv->perf.oa.oa_formats = hsw_oa_formats;
3408        } else if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
3409                /* Note: that although we could theoretically also support the
3410                 * legacy ringbuffer mode on BDW (and earlier iterations of
3411                 * this driver, before upstreaming did this) it didn't seem
3412                 * worth the complexity to maintain now that BDW+ enable
3413                 * execlist mode by default.
3414                 */
3415                dev_priv->perf.oa.oa_formats = gen8_plus_oa_formats;
3416
3417                dev_priv->perf.oa.ops.oa_enable = gen8_oa_enable;
3418                dev_priv->perf.oa.ops.oa_disable = gen8_oa_disable;
3419                dev_priv->perf.oa.ops.read = gen8_oa_read;
3420                dev_priv->perf.oa.ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
3421
3422                if (IS_GEN_RANGE(dev_priv, 8, 9)) {
3423                        dev_priv->perf.oa.ops.is_valid_b_counter_reg =
3424                                gen7_is_valid_b_counter_addr;
3425                        dev_priv->perf.oa.ops.is_valid_mux_reg =
3426                                gen8_is_valid_mux_addr;
3427                        dev_priv->perf.oa.ops.is_valid_flex_reg =
3428                                gen8_is_valid_flex_addr;
3429
3430                        if (IS_CHERRYVIEW(dev_priv)) {
3431                                dev_priv->perf.oa.ops.is_valid_mux_reg =
3432                                        chv_is_valid_mux_addr;
3433                        }
3434
3435                        dev_priv->perf.oa.ops.enable_metric_set = gen8_enable_metric_set;
3436                        dev_priv->perf.oa.ops.disable_metric_set = gen8_disable_metric_set;
3437
3438                        if (IS_GEN(dev_priv, 8)) {
3439                                dev_priv->perf.oa.ctx_oactxctrl_offset = 0x120;
3440                                dev_priv->perf.oa.ctx_flexeu0_offset = 0x2ce;
3441
3442                                dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<25);
3443                        } else {
3444                                dev_priv->perf.oa.ctx_oactxctrl_offset = 0x128;
3445                                dev_priv->perf.oa.ctx_flexeu0_offset = 0x3de;
3446
3447                                dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<16);
3448                        }
3449                } else if (IS_GEN_RANGE(dev_priv, 10, 11)) {
3450                        dev_priv->perf.oa.ops.is_valid_b_counter_reg =
3451                                gen7_is_valid_b_counter_addr;
3452                        dev_priv->perf.oa.ops.is_valid_mux_reg =
3453                                gen10_is_valid_mux_addr;
3454                        dev_priv->perf.oa.ops.is_valid_flex_reg =
3455                                gen8_is_valid_flex_addr;
3456
3457                        dev_priv->perf.oa.ops.enable_metric_set = gen8_enable_metric_set;
3458                        dev_priv->perf.oa.ops.disable_metric_set = gen10_disable_metric_set;
3459
3460                        dev_priv->perf.oa.ctx_oactxctrl_offset = 0x128;
3461                        dev_priv->perf.oa.ctx_flexeu0_offset = 0x3de;
3462
3463                        dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<16);
3464                }
3465        }
3466
3467        if (dev_priv->perf.oa.ops.enable_metric_set) {
3468                hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
3469                                CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3470                dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
3471                init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
3472
3473                INIT_LIST_HEAD(&dev_priv->perf.streams);
3474                mutex_init(&dev_priv->perf.lock);
3475                spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
3476
3477                oa_sample_rate_hard_limit = 1000 *
3478                        (RUNTIME_INFO(dev_priv)->cs_timestamp_frequency_khz / 2);
3479                dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
3480
3481                mutex_init(&dev_priv->perf.metrics_lock);
3482                idr_init(&dev_priv->perf.metrics_idr);
3483
3484                dev_priv->perf.initialized = true;
3485        }
3486}
3487
3488static int destroy_config(int id, void *p, void *data)
3489{
3490        struct drm_i915_private *dev_priv = data;
3491        struct i915_oa_config *oa_config = p;
3492
3493        put_oa_config(dev_priv, oa_config);
3494
3495        return 0;
3496}
3497
3498/**
3499 * i915_perf_fini - Counter part to i915_perf_init()
3500 * @dev_priv: i915 device instance
3501 */
3502void i915_perf_fini(struct drm_i915_private *dev_priv)
3503{
3504        if (!dev_priv->perf.initialized)
3505                return;
3506
3507        idr_for_each(&dev_priv->perf.metrics_idr, destroy_config, dev_priv);
3508        idr_destroy(&dev_priv->perf.metrics_idr);
3509
3510        unregister_sysctl_table(dev_priv->perf.sysctl_header);
3511
3512        memset(&dev_priv->perf.oa.ops, 0, sizeof(dev_priv->perf.oa.ops));
3513
3514        dev_priv->perf.initialized = false;
3515}
3516