linux/drivers/gpu/drm/i915/i915_perf.c
<<
>>
Prefs
   1/*
   2 * Copyright © 2015-2016 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * Authors:
  24 *   Robert Bragg <robert@sixbynine.org>
  25 */
  26
  27
  28/**
  29 * DOC: i915 Perf Overview
  30 *
  31 * Gen graphics supports a large number of performance counters that can help
  32 * driver and application developers understand and optimize their use of the
  33 * GPU.
  34 *
  35 * This i915 perf interface enables userspace to configure and open a file
  36 * descriptor representing a stream of GPU metrics which can then be read() as
  37 * a stream of sample records.
  38 *
  39 * The interface is particularly suited to exposing buffered metrics that are
  40 * captured by DMA from the GPU, unsynchronized with and unrelated to the CPU.
  41 *
  42 * Streams representing a single context are accessible to applications with a
  43 * corresponding drm file descriptor, such that OpenGL can use the interface
  44 * without special privileges. Access to system-wide metrics requires root
  45 * privileges by default, unless changed via the dev.i915.perf_event_paranoid
  46 * sysctl option.
  47 *
  48 */
  49
  50/**
  51 * DOC: i915 Perf History and Comparison with Core Perf
  52 *
  53 * The interface was initially inspired by the core Perf infrastructure but
  54 * some notable differences are:
  55 *
  56 * i915 perf file descriptors represent a "stream" instead of an "event"; where
  57 * a perf event primarily corresponds to a single 64bit value, while a stream
  58 * might sample sets of tightly-coupled counters, depending on the
  59 * configuration.  For example the Gen OA unit isn't designed to support
  60 * orthogonal configurations of individual counters; it's configured for a set
  61 * of related counters. Samples for an i915 perf stream capturing OA metrics
  62 * will include a set of counter values packed in a compact HW specific format.
  63 * The OA unit supports a number of different packing formats which can be
  64 * selected by the user opening the stream. Perf has support for grouping
  65 * events, but each event in the group is configured, validated and
  66 * authenticated individually with separate system calls.
  67 *
  68 * i915 perf stream configurations are provided as an array of u64 (key,value)
  69 * pairs, instead of a fixed struct with multiple miscellaneous config members,
  70 * interleaved with event-type specific members.
  71 *
  72 * i915 perf doesn't support exposing metrics via an mmap'd circular buffer.
  73 * The supported metrics are being written to memory by the GPU unsynchronized
  74 * with the CPU, using HW specific packing formats for counter sets. Sometimes
  75 * the constraints on HW configuration require reports to be filtered before it
  76 * would be acceptable to expose them to unprivileged applications - to hide
  77 * the metrics of other processes/contexts. For these use cases a read() based
  78 * interface is a good fit, and provides an opportunity to filter data as it
  79 * gets copied from the GPU mapped buffers to userspace buffers.
  80 *
  81 *
  82 * Issues hit with first prototype based on Core Perf
  83 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  84 *
  85 * The first prototype of this driver was based on the core perf
  86 * infrastructure, and while we did make that mostly work, with some changes to
  87 * perf, we found we were breaking or working around too many assumptions baked
  88 * into perf's currently cpu centric design.
  89 *
  90 * In the end we didn't see a clear benefit to making perf's implementation and
  91 * interface more complex by changing design assumptions while we knew we still
  92 * wouldn't be able to use any existing perf based userspace tools.
  93 *
  94 * Also considering the Gen specific nature of the Observability hardware and
  95 * how userspace will sometimes need to combine i915 perf OA metrics with
  96 * side-band OA data captured via MI_REPORT_PERF_COUNT commands; we're
  97 * expecting the interface to be used by a platform specific userspace such as
  98 * OpenGL or tools. This is to say; we aren't inherently missing out on having
  99 * a standard vendor/architecture agnostic interface by not using perf.
 100 *
 101 *
 102 * For posterity, in case we might re-visit trying to adapt core perf to be
 103 * better suited to exposing i915 metrics these were the main pain points we
 104 * hit:
 105 *
 106 * - The perf based OA PMU driver broke some significant design assumptions:
 107 *
 108 *   Existing perf pmus are used for profiling work on a cpu and we were
 109 *   introducing the idea of _IS_DEVICE pmus with different security
 110 *   implications, the need to fake cpu-related data (such as user/kernel
 111 *   registers) to fit with perf's current design, and adding _DEVICE records
 112 *   as a way to forward device-specific status records.
 113 *
 114 *   The OA unit writes reports of counters into a circular buffer, without
 115 *   involvement from the CPU, making our PMU driver the first of a kind.
 116 *
 117 *   Given the way we were periodically forward data from the GPU-mapped, OA
 118 *   buffer to perf's buffer, those bursts of sample writes looked to perf like
 119 *   we were sampling too fast and so we had to subvert its throttling checks.
 120 *
 121 *   Perf supports groups of counters and allows those to be read via
 122 *   transactions internally but transactions currently seem designed to be
 123 *   explicitly initiated from the cpu (say in response to a userspace read())
 124 *   and while we could pull a report out of the OA buffer we can't
 125 *   trigger a report from the cpu on demand.
 126 *
 127 *   Related to being report based; the OA counters are configured in HW as a
 128 *   set while perf generally expects counter configurations to be orthogonal.
 129 *   Although counters can be associated with a group leader as they are
 130 *   opened, there's no clear precedent for being able to provide group-wide
 131 *   configuration attributes (for example we want to let userspace choose the
 132 *   OA unit report format used to capture all counters in a set, or specify a
 133 *   GPU context to filter metrics on). We avoided using perf's grouping
 134 *   feature and forwarded OA reports to userspace via perf's 'raw' sample
 135 *   field. This suited our userspace well considering how coupled the counters
 136 *   are when dealing with normalizing. It would be inconvenient to split
 137 *   counters up into separate events, only to require userspace to recombine
 138 *   them. For Mesa it's also convenient to be forwarded raw, periodic reports
 139 *   for combining with the side-band raw reports it captures using
 140 *   MI_REPORT_PERF_COUNT commands.
 141 *
 142 *   - As a side note on perf's grouping feature; there was also some concern
 143 *     that using PERF_FORMAT_GROUP as a way to pack together counter values
 144 *     would quite drastically inflate our sample sizes, which would likely
 145 *     lower the effective sampling resolutions we could use when the available
 146 *     memory bandwidth is limited.
 147 *
 148 *     With the OA unit's report formats, counters are packed together as 32
 149 *     or 40bit values, with the largest report size being 256 bytes.
 150 *
 151 *     PERF_FORMAT_GROUP values are 64bit, but there doesn't appear to be a
 152 *     documented ordering to the values, implying PERF_FORMAT_ID must also be
 153 *     used to add a 64bit ID before each value; giving 16 bytes per counter.
 154 *
 155 *   Related to counter orthogonality; we can't time share the OA unit, while
 156 *   event scheduling is a central design idea within perf for allowing
 157 *   userspace to open + enable more events than can be configured in HW at any
 158 *   one time.  The OA unit is not designed to allow re-configuration while in
 159 *   use. We can't reconfigure the OA unit without losing internal OA unit
 160 *   state which we can't access explicitly to save and restore. Reconfiguring
 161 *   the OA unit is also relatively slow, involving ~100 register writes. From
 162 *   userspace Mesa also depends on a stable OA configuration when emitting
 163 *   MI_REPORT_PERF_COUNT commands and importantly the OA unit can't be
 164 *   disabled while there are outstanding MI_RPC commands lest we hang the
 165 *   command streamer.
 166 *
 167 *   The contents of sample records aren't extensible by device drivers (i.e.
 168 *   the sample_type bits). As an example; Sourab Gupta had been looking to
 169 *   attach GPU timestamps to our OA samples. We were shoehorning OA reports
 170 *   into sample records by using the 'raw' field, but it's tricky to pack more
 171 *   than one thing into this field because events/core.c currently only lets a
 172 *   pmu give a single raw data pointer plus len which will be copied into the
 173 *   ring buffer. To include more than the OA report we'd have to copy the
 174 *   report into an intermediate larger buffer. I'd been considering allowing a
 175 *   vector of data+len values to be specified for copying the raw data, but
 176 *   it felt like a kludge to being using the raw field for this purpose.
 177 *
 178 * - It felt like our perf based PMU was making some technical compromises
 179 *   just for the sake of using perf:
 180 *
 181 *   perf_event_open() requires events to either relate to a pid or a specific
 182 *   cpu core, while our device pmu related to neither.  Events opened with a
 183 *   pid will be automatically enabled/disabled according to the scheduling of
 184 *   that process - so not appropriate for us. When an event is related to a
 185 *   cpu id, perf ensures pmu methods will be invoked via an inter process
 186 *   interrupt on that core. To avoid invasive changes our userspace opened OA
 187 *   perf events for a specific cpu. This was workable but it meant the
 188 *   majority of the OA driver ran in atomic context, including all OA report
 189 *   forwarding, which wasn't really necessary in our case and seems to make
 190 *   our locking requirements somewhat complex as we handled the interaction
 191 *   with the rest of the i915 driver.
 192 */
 193
 194#include <linux/anon_inodes.h>
 195#include <linux/sizes.h>
 196#include <linux/uuid.h>
 197
 198#include "gem/i915_gem_context.h"
 199#include "gt/intel_engine_pm.h"
 200#include "gt/intel_engine_user.h"
 201#include "gt/intel_gt.h"
 202#include "gt/intel_lrc_reg.h"
 203#include "gt/intel_ring.h"
 204
 205#include "i915_drv.h"
 206#include "i915_perf.h"
 207#include "oa/i915_oa_hsw.h"
 208#include "oa/i915_oa_bdw.h"
 209#include "oa/i915_oa_chv.h"
 210#include "oa/i915_oa_sklgt2.h"
 211#include "oa/i915_oa_sklgt3.h"
 212#include "oa/i915_oa_sklgt4.h"
 213#include "oa/i915_oa_bxt.h"
 214#include "oa/i915_oa_kblgt2.h"
 215#include "oa/i915_oa_kblgt3.h"
 216#include "oa/i915_oa_glk.h"
 217#include "oa/i915_oa_cflgt2.h"
 218#include "oa/i915_oa_cflgt3.h"
 219#include "oa/i915_oa_cnl.h"
 220#include "oa/i915_oa_icl.h"
 221#include "oa/i915_oa_tgl.h"
 222
 223/* HW requires this to be a power of two, between 128k and 16M, though driver
 224 * is currently generally designed assuming the largest 16M size is used such
 225 * that the overflow cases are unlikely in normal operation.
 226 */
 227#define OA_BUFFER_SIZE          SZ_16M
 228
 229#define OA_TAKEN(tail, head)    ((tail - head) & (OA_BUFFER_SIZE - 1))
 230
 231/**
 232 * DOC: OA Tail Pointer Race
 233 *
 234 * There's a HW race condition between OA unit tail pointer register updates and
 235 * writes to memory whereby the tail pointer can sometimes get ahead of what's
 236 * been written out to the OA buffer so far (in terms of what's visible to the
 237 * CPU).
 238 *
 239 * Although this can be observed explicitly while copying reports to userspace
 240 * by checking for a zeroed report-id field in tail reports, we want to account
 241 * for this earlier, as part of the oa_buffer_check to avoid lots of redundant
 242 * read() attempts.
 243 *
 244 * In effect we define a tail pointer for reading that lags the real tail
 245 * pointer by at least %OA_TAIL_MARGIN_NSEC nanoseconds, which gives enough
 246 * time for the corresponding reports to become visible to the CPU.
 247 *
 248 * To manage this we actually track two tail pointers:
 249 *  1) An 'aging' tail with an associated timestamp that is tracked until we
 250 *     can trust the corresponding data is visible to the CPU; at which point
 251 *     it is considered 'aged'.
 252 *  2) An 'aged' tail that can be used for read()ing.
 253 *
 254 * The two separate pointers let us decouple read()s from tail pointer aging.
 255 *
 256 * The tail pointers are checked and updated at a limited rate within a hrtimer
 257 * callback (the same callback that is used for delivering EPOLLIN events)
 258 *
 259 * Initially the tails are marked invalid with %INVALID_TAIL_PTR which
 260 * indicates that an updated tail pointer is needed.
 261 *
 262 * Most of the implementation details for this workaround are in
 263 * oa_buffer_check_unlocked() and _append_oa_reports()
 264 *
 265 * Note for posterity: previously the driver used to define an effective tail
 266 * pointer that lagged the real pointer by a 'tail margin' measured in bytes
 267 * derived from %OA_TAIL_MARGIN_NSEC and the configured sampling frequency.
 268 * This was flawed considering that the OA unit may also automatically generate
 269 * non-periodic reports (such as on context switch) or the OA unit may be
 270 * enabled without any periodic sampling.
 271 */
 272#define OA_TAIL_MARGIN_NSEC     100000ULL
 273#define INVALID_TAIL_PTR        0xffffffff
 274
 275/* frequency for checking whether the OA unit has written new reports to the
 276 * circular OA buffer...
 277 */
 278#define POLL_FREQUENCY 200
 279#define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY)
 280
 281/* for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid */
 282static u32 i915_perf_stream_paranoid = true;
 283
 284/* The maximum exponent the hardware accepts is 63 (essentially it selects one
 285 * of the 64bit timestamp bits to trigger reports from) but there's currently
 286 * no known use case for sampling as infrequently as once per 47 thousand years.
 287 *
 288 * Since the timestamps included in OA reports are only 32bits it seems
 289 * reasonable to limit the OA exponent where it's still possible to account for
 290 * overflow in OA report timestamps.
 291 */
 292#define OA_EXPONENT_MAX 31
 293
 294#define INVALID_CTX_ID 0xffffffff
 295
 296/* On Gen8+ automatically triggered OA reports include a 'reason' field... */
 297#define OAREPORT_REASON_MASK           0x3f
 298#define OAREPORT_REASON_MASK_EXTENDED  0x7f
 299#define OAREPORT_REASON_SHIFT          19
 300#define OAREPORT_REASON_TIMER          (1<<0)
 301#define OAREPORT_REASON_CTX_SWITCH     (1<<3)
 302#define OAREPORT_REASON_CLK_RATIO      (1<<5)
 303
 304
 305/* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate
 306 *
 307 * The highest sampling frequency we can theoretically program the OA unit
 308 * with is always half the timestamp frequency: E.g. 6.25Mhz for Haswell.
 309 *
 310 * Initialized just before we register the sysctl parameter.
 311 */
 312static int oa_sample_rate_hard_limit;
 313
 314/* Theoretically we can program the OA unit to sample every 160ns but don't
 315 * allow that by default unless root...
 316 *
 317 * The default threshold of 100000Hz is based on perf's similar
 318 * kernel.perf_event_max_sample_rate sysctl parameter.
 319 */
 320static u32 i915_oa_max_sample_rate = 100000;
 321
 322/* XXX: beware if future OA HW adds new report formats that the current
 323 * code assumes all reports have a power-of-two size and ~(size - 1) can
 324 * be used as a mask to align the OA tail pointer.
 325 */
 326static const struct i915_oa_format hsw_oa_formats[I915_OA_FORMAT_MAX] = {
 327        [I915_OA_FORMAT_A13]        = { 0, 64 },
 328        [I915_OA_FORMAT_A29]        = { 1, 128 },
 329        [I915_OA_FORMAT_A13_B8_C8]  = { 2, 128 },
 330        /* A29_B8_C8 Disallowed as 192 bytes doesn't factor into buffer size */
 331        [I915_OA_FORMAT_B4_C8]      = { 4, 64 },
 332        [I915_OA_FORMAT_A45_B8_C8]  = { 5, 256 },
 333        [I915_OA_FORMAT_B4_C8_A16]  = { 6, 128 },
 334        [I915_OA_FORMAT_C4_B8]      = { 7, 64 },
 335};
 336
 337static const struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
 338        [I915_OA_FORMAT_A12]                = { 0, 64 },
 339        [I915_OA_FORMAT_A12_B8_C8]          = { 2, 128 },
 340        [I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
 341        [I915_OA_FORMAT_C4_B8]              = { 7, 64 },
 342};
 343
 344static const struct i915_oa_format gen12_oa_formats[I915_OA_FORMAT_MAX] = {
 345        [I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
 346};
 347
 348#define SAMPLE_OA_REPORT      (1<<0)
 349
 350/**
 351 * struct perf_open_properties - for validated properties given to open a stream
 352 * @sample_flags: `DRM_I915_PERF_PROP_SAMPLE_*` properties are tracked as flags
 353 * @single_context: Whether a single or all gpu contexts should be monitored
 354 * @hold_preemption: Whether the preemption is disabled for the filtered
 355 *                   context
 356 * @ctx_handle: A gem ctx handle for use with @single_context
 357 * @metrics_set: An ID for an OA unit metric set advertised via sysfs
 358 * @oa_format: An OA unit HW report format
 359 * @oa_periodic: Whether to enable periodic OA unit sampling
 360 * @oa_period_exponent: The OA unit sampling period is derived from this
 361 * @engine: The engine (typically rcs0) being monitored by the OA unit
 362 *
 363 * As read_properties_unlocked() enumerates and validates the properties given
 364 * to open a stream of metrics the configuration is built up in the structure
 365 * which starts out zero initialized.
 366 */
 367struct perf_open_properties {
 368        u32 sample_flags;
 369
 370        u64 single_context:1;
 371        u64 hold_preemption:1;
 372        u64 ctx_handle;
 373
 374        /* OA sampling state */
 375        int metrics_set;
 376        int oa_format;
 377        bool oa_periodic;
 378        int oa_period_exponent;
 379
 380        struct intel_engine_cs *engine;
 381};
 382
 383struct i915_oa_config_bo {
 384        struct llist_node node;
 385
 386        struct i915_oa_config *oa_config;
 387        struct i915_vma *vma;
 388};
 389
 390static struct ctl_table_header *sysctl_header;
 391
 392static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer);
 393
 394void i915_oa_config_release(struct kref *ref)
 395{
 396        struct i915_oa_config *oa_config =
 397                container_of(ref, typeof(*oa_config), ref);
 398
 399        kfree(oa_config->flex_regs);
 400        kfree(oa_config->b_counter_regs);
 401        kfree(oa_config->mux_regs);
 402
 403        kfree_rcu(oa_config, rcu);
 404}
 405
 406struct i915_oa_config *
 407i915_perf_get_oa_config(struct i915_perf *perf, int metrics_set)
 408{
 409        struct i915_oa_config *oa_config;
 410
 411        rcu_read_lock();
 412        if (metrics_set == 1)
 413                oa_config = &perf->test_config;
 414        else
 415                oa_config = idr_find(&perf->metrics_idr, metrics_set);
 416        if (oa_config)
 417                oa_config = i915_oa_config_get(oa_config);
 418        rcu_read_unlock();
 419
 420        return oa_config;
 421}
 422
 423static void free_oa_config_bo(struct i915_oa_config_bo *oa_bo)
 424{
 425        i915_oa_config_put(oa_bo->oa_config);
 426        i915_vma_put(oa_bo->vma);
 427        kfree(oa_bo);
 428}
 429
 430static u32 gen12_oa_hw_tail_read(struct i915_perf_stream *stream)
 431{
 432        struct intel_uncore *uncore = stream->uncore;
 433
 434        return intel_uncore_read(uncore, GEN12_OAG_OATAILPTR) &
 435               GEN12_OAG_OATAILPTR_MASK;
 436}
 437
 438static u32 gen8_oa_hw_tail_read(struct i915_perf_stream *stream)
 439{
 440        struct intel_uncore *uncore = stream->uncore;
 441
 442        return intel_uncore_read(uncore, GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
 443}
 444
 445static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream)
 446{
 447        struct intel_uncore *uncore = stream->uncore;
 448        u32 oastatus1 = intel_uncore_read(uncore, GEN7_OASTATUS1);
 449
 450        return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
 451}
 452
 453/**
 454 * oa_buffer_check_unlocked - check for data and update tail ptr state
 455 * @stream: i915 stream instance
 456 *
 457 * This is either called via fops (for blocking reads in user ctx) or the poll
 458 * check hrtimer (atomic ctx) to check the OA buffer tail pointer and check
 459 * if there is data available for userspace to read.
 460 *
 461 * This function is central to providing a workaround for the OA unit tail
 462 * pointer having a race with respect to what data is visible to the CPU.
 463 * It is responsible for reading tail pointers from the hardware and giving
 464 * the pointers time to 'age' before they are made available for reading.
 465 * (See description of OA_TAIL_MARGIN_NSEC above for further details.)
 466 *
 467 * Besides returning true when there is data available to read() this function
 468 * also has the side effect of updating the oa_buffer.tails[], .aging_timestamp
 469 * and .aged_tail_idx state used for reading.
 470 *
 471 * Note: It's safe to read OA config state here unlocked, assuming that this is
 472 * only called while the stream is enabled, while the global OA configuration
 473 * can't be modified.
 474 *
 475 * Returns: %true if the OA buffer contains data, else %false
 476 */
 477static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
 478{
 479        int report_size = stream->oa_buffer.format_size;
 480        unsigned long flags;
 481        unsigned int aged_idx;
 482        u32 head, hw_tail, aged_tail, aging_tail;
 483        u64 now;
 484
 485        /* We have to consider the (unlikely) possibility that read() errors
 486         * could result in an OA buffer reset which might reset the head,
 487         * tails[] and aged_tail state.
 488         */
 489        spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
 490
 491        /* NB: The head we observe here might effectively be a little out of
 492         * date (between head and tails[aged_idx].offset if there is currently
 493         * a read() in progress.
 494         */
 495        head = stream->oa_buffer.head;
 496
 497        aged_idx = stream->oa_buffer.aged_tail_idx;
 498        aged_tail = stream->oa_buffer.tails[aged_idx].offset;
 499        aging_tail = stream->oa_buffer.tails[!aged_idx].offset;
 500
 501        hw_tail = stream->perf->ops.oa_hw_tail_read(stream);
 502
 503        /* The tail pointer increases in 64 byte increments,
 504         * not in report_size steps...
 505         */
 506        hw_tail &= ~(report_size - 1);
 507
 508        now = ktime_get_mono_fast_ns();
 509
 510        /* Update the aged tail
 511         *
 512         * Flip the tail pointer available for read()s once the aging tail is
 513         * old enough to trust that the corresponding data will be visible to
 514         * the CPU...
 515         *
 516         * Do this before updating the aging pointer in case we may be able to
 517         * immediately start aging a new pointer too (if new data has become
 518         * available) without needing to wait for a later hrtimer callback.
 519         */
 520        if (aging_tail != INVALID_TAIL_PTR &&
 521            ((now - stream->oa_buffer.aging_timestamp) >
 522             OA_TAIL_MARGIN_NSEC)) {
 523
 524                aged_idx ^= 1;
 525                stream->oa_buffer.aged_tail_idx = aged_idx;
 526
 527                aged_tail = aging_tail;
 528
 529                /* Mark that we need a new pointer to start aging... */
 530                stream->oa_buffer.tails[!aged_idx].offset = INVALID_TAIL_PTR;
 531                aging_tail = INVALID_TAIL_PTR;
 532        }
 533
 534        /* Update the aging tail
 535         *
 536         * We throttle aging tail updates until we have a new tail that
 537         * represents >= one report more data than is already available for
 538         * reading. This ensures there will be enough data for a successful
 539         * read once this new pointer has aged and ensures we will give the new
 540         * pointer time to age.
 541         */
 542        if (aging_tail == INVALID_TAIL_PTR &&
 543            (aged_tail == INVALID_TAIL_PTR ||
 544             OA_TAKEN(hw_tail, aged_tail) >= report_size)) {
 545                struct i915_vma *vma = stream->oa_buffer.vma;
 546                u32 gtt_offset = i915_ggtt_offset(vma);
 547
 548                /* Be paranoid and do a bounds check on the pointer read back
 549                 * from hardware, just in case some spurious hardware condition
 550                 * could put the tail out of bounds...
 551                 */
 552                if (hw_tail >= gtt_offset &&
 553                    hw_tail < (gtt_offset + OA_BUFFER_SIZE)) {
 554                        stream->oa_buffer.tails[!aged_idx].offset =
 555                                aging_tail = hw_tail;
 556                        stream->oa_buffer.aging_timestamp = now;
 557                } else {
 558                        DRM_ERROR("Ignoring spurious out of range OA buffer tail pointer = %x\n",
 559                                  hw_tail);
 560                }
 561        }
 562
 563        spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
 564
 565        return aged_tail == INVALID_TAIL_PTR ?
 566                false : OA_TAKEN(aged_tail, head) >= report_size;
 567}
 568
 569/**
 570 * append_oa_status - Appends a status record to a userspace read() buffer.
 571 * @stream: An i915-perf stream opened for OA metrics
 572 * @buf: destination buffer given by userspace
 573 * @count: the number of bytes userspace wants to read
 574 * @offset: (inout): the current position for writing into @buf
 575 * @type: The kind of status to report to userspace
 576 *
 577 * Writes a status record (such as `DRM_I915_PERF_RECORD_OA_REPORT_LOST`)
 578 * into the userspace read() buffer.
 579 *
 580 * The @buf @offset will only be updated on success.
 581 *
 582 * Returns: 0 on success, negative error code on failure.
 583 */
 584static int append_oa_status(struct i915_perf_stream *stream,
 585                            char __user *buf,
 586                            size_t count,
 587                            size_t *offset,
 588                            enum drm_i915_perf_record_type type)
 589{
 590        struct drm_i915_perf_record_header header = { type, 0, sizeof(header) };
 591
 592        if ((count - *offset) < header.size)
 593                return -ENOSPC;
 594
 595        if (copy_to_user(buf + *offset, &header, sizeof(header)))
 596                return -EFAULT;
 597
 598        (*offset) += header.size;
 599
 600        return 0;
 601}
 602
 603/**
 604 * append_oa_sample - Copies single OA report into userspace read() buffer.
 605 * @stream: An i915-perf stream opened for OA metrics
 606 * @buf: destination buffer given by userspace
 607 * @count: the number of bytes userspace wants to read
 608 * @offset: (inout): the current position for writing into @buf
 609 * @report: A single OA report to (optionally) include as part of the sample
 610 *
 611 * The contents of a sample are configured through `DRM_I915_PERF_PROP_SAMPLE_*`
 612 * properties when opening a stream, tracked as `stream->sample_flags`. This
 613 * function copies the requested components of a single sample to the given
 614 * read() @buf.
 615 *
 616 * The @buf @offset will only be updated on success.
 617 *
 618 * Returns: 0 on success, negative error code on failure.
 619 */
 620static int append_oa_sample(struct i915_perf_stream *stream,
 621                            char __user *buf,
 622                            size_t count,
 623                            size_t *offset,
 624                            const u8 *report)
 625{
 626        int report_size = stream->oa_buffer.format_size;
 627        struct drm_i915_perf_record_header header;
 628        u32 sample_flags = stream->sample_flags;
 629
 630        header.type = DRM_I915_PERF_RECORD_SAMPLE;
 631        header.pad = 0;
 632        header.size = stream->sample_size;
 633
 634        if ((count - *offset) < header.size)
 635                return -ENOSPC;
 636
 637        buf += *offset;
 638        if (copy_to_user(buf, &header, sizeof(header)))
 639                return -EFAULT;
 640        buf += sizeof(header);
 641
 642        if (sample_flags & SAMPLE_OA_REPORT) {
 643                if (copy_to_user(buf, report, report_size))
 644                        return -EFAULT;
 645        }
 646
 647        (*offset) += header.size;
 648
 649        return 0;
 650}
 651
 652/**
 653 * Copies all buffered OA reports into userspace read() buffer.
 654 * @stream: An i915-perf stream opened for OA metrics
 655 * @buf: destination buffer given by userspace
 656 * @count: the number of bytes userspace wants to read
 657 * @offset: (inout): the current position for writing into @buf
 658 *
 659 * Notably any error condition resulting in a short read (-%ENOSPC or
 660 * -%EFAULT) will be returned even though one or more records may
 661 * have been successfully copied. In this case it's up to the caller
 662 * to decide if the error should be squashed before returning to
 663 * userspace.
 664 *
 665 * Note: reports are consumed from the head, and appended to the
 666 * tail, so the tail chases the head?... If you think that's mad
 667 * and back-to-front you're not alone, but this follows the
 668 * Gen PRM naming convention.
 669 *
 670 * Returns: 0 on success, negative error code on failure.
 671 */
 672static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 673                                  char __user *buf,
 674                                  size_t count,
 675                                  size_t *offset)
 676{
 677        struct intel_uncore *uncore = stream->uncore;
 678        int report_size = stream->oa_buffer.format_size;
 679        u8 *oa_buf_base = stream->oa_buffer.vaddr;
 680        u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
 681        u32 mask = (OA_BUFFER_SIZE - 1);
 682        size_t start_offset = *offset;
 683        unsigned long flags;
 684        unsigned int aged_tail_idx;
 685        u32 head, tail;
 686        u32 taken;
 687        int ret = 0;
 688
 689        if (WARN_ON(!stream->enabled))
 690                return -EIO;
 691
 692        spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
 693
 694        head = stream->oa_buffer.head;
 695        aged_tail_idx = stream->oa_buffer.aged_tail_idx;
 696        tail = stream->oa_buffer.tails[aged_tail_idx].offset;
 697
 698        spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
 699
 700        /*
 701         * An invalid tail pointer here means we're still waiting for the poll
 702         * hrtimer callback to give us a pointer
 703         */
 704        if (tail == INVALID_TAIL_PTR)
 705                return -EAGAIN;
 706
 707        /*
 708         * NB: oa_buffer.head/tail include the gtt_offset which we don't want
 709         * while indexing relative to oa_buf_base.
 710         */
 711        head -= gtt_offset;
 712        tail -= gtt_offset;
 713
 714        /*
 715         * An out of bounds or misaligned head or tail pointer implies a driver
 716         * bug since we validate + align the tail pointers we read from the
 717         * hardware and we are in full control of the head pointer which should
 718         * only be incremented by multiples of the report size (notably also
 719         * all a power of two).
 720         */
 721        if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
 722                      tail > OA_BUFFER_SIZE || tail % report_size,
 723                      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
 724                      head, tail))
 725                return -EIO;
 726
 727
 728        for (/* none */;
 729             (taken = OA_TAKEN(tail, head));
 730             head = (head + report_size) & mask) {
 731                u8 *report = oa_buf_base + head;
 732                u32 *report32 = (void *)report;
 733                u32 ctx_id;
 734                u32 reason;
 735
 736                /*
 737                 * All the report sizes factor neatly into the buffer
 738                 * size so we never expect to see a report split
 739                 * between the beginning and end of the buffer.
 740                 *
 741                 * Given the initial alignment check a misalignment
 742                 * here would imply a driver bug that would result
 743                 * in an overrun.
 744                 */
 745                if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
 746                        DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
 747                        break;
 748                }
 749
 750                /*
 751                 * The reason field includes flags identifying what
 752                 * triggered this specific report (mostly timer
 753                 * triggered or e.g. due to a context switch).
 754                 *
 755                 * This field is never expected to be zero so we can
 756                 * check that the report isn't invalid before copying
 757                 * it to userspace...
 758                 */
 759                reason = ((report32[0] >> OAREPORT_REASON_SHIFT) &
 760                          (IS_GEN(stream->perf->i915, 12) ?
 761                           OAREPORT_REASON_MASK_EXTENDED :
 762                           OAREPORT_REASON_MASK));
 763                if (reason == 0) {
 764                        if (__ratelimit(&stream->perf->spurious_report_rs))
 765                                DRM_NOTE("Skipping spurious, invalid OA report\n");
 766                        continue;
 767                }
 768
 769                ctx_id = report32[2] & stream->specific_ctx_id_mask;
 770
 771                /*
 772                 * Squash whatever is in the CTX_ID field if it's marked as
 773                 * invalid to be sure we avoid false-positive, single-context
 774                 * filtering below...
 775                 *
 776                 * Note: that we don't clear the valid_ctx_bit so userspace can
 777                 * understand that the ID has been squashed by the kernel.
 778                 */
 779                if (!(report32[0] & stream->perf->gen8_valid_ctx_bit) &&
 780                    INTEL_GEN(stream->perf->i915) <= 11)
 781                        ctx_id = report32[2] = INVALID_CTX_ID;
 782
 783                /*
 784                 * NB: For Gen 8 the OA unit no longer supports clock gating
 785                 * off for a specific context and the kernel can't securely
 786                 * stop the counters from updating as system-wide / global
 787                 * values.
 788                 *
 789                 * Automatic reports now include a context ID so reports can be
 790                 * filtered on the cpu but it's not worth trying to
 791                 * automatically subtract/hide counter progress for other
 792                 * contexts while filtering since we can't stop userspace
 793                 * issuing MI_REPORT_PERF_COUNT commands which would still
 794                 * provide a side-band view of the real values.
 795                 *
 796                 * To allow userspace (such as Mesa/GL_INTEL_performance_query)
 797                 * to normalize counters for a single filtered context then it
 798                 * needs be forwarded bookend context-switch reports so that it
 799                 * can track switches in between MI_REPORT_PERF_COUNT commands
 800                 * and can itself subtract/ignore the progress of counters
 801                 * associated with other contexts. Note that the hardware
 802                 * automatically triggers reports when switching to a new
 803                 * context which are tagged with the ID of the newly active
 804                 * context. To avoid the complexity (and likely fragility) of
 805                 * reading ahead while parsing reports to try and minimize
 806                 * forwarding redundant context switch reports (i.e. between
 807                 * other, unrelated contexts) we simply elect to forward them
 808                 * all.
 809                 *
 810                 * We don't rely solely on the reason field to identify context
 811                 * switches since it's not-uncommon for periodic samples to
 812                 * identify a switch before any 'context switch' report.
 813                 */
 814                if (!stream->perf->exclusive_stream->ctx ||
 815                    stream->specific_ctx_id == ctx_id ||
 816                    stream->oa_buffer.last_ctx_id == stream->specific_ctx_id ||
 817                    reason & OAREPORT_REASON_CTX_SWITCH) {
 818
 819                        /*
 820                         * While filtering for a single context we avoid
 821                         * leaking the IDs of other contexts.
 822                         */
 823                        if (stream->perf->exclusive_stream->ctx &&
 824                            stream->specific_ctx_id != ctx_id) {
 825                                report32[2] = INVALID_CTX_ID;
 826                        }
 827
 828                        ret = append_oa_sample(stream, buf, count, offset,
 829                                               report);
 830                        if (ret)
 831                                break;
 832
 833                        stream->oa_buffer.last_ctx_id = ctx_id;
 834                }
 835
 836                /*
 837                 * The above reason field sanity check is based on
 838                 * the assumption that the OA buffer is initially
 839                 * zeroed and we reset the field after copying so the
 840                 * check is still meaningful once old reports start
 841                 * being overwritten.
 842                 */
 843                report32[0] = 0;
 844        }
 845
 846        if (start_offset != *offset) {
 847                i915_reg_t oaheadptr;
 848
 849                oaheadptr = IS_GEN(stream->perf->i915, 12) ?
 850                            GEN12_OAG_OAHEADPTR : GEN8_OAHEADPTR;
 851
 852                spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
 853
 854                /*
 855                 * We removed the gtt_offset for the copy loop above, indexing
 856                 * relative to oa_buf_base so put back here...
 857                 */
 858                head += gtt_offset;
 859                intel_uncore_write(uncore, oaheadptr,
 860                                   head & GEN12_OAG_OAHEADPTR_MASK);
 861                stream->oa_buffer.head = head;
 862
 863                spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
 864        }
 865
 866        return ret;
 867}
 868
 869/**
 870 * gen8_oa_read - copy status records then buffered OA reports
 871 * @stream: An i915-perf stream opened for OA metrics
 872 * @buf: destination buffer given by userspace
 873 * @count: the number of bytes userspace wants to read
 874 * @offset: (inout): the current position for writing into @buf
 875 *
 876 * Checks OA unit status registers and if necessary appends corresponding
 877 * status records for userspace (such as for a buffer full condition) and then
 878 * initiate appending any buffered OA reports.
 879 *
 880 * Updates @offset according to the number of bytes successfully copied into
 881 * the userspace buffer.
 882 *
 883 * NB: some data may be successfully copied to the userspace buffer
 884 * even if an error is returned, and this is reflected in the
 885 * updated @offset.
 886 *
 887 * Returns: zero on success or a negative error code
 888 */
 889static int gen8_oa_read(struct i915_perf_stream *stream,
 890                        char __user *buf,
 891                        size_t count,
 892                        size_t *offset)
 893{
 894        struct intel_uncore *uncore = stream->uncore;
 895        u32 oastatus;
 896        i915_reg_t oastatus_reg;
 897        int ret;
 898
 899        if (WARN_ON(!stream->oa_buffer.vaddr))
 900                return -EIO;
 901
 902        oastatus_reg = IS_GEN(stream->perf->i915, 12) ?
 903                       GEN12_OAG_OASTATUS : GEN8_OASTATUS;
 904
 905        oastatus = intel_uncore_read(uncore, oastatus_reg);
 906
 907        /*
 908         * We treat OABUFFER_OVERFLOW as a significant error:
 909         *
 910         * Although theoretically we could handle this more gracefully
 911         * sometimes, some Gens don't correctly suppress certain
 912         * automatically triggered reports in this condition and so we
 913         * have to assume that old reports are now being trampled
 914         * over.
 915         *
 916         * Considering how we don't currently give userspace control
 917         * over the OA buffer size and always configure a large 16MB
 918         * buffer, then a buffer overflow does anyway likely indicate
 919         * that something has gone quite badly wrong.
 920         */
 921        if (oastatus & GEN8_OASTATUS_OABUFFER_OVERFLOW) {
 922                ret = append_oa_status(stream, buf, count, offset,
 923                                       DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
 924                if (ret)
 925                        return ret;
 926
 927                DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
 928                          stream->period_exponent);
 929
 930                stream->perf->ops.oa_disable(stream);
 931                stream->perf->ops.oa_enable(stream);
 932
 933                /*
 934                 * Note: .oa_enable() is expected to re-init the oabuffer and
 935                 * reset GEN8_OASTATUS for us
 936                 */
 937                oastatus = intel_uncore_read(uncore, oastatus_reg);
 938        }
 939
 940        if (oastatus & GEN8_OASTATUS_REPORT_LOST) {
 941                ret = append_oa_status(stream, buf, count, offset,
 942                                       DRM_I915_PERF_RECORD_OA_REPORT_LOST);
 943                if (ret)
 944                        return ret;
 945                intel_uncore_write(uncore, oastatus_reg,
 946                                   oastatus & ~GEN8_OASTATUS_REPORT_LOST);
 947        }
 948
 949        return gen8_append_oa_reports(stream, buf, count, offset);
 950}
 951
 952/**
 953 * Copies all buffered OA reports into userspace read() buffer.
 954 * @stream: An i915-perf stream opened for OA metrics
 955 * @buf: destination buffer given by userspace
 956 * @count: the number of bytes userspace wants to read
 957 * @offset: (inout): the current position for writing into @buf
 958 *
 959 * Notably any error condition resulting in a short read (-%ENOSPC or
 960 * -%EFAULT) will be returned even though one or more records may
 961 * have been successfully copied. In this case it's up to the caller
 962 * to decide if the error should be squashed before returning to
 963 * userspace.
 964 *
 965 * Note: reports are consumed from the head, and appended to the
 966 * tail, so the tail chases the head?... If you think that's mad
 967 * and back-to-front you're not alone, but this follows the
 968 * Gen PRM naming convention.
 969 *
 970 * Returns: 0 on success, negative error code on failure.
 971 */
 972static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 973                                  char __user *buf,
 974                                  size_t count,
 975                                  size_t *offset)
 976{
 977        struct intel_uncore *uncore = stream->uncore;
 978        int report_size = stream->oa_buffer.format_size;
 979        u8 *oa_buf_base = stream->oa_buffer.vaddr;
 980        u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
 981        u32 mask = (OA_BUFFER_SIZE - 1);
 982        size_t start_offset = *offset;
 983        unsigned long flags;
 984        unsigned int aged_tail_idx;
 985        u32 head, tail;
 986        u32 taken;
 987        int ret = 0;
 988
 989        if (WARN_ON(!stream->enabled))
 990                return -EIO;
 991
 992        spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
 993
 994        head = stream->oa_buffer.head;
 995        aged_tail_idx = stream->oa_buffer.aged_tail_idx;
 996        tail = stream->oa_buffer.tails[aged_tail_idx].offset;
 997
 998        spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
 999
1000        /* An invalid tail pointer here means we're still waiting for the poll
1001         * hrtimer callback to give us a pointer
1002         */
1003        if (tail == INVALID_TAIL_PTR)
1004                return -EAGAIN;
1005
1006        /* NB: oa_buffer.head/tail include the gtt_offset which we don't want
1007         * while indexing relative to oa_buf_base.
1008         */
1009        head -= gtt_offset;
1010        tail -= gtt_offset;
1011
1012        /* An out of bounds or misaligned head or tail pointer implies a driver
1013         * bug since we validate + align the tail pointers we read from the
1014         * hardware and we are in full control of the head pointer which should
1015         * only be incremented by multiples of the report size (notably also
1016         * all a power of two).
1017         */
1018        if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
1019                      tail > OA_BUFFER_SIZE || tail % report_size,
1020                      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
1021                      head, tail))
1022                return -EIO;
1023
1024
1025        for (/* none */;
1026             (taken = OA_TAKEN(tail, head));
1027             head = (head + report_size) & mask) {
1028                u8 *report = oa_buf_base + head;
1029                u32 *report32 = (void *)report;
1030
1031                /* All the report sizes factor neatly into the buffer
1032                 * size so we never expect to see a report split
1033                 * between the beginning and end of the buffer.
1034                 *
1035                 * Given the initial alignment check a misalignment
1036                 * here would imply a driver bug that would result
1037                 * in an overrun.
1038                 */
1039                if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
1040                        DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
1041                        break;
1042                }
1043
1044                /* The report-ID field for periodic samples includes
1045                 * some undocumented flags related to what triggered
1046                 * the report and is never expected to be zero so we
1047                 * can check that the report isn't invalid before
1048                 * copying it to userspace...
1049                 */
1050                if (report32[0] == 0) {
1051                        if (__ratelimit(&stream->perf->spurious_report_rs))
1052                                DRM_NOTE("Skipping spurious, invalid OA report\n");
1053                        continue;
1054                }
1055
1056                ret = append_oa_sample(stream, buf, count, offset, report);
1057                if (ret)
1058                        break;
1059
1060                /* The above report-id field sanity check is based on
1061                 * the assumption that the OA buffer is initially
1062                 * zeroed and we reset the field after copying so the
1063                 * check is still meaningful once old reports start
1064                 * being overwritten.
1065                 */
1066                report32[0] = 0;
1067        }
1068
1069        if (start_offset != *offset) {
1070                spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
1071
1072                /* We removed the gtt_offset for the copy loop above, indexing
1073                 * relative to oa_buf_base so put back here...
1074                 */
1075                head += gtt_offset;
1076
1077                intel_uncore_write(uncore, GEN7_OASTATUS2,
1078                                   (head & GEN7_OASTATUS2_HEAD_MASK) |
1079                                   GEN7_OASTATUS2_MEM_SELECT_GGTT);
1080                stream->oa_buffer.head = head;
1081
1082                spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
1083        }
1084
1085        return ret;
1086}
1087
1088/**
1089 * gen7_oa_read - copy status records then buffered OA reports
1090 * @stream: An i915-perf stream opened for OA metrics
1091 * @buf: destination buffer given by userspace
1092 * @count: the number of bytes userspace wants to read
1093 * @offset: (inout): the current position for writing into @buf
1094 *
1095 * Checks Gen 7 specific OA unit status registers and if necessary appends
1096 * corresponding status records for userspace (such as for a buffer full
1097 * condition) and then initiate appending any buffered OA reports.
1098 *
1099 * Updates @offset according to the number of bytes successfully copied into
1100 * the userspace buffer.
1101 *
1102 * Returns: zero on success or a negative error code
1103 */
1104static int gen7_oa_read(struct i915_perf_stream *stream,
1105                        char __user *buf,
1106                        size_t count,
1107                        size_t *offset)
1108{
1109        struct intel_uncore *uncore = stream->uncore;
1110        u32 oastatus1;
1111        int ret;
1112
1113        if (WARN_ON(!stream->oa_buffer.vaddr))
1114                return -EIO;
1115
1116        oastatus1 = intel_uncore_read(uncore, GEN7_OASTATUS1);
1117
1118        /* XXX: On Haswell we don't have a safe way to clear oastatus1
1119         * bits while the OA unit is enabled (while the tail pointer
1120         * may be updated asynchronously) so we ignore status bits
1121         * that have already been reported to userspace.
1122         */
1123        oastatus1 &= ~stream->perf->gen7_latched_oastatus1;
1124
1125        /* We treat OABUFFER_OVERFLOW as a significant error:
1126         *
1127         * - The status can be interpreted to mean that the buffer is
1128         *   currently full (with a higher precedence than OA_TAKEN()
1129         *   which will start to report a near-empty buffer after an
1130         *   overflow) but it's awkward that we can't clear the status
1131         *   on Haswell, so without a reset we won't be able to catch
1132         *   the state again.
1133         *
1134         * - Since it also implies the HW has started overwriting old
1135         *   reports it may also affect our sanity checks for invalid
1136         *   reports when copying to userspace that assume new reports
1137         *   are being written to cleared memory.
1138         *
1139         * - In the future we may want to introduce a flight recorder
1140         *   mode where the driver will automatically maintain a safe
1141         *   guard band between head/tail, avoiding this overflow
1142         *   condition, but we avoid the added driver complexity for
1143         *   now.
1144         */
1145        if (unlikely(oastatus1 & GEN7_OASTATUS1_OABUFFER_OVERFLOW)) {
1146                ret = append_oa_status(stream, buf, count, offset,
1147                                       DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
1148                if (ret)
1149                        return ret;
1150
1151                DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
1152                          stream->period_exponent);
1153
1154                stream->perf->ops.oa_disable(stream);
1155                stream->perf->ops.oa_enable(stream);
1156
1157                oastatus1 = intel_uncore_read(uncore, GEN7_OASTATUS1);
1158        }
1159
1160        if (unlikely(oastatus1 & GEN7_OASTATUS1_REPORT_LOST)) {
1161                ret = append_oa_status(stream, buf, count, offset,
1162                                       DRM_I915_PERF_RECORD_OA_REPORT_LOST);
1163                if (ret)
1164                        return ret;
1165                stream->perf->gen7_latched_oastatus1 |=
1166                        GEN7_OASTATUS1_REPORT_LOST;
1167        }
1168
1169        return gen7_append_oa_reports(stream, buf, count, offset);
1170}
1171
1172/**
1173 * i915_oa_wait_unlocked - handles blocking IO until OA data available
1174 * @stream: An i915-perf stream opened for OA metrics
1175 *
1176 * Called when userspace tries to read() from a blocking stream FD opened
1177 * for OA metrics. It waits until the hrtimer callback finds a non-empty
1178 * OA buffer and wakes us.
1179 *
1180 * Note: it's acceptable to have this return with some false positives
1181 * since any subsequent read handling will return -EAGAIN if there isn't
1182 * really data ready for userspace yet.
1183 *
1184 * Returns: zero on success or a negative error code
1185 */
1186static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
1187{
1188        /* We would wait indefinitely if periodic sampling is not enabled */
1189        if (!stream->periodic)
1190                return -EIO;
1191
1192        return wait_event_interruptible(stream->poll_wq,
1193                                        oa_buffer_check_unlocked(stream));
1194}
1195
1196/**
1197 * i915_oa_poll_wait - call poll_wait() for an OA stream poll()
1198 * @stream: An i915-perf stream opened for OA metrics
1199 * @file: An i915 perf stream file
1200 * @wait: poll() state table
1201 *
1202 * For handling userspace polling on an i915 perf stream opened for OA metrics,
1203 * this starts a poll_wait with the wait queue that our hrtimer callback wakes
1204 * when it sees data ready to read in the circular OA buffer.
1205 */
1206static void i915_oa_poll_wait(struct i915_perf_stream *stream,
1207                              struct file *file,
1208                              poll_table *wait)
1209{
1210        poll_wait(file, &stream->poll_wq, wait);
1211}
1212
1213/**
1214 * i915_oa_read - just calls through to &i915_oa_ops->read
1215 * @stream: An i915-perf stream opened for OA metrics
1216 * @buf: destination buffer given by userspace
1217 * @count: the number of bytes userspace wants to read
1218 * @offset: (inout): the current position for writing into @buf
1219 *
1220 * Updates @offset according to the number of bytes successfully copied into
1221 * the userspace buffer.
1222 *
1223 * Returns: zero on success or a negative error code
1224 */
1225static int i915_oa_read(struct i915_perf_stream *stream,
1226                        char __user *buf,
1227                        size_t count,
1228                        size_t *offset)
1229{
1230        return stream->perf->ops.read(stream, buf, count, offset);
1231}
1232
1233static struct intel_context *oa_pin_context(struct i915_perf_stream *stream)
1234{
1235        struct i915_gem_engines_iter it;
1236        struct i915_gem_context *ctx = stream->ctx;
1237        struct intel_context *ce;
1238        int err;
1239
1240        for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
1241                if (ce->engine != stream->engine) /* first match! */
1242                        continue;
1243
1244                /*
1245                 * As the ID is the gtt offset of the context's vma we
1246                 * pin the vma to ensure the ID remains fixed.
1247                 */
1248                err = intel_context_pin(ce);
1249                if (err == 0) {
1250                        stream->pinned_ctx = ce;
1251                        break;
1252                }
1253        }
1254        i915_gem_context_unlock_engines(ctx);
1255
1256        return stream->pinned_ctx;
1257}
1258
1259/**
1260 * oa_get_render_ctx_id - determine and hold ctx hw id
1261 * @stream: An i915-perf stream opened for OA metrics
1262 *
1263 * Determine the render context hw id, and ensure it remains fixed for the
1264 * lifetime of the stream. This ensures that we don't have to worry about
1265 * updating the context ID in OACONTROL on the fly.
1266 *
1267 * Returns: zero on success or a negative error code
1268 */
1269static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
1270{
1271        struct intel_context *ce;
1272
1273        ce = oa_pin_context(stream);
1274        if (IS_ERR(ce))
1275                return PTR_ERR(ce);
1276
1277        switch (INTEL_GEN(ce->engine->i915)) {
1278        case 7: {
1279                /*
1280                 * On Haswell we don't do any post processing of the reports
1281                 * and don't need to use the mask.
1282                 */
1283                stream->specific_ctx_id = i915_ggtt_offset(ce->state);
1284                stream->specific_ctx_id_mask = 0;
1285                break;
1286        }
1287
1288        case 8:
1289        case 9:
1290        case 10:
1291                if (intel_engine_in_execlists_submission_mode(ce->engine)) {
1292                        stream->specific_ctx_id_mask =
1293                                (1U << GEN8_CTX_ID_WIDTH) - 1;
1294                        stream->specific_ctx_id = stream->specific_ctx_id_mask;
1295                } else {
1296                        /*
1297                         * When using GuC, the context descriptor we write in
1298                         * i915 is read by GuC and rewritten before it's
1299                         * actually written into the hardware. The LRCA is
1300                         * what is put into the context id field of the
1301                         * context descriptor by GuC. Because it's aligned to
1302                         * a page, the lower 12bits are always at 0 and
1303                         * dropped by GuC. They won't be part of the context
1304                         * ID in the OA reports, so squash those lower bits.
1305                         */
1306                        stream->specific_ctx_id =
1307                                lower_32_bits(ce->lrc_desc) >> 12;
1308
1309                        /*
1310                         * GuC uses the top bit to signal proxy submission, so
1311                         * ignore that bit.
1312                         */
1313                        stream->specific_ctx_id_mask =
1314                                (1U << (GEN8_CTX_ID_WIDTH - 1)) - 1;
1315                }
1316                break;
1317
1318        case 11:
1319        case 12: {
1320                stream->specific_ctx_id_mask =
1321                        ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32);
1322                stream->specific_ctx_id = stream->specific_ctx_id_mask;
1323                break;
1324        }
1325
1326        default:
1327                MISSING_CASE(INTEL_GEN(ce->engine->i915));
1328        }
1329
1330        ce->tag = stream->specific_ctx_id_mask;
1331
1332        DRM_DEBUG_DRIVER("filtering on ctx_id=0x%x ctx_id_mask=0x%x\n",
1333                         stream->specific_ctx_id,
1334                         stream->specific_ctx_id_mask);
1335
1336        return 0;
1337}
1338
1339/**
1340 * oa_put_render_ctx_id - counterpart to oa_get_render_ctx_id releases hold
1341 * @stream: An i915-perf stream opened for OA metrics
1342 *
1343 * In case anything needed doing to ensure the context HW ID would remain valid
1344 * for the lifetime of the stream, then that can be undone here.
1345 */
1346static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
1347{
1348        struct intel_context *ce;
1349
1350        ce = fetch_and_zero(&stream->pinned_ctx);
1351        if (ce) {
1352                ce->tag = 0; /* recomputed on next submission after parking */
1353                intel_context_unpin(ce);
1354        }
1355
1356        stream->specific_ctx_id = INVALID_CTX_ID;
1357        stream->specific_ctx_id_mask = 0;
1358}
1359
1360static void
1361free_oa_buffer(struct i915_perf_stream *stream)
1362{
1363        i915_vma_unpin_and_release(&stream->oa_buffer.vma,
1364                                   I915_VMA_RELEASE_MAP);
1365
1366        stream->oa_buffer.vaddr = NULL;
1367}
1368
1369static void
1370free_oa_configs(struct i915_perf_stream *stream)
1371{
1372        struct i915_oa_config_bo *oa_bo, *tmp;
1373
1374        i915_oa_config_put(stream->oa_config);
1375        llist_for_each_entry_safe(oa_bo, tmp, stream->oa_config_bos.first, node)
1376                free_oa_config_bo(oa_bo);
1377}
1378
1379static void
1380free_noa_wait(struct i915_perf_stream *stream)
1381{
1382        i915_vma_unpin_and_release(&stream->noa_wait, 0);
1383}
1384
1385static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
1386{
1387        struct i915_perf *perf = stream->perf;
1388
1389        BUG_ON(stream != perf->exclusive_stream);
1390
1391        /*
1392         * Unset exclusive_stream first, it will be checked while disabling
1393         * the metric set on gen8+.
1394         */
1395        perf->exclusive_stream = NULL;
1396        perf->ops.disable_metric_set(stream);
1397
1398        free_oa_buffer(stream);
1399
1400        intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
1401        intel_engine_pm_put(stream->engine);
1402
1403        if (stream->ctx)
1404                oa_put_render_ctx_id(stream);
1405
1406        free_oa_configs(stream);
1407        free_noa_wait(stream);
1408
1409        if (perf->spurious_report_rs.missed) {
1410                DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n",
1411                         perf->spurious_report_rs.missed);
1412        }
1413}
1414
1415static void gen7_init_oa_buffer(struct i915_perf_stream *stream)
1416{
1417        struct intel_uncore *uncore = stream->uncore;
1418        u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
1419        unsigned long flags;
1420
1421        spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
1422
1423        /* Pre-DevBDW: OABUFFER must be set with counters off,
1424         * before OASTATUS1, but after OASTATUS2
1425         */
1426        intel_uncore_write(uncore, GEN7_OASTATUS2, /* head */
1427                           gtt_offset | GEN7_OASTATUS2_MEM_SELECT_GGTT);
1428        stream->oa_buffer.head = gtt_offset;
1429
1430        intel_uncore_write(uncore, GEN7_OABUFFER, gtt_offset);
1431
1432        intel_uncore_write(uncore, GEN7_OASTATUS1, /* tail */
1433                           gtt_offset | OABUFFER_SIZE_16M);
1434
1435        /* Mark that we need updated tail pointers to read from... */
1436        stream->oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
1437        stream->oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
1438
1439        spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
1440
1441        /* On Haswell we have to track which OASTATUS1 flags we've
1442         * already seen since they can't be cleared while periodic
1443         * sampling is enabled.
1444         */
1445        stream->perf->gen7_latched_oastatus1 = 0;
1446
1447        /* NB: although the OA buffer will initially be allocated
1448         * zeroed via shmfs (and so this memset is redundant when
1449         * first allocating), we may re-init the OA buffer, either
1450         * when re-enabling a stream or in error/reset paths.
1451         *
1452         * The reason we clear the buffer for each re-init is for the
1453         * sanity check in gen7_append_oa_reports() that looks at the
1454         * report-id field to make sure it's non-zero which relies on
1455         * the assumption that new reports are being written to zeroed
1456         * memory...
1457         */
1458        memset(stream->oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
1459
1460        stream->pollin = false;
1461}
1462
1463static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
1464{
1465        struct intel_uncore *uncore = stream->uncore;
1466        u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
1467        unsigned long flags;
1468
1469        spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
1470
1471        intel_uncore_write(uncore, GEN8_OASTATUS, 0);
1472        intel_uncore_write(uncore, GEN8_OAHEADPTR, gtt_offset);
1473        stream->oa_buffer.head = gtt_offset;
1474
1475        intel_uncore_write(uncore, GEN8_OABUFFER_UDW, 0);
1476
1477        /*
1478         * PRM says:
1479         *
1480         *  "This MMIO must be set before the OATAILPTR
1481         *  register and after the OAHEADPTR register. This is
1482         *  to enable proper functionality of the overflow
1483         *  bit."
1484         */
1485        intel_uncore_write(uncore, GEN8_OABUFFER, gtt_offset |
1486                   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
1487        intel_uncore_write(uncore, GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
1488
1489        /* Mark that we need updated tail pointers to read from... */
1490        stream->oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
1491        stream->oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
1492
1493        /*
1494         * Reset state used to recognise context switches, affecting which
1495         * reports we will forward to userspace while filtering for a single
1496         * context.
1497         */
1498        stream->oa_buffer.last_ctx_id = INVALID_CTX_ID;
1499
1500        spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
1501
1502        /*
1503         * NB: although the OA buffer will initially be allocated
1504         * zeroed via shmfs (and so this memset is redundant when
1505         * first allocating), we may re-init the OA buffer, either
1506         * when re-enabling a stream or in error/reset paths.
1507         *
1508         * The reason we clear the buffer for each re-init is for the
1509         * sanity check in gen8_append_oa_reports() that looks at the
1510         * reason field to make sure it's non-zero which relies on
1511         * the assumption that new reports are being written to zeroed
1512         * memory...
1513         */
1514        memset(stream->oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
1515
1516        stream->pollin = false;
1517}
1518
1519static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
1520{
1521        struct intel_uncore *uncore = stream->uncore;
1522        u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
1523        unsigned long flags;
1524
1525        spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
1526
1527        intel_uncore_write(uncore, GEN12_OAG_OASTATUS, 0);
1528        intel_uncore_write(uncore, GEN12_OAG_OAHEADPTR,
1529                           gtt_offset & GEN12_OAG_OAHEADPTR_MASK);
1530        stream->oa_buffer.head = gtt_offset;
1531
1532        /*
1533         * PRM says:
1534         *
1535         *  "This MMIO must be set before the OATAILPTR
1536         *  register and after the OAHEADPTR register. This is
1537         *  to enable proper functionality of the overflow
1538         *  bit."
1539         */
1540        intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
1541                           OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
1542        intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
1543                           gtt_offset & GEN12_OAG_OATAILPTR_MASK);
1544
1545        /* Mark that we need updated tail pointers to read from... */
1546        stream->oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
1547        stream->oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
1548
1549        /*
1550         * Reset state used to recognise context switches, affecting which
1551         * reports we will forward to userspace while filtering for a single
1552         * context.
1553         */
1554        stream->oa_buffer.last_ctx_id = INVALID_CTX_ID;
1555
1556        spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
1557
1558        /*
1559         * NB: although the OA buffer will initially be allocated
1560         * zeroed via shmfs (and so this memset is redundant when
1561         * first allocating), we may re-init the OA buffer, either
1562         * when re-enabling a stream or in error/reset paths.
1563         *
1564         * The reason we clear the buffer for each re-init is for the
1565         * sanity check in gen8_append_oa_reports() that looks at the
1566         * reason field to make sure it's non-zero which relies on
1567         * the assumption that new reports are being written to zeroed
1568         * memory...
1569         */
1570        memset(stream->oa_buffer.vaddr, 0,
1571               stream->oa_buffer.vma->size);
1572
1573        stream->pollin = false;
1574}
1575
1576static int alloc_oa_buffer(struct i915_perf_stream *stream)
1577{
1578        struct drm_i915_gem_object *bo;
1579        struct i915_vma *vma;
1580        int ret;
1581
1582        if (WARN_ON(stream->oa_buffer.vma))
1583                return -ENODEV;
1584
1585        BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
1586        BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
1587
1588        bo = i915_gem_object_create_shmem(stream->perf->i915, OA_BUFFER_SIZE);
1589        if (IS_ERR(bo)) {
1590                DRM_ERROR("Failed to allocate OA buffer\n");
1591                return PTR_ERR(bo);
1592        }
1593
1594        i915_gem_object_set_cache_coherency(bo, I915_CACHE_LLC);
1595
1596        /* PreHSW required 512K alignment, HSW requires 16M */
1597        vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
1598        if (IS_ERR(vma)) {
1599                ret = PTR_ERR(vma);
1600                goto err_unref;
1601        }
1602        stream->oa_buffer.vma = vma;
1603
1604        stream->oa_buffer.vaddr =
1605                i915_gem_object_pin_map(bo, I915_MAP_WB);
1606        if (IS_ERR(stream->oa_buffer.vaddr)) {
1607                ret = PTR_ERR(stream->oa_buffer.vaddr);
1608                goto err_unpin;
1609        }
1610
1611        return 0;
1612
1613err_unpin:
1614        __i915_vma_unpin(vma);
1615
1616err_unref:
1617        i915_gem_object_put(bo);
1618
1619        stream->oa_buffer.vaddr = NULL;
1620        stream->oa_buffer.vma = NULL;
1621
1622        return ret;
1623}
1624
1625static u32 *save_restore_register(struct i915_perf_stream *stream, u32 *cs,
1626                                  bool save, i915_reg_t reg, u32 offset,
1627                                  u32 dword_count)
1628{
1629        u32 cmd;
1630        u32 d;
1631
1632        cmd = save ? MI_STORE_REGISTER_MEM : MI_LOAD_REGISTER_MEM;
1633        if (INTEL_GEN(stream->perf->i915) >= 8)
1634                cmd++;
1635
1636        for (d = 0; d < dword_count; d++) {
1637                *cs++ = cmd;
1638                *cs++ = i915_mmio_reg_offset(reg) + 4 * d;
1639                *cs++ = intel_gt_scratch_offset(stream->engine->gt,
1640                                                offset) + 4 * d;
1641                *cs++ = 0;
1642        }
1643
1644        return cs;
1645}
1646
1647static int alloc_noa_wait(struct i915_perf_stream *stream)
1648{
1649        struct drm_i915_private *i915 = stream->perf->i915;
1650        struct drm_i915_gem_object *bo;
1651        struct i915_vma *vma;
1652        const u64 delay_ticks = 0xffffffffffffffff -
1653                DIV64_U64_ROUND_UP(
1654                        atomic64_read(&stream->perf->noa_programming_delay) *
1655                        RUNTIME_INFO(i915)->cs_timestamp_frequency_khz,
1656                        1000000ull);
1657        const u32 base = stream->engine->mmio_base;
1658#define CS_GPR(x) GEN8_RING_CS_GPR(base, x)
1659        u32 *batch, *ts0, *cs, *jump;
1660        int ret, i;
1661        enum {
1662                START_TS,
1663                NOW_TS,
1664                DELTA_TS,
1665                JUMP_PREDICATE,
1666                DELTA_TARGET,
1667                N_CS_GPR
1668        };
1669
1670        bo = i915_gem_object_create_internal(i915, 4096);
1671        if (IS_ERR(bo)) {
1672                DRM_ERROR("Failed to allocate NOA wait batchbuffer\n");
1673                return PTR_ERR(bo);
1674        }
1675
1676        /*
1677         * We pin in GGTT because we jump into this buffer now because
1678         * multiple OA config BOs will have a jump to this address and it
1679         * needs to be fixed during the lifetime of the i915/perf stream.
1680         */
1681        vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 0, PIN_HIGH);
1682        if (IS_ERR(vma)) {
1683                ret = PTR_ERR(vma);
1684                goto err_unref;
1685        }
1686
1687        batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
1688        if (IS_ERR(batch)) {
1689                ret = PTR_ERR(batch);
1690                goto err_unpin;
1691        }
1692
1693        /* Save registers. */
1694        for (i = 0; i < N_CS_GPR; i++)
1695                cs = save_restore_register(
1696                        stream, cs, true /* save */, CS_GPR(i),
1697                        INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
1698        cs = save_restore_register(
1699                stream, cs, true /* save */, MI_PREDICATE_RESULT_1,
1700                INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
1701
1702        /* First timestamp snapshot location. */
1703        ts0 = cs;
1704
1705        /*
1706         * Initial snapshot of the timestamp register to implement the wait.
1707         * We work with 32b values, so clear out the top 32b bits of the
1708         * register because the ALU works 64bits.
1709         */
1710        *cs++ = MI_LOAD_REGISTER_IMM(1);
1711        *cs++ = i915_mmio_reg_offset(CS_GPR(START_TS)) + 4;
1712        *cs++ = 0;
1713        *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
1714        *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base));
1715        *cs++ = i915_mmio_reg_offset(CS_GPR(START_TS));
1716
1717        /*
1718         * This is the location we're going to jump back into until the
1719         * required amount of time has passed.
1720         */
1721        jump = cs;
1722
1723        /*
1724         * Take another snapshot of the timestamp register. Take care to clear
1725         * up the top 32bits of CS_GPR(1) as we're using it for other
1726         * operations below.
1727         */
1728        *cs++ = MI_LOAD_REGISTER_IMM(1);
1729        *cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS)) + 4;
1730        *cs++ = 0;
1731        *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
1732        *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base));
1733        *cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS));
1734
1735        /*
1736         * Do a diff between the 2 timestamps and store the result back into
1737         * CS_GPR(1).
1738         */
1739        *cs++ = MI_MATH(5);
1740        *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
1741        *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
1742        *cs++ = MI_MATH_SUB;
1743        *cs++ = MI_MATH_STORE(MI_MATH_REG(DELTA_TS), MI_MATH_REG_ACCU);
1744        *cs++ = MI_MATH_STORE(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
1745
1746        /*
1747         * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
1748         * timestamp have rolled over the 32bits) into the predicate register
1749         * to be used for the predicated jump.
1750         */
1751        *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
1752        *cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE));
1753        *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
1754
1755        /* Restart from the beginning if we had timestamps roll over. */
1756        *cs++ = (INTEL_GEN(i915) < 8 ?
1757                 MI_BATCH_BUFFER_START :
1758                 MI_BATCH_BUFFER_START_GEN8) |
1759                MI_BATCH_PREDICATE;
1760        *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4;
1761        *cs++ = 0;
1762
1763        /*
1764         * Now add the diff between to previous timestamps and add it to :
1765         *      (((1 * << 64) - 1) - delay_ns)
1766         *
1767         * When the Carry Flag contains 1 this means the elapsed time is
1768         * longer than the expected delay, and we can exit the wait loop.
1769         */
1770        *cs++ = MI_LOAD_REGISTER_IMM(2);
1771        *cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET));
1772        *cs++ = lower_32_bits(delay_ticks);
1773        *cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET)) + 4;
1774        *cs++ = upper_32_bits(delay_ticks);
1775
1776        *cs++ = MI_MATH(4);
1777        *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(DELTA_TS));
1778        *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(DELTA_TARGET));
1779        *cs++ = MI_MATH_ADD;
1780        *cs++ = MI_MATH_STOREINV(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
1781
1782        *cs++ = MI_ARB_CHECK;
1783
1784        /*
1785         * Transfer the result into the predicate register to be used for the
1786         * predicated jump.
1787         */
1788        *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
1789        *cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE));
1790        *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
1791
1792        /* Predicate the jump.  */
1793        *cs++ = (INTEL_GEN(i915) < 8 ?
1794                 MI_BATCH_BUFFER_START :
1795                 MI_BATCH_BUFFER_START_GEN8) |
1796                MI_BATCH_PREDICATE;
1797        *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4;
1798        *cs++ = 0;
1799
1800        /* Restore registers. */
1801        for (i = 0; i < N_CS_GPR; i++)
1802                cs = save_restore_register(
1803                        stream, cs, false /* restore */, CS_GPR(i),
1804                        INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
1805        cs = save_restore_register(
1806                stream, cs, false /* restore */, MI_PREDICATE_RESULT_1,
1807                INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
1808
1809        /* And return to the ring. */
1810        *cs++ = MI_BATCH_BUFFER_END;
1811
1812        GEM_BUG_ON(cs - batch > PAGE_SIZE / sizeof(*batch));
1813
1814        i915_gem_object_flush_map(bo);
1815        i915_gem_object_unpin_map(bo);
1816
1817        stream->noa_wait = vma;
1818        return 0;
1819
1820err_unpin:
1821        i915_vma_unpin_and_release(&vma, 0);
1822err_unref:
1823        i915_gem_object_put(bo);
1824        return ret;
1825}
1826
1827static u32 *write_cs_mi_lri(u32 *cs,
1828                            const struct i915_oa_reg *reg_data,
1829                            u32 n_regs)
1830{
1831        u32 i;
1832
1833        for (i = 0; i < n_regs; i++) {
1834                if ((i % MI_LOAD_REGISTER_IMM_MAX_REGS) == 0) {
1835                        u32 n_lri = min_t(u32,
1836                                          n_regs - i,
1837                                          MI_LOAD_REGISTER_IMM_MAX_REGS);
1838
1839                        *cs++ = MI_LOAD_REGISTER_IMM(n_lri);
1840                }
1841                *cs++ = i915_mmio_reg_offset(reg_data[i].addr);
1842                *cs++ = reg_data[i].value;
1843        }
1844
1845        return cs;
1846}
1847
1848static int num_lri_dwords(int num_regs)
1849{
1850        int count = 0;
1851
1852        if (num_regs > 0) {
1853                count += DIV_ROUND_UP(num_regs, MI_LOAD_REGISTER_IMM_MAX_REGS);
1854                count += num_regs * 2;
1855        }
1856
1857        return count;
1858}
1859
1860static struct i915_oa_config_bo *
1861alloc_oa_config_buffer(struct i915_perf_stream *stream,
1862                       struct i915_oa_config *oa_config)
1863{
1864        struct drm_i915_gem_object *obj;
1865        struct i915_oa_config_bo *oa_bo;
1866        size_t config_length = 0;
1867        u32 *cs;
1868        int err;
1869
1870        oa_bo = kzalloc(sizeof(*oa_bo), GFP_KERNEL);
1871        if (!oa_bo)
1872                return ERR_PTR(-ENOMEM);
1873
1874        config_length += num_lri_dwords(oa_config->mux_regs_len);
1875        config_length += num_lri_dwords(oa_config->b_counter_regs_len);
1876        config_length += num_lri_dwords(oa_config->flex_regs_len);
1877        config_length += 3; /* MI_BATCH_BUFFER_START */
1878        config_length = ALIGN(sizeof(u32) * config_length, I915_GTT_PAGE_SIZE);
1879
1880        obj = i915_gem_object_create_shmem(stream->perf->i915, config_length);
1881        if (IS_ERR(obj)) {
1882                err = PTR_ERR(obj);
1883                goto err_free;
1884        }
1885
1886        cs = i915_gem_object_pin_map(obj, I915_MAP_WB);
1887        if (IS_ERR(cs)) {
1888                err = PTR_ERR(cs);
1889                goto err_oa_bo;
1890        }
1891
1892        cs = write_cs_mi_lri(cs,
1893                             oa_config->mux_regs,
1894                             oa_config->mux_regs_len);
1895        cs = write_cs_mi_lri(cs,
1896                             oa_config->b_counter_regs,
1897                             oa_config->b_counter_regs_len);
1898        cs = write_cs_mi_lri(cs,
1899                             oa_config->flex_regs,
1900                             oa_config->flex_regs_len);
1901
1902        /* Jump into the active wait. */
1903        *cs++ = (INTEL_GEN(stream->perf->i915) < 8 ?
1904                 MI_BATCH_BUFFER_START :
1905                 MI_BATCH_BUFFER_START_GEN8);
1906        *cs++ = i915_ggtt_offset(stream->noa_wait);
1907        *cs++ = 0;
1908
1909        i915_gem_object_flush_map(obj);
1910        i915_gem_object_unpin_map(obj);
1911
1912        oa_bo->vma = i915_vma_instance(obj,
1913                                       &stream->engine->gt->ggtt->vm,
1914                                       NULL);
1915        if (IS_ERR(oa_bo->vma)) {
1916                err = PTR_ERR(oa_bo->vma);
1917                goto err_oa_bo;
1918        }
1919
1920        oa_bo->oa_config = i915_oa_config_get(oa_config);
1921        llist_add(&oa_bo->node, &stream->oa_config_bos);
1922
1923        return oa_bo;
1924
1925err_oa_bo:
1926        i915_gem_object_put(obj);
1927err_free:
1928        kfree(oa_bo);
1929        return ERR_PTR(err);
1930}
1931
1932static struct i915_vma *
1933get_oa_vma(struct i915_perf_stream *stream, struct i915_oa_config *oa_config)
1934{
1935        struct i915_oa_config_bo *oa_bo;
1936
1937        /*
1938         * Look for the buffer in the already allocated BOs attached
1939         * to the stream.
1940         */
1941        llist_for_each_entry(oa_bo, stream->oa_config_bos.first, node) {
1942                if (oa_bo->oa_config == oa_config &&
1943                    memcmp(oa_bo->oa_config->uuid,
1944                           oa_config->uuid,
1945                           sizeof(oa_config->uuid)) == 0)
1946                        goto out;
1947        }
1948
1949        oa_bo = alloc_oa_config_buffer(stream, oa_config);
1950        if (IS_ERR(oa_bo))
1951                return ERR_CAST(oa_bo);
1952
1953out:
1954        return i915_vma_get(oa_bo->vma);
1955}
1956
1957static struct i915_request *
1958emit_oa_config(struct i915_perf_stream *stream,
1959               struct i915_oa_config *oa_config,
1960               struct intel_context *ce)
1961{
1962        struct i915_request *rq;
1963        struct i915_vma *vma;
1964        int err;
1965
1966        vma = get_oa_vma(stream, oa_config);
1967        if (IS_ERR(vma))
1968                return ERR_CAST(vma);
1969
1970        err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
1971        if (err)
1972                goto err_vma_put;
1973
1974        intel_engine_pm_get(ce->engine);
1975        rq = i915_request_create(ce);
1976        intel_engine_pm_put(ce->engine);
1977        if (IS_ERR(rq)) {
1978                err = PTR_ERR(rq);
1979                goto err_vma_unpin;
1980        }
1981
1982        i915_vma_lock(vma);
1983        err = i915_request_await_object(rq, vma->obj, 0);
1984        if (!err)
1985                err = i915_vma_move_to_active(vma, rq, 0);
1986        i915_vma_unlock(vma);
1987        if (err)
1988                goto err_add_request;
1989
1990        err = rq->engine->emit_bb_start(rq,
1991                                        vma->node.start, 0,
1992                                        I915_DISPATCH_SECURE);
1993        if (err)
1994                goto err_add_request;
1995
1996        i915_request_get(rq);
1997err_add_request:
1998        i915_request_add(rq);
1999err_vma_unpin:
2000        i915_vma_unpin(vma);
2001err_vma_put:
2002        i915_vma_put(vma);
2003        return err ? ERR_PTR(err) : rq;
2004}
2005
2006static struct intel_context *oa_context(struct i915_perf_stream *stream)
2007{
2008        return stream->pinned_ctx ?: stream->engine->kernel_context;
2009}
2010
2011static struct i915_request *
2012hsw_enable_metric_set(struct i915_perf_stream *stream)
2013{
2014        struct intel_uncore *uncore = stream->uncore;
2015
2016        /*
2017         * PRM:
2018         *
2019         * OA unit is using “crclk” for its functionality. When trunk
2020         * level clock gating takes place, OA clock would be gated,
2021         * unable to count the events from non-render clock domain.
2022         * Render clock gating must be disabled when OA is enabled to
2023         * count the events from non-render domain. Unit level clock
2024         * gating for RCS should also be disabled.
2025         */
2026        intel_uncore_rmw(uncore, GEN7_MISCCPCTL,
2027                         GEN7_DOP_CLOCK_GATE_ENABLE, 0);
2028        intel_uncore_rmw(uncore, GEN6_UCGCTL1,
2029                         0, GEN6_CSUNIT_CLOCK_GATE_DISABLE);
2030
2031        return emit_oa_config(stream, stream->oa_config, oa_context(stream));
2032}
2033
2034static void hsw_disable_metric_set(struct i915_perf_stream *stream)
2035{
2036        struct intel_uncore *uncore = stream->uncore;
2037
2038        intel_uncore_rmw(uncore, GEN6_UCGCTL1,
2039                         GEN6_CSUNIT_CLOCK_GATE_DISABLE, 0);
2040        intel_uncore_rmw(uncore, GEN7_MISCCPCTL,
2041                         0, GEN7_DOP_CLOCK_GATE_ENABLE);
2042
2043        intel_uncore_rmw(uncore, GDT_CHICKEN_BITS, GT_NOA_ENABLE, 0);
2044}
2045
2046static u32 oa_config_flex_reg(const struct i915_oa_config *oa_config,
2047                              i915_reg_t reg)
2048{
2049        u32 mmio = i915_mmio_reg_offset(reg);
2050        int i;
2051
2052        /*
2053         * This arbitrary default will select the 'EU FPU0 Pipeline
2054         * Active' event. In the future it's anticipated that there
2055         * will be an explicit 'No Event' we can select, but not yet...
2056         */
2057        if (!oa_config)
2058                return 0;
2059
2060        for (i = 0; i < oa_config->flex_regs_len; i++) {
2061                if (i915_mmio_reg_offset(oa_config->flex_regs[i].addr) == mmio)
2062                        return oa_config->flex_regs[i].value;
2063        }
2064
2065        return 0;
2066}
2067/*
2068 * NB: It must always remain pointer safe to run this even if the OA unit
2069 * has been disabled.
2070 *
2071 * It's fine to put out-of-date values into these per-context registers
2072 * in the case that the OA unit has been disabled.
2073 */
2074static void
2075gen8_update_reg_state_unlocked(const struct intel_context *ce,
2076                               const struct i915_perf_stream *stream)
2077{
2078        u32 ctx_oactxctrl = stream->perf->ctx_oactxctrl_offset;
2079        u32 ctx_flexeu0 = stream->perf->ctx_flexeu0_offset;
2080        /* The MMIO offsets for Flex EU registers aren't contiguous */
2081        i915_reg_t flex_regs[] = {
2082                EU_PERF_CNTL0,
2083                EU_PERF_CNTL1,
2084                EU_PERF_CNTL2,
2085                EU_PERF_CNTL3,
2086                EU_PERF_CNTL4,
2087                EU_PERF_CNTL5,
2088                EU_PERF_CNTL6,
2089        };
2090        u32 *reg_state = ce->lrc_reg_state;
2091        int i;
2092
2093        reg_state[ctx_oactxctrl + 1] =
2094                (stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
2095                (stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
2096                GEN8_OA_COUNTER_RESUME;
2097
2098        for (i = 0; i < ARRAY_SIZE(flex_regs); i++)
2099                reg_state[ctx_flexeu0 + i * 2 + 1] =
2100                        oa_config_flex_reg(stream->oa_config, flex_regs[i]);
2101
2102        reg_state[CTX_R_PWR_CLK_STATE] =
2103                intel_sseu_make_rpcs(ce->engine->i915, &ce->sseu);
2104}
2105
2106struct flex {
2107        i915_reg_t reg;
2108        u32 offset;
2109        u32 value;
2110};
2111
2112static int
2113gen8_store_flex(struct i915_request *rq,
2114                struct intel_context *ce,
2115                const struct flex *flex, unsigned int count)
2116{
2117        u32 offset;
2118        u32 *cs;
2119
2120        cs = intel_ring_begin(rq, 4 * count);
2121        if (IS_ERR(cs))
2122                return PTR_ERR(cs);
2123
2124        offset = i915_ggtt_offset(ce->state) + LRC_STATE_PN * PAGE_SIZE;
2125        do {
2126                *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2127                *cs++ = offset + flex->offset * sizeof(u32);
2128                *cs++ = 0;
2129                *cs++ = flex->value;
2130        } while (flex++, --count);
2131
2132        intel_ring_advance(rq, cs);
2133
2134        return 0;
2135}
2136
2137static int
2138gen8_load_flex(struct i915_request *rq,
2139               struct intel_context *ce,
2140               const struct flex *flex, unsigned int count)
2141{
2142        u32 *cs;
2143
2144        GEM_BUG_ON(!count || count > 63);
2145
2146        cs = intel_ring_begin(rq, 2 * count + 2);
2147        if (IS_ERR(cs))
2148                return PTR_ERR(cs);
2149
2150        *cs++ = MI_LOAD_REGISTER_IMM(count);
2151        do {
2152                *cs++ = i915_mmio_reg_offset(flex->reg);
2153                *cs++ = flex->value;
2154        } while (flex++, --count);
2155        *cs++ = MI_NOOP;
2156
2157        intel_ring_advance(rq, cs);
2158
2159        return 0;
2160}
2161
2162static int gen8_modify_context(struct intel_context *ce,
2163                               const struct flex *flex, unsigned int count)
2164{
2165        struct i915_request *rq;
2166        int err;
2167
2168        rq = intel_engine_create_kernel_request(ce->engine);
2169        if (IS_ERR(rq))
2170                return PTR_ERR(rq);
2171
2172        /* Serialise with the remote context */
2173        err = intel_context_prepare_remote_request(ce, rq);
2174        if (err == 0)
2175                err = gen8_store_flex(rq, ce, flex, count);
2176
2177        i915_request_add(rq);
2178        return err;
2179}
2180
2181static int gen8_modify_self(struct intel_context *ce,
2182                            const struct flex *flex, unsigned int count)
2183{
2184        struct i915_request *rq;
2185        int err;
2186
2187        rq = i915_request_create(ce);
2188        if (IS_ERR(rq))
2189                return PTR_ERR(rq);
2190
2191        err = gen8_load_flex(rq, ce, flex, count);
2192
2193        i915_request_add(rq);
2194        return err;
2195}
2196
2197static int gen8_configure_context(struct i915_gem_context *ctx,
2198                                  struct flex *flex, unsigned int count)
2199{
2200        struct i915_gem_engines_iter it;
2201        struct intel_context *ce;
2202        int err = 0;
2203
2204        for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
2205                GEM_BUG_ON(ce == ce->engine->kernel_context);
2206
2207                if (ce->engine->class != RENDER_CLASS)
2208                        continue;
2209
2210                /* Otherwise OA settings will be set upon first use */
2211                if (!intel_context_pin_if_active(ce))
2212                        continue;
2213
2214                flex->value = intel_sseu_make_rpcs(ctx->i915, &ce->sseu);
2215                err = gen8_modify_context(ce, flex, count);
2216
2217                intel_context_unpin(ce);
2218                if (err)
2219                        break;
2220        }
2221        i915_gem_context_unlock_engines(ctx);
2222
2223        return err;
2224}
2225
2226static int gen12_configure_oar_context(struct i915_perf_stream *stream, bool enable)
2227{
2228        int err;
2229        struct intel_context *ce = stream->pinned_ctx;
2230        u32 format = stream->oa_buffer.format;
2231        struct flex regs_context[] = {
2232                {
2233                        GEN8_OACTXCONTROL,
2234                        stream->perf->ctx_oactxctrl_offset + 1,
2235                        enable ? GEN8_OA_COUNTER_RESUME : 0,
2236                },
2237        };
2238        /* Offsets in regs_lri are not used since this configuration is only
2239         * applied using LRI. Initialize the correct offsets for posterity.
2240         */
2241#define GEN12_OAR_OACONTROL_OFFSET 0x5B0
2242        struct flex regs_lri[] = {
2243                {
2244                        GEN12_OAR_OACONTROL,
2245                        GEN12_OAR_OACONTROL_OFFSET + 1,
2246                        (format << GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT) |
2247                        (enable ? GEN12_OAR_OACONTROL_COUNTER_ENABLE : 0)
2248                },
2249                {
2250                        RING_CONTEXT_CONTROL(ce->engine->mmio_base),
2251                        CTX_CONTEXT_CONTROL,
2252                        _MASKED_FIELD(GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE,
2253                                      enable ?
2254                                      GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE :
2255                                      0)
2256                },
2257        };
2258
2259        /* Modify the context image of pinned context with regs_context*/
2260        err = intel_context_lock_pinned(ce);
2261        if (err)
2262                return err;
2263
2264        err = gen8_modify_context(ce, regs_context, ARRAY_SIZE(regs_context));
2265        intel_context_unlock_pinned(ce);
2266        if (err)
2267                return err;
2268
2269        /* Apply regs_lri using LRI with pinned context */
2270        return gen8_modify_self(ce, regs_lri, ARRAY_SIZE(regs_lri));
2271}
2272
2273/*
2274 * Manages updating the per-context aspects of the OA stream
2275 * configuration across all contexts.
2276 *
2277 * The awkward consideration here is that OACTXCONTROL controls the
2278 * exponent for periodic sampling which is primarily used for system
2279 * wide profiling where we'd like a consistent sampling period even in
2280 * the face of context switches.
2281 *
2282 * Our approach of updating the register state context (as opposed to
2283 * say using a workaround batch buffer) ensures that the hardware
2284 * won't automatically reload an out-of-date timer exponent even
2285 * transiently before a WA BB could be parsed.
2286 *
2287 * This function needs to:
2288 * - Ensure the currently running context's per-context OA state is
2289 *   updated
2290 * - Ensure that all existing contexts will have the correct per-context
2291 *   OA state if they are scheduled for use.
2292 * - Ensure any new contexts will be initialized with the correct
2293 *   per-context OA state.
2294 *
2295 * Note: it's only the RCS/Render context that has any OA state.
2296 * Note: the first flex register passed must always be R_PWR_CLK_STATE
2297 */
2298static int oa_configure_all_contexts(struct i915_perf_stream *stream,
2299                                     struct flex *regs,
2300                                     size_t num_regs)
2301{
2302        struct drm_i915_private *i915 = stream->perf->i915;
2303        struct intel_engine_cs *engine;
2304        struct i915_gem_context *ctx, *cn;
2305        int err;
2306
2307        lockdep_assert_held(&stream->perf->lock);
2308
2309        /*
2310         * The OA register config is setup through the context image. This image
2311         * might be written to by the GPU on context switch (in particular on
2312         * lite-restore). This means we can't safely update a context's image,
2313         * if this context is scheduled/submitted to run on the GPU.
2314         *
2315         * We could emit the OA register config through the batch buffer but
2316         * this might leave small interval of time where the OA unit is
2317         * configured at an invalid sampling period.
2318         *
2319         * Note that since we emit all requests from a single ring, there
2320         * is still an implicit global barrier here that may cause a high
2321         * priority context to wait for an otherwise independent low priority
2322         * context. Contexts idle at the time of reconfiguration are not
2323         * trapped behind the barrier.
2324         */
2325        spin_lock(&i915->gem.contexts.lock);
2326        list_for_each_entry_safe(ctx, cn, &i915->gem.contexts.list, link) {
2327                if (!kref_get_unless_zero(&ctx->ref))
2328                        continue;
2329
2330                spin_unlock(&i915->gem.contexts.lock);
2331
2332                err = gen8_configure_context(ctx, regs, num_regs);
2333                if (err) {
2334                        i915_gem_context_put(ctx);
2335                        return err;
2336                }
2337
2338                spin_lock(&i915->gem.contexts.lock);
2339                list_safe_reset_next(ctx, cn, link);
2340                i915_gem_context_put(ctx);
2341        }
2342        spin_unlock(&i915->gem.contexts.lock);
2343
2344        /*
2345         * After updating all other contexts, we need to modify ourselves.
2346         * If we don't modify the kernel_context, we do not get events while
2347         * idle.
2348         */
2349        for_each_uabi_engine(engine, i915) {
2350                struct intel_context *ce = engine->kernel_context;
2351
2352                if (engine->class != RENDER_CLASS)
2353                        continue;
2354
2355                regs[0].value = intel_sseu_make_rpcs(i915, &ce->sseu);
2356
2357                err = gen8_modify_self(ce, regs, num_regs);
2358                if (err)
2359                        return err;
2360        }
2361
2362        return 0;
2363}
2364
2365static int gen12_configure_all_contexts(struct i915_perf_stream *stream,
2366                                        const struct i915_oa_config *oa_config)
2367{
2368        struct flex regs[] = {
2369                {
2370                        GEN8_R_PWR_CLK_STATE,
2371                        CTX_R_PWR_CLK_STATE,
2372                },
2373        };
2374
2375        return oa_configure_all_contexts(stream, regs, ARRAY_SIZE(regs));
2376}
2377
2378static int lrc_configure_all_contexts(struct i915_perf_stream *stream,
2379                                      const struct i915_oa_config *oa_config)
2380{
2381        /* The MMIO offsets for Flex EU registers aren't contiguous */
2382        const u32 ctx_flexeu0 = stream->perf->ctx_flexeu0_offset;
2383#define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N) + 1)
2384        struct flex regs[] = {
2385                {
2386                        GEN8_R_PWR_CLK_STATE,
2387                        CTX_R_PWR_CLK_STATE,
2388                },
2389                {
2390                        GEN8_OACTXCONTROL,
2391                        stream->perf->ctx_oactxctrl_offset + 1,
2392                },
2393                { EU_PERF_CNTL0, ctx_flexeuN(0) },
2394                { EU_PERF_CNTL1, ctx_flexeuN(1) },
2395                { EU_PERF_CNTL2, ctx_flexeuN(2) },
2396                { EU_PERF_CNTL3, ctx_flexeuN(3) },
2397                { EU_PERF_CNTL4, ctx_flexeuN(4) },
2398                { EU_PERF_CNTL5, ctx_flexeuN(5) },
2399                { EU_PERF_CNTL6, ctx_flexeuN(6) },
2400        };
2401#undef ctx_flexeuN
2402        int i;
2403
2404        regs[1].value =
2405                (stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
2406                (stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
2407                GEN8_OA_COUNTER_RESUME;
2408
2409        for (i = 2; i < ARRAY_SIZE(regs); i++)
2410                regs[i].value = oa_config_flex_reg(oa_config, regs[i].reg);
2411
2412        return oa_configure_all_contexts(stream, regs, ARRAY_SIZE(regs));
2413}
2414
2415static struct i915_request *
2416gen8_enable_metric_set(struct i915_perf_stream *stream)
2417{
2418        struct intel_uncore *uncore = stream->uncore;
2419        struct i915_oa_config *oa_config = stream->oa_config;
2420        int ret;
2421
2422        /*
2423         * We disable slice/unslice clock ratio change reports on SKL since
2424         * they are too noisy. The HW generates a lot of redundant reports
2425         * where the ratio hasn't really changed causing a lot of redundant
2426         * work to processes and increasing the chances we'll hit buffer
2427         * overruns.
2428         *
2429         * Although we don't currently use the 'disable overrun' OABUFFER
2430         * feature it's worth noting that clock ratio reports have to be
2431         * disabled before considering to use that feature since the HW doesn't
2432         * correctly block these reports.
2433         *
2434         * Currently none of the high-level metrics we have depend on knowing
2435         * this ratio to normalize.
2436         *
2437         * Note: This register is not power context saved and restored, but
2438         * that's OK considering that we disable RC6 while the OA unit is
2439         * enabled.
2440         *
2441         * The _INCLUDE_CLK_RATIO bit allows the slice/unslice frequency to
2442         * be read back from automatically triggered reports, as part of the
2443         * RPT_ID field.
2444         */
2445        if (IS_GEN_RANGE(stream->perf->i915, 9, 11)) {
2446                intel_uncore_write(uncore, GEN8_OA_DEBUG,
2447                                   _MASKED_BIT_ENABLE(GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
2448                                                      GEN9_OA_DEBUG_INCLUDE_CLK_RATIO));
2449        }
2450
2451        /*
2452         * Update all contexts prior writing the mux configurations as we need
2453         * to make sure all slices/subslices are ON before writing to NOA
2454         * registers.
2455         */
2456        ret = lrc_configure_all_contexts(stream, oa_config);
2457        if (ret)
2458                return ERR_PTR(ret);
2459
2460        return emit_oa_config(stream, oa_config, oa_context(stream));
2461}
2462
2463static u32 oag_report_ctx_switches(const struct i915_perf_stream *stream)
2464{
2465        return _MASKED_FIELD(GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS,
2466                             (stream->sample_flags & SAMPLE_OA_REPORT) ?
2467                             0 : GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS);
2468}
2469
2470static struct i915_request *
2471gen12_enable_metric_set(struct i915_perf_stream *stream)
2472{
2473        struct intel_uncore *uncore = stream->uncore;
2474        struct i915_oa_config *oa_config = stream->oa_config;
2475        bool periodic = stream->periodic;
2476        u32 period_exponent = stream->period_exponent;
2477        int ret;
2478
2479        intel_uncore_write(uncore, GEN12_OAG_OA_DEBUG,
2480                           /* Disable clk ratio reports, like previous Gens. */
2481                           _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
2482                                              GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) |
2483                           /*
2484                            * If the user didn't require OA reports, instruct
2485                            * the hardware not to emit ctx switch reports.
2486                            */
2487                           oag_report_ctx_switches(stream));
2488
2489        intel_uncore_write(uncore, GEN12_OAG_OAGLBCTXCTRL, periodic ?
2490                           (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME |
2491                            GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE |
2492                            (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
2493                            : 0);
2494
2495        /*
2496         * Update all contexts prior writing the mux configurations as we need
2497         * to make sure all slices/subslices are ON before writing to NOA
2498         * registers.
2499         */
2500        ret = gen12_configure_all_contexts(stream, oa_config);
2501        if (ret)
2502                return ERR_PTR(ret);
2503
2504        /*
2505         * For Gen12, performance counters are context
2506         * saved/restored. Only enable it for the context that
2507         * requested this.
2508         */
2509        if (stream->ctx) {
2510                ret = gen12_configure_oar_context(stream, true);
2511                if (ret)
2512                        return ERR_PTR(ret);
2513        }
2514
2515        return emit_oa_config(stream, oa_config, oa_context(stream));
2516}
2517
2518static void gen8_disable_metric_set(struct i915_perf_stream *stream)
2519{
2520        struct intel_uncore *uncore = stream->uncore;
2521
2522        /* Reset all contexts' slices/subslices configurations. */
2523        lrc_configure_all_contexts(stream, NULL);
2524
2525        intel_uncore_rmw(uncore, GDT_CHICKEN_BITS, GT_NOA_ENABLE, 0);
2526}
2527
2528static void gen10_disable_metric_set(struct i915_perf_stream *stream)
2529{
2530        struct intel_uncore *uncore = stream->uncore;
2531
2532        /* Reset all contexts' slices/subslices configurations. */
2533        lrc_configure_all_contexts(stream, NULL);
2534
2535        /* Make sure we disable noa to save power. */
2536        intel_uncore_rmw(uncore, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, 0);
2537}
2538
2539static void gen12_disable_metric_set(struct i915_perf_stream *stream)
2540{
2541        struct intel_uncore *uncore = stream->uncore;
2542
2543        /* Reset all contexts' slices/subslices configurations. */
2544        gen12_configure_all_contexts(stream, NULL);
2545
2546        /* disable the context save/restore or OAR counters */
2547        if (stream->ctx)
2548                gen12_configure_oar_context(stream, false);
2549
2550        /* Make sure we disable noa to save power. */
2551        intel_uncore_rmw(uncore, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, 0);
2552}
2553
2554static void gen7_oa_enable(struct i915_perf_stream *stream)
2555{
2556        struct intel_uncore *uncore = stream->uncore;
2557        struct i915_gem_context *ctx = stream->ctx;
2558        u32 ctx_id = stream->specific_ctx_id;
2559        bool periodic = stream->periodic;
2560        u32 period_exponent = stream->period_exponent;
2561        u32 report_format = stream->oa_buffer.format;
2562
2563        /*
2564         * Reset buf pointers so we don't forward reports from before now.
2565         *
2566         * Think carefully if considering trying to avoid this, since it
2567         * also ensures status flags and the buffer itself are cleared
2568         * in error paths, and we have checks for invalid reports based
2569         * on the assumption that certain fields are written to zeroed
2570         * memory which this helps maintains.
2571         */
2572        gen7_init_oa_buffer(stream);
2573
2574        intel_uncore_write(uncore, GEN7_OACONTROL,
2575                           (ctx_id & GEN7_OACONTROL_CTX_MASK) |
2576                           (period_exponent <<
2577                            GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
2578                           (periodic ? GEN7_OACONTROL_TIMER_ENABLE : 0) |
2579                           (report_format << GEN7_OACONTROL_FORMAT_SHIFT) |
2580                           (ctx ? GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
2581                           GEN7_OACONTROL_ENABLE);
2582}
2583
2584static void gen8_oa_enable(struct i915_perf_stream *stream)
2585{
2586        struct intel_uncore *uncore = stream->uncore;
2587        u32 report_format = stream->oa_buffer.format;
2588
2589        /*
2590         * Reset buf pointers so we don't forward reports from before now.
2591         *
2592         * Think carefully if considering trying to avoid this, since it
2593         * also ensures status flags and the buffer itself are cleared
2594         * in error paths, and we have checks for invalid reports based
2595         * on the assumption that certain fields are written to zeroed
2596         * memory which this helps maintains.
2597         */
2598        gen8_init_oa_buffer(stream);
2599
2600        /*
2601         * Note: we don't rely on the hardware to perform single context
2602         * filtering and instead filter on the cpu based on the context-id
2603         * field of reports
2604         */
2605        intel_uncore_write(uncore, GEN8_OACONTROL,
2606                           (report_format << GEN8_OA_REPORT_FORMAT_SHIFT) |
2607                           GEN8_OA_COUNTER_ENABLE);
2608}
2609
2610static void gen12_oa_enable(struct i915_perf_stream *stream)
2611{
2612        struct intel_uncore *uncore = stream->uncore;
2613        u32 report_format = stream->oa_buffer.format;
2614
2615        /*
2616         * If we don't want OA reports from the OA buffer, then we don't even
2617         * need to program the OAG unit.
2618         */
2619        if (!(stream->sample_flags & SAMPLE_OA_REPORT))
2620                return;
2621
2622        gen12_init_oa_buffer(stream);
2623
2624        intel_uncore_write(uncore, GEN12_OAG_OACONTROL,
2625                           (report_format << GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT) |
2626                           GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE);
2627}
2628
2629/**
2630 * i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream
2631 * @stream: An i915 perf stream opened for OA metrics
2632 *
2633 * [Re]enables hardware periodic sampling according to the period configured
2634 * when opening the stream. This also starts a hrtimer that will periodically
2635 * check for data in the circular OA buffer for notifying userspace (e.g.
2636 * during a read() or poll()).
2637 */
2638static void i915_oa_stream_enable(struct i915_perf_stream *stream)
2639{
2640        stream->perf->ops.oa_enable(stream);
2641
2642        if (stream->periodic)
2643                hrtimer_start(&stream->poll_check_timer,
2644                              ns_to_ktime(POLL_PERIOD),
2645                              HRTIMER_MODE_REL_PINNED);
2646}
2647
2648static void gen7_oa_disable(struct i915_perf_stream *stream)
2649{
2650        struct intel_uncore *uncore = stream->uncore;
2651
2652        intel_uncore_write(uncore, GEN7_OACONTROL, 0);
2653        if (intel_wait_for_register(uncore,
2654                                    GEN7_OACONTROL, GEN7_OACONTROL_ENABLE, 0,
2655                                    50))
2656                DRM_ERROR("wait for OA to be disabled timed out\n");
2657}
2658
2659static void gen8_oa_disable(struct i915_perf_stream *stream)
2660{
2661        struct intel_uncore *uncore = stream->uncore;
2662
2663        intel_uncore_write(uncore, GEN8_OACONTROL, 0);
2664        if (intel_wait_for_register(uncore,
2665                                    GEN8_OACONTROL, GEN8_OA_COUNTER_ENABLE, 0,
2666                                    50))
2667                DRM_ERROR("wait for OA to be disabled timed out\n");
2668}
2669
2670static void gen12_oa_disable(struct i915_perf_stream *stream)
2671{
2672        struct intel_uncore *uncore = stream->uncore;
2673
2674        intel_uncore_write(uncore, GEN12_OAG_OACONTROL, 0);
2675        if (intel_wait_for_register(uncore,
2676                                    GEN12_OAG_OACONTROL,
2677                                    GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 0,
2678                                    50))
2679                DRM_ERROR("wait for OA to be disabled timed out\n");
2680}
2681
2682/**
2683 * i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream
2684 * @stream: An i915 perf stream opened for OA metrics
2685 *
2686 * Stops the OA unit from periodically writing counter reports into the
2687 * circular OA buffer. This also stops the hrtimer that periodically checks for
2688 * data in the circular OA buffer, for notifying userspace.
2689 */
2690static void i915_oa_stream_disable(struct i915_perf_stream *stream)
2691{
2692        stream->perf->ops.oa_disable(stream);
2693
2694        if (stream->periodic)
2695                hrtimer_cancel(&stream->poll_check_timer);
2696}
2697
2698static const struct i915_perf_stream_ops i915_oa_stream_ops = {
2699        .destroy = i915_oa_stream_destroy,
2700        .enable = i915_oa_stream_enable,
2701        .disable = i915_oa_stream_disable,
2702        .wait_unlocked = i915_oa_wait_unlocked,
2703        .poll_wait = i915_oa_poll_wait,
2704        .read = i915_oa_read,
2705};
2706
2707static int i915_perf_stream_enable_sync(struct i915_perf_stream *stream)
2708{
2709        struct i915_request *rq;
2710
2711        rq = stream->perf->ops.enable_metric_set(stream);
2712        if (IS_ERR(rq))
2713                return PTR_ERR(rq);
2714
2715        i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
2716        i915_request_put(rq);
2717
2718        return 0;
2719}
2720
2721/**
2722 * i915_oa_stream_init - validate combined props for OA stream and init
2723 * @stream: An i915 perf stream
2724 * @param: The open parameters passed to `DRM_I915_PERF_OPEN`
2725 * @props: The property state that configures stream (individually validated)
2726 *
2727 * While read_properties_unlocked() validates properties in isolation it
2728 * doesn't ensure that the combination necessarily makes sense.
2729 *
2730 * At this point it has been determined that userspace wants a stream of
2731 * OA metrics, but still we need to further validate the combined
2732 * properties are OK.
2733 *
2734 * If the configuration makes sense then we can allocate memory for
2735 * a circular OA buffer and apply the requested metric set configuration.
2736 *
2737 * Returns: zero on success or a negative error code.
2738 */
2739static int i915_oa_stream_init(struct i915_perf_stream *stream,
2740                               struct drm_i915_perf_open_param *param,
2741                               struct perf_open_properties *props)
2742{
2743        struct i915_perf *perf = stream->perf;
2744        int format_size;
2745        int ret;
2746
2747        if (!props->engine) {
2748                DRM_DEBUG("OA engine not specified\n");
2749                return -EINVAL;
2750        }
2751
2752        /*
2753         * If the sysfs metrics/ directory wasn't registered for some
2754         * reason then don't let userspace try their luck with config
2755         * IDs
2756         */
2757        if (!perf->metrics_kobj) {
2758                DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
2759                return -EINVAL;
2760        }
2761
2762        if (!(props->sample_flags & SAMPLE_OA_REPORT) &&
2763            (INTEL_GEN(perf->i915) < 12 || !stream->ctx)) {
2764                DRM_DEBUG("Only OA report sampling supported\n");
2765                return -EINVAL;
2766        }
2767
2768        if (!perf->ops.enable_metric_set) {
2769                DRM_DEBUG("OA unit not supported\n");
2770                return -ENODEV;
2771        }
2772
2773        /*
2774         * To avoid the complexity of having to accurately filter
2775         * counter reports and marshal to the appropriate client
2776         * we currently only allow exclusive access
2777         */
2778        if (perf->exclusive_stream) {
2779                DRM_DEBUG("OA unit already in use\n");
2780                return -EBUSY;
2781        }
2782
2783        if (!props->oa_format) {
2784                DRM_DEBUG("OA report format not specified\n");
2785                return -EINVAL;
2786        }
2787
2788        stream->engine = props->engine;
2789        stream->uncore = stream->engine->gt->uncore;
2790
2791        stream->sample_size = sizeof(struct drm_i915_perf_record_header);
2792
2793        format_size = perf->oa_formats[props->oa_format].size;
2794
2795        stream->sample_flags = props->sample_flags;
2796        stream->sample_size += format_size;
2797
2798        stream->oa_buffer.format_size = format_size;
2799        if (WARN_ON(stream->oa_buffer.format_size == 0))
2800                return -EINVAL;
2801
2802        stream->hold_preemption = props->hold_preemption;
2803
2804        stream->oa_buffer.format =
2805                perf->oa_formats[props->oa_format].format;
2806
2807        stream->periodic = props->oa_periodic;
2808        if (stream->periodic)
2809                stream->period_exponent = props->oa_period_exponent;
2810
2811        if (stream->ctx) {
2812                ret = oa_get_render_ctx_id(stream);
2813                if (ret) {
2814                        DRM_DEBUG("Invalid context id to filter with\n");
2815                        return ret;
2816                }
2817        }
2818
2819        ret = alloc_noa_wait(stream);
2820        if (ret) {
2821                DRM_DEBUG("Unable to allocate NOA wait batch buffer\n");
2822                goto err_noa_wait_alloc;
2823        }
2824
2825        stream->oa_config = i915_perf_get_oa_config(perf, props->metrics_set);
2826        if (!stream->oa_config) {
2827                DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
2828                ret = -EINVAL;
2829                goto err_config;
2830        }
2831
2832        /* PRM - observability performance counters:
2833         *
2834         *   OACONTROL, performance counter enable, note:
2835         *
2836         *   "When this bit is set, in order to have coherent counts,
2837         *   RC6 power state and trunk clock gating must be disabled.
2838         *   This can be achieved by programming MMIO registers as
2839         *   0xA094=0 and 0xA090[31]=1"
2840         *
2841         *   In our case we are expecting that taking pm + FORCEWAKE
2842         *   references will effectively disable RC6.
2843         */
2844        intel_engine_pm_get(stream->engine);
2845        intel_uncore_forcewake_get(stream->uncore, FORCEWAKE_ALL);
2846
2847        ret = alloc_oa_buffer(stream);
2848        if (ret)
2849                goto err_oa_buf_alloc;
2850
2851        stream->ops = &i915_oa_stream_ops;
2852        perf->exclusive_stream = stream;
2853
2854        ret = i915_perf_stream_enable_sync(stream);
2855        if (ret) {
2856                DRM_DEBUG("Unable to enable metric set\n");
2857                goto err_enable;
2858        }
2859
2860        DRM_DEBUG("opening stream oa config uuid=%s\n",
2861                  stream->oa_config->uuid);
2862
2863        hrtimer_init(&stream->poll_check_timer,
2864                     CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2865        stream->poll_check_timer.function = oa_poll_check_timer_cb;
2866        init_waitqueue_head(&stream->poll_wq);
2867        spin_lock_init(&stream->oa_buffer.ptr_lock);
2868
2869        return 0;
2870
2871err_enable:
2872        perf->exclusive_stream = NULL;
2873        perf->ops.disable_metric_set(stream);
2874
2875        free_oa_buffer(stream);
2876
2877err_oa_buf_alloc:
2878        free_oa_configs(stream);
2879
2880        intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL);
2881        intel_engine_pm_put(stream->engine);
2882
2883err_config:
2884        free_noa_wait(stream);
2885
2886err_noa_wait_alloc:
2887        if (stream->ctx)
2888                oa_put_render_ctx_id(stream);
2889
2890        return ret;
2891}
2892
2893void i915_oa_init_reg_state(const struct intel_context *ce,
2894                            const struct intel_engine_cs *engine)
2895{
2896        struct i915_perf_stream *stream;
2897
2898        /* perf.exclusive_stream serialised by lrc_configure_all_contexts() */
2899
2900        if (engine->class != RENDER_CLASS)
2901                return;
2902
2903        stream = engine->i915->perf.exclusive_stream;
2904        /*
2905         * For gen12, only CTX_R_PWR_CLK_STATE needs update, but the caller
2906         * is already doing that, so nothing to be done for gen12 here.
2907         */
2908        if (stream && INTEL_GEN(stream->perf->i915) < 12)
2909                gen8_update_reg_state_unlocked(ce, stream);
2910}
2911
2912/**
2913 * i915_perf_read_locked - &i915_perf_stream_ops->read with error normalisation
2914 * @stream: An i915 perf stream
2915 * @file: An i915 perf stream file
2916 * @buf: destination buffer given by userspace
2917 * @count: the number of bytes userspace wants to read
2918 * @ppos: (inout) file seek position (unused)
2919 *
2920 * Besides wrapping &i915_perf_stream_ops->read this provides a common place to
2921 * ensure that if we've successfully copied any data then reporting that takes
2922 * precedence over any internal error status, so the data isn't lost.
2923 *
2924 * For example ret will be -ENOSPC whenever there is more buffered data than
2925 * can be copied to userspace, but that's only interesting if we weren't able
2926 * to copy some data because it implies the userspace buffer is too small to
2927 * receive a single record (and we never split records).
2928 *
2929 * Another case with ret == -EFAULT is more of a grey area since it would seem
2930 * like bad form for userspace to ask us to overrun its buffer, but the user
2931 * knows best:
2932 *
2933 *   http://yarchive.net/comp/linux/partial_reads_writes.html
2934 *
2935 * Returns: The number of bytes copied or a negative error code on failure.
2936 */
2937static ssize_t i915_perf_read_locked(struct i915_perf_stream *stream,
2938                                     struct file *file,
2939                                     char __user *buf,
2940                                     size_t count,
2941                                     loff_t *ppos)
2942{
2943        /* Note we keep the offset (aka bytes read) separate from any
2944         * error status so that the final check for whether we return
2945         * the bytes read with a higher precedence than any error (see
2946         * comment below) doesn't need to be handled/duplicated in
2947         * stream->ops->read() implementations.
2948         */
2949        size_t offset = 0;
2950        int ret = stream->ops->read(stream, buf, count, &offset);
2951
2952        return offset ?: (ret ?: -EAGAIN);
2953}
2954
2955/**
2956 * i915_perf_read - handles read() FOP for i915 perf stream FDs
2957 * @file: An i915 perf stream file
2958 * @buf: destination buffer given by userspace
2959 * @count: the number of bytes userspace wants to read
2960 * @ppos: (inout) file seek position (unused)
2961 *
2962 * The entry point for handling a read() on a stream file descriptor from
2963 * userspace. Most of the work is left to the i915_perf_read_locked() and
2964 * &i915_perf_stream_ops->read but to save having stream implementations (of
2965 * which we might have multiple later) we handle blocking read here.
2966 *
2967 * We can also consistently treat trying to read from a disabled stream
2968 * as an IO error so implementations can assume the stream is enabled
2969 * while reading.
2970 *
2971 * Returns: The number of bytes copied or a negative error code on failure.
2972 */
2973static ssize_t i915_perf_read(struct file *file,
2974                              char __user *buf,
2975                              size_t count,
2976                              loff_t *ppos)
2977{
2978        struct i915_perf_stream *stream = file->private_data;
2979        struct i915_perf *perf = stream->perf;
2980        ssize_t ret;
2981
2982        /* To ensure it's handled consistently we simply treat all reads of a
2983         * disabled stream as an error. In particular it might otherwise lead
2984         * to a deadlock for blocking file descriptors...
2985         */
2986        if (!stream->enabled)
2987                return -EIO;
2988
2989        if (!(file->f_flags & O_NONBLOCK)) {
2990                /* There's the small chance of false positives from
2991                 * stream->ops->wait_unlocked.
2992                 *
2993                 * E.g. with single context filtering since we only wait until
2994                 * oabuffer has >= 1 report we don't immediately know whether
2995                 * any reports really belong to the current context
2996                 */
2997                do {
2998                        ret = stream->ops->wait_unlocked(stream);
2999                        if (ret)
3000                                return ret;
3001
3002                        mutex_lock(&perf->lock);
3003                        ret = i915_perf_read_locked(stream, file,
3004                                                    buf, count, ppos);
3005                        mutex_unlock(&perf->lock);
3006                } while (ret == -EAGAIN);
3007        } else {
3008                mutex_lock(&perf->lock);
3009                ret = i915_perf_read_locked(stream, file, buf, count, ppos);
3010                mutex_unlock(&perf->lock);
3011        }
3012
3013        /* We allow the poll checking to sometimes report false positive EPOLLIN
3014         * events where we might actually report EAGAIN on read() if there's
3015         * not really any data available. In this situation though we don't
3016         * want to enter a busy loop between poll() reporting a EPOLLIN event
3017         * and read() returning -EAGAIN. Clearing the oa.pollin state here
3018         * effectively ensures we back off until the next hrtimer callback
3019         * before reporting another EPOLLIN event.
3020         */
3021        if (ret >= 0 || ret == -EAGAIN) {
3022                /* Maybe make ->pollin per-stream state if we support multiple
3023                 * concurrent streams in the future.
3024                 */
3025                stream->pollin = false;
3026        }
3027
3028        return ret;
3029}
3030
3031static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
3032{
3033        struct i915_perf_stream *stream =
3034                container_of(hrtimer, typeof(*stream), poll_check_timer);
3035
3036        if (oa_buffer_check_unlocked(stream)) {
3037                stream->pollin = true;
3038                wake_up(&stream->poll_wq);
3039        }
3040
3041        hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
3042
3043        return HRTIMER_RESTART;
3044}
3045
3046/**
3047 * i915_perf_poll_locked - poll_wait() with a suitable wait queue for stream
3048 * @stream: An i915 perf stream
3049 * @file: An i915 perf stream file
3050 * @wait: poll() state table
3051 *
3052 * For handling userspace polling on an i915 perf stream, this calls through to
3053 * &i915_perf_stream_ops->poll_wait to call poll_wait() with a wait queue that
3054 * will be woken for new stream data.
3055 *
3056 * Note: The &perf->lock mutex has been taken to serialize
3057 * with any non-file-operation driver hooks.
3058 *
3059 * Returns: any poll events that are ready without sleeping
3060 */
3061static __poll_t i915_perf_poll_locked(struct i915_perf_stream *stream,
3062                                      struct file *file,
3063                                      poll_table *wait)
3064{
3065        __poll_t events = 0;
3066
3067        stream->ops->poll_wait(stream, file, wait);
3068
3069        /* Note: we don't explicitly check whether there's something to read
3070         * here since this path may be very hot depending on what else
3071         * userspace is polling, or on the timeout in use. We rely solely on
3072         * the hrtimer/oa_poll_check_timer_cb to notify us when there are
3073         * samples to read.
3074         */
3075        if (stream->pollin)
3076                events |= EPOLLIN;
3077
3078        return events;
3079}
3080
3081/**
3082 * i915_perf_poll - call poll_wait() with a suitable wait queue for stream
3083 * @file: An i915 perf stream file
3084 * @wait: poll() state table
3085 *
3086 * For handling userspace polling on an i915 perf stream, this ensures
3087 * poll_wait() gets called with a wait queue that will be woken for new stream
3088 * data.
3089 *
3090 * Note: Implementation deferred to i915_perf_poll_locked()
3091 *
3092 * Returns: any poll events that are ready without sleeping
3093 */
3094static __poll_t i915_perf_poll(struct file *file, poll_table *wait)
3095{
3096        struct i915_perf_stream *stream = file->private_data;
3097        struct i915_perf *perf = stream->perf;
3098        __poll_t ret;
3099
3100        mutex_lock(&perf->lock);
3101        ret = i915_perf_poll_locked(stream, file, wait);
3102        mutex_unlock(&perf->lock);
3103
3104        return ret;
3105}
3106
3107/**
3108 * i915_perf_enable_locked - handle `I915_PERF_IOCTL_ENABLE` ioctl
3109 * @stream: A disabled i915 perf stream
3110 *
3111 * [Re]enables the associated capture of data for this stream.
3112 *
3113 * If a stream was previously enabled then there's currently no intention
3114 * to provide userspace any guarantee about the preservation of previously
3115 * buffered data.
3116 */
3117static void i915_perf_enable_locked(struct i915_perf_stream *stream)
3118{
3119        if (stream->enabled)
3120                return;
3121
3122        /* Allow stream->ops->enable() to refer to this */
3123        stream->enabled = true;
3124
3125        if (stream->ops->enable)
3126                stream->ops->enable(stream);
3127
3128        if (stream->hold_preemption)
3129                intel_context_set_nopreempt(stream->pinned_ctx);
3130}
3131
3132/**
3133 * i915_perf_disable_locked - handle `I915_PERF_IOCTL_DISABLE` ioctl
3134 * @stream: An enabled i915 perf stream
3135 *
3136 * Disables the associated capture of data for this stream.
3137 *
3138 * The intention is that disabling an re-enabling a stream will ideally be
3139 * cheaper than destroying and re-opening a stream with the same configuration,
3140 * though there are no formal guarantees about what state or buffered data
3141 * must be retained between disabling and re-enabling a stream.
3142 *
3143 * Note: while a stream is disabled it's considered an error for userspace
3144 * to attempt to read from the stream (-EIO).
3145 */
3146static void i915_perf_disable_locked(struct i915_perf_stream *stream)
3147{
3148        if (!stream->enabled)
3149                return;
3150
3151        /* Allow stream->ops->disable() to refer to this */
3152        stream->enabled = false;
3153
3154        if (stream->hold_preemption)
3155                intel_context_clear_nopreempt(stream->pinned_ctx);
3156
3157        if (stream->ops->disable)
3158                stream->ops->disable(stream);
3159}
3160
3161static long i915_perf_config_locked(struct i915_perf_stream *stream,
3162                                    unsigned long metrics_set)
3163{
3164        struct i915_oa_config *config;
3165        long ret = stream->oa_config->id;
3166
3167        config = i915_perf_get_oa_config(stream->perf, metrics_set);
3168        if (!config)
3169                return -EINVAL;
3170
3171        if (config != stream->oa_config) {
3172                struct i915_request *rq;
3173
3174                /*
3175                 * If OA is bound to a specific context, emit the
3176                 * reconfiguration inline from that context. The update
3177                 * will then be ordered with respect to submission on that
3178                 * context.
3179                 *
3180                 * When set globally, we use a low priority kernel context,
3181                 * so it will effectively take effect when idle.
3182                 */
3183                rq = emit_oa_config(stream, config, oa_context(stream));
3184                if (!IS_ERR(rq)) {
3185                        config = xchg(&stream->oa_config, config);
3186                        i915_request_put(rq);
3187                } else {
3188                        ret = PTR_ERR(rq);
3189                }
3190        }
3191
3192        i915_oa_config_put(config);
3193
3194        return ret;
3195}
3196
3197/**
3198 * i915_perf_ioctl - support ioctl() usage with i915 perf stream FDs
3199 * @stream: An i915 perf stream
3200 * @cmd: the ioctl request
3201 * @arg: the ioctl data
3202 *
3203 * Note: The &perf->lock mutex has been taken to serialize
3204 * with any non-file-operation driver hooks.
3205 *
3206 * Returns: zero on success or a negative error code. Returns -EINVAL for
3207 * an unknown ioctl request.
3208 */
3209static long i915_perf_ioctl_locked(struct i915_perf_stream *stream,
3210                                   unsigned int cmd,
3211                                   unsigned long arg)
3212{
3213        switch (cmd) {
3214        case I915_PERF_IOCTL_ENABLE:
3215                i915_perf_enable_locked(stream);
3216                return 0;
3217        case I915_PERF_IOCTL_DISABLE:
3218                i915_perf_disable_locked(stream);
3219                return 0;
3220        case I915_PERF_IOCTL_CONFIG:
3221                return i915_perf_config_locked(stream, arg);
3222        }
3223
3224        return -EINVAL;
3225}
3226
3227/**
3228 * i915_perf_ioctl - support ioctl() usage with i915 perf stream FDs
3229 * @file: An i915 perf stream file
3230 * @cmd: the ioctl request
3231 * @arg: the ioctl data
3232 *
3233 * Implementation deferred to i915_perf_ioctl_locked().
3234 *
3235 * Returns: zero on success or a negative error code. Returns -EINVAL for
3236 * an unknown ioctl request.
3237 */
3238static long i915_perf_ioctl(struct file *file,
3239                            unsigned int cmd,
3240                            unsigned long arg)
3241{
3242        struct i915_perf_stream *stream = file->private_data;
3243        struct i915_perf *perf = stream->perf;
3244        long ret;
3245
3246        mutex_lock(&perf->lock);
3247        ret = i915_perf_ioctl_locked(stream, cmd, arg);
3248        mutex_unlock(&perf->lock);
3249
3250        return ret;
3251}
3252
3253/**
3254 * i915_perf_destroy_locked - destroy an i915 perf stream
3255 * @stream: An i915 perf stream
3256 *
3257 * Frees all resources associated with the given i915 perf @stream, disabling
3258 * any associated data capture in the process.
3259 *
3260 * Note: The &perf->lock mutex has been taken to serialize
3261 * with any non-file-operation driver hooks.
3262 */
3263static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
3264{
3265        if (stream->enabled)
3266                i915_perf_disable_locked(stream);
3267
3268        if (stream->ops->destroy)
3269                stream->ops->destroy(stream);
3270
3271        if (stream->ctx)
3272                i915_gem_context_put(stream->ctx);
3273
3274        kfree(stream);
3275}
3276
3277/**
3278 * i915_perf_release - handles userspace close() of a stream file
3279 * @inode: anonymous inode associated with file
3280 * @file: An i915 perf stream file
3281 *
3282 * Cleans up any resources associated with an open i915 perf stream file.
3283 *
3284 * NB: close() can't really fail from the userspace point of view.
3285 *
3286 * Returns: zero on success or a negative error code.
3287 */
3288static int i915_perf_release(struct inode *inode, struct file *file)
3289{
3290        struct i915_perf_stream *stream = file->private_data;
3291        struct i915_perf *perf = stream->perf;
3292
3293        mutex_lock(&perf->lock);
3294        i915_perf_destroy_locked(stream);
3295        mutex_unlock(&perf->lock);
3296
3297        /* Release the reference the perf stream kept on the driver. */
3298        drm_dev_put(&perf->i915->drm);
3299
3300        return 0;
3301}
3302
3303
3304static const struct file_operations fops = {
3305        .owner          = THIS_MODULE,
3306        .llseek         = no_llseek,
3307        .release        = i915_perf_release,
3308        .poll           = i915_perf_poll,
3309        .read           = i915_perf_read,
3310        .unlocked_ioctl = i915_perf_ioctl,
3311        /* Our ioctl have no arguments, so it's safe to use the same function
3312         * to handle 32bits compatibility.
3313         */
3314        .compat_ioctl   = i915_perf_ioctl,
3315};
3316
3317
3318/**
3319 * i915_perf_open_ioctl_locked - DRM ioctl() for userspace to open a stream FD
3320 * @perf: i915 perf instance
3321 * @param: The open parameters passed to 'DRM_I915_PERF_OPEN`
3322 * @props: individually validated u64 property value pairs
3323 * @file: drm file
3324 *
3325 * See i915_perf_ioctl_open() for interface details.
3326 *
3327 * Implements further stream config validation and stream initialization on
3328 * behalf of i915_perf_open_ioctl() with the &perf->lock mutex
3329 * taken to serialize with any non-file-operation driver hooks.
3330 *
3331 * Note: at this point the @props have only been validated in isolation and
3332 * it's still necessary to validate that the combination of properties makes
3333 * sense.
3334 *
3335 * In the case where userspace is interested in OA unit metrics then further
3336 * config validation and stream initialization details will be handled by
3337 * i915_oa_stream_init(). The code here should only validate config state that
3338 * will be relevant to all stream types / backends.
3339 *
3340 * Returns: zero on success or a negative error code.
3341 */
3342static int
3343i915_perf_open_ioctl_locked(struct i915_perf *perf,
3344                            struct drm_i915_perf_open_param *param,
3345                            struct perf_open_properties *props,
3346                            struct drm_file *file)
3347{
3348        struct i915_gem_context *specific_ctx = NULL;
3349        struct i915_perf_stream *stream = NULL;
3350        unsigned long f_flags = 0;
3351        bool privileged_op = true;
3352        int stream_fd;
3353        int ret;
3354
3355        if (props->single_context) {
3356                u32 ctx_handle = props->ctx_handle;
3357                struct drm_i915_file_private *file_priv = file->driver_priv;
3358
3359                specific_ctx = i915_gem_context_lookup(file_priv, ctx_handle);
3360                if (!specific_ctx) {
3361                        DRM_DEBUG("Failed to look up context with ID %u for opening perf stream\n",
3362                                  ctx_handle);
3363                        ret = -ENOENT;
3364                        goto err;
3365                }
3366        }
3367
3368        /*
3369         * On Haswell the OA unit supports clock gating off for a specific
3370         * context and in this mode there's no visibility of metrics for the
3371         * rest of the system, which we consider acceptable for a
3372         * non-privileged client.
3373         *
3374         * For Gen8->11 the OA unit no longer supports clock gating off for a
3375         * specific context and the kernel can't securely stop the counters
3376         * from updating as system-wide / global values. Even though we can
3377         * filter reports based on the included context ID we can't block
3378         * clients from seeing the raw / global counter values via
3379         * MI_REPORT_PERF_COUNT commands and so consider it a privileged op to
3380         * enable the OA unit by default.
3381         *
3382         * For Gen12+ we gain a new OAR unit that only monitors the RCS on a
3383         * per context basis. So we can relax requirements there if the user
3384         * doesn't request global stream access (i.e. query based sampling
3385         * using MI_RECORD_PERF_COUNT.
3386         */
3387        if (IS_HASWELL(perf->i915) && specific_ctx)
3388                privileged_op = false;
3389        else if (IS_GEN(perf->i915, 12) && specific_ctx &&
3390                 (props->sample_flags & SAMPLE_OA_REPORT) == 0)
3391                privileged_op = false;
3392
3393        if (props->hold_preemption) {
3394                if (!props->single_context) {
3395                        DRM_DEBUG("preemption disable with no context\n");
3396                        ret = -EINVAL;
3397                        goto err;
3398                }
3399                privileged_op = true;
3400        }
3401
3402        /* Similar to perf's kernel.perf_paranoid_cpu sysctl option
3403         * we check a dev.i915.perf_stream_paranoid sysctl option
3404         * to determine if it's ok to access system wide OA counters
3405         * without CAP_SYS_ADMIN privileges.
3406         */
3407        if (privileged_op &&
3408            i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
3409                DRM_DEBUG("Insufficient privileges to open i915 perf stream\n");
3410                ret = -EACCES;
3411                goto err_ctx;
3412        }
3413
3414        stream = kzalloc(sizeof(*stream), GFP_KERNEL);
3415        if (!stream) {
3416                ret = -ENOMEM;
3417                goto err_ctx;
3418        }
3419
3420        stream->perf = perf;
3421        stream->ctx = specific_ctx;
3422
3423        ret = i915_oa_stream_init(stream, param, props);
3424        if (ret)
3425                goto err_alloc;
3426
3427        /* we avoid simply assigning stream->sample_flags = props->sample_flags
3428         * to have _stream_init check the combination of sample flags more
3429         * thoroughly, but still this is the expected result at this point.
3430         */
3431        if (WARN_ON(stream->sample_flags != props->sample_flags)) {
3432                ret = -ENODEV;
3433                goto err_flags;
3434        }
3435
3436        if (param->flags & I915_PERF_FLAG_FD_CLOEXEC)
3437                f_flags |= O_CLOEXEC;
3438        if (param->flags & I915_PERF_FLAG_FD_NONBLOCK)
3439                f_flags |= O_NONBLOCK;
3440
3441        stream_fd = anon_inode_getfd("[i915_perf]", &fops, stream, f_flags);
3442        if (stream_fd < 0) {
3443                ret = stream_fd;
3444                goto err_flags;
3445        }
3446
3447        if (!(param->flags & I915_PERF_FLAG_DISABLED))
3448                i915_perf_enable_locked(stream);
3449
3450        /* Take a reference on the driver that will be kept with stream_fd
3451         * until its release.
3452         */
3453        drm_dev_get(&perf->i915->drm);
3454
3455        return stream_fd;
3456
3457err_flags:
3458        if (stream->ops->destroy)
3459                stream->ops->destroy(stream);
3460err_alloc:
3461        kfree(stream);
3462err_ctx:
3463        if (specific_ctx)
3464                i915_gem_context_put(specific_ctx);
3465err:
3466        return ret;
3467}
3468
3469static u64 oa_exponent_to_ns(struct i915_perf *perf, int exponent)
3470{
3471        return div64_u64(1000000000ULL * (2ULL << exponent),
3472                         1000ULL * RUNTIME_INFO(perf->i915)->cs_timestamp_frequency_khz);
3473}
3474
3475/**
3476 * read_properties_unlocked - validate + copy userspace stream open properties
3477 * @perf: i915 perf instance
3478 * @uprops: The array of u64 key value pairs given by userspace
3479 * @n_props: The number of key value pairs expected in @uprops
3480 * @props: The stream configuration built up while validating properties
3481 *
3482 * Note this function only validates properties in isolation it doesn't
3483 * validate that the combination of properties makes sense or that all
3484 * properties necessary for a particular kind of stream have been set.
3485 *
3486 * Note that there currently aren't any ordering requirements for properties so
3487 * we shouldn't validate or assume anything about ordering here. This doesn't
3488 * rule out defining new properties with ordering requirements in the future.
3489 */
3490static int read_properties_unlocked(struct i915_perf *perf,
3491                                    u64 __user *uprops,
3492                                    u32 n_props,
3493                                    struct perf_open_properties *props)
3494{
3495        u64 __user *uprop = uprops;
3496        u32 i;
3497
3498        memset(props, 0, sizeof(struct perf_open_properties));
3499
3500        if (!n_props) {
3501                DRM_DEBUG("No i915 perf properties given\n");
3502                return -EINVAL;
3503        }
3504
3505        /* At the moment we only support using i915-perf on the RCS. */
3506        props->engine = intel_engine_lookup_user(perf->i915,
3507                                                 I915_ENGINE_CLASS_RENDER,
3508                                                 0);
3509        if (!props->engine) {
3510                DRM_DEBUG("No RENDER-capable engines\n");
3511                return -EINVAL;
3512        }
3513
3514        /* Considering that ID = 0 is reserved and assuming that we don't
3515         * (currently) expect any configurations to ever specify duplicate
3516         * values for a particular property ID then the last _PROP_MAX value is
3517         * one greater than the maximum number of properties we expect to get
3518         * from userspace.
3519         */
3520        if (n_props >= DRM_I915_PERF_PROP_MAX) {
3521                DRM_DEBUG("More i915 perf properties specified than exist\n");
3522                return -EINVAL;
3523        }
3524
3525        for (i = 0; i < n_props; i++) {
3526                u64 oa_period, oa_freq_hz;
3527                u64 id, value;
3528                int ret;
3529
3530                ret = get_user(id, uprop);
3531                if (ret)
3532                        return ret;
3533
3534                ret = get_user(value, uprop + 1);
3535                if (ret)
3536                        return ret;
3537
3538                if (id == 0 || id >= DRM_I915_PERF_PROP_MAX) {
3539                        DRM_DEBUG("Unknown i915 perf property ID\n");
3540                        return -EINVAL;
3541                }
3542
3543                switch ((enum drm_i915_perf_property_id)id) {
3544                case DRM_I915_PERF_PROP_CTX_HANDLE:
3545                        props->single_context = 1;
3546                        props->ctx_handle = value;
3547                        break;
3548                case DRM_I915_PERF_PROP_SAMPLE_OA:
3549                        if (value)
3550                                props->sample_flags |= SAMPLE_OA_REPORT;
3551                        break;
3552                case DRM_I915_PERF_PROP_OA_METRICS_SET:
3553                        if (value == 0) {
3554                                DRM_DEBUG("Unknown OA metric set ID\n");
3555                                return -EINVAL;
3556                        }
3557                        props->metrics_set = value;
3558                        break;
3559                case DRM_I915_PERF_PROP_OA_FORMAT:
3560                        if (value == 0 || value >= I915_OA_FORMAT_MAX) {
3561                                DRM_DEBUG("Out-of-range OA report format %llu\n",
3562                                          value);
3563                                return -EINVAL;
3564                        }
3565                        if (!perf->oa_formats[value].size) {
3566                                DRM_DEBUG("Unsupported OA report format %llu\n",
3567                                          value);
3568                                return -EINVAL;
3569                        }
3570                        props->oa_format = value;
3571                        break;
3572                case DRM_I915_PERF_PROP_OA_EXPONENT:
3573                        if (value > OA_EXPONENT_MAX) {
3574                                DRM_DEBUG("OA timer exponent too high (> %u)\n",
3575                                         OA_EXPONENT_MAX);
3576                                return -EINVAL;
3577                        }
3578
3579                        /* Theoretically we can program the OA unit to sample
3580                         * e.g. every 160ns for HSW, 167ns for BDW/SKL or 104ns
3581                         * for BXT. We don't allow such high sampling
3582                         * frequencies by default unless root.
3583                         */
3584
3585                        BUILD_BUG_ON(sizeof(oa_period) != 8);
3586                        oa_period = oa_exponent_to_ns(perf, value);
3587
3588                        /* This check is primarily to ensure that oa_period <=
3589                         * UINT32_MAX (before passing to do_div which only
3590                         * accepts a u32 denominator), but we can also skip
3591                         * checking anything < 1Hz which implicitly can't be
3592                         * limited via an integer oa_max_sample_rate.
3593                         */
3594                        if (oa_period <= NSEC_PER_SEC) {
3595                                u64 tmp = NSEC_PER_SEC;
3596                                do_div(tmp, oa_period);
3597                                oa_freq_hz = tmp;
3598                        } else
3599                                oa_freq_hz = 0;
3600
3601                        if (oa_freq_hz > i915_oa_max_sample_rate &&
3602                            !capable(CAP_SYS_ADMIN)) {
3603                                DRM_DEBUG("OA exponent would exceed the max sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without root privileges\n",
3604                                          i915_oa_max_sample_rate);
3605                                return -EACCES;
3606                        }
3607
3608                        props->oa_periodic = true;
3609                        props->oa_period_exponent = value;
3610                        break;
3611                case DRM_I915_PERF_PROP_HOLD_PREEMPTION:
3612                        props->hold_preemption = !!value;
3613                        break;
3614                case DRM_I915_PERF_PROP_MAX:
3615                        MISSING_CASE(id);
3616                        return -EINVAL;
3617                }
3618
3619                uprop += 2;
3620        }
3621
3622        return 0;
3623}
3624
3625/**
3626 * i915_perf_open_ioctl - DRM ioctl() for userspace to open a stream FD
3627 * @dev: drm device
3628 * @data: ioctl data copied from userspace (unvalidated)
3629 * @file: drm file
3630 *
3631 * Validates the stream open parameters given by userspace including flags
3632 * and an array of u64 key, value pair properties.
3633 *
3634 * Very little is assumed up front about the nature of the stream being
3635 * opened (for instance we don't assume it's for periodic OA unit metrics). An
3636 * i915-perf stream is expected to be a suitable interface for other forms of
3637 * buffered data written by the GPU besides periodic OA metrics.
3638 *
3639 * Note we copy the properties from userspace outside of the i915 perf
3640 * mutex to avoid an awkward lockdep with mmap_sem.
3641 *
3642 * Most of the implementation details are handled by
3643 * i915_perf_open_ioctl_locked() after taking the &perf->lock
3644 * mutex for serializing with any non-file-operation driver hooks.
3645 *
3646 * Return: A newly opened i915 Perf stream file descriptor or negative
3647 * error code on failure.
3648 */
3649int i915_perf_open_ioctl(struct drm_device *dev, void *data,
3650                         struct drm_file *file)
3651{
3652        struct i915_perf *perf = &to_i915(dev)->perf;
3653        struct drm_i915_perf_open_param *param = data;
3654        struct perf_open_properties props;
3655        u32 known_open_flags;
3656        int ret;
3657
3658        if (!perf->i915) {
3659                DRM_DEBUG("i915 perf interface not available for this system\n");
3660                return -ENOTSUPP;
3661        }
3662
3663        known_open_flags = I915_PERF_FLAG_FD_CLOEXEC |
3664                           I915_PERF_FLAG_FD_NONBLOCK |
3665                           I915_PERF_FLAG_DISABLED;
3666        if (param->flags & ~known_open_flags) {
3667                DRM_DEBUG("Unknown drm_i915_perf_open_param flag\n");
3668                return -EINVAL;
3669        }
3670
3671        ret = read_properties_unlocked(perf,
3672                                       u64_to_user_ptr(param->properties_ptr),
3673                                       param->num_properties,
3674                                       &props);
3675        if (ret)
3676                return ret;
3677
3678        mutex_lock(&perf->lock);
3679        ret = i915_perf_open_ioctl_locked(perf, param, &props, file);
3680        mutex_unlock(&perf->lock);
3681
3682        return ret;
3683}
3684
3685/**
3686 * i915_perf_register - exposes i915-perf to userspace
3687 * @i915: i915 device instance
3688 *
3689 * In particular OA metric sets are advertised under a sysfs metrics/
3690 * directory allowing userspace to enumerate valid IDs that can be
3691 * used to open an i915-perf stream.
3692 */
3693void i915_perf_register(struct drm_i915_private *i915)
3694{
3695        struct i915_perf *perf = &i915->perf;
3696        int ret;
3697
3698        if (!perf->i915)
3699                return;
3700
3701        /* To be sure we're synchronized with an attempted
3702         * i915_perf_open_ioctl(); considering that we register after
3703         * being exposed to userspace.
3704         */
3705        mutex_lock(&perf->lock);
3706
3707        perf->metrics_kobj =
3708                kobject_create_and_add("metrics",
3709                                       &i915->drm.primary->kdev->kobj);
3710        if (!perf->metrics_kobj)
3711                goto exit;
3712
3713        sysfs_attr_init(&perf->test_config.sysfs_metric_id.attr);
3714
3715        if (IS_TIGERLAKE(i915)) {
3716                i915_perf_load_test_config_tgl(i915);
3717        } else if (INTEL_GEN(i915) >= 11) {
3718                i915_perf_load_test_config_icl(i915);
3719        } else if (IS_CANNONLAKE(i915)) {
3720                i915_perf_load_test_config_cnl(i915);
3721        } else if (IS_COFFEELAKE(i915)) {
3722                if (IS_CFL_GT2(i915))
3723                        i915_perf_load_test_config_cflgt2(i915);
3724                if (IS_CFL_GT3(i915))
3725                        i915_perf_load_test_config_cflgt3(i915);
3726        } else if (IS_GEMINILAKE(i915)) {
3727                i915_perf_load_test_config_glk(i915);
3728        } else if (IS_KABYLAKE(i915)) {
3729                if (IS_KBL_GT2(i915))
3730                        i915_perf_load_test_config_kblgt2(i915);
3731                else if (IS_KBL_GT3(i915))
3732                        i915_perf_load_test_config_kblgt3(i915);
3733        } else if (IS_BROXTON(i915)) {
3734                i915_perf_load_test_config_bxt(i915);
3735        } else if (IS_SKYLAKE(i915)) {
3736                if (IS_SKL_GT2(i915))
3737                        i915_perf_load_test_config_sklgt2(i915);
3738                else if (IS_SKL_GT3(i915))
3739                        i915_perf_load_test_config_sklgt3(i915);
3740                else if (IS_SKL_GT4(i915))
3741                        i915_perf_load_test_config_sklgt4(i915);
3742        } else if (IS_CHERRYVIEW(i915)) {
3743                i915_perf_load_test_config_chv(i915);
3744        } else if (IS_BROADWELL(i915)) {
3745                i915_perf_load_test_config_bdw(i915);
3746        } else if (IS_HASWELL(i915)) {
3747                i915_perf_load_test_config_hsw(i915);
3748        }
3749
3750        if (perf->test_config.id == 0)
3751                goto sysfs_error;
3752
3753        ret = sysfs_create_group(perf->metrics_kobj,
3754                                 &perf->test_config.sysfs_metric);
3755        if (ret)
3756                goto sysfs_error;
3757
3758        perf->test_config.perf = perf;
3759        kref_init(&perf->test_config.ref);
3760
3761        goto exit;
3762
3763sysfs_error:
3764        kobject_put(perf->metrics_kobj);
3765        perf->metrics_kobj = NULL;
3766
3767exit:
3768        mutex_unlock(&perf->lock);
3769}
3770
3771/**
3772 * i915_perf_unregister - hide i915-perf from userspace
3773 * @i915: i915 device instance
3774 *
3775 * i915-perf state cleanup is split up into an 'unregister' and
3776 * 'deinit' phase where the interface is first hidden from
3777 * userspace by i915_perf_unregister() before cleaning up
3778 * remaining state in i915_perf_fini().
3779 */
3780void i915_perf_unregister(struct drm_i915_private *i915)
3781{
3782        struct i915_perf *perf = &i915->perf;
3783
3784        if (!perf->metrics_kobj)
3785                return;
3786
3787        sysfs_remove_group(perf->metrics_kobj,
3788                           &perf->test_config.sysfs_metric);
3789
3790        kobject_put(perf->metrics_kobj);
3791        perf->metrics_kobj = NULL;
3792}
3793
3794static bool gen8_is_valid_flex_addr(struct i915_perf *perf, u32 addr)
3795{
3796        static const i915_reg_t flex_eu_regs[] = {
3797                EU_PERF_CNTL0,
3798                EU_PERF_CNTL1,
3799                EU_PERF_CNTL2,
3800                EU_PERF_CNTL3,
3801                EU_PERF_CNTL4,
3802                EU_PERF_CNTL5,
3803                EU_PERF_CNTL6,
3804        };
3805        int i;
3806
3807        for (i = 0; i < ARRAY_SIZE(flex_eu_regs); i++) {
3808                if (i915_mmio_reg_offset(flex_eu_regs[i]) == addr)
3809                        return true;
3810        }
3811        return false;
3812}
3813
3814#define ADDR_IN_RANGE(addr, start, end) \
3815        ((addr) >= (start) && \
3816         (addr) <= (end))
3817
3818#define REG_IN_RANGE(addr, start, end) \
3819        ((addr) >= i915_mmio_reg_offset(start) && \
3820         (addr) <= i915_mmio_reg_offset(end))
3821
3822#define REG_EQUAL(addr, mmio) \
3823        ((addr) == i915_mmio_reg_offset(mmio))
3824
3825static bool gen7_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
3826{
3827        return REG_IN_RANGE(addr, OASTARTTRIG1, OASTARTTRIG8) ||
3828               REG_IN_RANGE(addr, OAREPORTTRIG1, OAREPORTTRIG8) ||
3829               REG_IN_RANGE(addr, OACEC0_0, OACEC7_1);
3830}
3831
3832static bool gen7_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
3833{
3834        return REG_EQUAL(addr, HALF_SLICE_CHICKEN2) ||
3835               REG_IN_RANGE(addr, MICRO_BP0_0, NOA_WRITE) ||
3836               REG_IN_RANGE(addr, OA_PERFCNT1_LO, OA_PERFCNT2_HI) ||
3837               REG_IN_RANGE(addr, OA_PERFMATRIX_LO, OA_PERFMATRIX_HI);
3838}
3839
3840static bool gen8_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
3841{
3842        return gen7_is_valid_mux_addr(perf, addr) ||
3843               REG_EQUAL(addr, WAIT_FOR_RC6_EXIT) ||
3844               REG_IN_RANGE(addr, RPM_CONFIG0, NOA_CONFIG(8));
3845}
3846
3847static bool gen10_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
3848{
3849        return gen8_is_valid_mux_addr(perf, addr) ||
3850               REG_EQUAL(addr, GEN10_NOA_WRITE_HIGH) ||
3851               REG_IN_RANGE(addr, OA_PERFCNT3_LO, OA_PERFCNT4_HI);
3852}
3853
3854static bool hsw_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
3855{
3856        return gen7_is_valid_mux_addr(perf, addr) ||
3857               ADDR_IN_RANGE(addr, 0x25100, 0x2FF90) ||
3858               REG_IN_RANGE(addr, HSW_MBVID2_NOA0, HSW_MBVID2_NOA9) ||
3859               REG_EQUAL(addr, HSW_MBVID2_MISR0);
3860}
3861
3862static bool chv_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
3863{
3864        return gen7_is_valid_mux_addr(perf, addr) ||
3865               ADDR_IN_RANGE(addr, 0x182300, 0x1823A4);
3866}
3867
3868static bool gen12_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
3869{
3870        return REG_IN_RANGE(addr, GEN12_OAG_OASTARTTRIG1, GEN12_OAG_OASTARTTRIG8) ||
3871               REG_IN_RANGE(addr, GEN12_OAG_OAREPORTTRIG1, GEN12_OAG_OAREPORTTRIG8) ||
3872               REG_IN_RANGE(addr, GEN12_OAG_CEC0_0, GEN12_OAG_CEC7_1) ||
3873               REG_IN_RANGE(addr, GEN12_OAG_SCEC0_0, GEN12_OAG_SCEC7_1) ||
3874               REG_EQUAL(addr, GEN12_OAA_DBG_REG) ||
3875               REG_EQUAL(addr, GEN12_OAG_OA_PESS) ||
3876               REG_EQUAL(addr, GEN12_OAG_SPCTR_CNF);
3877}
3878
3879static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
3880{
3881        return REG_EQUAL(addr, NOA_WRITE) ||
3882               REG_EQUAL(addr, GEN10_NOA_WRITE_HIGH) ||
3883               REG_EQUAL(addr, GDT_CHICKEN_BITS) ||
3884               REG_EQUAL(addr, WAIT_FOR_RC6_EXIT) ||
3885               REG_EQUAL(addr, RPM_CONFIG0) ||
3886               REG_EQUAL(addr, RPM_CONFIG1) ||
3887               REG_IN_RANGE(addr, NOA_CONFIG(0), NOA_CONFIG(8));
3888}
3889
3890static u32 mask_reg_value(u32 reg, u32 val)
3891{
3892        /* HALF_SLICE_CHICKEN2 is programmed with a the
3893         * WaDisableSTUnitPowerOptimization workaround. Make sure the value
3894         * programmed by userspace doesn't change this.
3895         */
3896        if (REG_EQUAL(reg, HALF_SLICE_CHICKEN2))
3897                val = val & ~_MASKED_BIT_ENABLE(GEN8_ST_PO_DISABLE);
3898
3899        /* WAIT_FOR_RC6_EXIT has only one bit fullfilling the function
3900         * indicated by its name and a bunch of selection fields used by OA
3901         * configs.
3902         */
3903        if (REG_EQUAL(reg, WAIT_FOR_RC6_EXIT))
3904                val = val & ~_MASKED_BIT_ENABLE(HSW_WAIT_FOR_RC6_EXIT_ENABLE);
3905
3906        return val;
3907}
3908
3909static struct i915_oa_reg *alloc_oa_regs(struct i915_perf *perf,
3910                                         bool (*is_valid)(struct i915_perf *perf, u32 addr),
3911                                         u32 __user *regs,
3912                                         u32 n_regs)
3913{
3914        struct i915_oa_reg *oa_regs;
3915        int err;
3916        u32 i;
3917
3918        if (!n_regs)
3919                return NULL;
3920
3921        if (!access_ok(regs, n_regs * sizeof(u32) * 2))
3922                return ERR_PTR(-EFAULT);
3923
3924        /* No is_valid function means we're not allowing any register to be programmed. */
3925        GEM_BUG_ON(!is_valid);
3926        if (!is_valid)
3927                return ERR_PTR(-EINVAL);
3928
3929        oa_regs = kmalloc_array(n_regs, sizeof(*oa_regs), GFP_KERNEL);
3930        if (!oa_regs)
3931                return ERR_PTR(-ENOMEM);
3932
3933        for (i = 0; i < n_regs; i++) {
3934                u32 addr, value;
3935
3936                err = get_user(addr, regs);
3937                if (err)
3938                        goto addr_err;
3939
3940                if (!is_valid(perf, addr)) {
3941                        DRM_DEBUG("Invalid oa_reg address: %X\n", addr);
3942                        err = -EINVAL;
3943                        goto addr_err;
3944                }
3945
3946                err = get_user(value, regs + 1);
3947                if (err)
3948                        goto addr_err;
3949
3950                oa_regs[i].addr = _MMIO(addr);
3951                oa_regs[i].value = mask_reg_value(addr, value);
3952
3953                regs += 2;
3954        }
3955
3956        return oa_regs;
3957
3958addr_err:
3959        kfree(oa_regs);
3960        return ERR_PTR(err);
3961}
3962
3963static ssize_t show_dynamic_id(struct device *dev,
3964                               struct device_attribute *attr,
3965                               char *buf)
3966{
3967        struct i915_oa_config *oa_config =
3968                container_of(attr, typeof(*oa_config), sysfs_metric_id);
3969
3970        return sprintf(buf, "%d\n", oa_config->id);
3971}
3972
3973static int create_dynamic_oa_sysfs_entry(struct i915_perf *perf,
3974                                         struct i915_oa_config *oa_config)
3975{
3976        sysfs_attr_init(&oa_config->sysfs_metric_id.attr);
3977        oa_config->sysfs_metric_id.attr.name = "id";
3978        oa_config->sysfs_metric_id.attr.mode = S_IRUGO;
3979        oa_config->sysfs_metric_id.show = show_dynamic_id;
3980        oa_config->sysfs_metric_id.store = NULL;
3981
3982        oa_config->attrs[0] = &oa_config->sysfs_metric_id.attr;
3983        oa_config->attrs[1] = NULL;
3984
3985        oa_config->sysfs_metric.name = oa_config->uuid;
3986        oa_config->sysfs_metric.attrs = oa_config->attrs;
3987
3988        return sysfs_create_group(perf->metrics_kobj,
3989                                  &oa_config->sysfs_metric);
3990}
3991
3992/**
3993 * i915_perf_add_config_ioctl - DRM ioctl() for userspace to add a new OA config
3994 * @dev: drm device
3995 * @data: ioctl data (pointer to struct drm_i915_perf_oa_config) copied from
3996 *        userspace (unvalidated)
3997 * @file: drm file
3998 *
3999 * Validates the submitted OA register to be saved into a new OA config that
4000 * can then be used for programming the OA unit and its NOA network.
4001 *
4002 * Returns: A new allocated config number to be used with the perf open ioctl
4003 * or a negative error code on failure.
4004 */
4005int i915_perf_add_config_ioctl(struct drm_device *dev, void *data,
4006                               struct drm_file *file)
4007{
4008        struct i915_perf *perf = &to_i915(dev)->perf;
4009        struct drm_i915_perf_oa_config *args = data;
4010        struct i915_oa_config *oa_config, *tmp;
4011        struct i915_oa_reg *regs;
4012        int err, id;
4013
4014        if (!perf->i915) {
4015                DRM_DEBUG("i915 perf interface not available for this system\n");
4016                return -ENOTSUPP;
4017        }
4018
4019        if (!perf->metrics_kobj) {
4020                DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
4021                return -EINVAL;
4022        }
4023
4024        if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
4025                DRM_DEBUG("Insufficient privileges to add i915 OA config\n");
4026                return -EACCES;
4027        }
4028
4029        if ((!args->mux_regs_ptr || !args->n_mux_regs) &&
4030            (!args->boolean_regs_ptr || !args->n_boolean_regs) &&
4031            (!args->flex_regs_ptr || !args->n_flex_regs)) {
4032                DRM_DEBUG("No OA registers given\n");
4033                return -EINVAL;
4034        }
4035
4036        oa_config = kzalloc(sizeof(*oa_config), GFP_KERNEL);
4037        if (!oa_config) {
4038                DRM_DEBUG("Failed to allocate memory for the OA config\n");
4039                return -ENOMEM;
4040        }
4041
4042        oa_config->perf = perf;
4043        kref_init(&oa_config->ref);
4044
4045        if (!uuid_is_valid(args->uuid)) {
4046                DRM_DEBUG("Invalid uuid format for OA config\n");
4047                err = -EINVAL;
4048                goto reg_err;
4049        }
4050
4051        /* Last character in oa_config->uuid will be 0 because oa_config is
4052         * kzalloc.
4053         */
4054        memcpy(oa_config->uuid, args->uuid, sizeof(args->uuid));
4055
4056        oa_config->mux_regs_len = args->n_mux_regs;
4057        regs = alloc_oa_regs(perf,
4058                             perf->ops.is_valid_mux_reg,
4059                             u64_to_user_ptr(args->mux_regs_ptr),
4060                             args->n_mux_regs);
4061
4062        if (IS_ERR(regs)) {
4063                DRM_DEBUG("Failed to create OA config for mux_regs\n");
4064                err = PTR_ERR(regs);
4065                goto reg_err;
4066        }
4067        oa_config->mux_regs = regs;
4068
4069        oa_config->b_counter_regs_len = args->n_boolean_regs;
4070        regs = alloc_oa_regs(perf,
4071                             perf->ops.is_valid_b_counter_reg,
4072                             u64_to_user_ptr(args->boolean_regs_ptr),
4073                             args->n_boolean_regs);
4074
4075        if (IS_ERR(regs)) {
4076                DRM_DEBUG("Failed to create OA config for b_counter_regs\n");
4077                err = PTR_ERR(regs);
4078                goto reg_err;
4079        }
4080        oa_config->b_counter_regs = regs;
4081
4082        if (INTEL_GEN(perf->i915) < 8) {
4083                if (args->n_flex_regs != 0) {
4084                        err = -EINVAL;
4085                        goto reg_err;
4086                }
4087        } else {
4088                oa_config->flex_regs_len = args->n_flex_regs;
4089                regs = alloc_oa_regs(perf,
4090                                     perf->ops.is_valid_flex_reg,
4091                                     u64_to_user_ptr(args->flex_regs_ptr),
4092                                     args->n_flex_regs);
4093
4094                if (IS_ERR(regs)) {
4095                        DRM_DEBUG("Failed to create OA config for flex_regs\n");
4096                        err = PTR_ERR(regs);
4097                        goto reg_err;
4098                }
4099                oa_config->flex_regs = regs;
4100        }
4101
4102        err = mutex_lock_interruptible(&perf->metrics_lock);
4103        if (err)
4104                goto reg_err;
4105
4106        /* We shouldn't have too many configs, so this iteration shouldn't be
4107         * too costly.
4108         */
4109        idr_for_each_entry(&perf->metrics_idr, tmp, id) {
4110                if (!strcmp(tmp->uuid, oa_config->uuid)) {
4111                        DRM_DEBUG("OA config already exists with this uuid\n");
4112                        err = -EADDRINUSE;
4113                        goto sysfs_err;
4114                }
4115        }
4116
4117        err = create_dynamic_oa_sysfs_entry(perf, oa_config);
4118        if (err) {
4119                DRM_DEBUG("Failed to create sysfs entry for OA config\n");
4120                goto sysfs_err;
4121        }
4122
4123        /* Config id 0 is invalid, id 1 for kernel stored test config. */
4124        oa_config->id = idr_alloc(&perf->metrics_idr,
4125                                  oa_config, 2,
4126                                  0, GFP_KERNEL);
4127        if (oa_config->id < 0) {
4128                DRM_DEBUG("Failed to create sysfs entry for OA config\n");
4129                err = oa_config->id;
4130                goto sysfs_err;
4131        }
4132
4133        mutex_unlock(&perf->metrics_lock);
4134
4135        DRM_DEBUG("Added config %s id=%i\n", oa_config->uuid, oa_config->id);
4136
4137        return oa_config->id;
4138
4139sysfs_err:
4140        mutex_unlock(&perf->metrics_lock);
4141reg_err:
4142        i915_oa_config_put(oa_config);
4143        DRM_DEBUG("Failed to add new OA config\n");
4144        return err;
4145}
4146
4147/**
4148 * i915_perf_remove_config_ioctl - DRM ioctl() for userspace to remove an OA config
4149 * @dev: drm device
4150 * @data: ioctl data (pointer to u64 integer) copied from userspace
4151 * @file: drm file
4152 *
4153 * Configs can be removed while being used, the will stop appearing in sysfs
4154 * and their content will be freed when the stream using the config is closed.
4155 *
4156 * Returns: 0 on success or a negative error code on failure.
4157 */
4158int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data,
4159                                  struct drm_file *file)
4160{
4161        struct i915_perf *perf = &to_i915(dev)->perf;
4162        u64 *arg = data;
4163        struct i915_oa_config *oa_config;
4164        int ret;
4165
4166        if (!perf->i915) {
4167                DRM_DEBUG("i915 perf interface not available for this system\n");
4168                return -ENOTSUPP;
4169        }
4170
4171        if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
4172                DRM_DEBUG("Insufficient privileges to remove i915 OA config\n");
4173                return -EACCES;
4174        }
4175
4176        ret = mutex_lock_interruptible(&perf->metrics_lock);
4177        if (ret)
4178                return ret;
4179
4180        oa_config = idr_find(&perf->metrics_idr, *arg);
4181        if (!oa_config) {
4182                DRM_DEBUG("Failed to remove unknown OA config\n");
4183                ret = -ENOENT;
4184                goto err_unlock;
4185        }
4186
4187        GEM_BUG_ON(*arg != oa_config->id);
4188
4189        sysfs_remove_group(perf->metrics_kobj, &oa_config->sysfs_metric);
4190
4191        idr_remove(&perf->metrics_idr, *arg);
4192
4193        mutex_unlock(&perf->metrics_lock);
4194
4195        DRM_DEBUG("Removed config %s id=%i\n", oa_config->uuid, oa_config->id);
4196
4197        i915_oa_config_put(oa_config);
4198
4199        return 0;
4200
4201err_unlock:
4202        mutex_unlock(&perf->metrics_lock);
4203        return ret;
4204}
4205
4206static struct ctl_table oa_table[] = {
4207        {
4208         .procname = "perf_stream_paranoid",
4209         .data = &i915_perf_stream_paranoid,
4210         .maxlen = sizeof(i915_perf_stream_paranoid),
4211         .mode = 0644,
4212         .proc_handler = proc_dointvec_minmax,
4213         .extra1 = SYSCTL_ZERO,
4214         .extra2 = SYSCTL_ONE,
4215         },
4216        {
4217         .procname = "oa_max_sample_rate",
4218         .data = &i915_oa_max_sample_rate,
4219         .maxlen = sizeof(i915_oa_max_sample_rate),
4220         .mode = 0644,
4221         .proc_handler = proc_dointvec_minmax,
4222         .extra1 = SYSCTL_ZERO,
4223         .extra2 = &oa_sample_rate_hard_limit,
4224         },
4225        {}
4226};
4227
4228static struct ctl_table i915_root[] = {
4229        {
4230         .procname = "i915",
4231         .maxlen = 0,
4232         .mode = 0555,
4233         .child = oa_table,
4234         },
4235        {}
4236};
4237
4238static struct ctl_table dev_root[] = {
4239        {
4240         .procname = "dev",
4241         .maxlen = 0,
4242         .mode = 0555,
4243         .child = i915_root,
4244         },
4245        {}
4246};
4247
4248/**
4249 * i915_perf_init - initialize i915-perf state on module bind
4250 * @i915: i915 device instance
4251 *
4252 * Initializes i915-perf state without exposing anything to userspace.
4253 *
4254 * Note: i915-perf initialization is split into an 'init' and 'register'
4255 * phase with the i915_perf_register() exposing state to userspace.
4256 */
4257void i915_perf_init(struct drm_i915_private *i915)
4258{
4259        struct i915_perf *perf = &i915->perf;
4260
4261        /* XXX const struct i915_perf_ops! */
4262
4263        if (IS_HASWELL(i915)) {
4264                perf->ops.is_valid_b_counter_reg = gen7_is_valid_b_counter_addr;
4265                perf->ops.is_valid_mux_reg = hsw_is_valid_mux_addr;
4266                perf->ops.is_valid_flex_reg = NULL;
4267                perf->ops.enable_metric_set = hsw_enable_metric_set;
4268                perf->ops.disable_metric_set = hsw_disable_metric_set;
4269                perf->ops.oa_enable = gen7_oa_enable;
4270                perf->ops.oa_disable = gen7_oa_disable;
4271                perf->ops.read = gen7_oa_read;
4272                perf->ops.oa_hw_tail_read = gen7_oa_hw_tail_read;
4273
4274                perf->oa_formats = hsw_oa_formats;
4275        } else if (HAS_LOGICAL_RING_CONTEXTS(i915)) {
4276                /* Note: that although we could theoretically also support the
4277                 * legacy ringbuffer mode on BDW (and earlier iterations of
4278                 * this driver, before upstreaming did this) it didn't seem
4279                 * worth the complexity to maintain now that BDW+ enable
4280                 * execlist mode by default.
4281                 */
4282                perf->ops.read = gen8_oa_read;
4283
4284                if (IS_GEN_RANGE(i915, 8, 9)) {
4285                        perf->oa_formats = gen8_plus_oa_formats;
4286
4287                        perf->ops.is_valid_b_counter_reg =
4288                                gen7_is_valid_b_counter_addr;
4289                        perf->ops.is_valid_mux_reg =
4290                                gen8_is_valid_mux_addr;
4291                        perf->ops.is_valid_flex_reg =
4292                                gen8_is_valid_flex_addr;
4293
4294                        if (IS_CHERRYVIEW(i915)) {
4295                                perf->ops.is_valid_mux_reg =
4296                                        chv_is_valid_mux_addr;
4297                        }
4298
4299                        perf->ops.oa_enable = gen8_oa_enable;
4300                        perf->ops.oa_disable = gen8_oa_disable;
4301                        perf->ops.enable_metric_set = gen8_enable_metric_set;
4302                        perf->ops.disable_metric_set = gen8_disable_metric_set;
4303                        perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
4304
4305                        if (IS_GEN(i915, 8)) {
4306                                perf->ctx_oactxctrl_offset = 0x120;
4307                                perf->ctx_flexeu0_offset = 0x2ce;
4308
4309                                perf->gen8_valid_ctx_bit = BIT(25);
4310                        } else {
4311                                perf->ctx_oactxctrl_offset = 0x128;
4312                                perf->ctx_flexeu0_offset = 0x3de;
4313
4314                                perf->gen8_valid_ctx_bit = BIT(16);
4315                        }
4316                } else if (IS_GEN_RANGE(i915, 10, 11)) {
4317                        perf->oa_formats = gen8_plus_oa_formats;
4318
4319                        perf->ops.is_valid_b_counter_reg =
4320                                gen7_is_valid_b_counter_addr;
4321                        perf->ops.is_valid_mux_reg =
4322                                gen10_is_valid_mux_addr;
4323                        perf->ops.is_valid_flex_reg =
4324                                gen8_is_valid_flex_addr;
4325
4326                        perf->ops.oa_enable = gen8_oa_enable;
4327                        perf->ops.oa_disable = gen8_oa_disable;
4328                        perf->ops.enable_metric_set = gen8_enable_metric_set;
4329                        perf->ops.disable_metric_set = gen10_disable_metric_set;
4330                        perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
4331
4332                        if (IS_GEN(i915, 10)) {
4333                                perf->ctx_oactxctrl_offset = 0x128;
4334                                perf->ctx_flexeu0_offset = 0x3de;
4335                        } else {
4336                                perf->ctx_oactxctrl_offset = 0x124;
4337                                perf->ctx_flexeu0_offset = 0x78e;
4338                        }
4339                        perf->gen8_valid_ctx_bit = BIT(16);
4340                } else if (IS_GEN(i915, 12)) {
4341                        perf->oa_formats = gen12_oa_formats;
4342
4343                        perf->ops.is_valid_b_counter_reg =
4344                                gen12_is_valid_b_counter_addr;
4345                        perf->ops.is_valid_mux_reg =
4346                                gen12_is_valid_mux_addr;
4347                        perf->ops.is_valid_flex_reg =
4348                                gen8_is_valid_flex_addr;
4349
4350                        perf->ops.oa_enable = gen12_oa_enable;
4351                        perf->ops.oa_disable = gen12_oa_disable;
4352                        perf->ops.enable_metric_set = gen12_enable_metric_set;
4353                        perf->ops.disable_metric_set = gen12_disable_metric_set;
4354                        perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read;
4355
4356                        perf->ctx_flexeu0_offset = 0;
4357                        perf->ctx_oactxctrl_offset = 0x144;
4358                }
4359        }
4360
4361        if (perf->ops.enable_metric_set) {
4362                mutex_init(&perf->lock);
4363
4364                oa_sample_rate_hard_limit = 1000 *
4365                        (RUNTIME_INFO(i915)->cs_timestamp_frequency_khz / 2);
4366
4367                mutex_init(&perf->metrics_lock);
4368                idr_init(&perf->metrics_idr);
4369
4370                /* We set up some ratelimit state to potentially throttle any
4371                 * _NOTES about spurious, invalid OA reports which we don't
4372                 * forward to userspace.
4373                 *
4374                 * We print a _NOTE about any throttling when closing the
4375                 * stream instead of waiting until driver _fini which no one
4376                 * would ever see.
4377                 *
4378                 * Using the same limiting factors as printk_ratelimit()
4379                 */
4380                ratelimit_state_init(&perf->spurious_report_rs, 5 * HZ, 10);
4381                /* Since we use a DRM_NOTE for spurious reports it would be
4382                 * inconsistent to let __ratelimit() automatically print a
4383                 * warning for throttling.
4384                 */
4385                ratelimit_set_flags(&perf->spurious_report_rs,
4386                                    RATELIMIT_MSG_ON_RELEASE);
4387
4388                atomic64_set(&perf->noa_programming_delay,
4389                             500 * 1000 /* 500us */);
4390
4391                perf->i915 = i915;
4392        }
4393}
4394
4395static int destroy_config(int id, void *p, void *data)
4396{
4397        i915_oa_config_put(p);
4398        return 0;
4399}
4400
4401void i915_perf_sysctl_register(void)
4402{
4403        sysctl_header = register_sysctl_table(dev_root);
4404}
4405
4406void i915_perf_sysctl_unregister(void)
4407{
4408        unregister_sysctl_table(sysctl_header);
4409}
4410
4411/**
4412 * i915_perf_fini - Counter part to i915_perf_init()
4413 * @i915: i915 device instance
4414 */
4415void i915_perf_fini(struct drm_i915_private *i915)
4416{
4417        struct i915_perf *perf = &i915->perf;
4418
4419        if (!perf->i915)
4420                return;
4421
4422        idr_for_each(&perf->metrics_idr, destroy_config, perf);
4423        idr_destroy(&perf->metrics_idr);
4424
4425        memset(&perf->ops, 0, sizeof(perf->ops));
4426        perf->i915 = NULL;
4427}
4428
4429/**
4430 * i915_perf_ioctl_version - Version of the i915-perf subsystem
4431 *
4432 * This version number is used by userspace to detect available features.
4433 */
4434int i915_perf_ioctl_version(void)
4435{
4436        /*
4437         * 1: Initial version
4438         *   I915_PERF_IOCTL_ENABLE
4439         *   I915_PERF_IOCTL_DISABLE
4440         *
4441         * 2: Added runtime modification of OA config.
4442         *   I915_PERF_IOCTL_CONFIG
4443         *
4444         * 3: Add DRM_I915_PERF_PROP_HOLD_PREEMPTION parameter to hold
4445         *    preemption on a particular context so that performance data is
4446         *    accessible from a delta of MI_RPC reports without looking at the
4447         *    OA buffer.
4448         */
4449        return 3;
4450}
4451
4452#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4453#include "selftests/i915_perf.c"
4454#endif
4455