linux/arch/x86/kernel/cpu/intel_rdt_monitor.c
<<
>>
Prefs
   1/*
   2 * Resource Director Technology(RDT)
   3 * - Monitoring code
   4 *
   5 * Copyright (C) 2017 Intel Corporation
   6 *
   7 * Author:
   8 *    Vikas Shivappa <vikas.shivappa@intel.com>
   9 *
  10 * This replaces the cqm.c based on perf but we reuse a lot of
  11 * code and datastructures originally from Peter Zijlstra and Matt Fleming.
  12 *
  13 * This program is free software; you can redistribute it and/or modify it
  14 * under the terms and conditions of the GNU General Public License,
  15 * version 2, as published by the Free Software Foundation.
  16 *
  17 * This program is distributed in the hope it will be useful, but WITHOUT
  18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  20 * more details.
  21 *
  22 * More information about RDT be found in the Intel (R) x86 Architecture
  23 * Software Developer Manual June 2016, volume 3, section 17.17.
  24 */
  25
  26#include <linux/module.h>
  27#include <linux/slab.h>
  28#include <asm/cpu_device_id.h>
  29#include "intel_rdt.h"
  30
  31#define MSR_IA32_QM_CTR         0x0c8e
  32#define MSR_IA32_QM_EVTSEL              0x0c8d
  33
  34struct rmid_entry {
  35        u32                             rmid;
  36        int                             busy;
  37        struct list_head                list;
  38};
  39
  40/**
  41 * @rmid_free_lru    A least recently used list of free RMIDs
  42 *     These RMIDs are guaranteed to have an occupancy less than the
  43 *     threshold occupancy
  44 */
  45static LIST_HEAD(rmid_free_lru);
  46
  47/**
  48 * @rmid_limbo_count     count of currently unused but (potentially)
  49 *     dirty RMIDs.
  50 *     This counts RMIDs that no one is currently using but that
  51 *     may have a occupancy value > intel_cqm_threshold. User can change
  52 *     the threshold occupancy value.
  53 */
  54static unsigned int rmid_limbo_count;
  55
  56/**
  57 * @rmid_entry - The entry in the limbo and free lists.
  58 */
  59static struct rmid_entry        *rmid_ptrs;
  60
  61/*
  62 * Global boolean for rdt_monitor which is true if any
  63 * resource monitoring is enabled.
  64 */
  65bool rdt_mon_capable;
  66
  67/*
  68 * Global to indicate which monitoring events are enabled.
  69 */
  70unsigned int rdt_mon_features;
  71
  72/*
  73 * This is the threshold cache occupancy at which we will consider an
  74 * RMID available for re-allocation.
  75 */
  76unsigned int intel_cqm_threshold;
  77
  78static inline struct rmid_entry *__rmid_entry(u32 rmid)
  79{
  80        struct rmid_entry *entry;
  81
  82        entry = &rmid_ptrs[rmid];
  83        WARN_ON(entry->rmid != rmid);
  84
  85        return entry;
  86}
  87
  88static u64 __rmid_read(u32 rmid, u32 eventid)
  89{
  90        u64 val;
  91
  92        /*
  93         * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
  94         * with a valid event code for supported resource type and the bits
  95         * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
  96         * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
  97         * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
  98         * are error bits.
  99         */
 100        wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
 101        rdmsrl(MSR_IA32_QM_CTR, val);
 102
 103        return val;
 104}
 105
 106static bool rmid_dirty(struct rmid_entry *entry)
 107{
 108        u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
 109
 110        return val >= intel_cqm_threshold;
 111}
 112
 113/*
 114 * Check the RMIDs that are marked as busy for this domain. If the
 115 * reported LLC occupancy is below the threshold clear the busy bit and
 116 * decrement the count. If the busy count gets to zero on an RMID, we
 117 * free the RMID
 118 */
 119void __check_limbo(struct rdt_domain *d, bool force_free)
 120{
 121        struct rmid_entry *entry;
 122        struct rdt_resource *r;
 123        u32 crmid = 1, nrmid;
 124
 125        r = &rdt_resources_all[RDT_RESOURCE_L3];
 126
 127        /*
 128         * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
 129         * are marked as busy for occupancy < threshold. If the occupancy
 130         * is less than the threshold decrement the busy counter of the
 131         * RMID and move it to the free list when the counter reaches 0.
 132         */
 133        for (;;) {
 134                nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
 135                if (nrmid >= r->num_rmid)
 136                        break;
 137
 138                entry = __rmid_entry(nrmid);
 139                if (force_free || !rmid_dirty(entry)) {
 140                        clear_bit(entry->rmid, d->rmid_busy_llc);
 141                        if (!--entry->busy) {
 142                                rmid_limbo_count--;
 143                                list_add_tail(&entry->list, &rmid_free_lru);
 144                        }
 145                }
 146                crmid = nrmid + 1;
 147        }
 148}
 149
 150bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
 151{
 152        return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
 153}
 154
 155/*
 156 * As of now the RMIDs allocation is global.
 157 * However we keep track of which packages the RMIDs
 158 * are used to optimize the limbo list management.
 159 */
 160int alloc_rmid(void)
 161{
 162        struct rmid_entry *entry;
 163
 164        lockdep_assert_held(&rdtgroup_mutex);
 165
 166        if (list_empty(&rmid_free_lru))
 167                return rmid_limbo_count ? -EBUSY : -ENOSPC;
 168
 169        entry = list_first_entry(&rmid_free_lru,
 170                                 struct rmid_entry, list);
 171        list_del(&entry->list);
 172
 173        return entry->rmid;
 174}
 175
 176static void add_rmid_to_limbo(struct rmid_entry *entry)
 177{
 178        struct rdt_resource *r;
 179        struct rdt_domain *d;
 180        int cpu;
 181        u64 val;
 182
 183        r = &rdt_resources_all[RDT_RESOURCE_L3];
 184
 185        entry->busy = 0;
 186        cpu = get_cpu();
 187        list_for_each_entry(d, &r->domains, list) {
 188                if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
 189                        val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
 190                        if (val <= intel_cqm_threshold)
 191                                continue;
 192                }
 193
 194                /*
 195                 * For the first limbo RMID in the domain,
 196                 * setup up the limbo worker.
 197                 */
 198                if (!has_busy_rmid(r, d))
 199                        cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
 200                set_bit(entry->rmid, d->rmid_busy_llc);
 201                entry->busy++;
 202        }
 203        put_cpu();
 204
 205        if (entry->busy)
 206                rmid_limbo_count++;
 207        else
 208                list_add_tail(&entry->list, &rmid_free_lru);
 209}
 210
 211void free_rmid(u32 rmid)
 212{
 213        struct rmid_entry *entry;
 214
 215        if (!rmid)
 216                return;
 217
 218        lockdep_assert_held(&rdtgroup_mutex);
 219
 220        entry = __rmid_entry(rmid);
 221
 222        if (is_llc_occupancy_enabled())
 223                add_rmid_to_limbo(entry);
 224        else
 225                list_add_tail(&entry->list, &rmid_free_lru);
 226}
 227
 228static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 229{
 230        u64 chunks, shift, tval;
 231        struct mbm_state *m;
 232
 233        tval = __rmid_read(rmid, rr->evtid);
 234        if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
 235                rr->val = tval;
 236                return -EINVAL;
 237        }
 238        switch (rr->evtid) {
 239        case QOS_L3_OCCUP_EVENT_ID:
 240                rr->val += tval;
 241                return 0;
 242        case QOS_L3_MBM_TOTAL_EVENT_ID:
 243                m = &rr->d->mbm_total[rmid];
 244                break;
 245        case QOS_L3_MBM_LOCAL_EVENT_ID:
 246                m = &rr->d->mbm_local[rmid];
 247                break;
 248        default:
 249                /*
 250                 * Code would never reach here because
 251                 * an invalid event id would fail the __rmid_read.
 252                 */
 253                return -EINVAL;
 254        }
 255
 256        if (rr->first) {
 257                m->prev_msr = tval;
 258                m->chunks = 0;
 259                return 0;
 260        }
 261
 262        shift = 64 - MBM_CNTR_WIDTH;
 263        chunks = (tval << shift) - (m->prev_msr << shift);
 264        chunks >>= shift;
 265        m->chunks += chunks;
 266        m->prev_msr = tval;
 267
 268        rr->val += m->chunks;
 269        return 0;
 270}
 271
 272/*
 273 * This is called via IPI to read the CQM/MBM counters
 274 * on a domain.
 275 */
 276void mon_event_count(void *info)
 277{
 278        struct rdtgroup *rdtgrp, *entry;
 279        struct rmid_read *rr = info;
 280        struct list_head *head;
 281
 282        rdtgrp = rr->rgrp;
 283
 284        if (__mon_event_count(rdtgrp->mon.rmid, rr))
 285                return;
 286
 287        /*
 288         * For Ctrl groups read data from child monitor groups.
 289         */
 290        head = &rdtgrp->mon.crdtgrp_list;
 291
 292        if (rdtgrp->type == RDTCTRL_GROUP) {
 293                list_for_each_entry(entry, head, mon.crdtgrp_list) {
 294                        if (__mon_event_count(entry->mon.rmid, rr))
 295                                return;
 296                }
 297        }
 298}
 299
 300static void mbm_update(struct rdt_domain *d, int rmid)
 301{
 302        struct rmid_read rr;
 303
 304        rr.first = false;
 305        rr.d = d;
 306
 307        /*
 308         * This is protected from concurrent reads from user
 309         * as both the user and we hold the global mutex.
 310         */
 311        if (is_mbm_total_enabled()) {
 312                rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
 313                __mon_event_count(rmid, &rr);
 314        }
 315        if (is_mbm_local_enabled()) {
 316                rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
 317                __mon_event_count(rmid, &rr);
 318        }
 319}
 320
 321/*
 322 * Handler to scan the limbo list and move the RMIDs
 323 * to free list whose occupancy < threshold_occupancy.
 324 */
 325void cqm_handle_limbo(struct work_struct *work)
 326{
 327        unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
 328        int cpu = smp_processor_id();
 329        struct rdt_resource *r;
 330        struct rdt_domain *d;
 331
 332        mutex_lock(&rdtgroup_mutex);
 333
 334        r = &rdt_resources_all[RDT_RESOURCE_L3];
 335        d = get_domain_from_cpu(cpu, r);
 336
 337        if (!d) {
 338                pr_warn_once("Failure to get domain for limbo worker\n");
 339                goto out_unlock;
 340        }
 341
 342        __check_limbo(d, false);
 343
 344        if (has_busy_rmid(r, d))
 345                schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
 346
 347out_unlock:
 348        mutex_unlock(&rdtgroup_mutex);
 349}
 350
 351void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
 352{
 353        unsigned long delay = msecs_to_jiffies(delay_ms);
 354        struct rdt_resource *r;
 355        int cpu;
 356
 357        r = &rdt_resources_all[RDT_RESOURCE_L3];
 358
 359        cpu = cpumask_any(&dom->cpu_mask);
 360        dom->cqm_work_cpu = cpu;
 361
 362        schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
 363}
 364
 365void mbm_handle_overflow(struct work_struct *work)
 366{
 367        unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
 368        struct rdtgroup *prgrp, *crgrp;
 369        int cpu = smp_processor_id();
 370        struct list_head *head;
 371        struct rdt_domain *d;
 372
 373        mutex_lock(&rdtgroup_mutex);
 374
 375        if (!static_branch_likely(&rdt_enable_key))
 376                goto out_unlock;
 377
 378        d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
 379        if (!d)
 380                goto out_unlock;
 381
 382        list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
 383                mbm_update(d, prgrp->mon.rmid);
 384
 385                head = &prgrp->mon.crdtgrp_list;
 386                list_for_each_entry(crgrp, head, mon.crdtgrp_list)
 387                        mbm_update(d, crgrp->mon.rmid);
 388        }
 389
 390        schedule_delayed_work_on(cpu, &d->mbm_over, delay);
 391
 392out_unlock:
 393        mutex_unlock(&rdtgroup_mutex);
 394}
 395
 396void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
 397{
 398        unsigned long delay = msecs_to_jiffies(delay_ms);
 399        int cpu;
 400
 401        if (!static_branch_likely(&rdt_enable_key))
 402                return;
 403        cpu = cpumask_any(&dom->cpu_mask);
 404        dom->mbm_work_cpu = cpu;
 405        schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
 406}
 407
 408static int dom_data_init(struct rdt_resource *r)
 409{
 410        struct rmid_entry *entry = NULL;
 411        int i, nr_rmids;
 412
 413        nr_rmids = r->num_rmid;
 414        rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
 415        if (!rmid_ptrs)
 416                return -ENOMEM;
 417
 418        for (i = 0; i < nr_rmids; i++) {
 419                entry = &rmid_ptrs[i];
 420                INIT_LIST_HEAD(&entry->list);
 421
 422                entry->rmid = i;
 423                list_add_tail(&entry->list, &rmid_free_lru);
 424        }
 425
 426        /*
 427         * RMID 0 is special and is always allocated. It's used for all
 428         * tasks that are not monitored.
 429         */
 430        entry = __rmid_entry(0);
 431        list_del(&entry->list);
 432
 433        return 0;
 434}
 435
 436static struct mon_evt llc_occupancy_event = {
 437        .name           = "llc_occupancy",
 438        .evtid          = QOS_L3_OCCUP_EVENT_ID,
 439};
 440
 441static struct mon_evt mbm_total_event = {
 442        .name           = "mbm_total_bytes",
 443        .evtid          = QOS_L3_MBM_TOTAL_EVENT_ID,
 444};
 445
 446static struct mon_evt mbm_local_event = {
 447        .name           = "mbm_local_bytes",
 448        .evtid          = QOS_L3_MBM_LOCAL_EVENT_ID,
 449};
 450
 451/*
 452 * Initialize the event list for the resource.
 453 *
 454 * Note that MBM events are also part of RDT_RESOURCE_L3 resource
 455 * because as per the SDM the total and local memory bandwidth
 456 * are enumerated as part of L3 monitoring.
 457 */
 458static void l3_mon_evt_init(struct rdt_resource *r)
 459{
 460        INIT_LIST_HEAD(&r->evt_list);
 461
 462        if (is_llc_occupancy_enabled())
 463                list_add_tail(&llc_occupancy_event.list, &r->evt_list);
 464        if (is_mbm_total_enabled())
 465                list_add_tail(&mbm_total_event.list, &r->evt_list);
 466        if (is_mbm_local_enabled())
 467                list_add_tail(&mbm_local_event.list, &r->evt_list);
 468}
 469
 470int rdt_get_mon_l3_config(struct rdt_resource *r)
 471{
 472        int ret;
 473
 474        r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
 475        r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
 476
 477        /*
 478         * A reasonable upper limit on the max threshold is the number
 479         * of lines tagged per RMID if all RMIDs have the same number of
 480         * lines tagged in the LLC.
 481         *
 482         * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
 483         */
 484        intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid;
 485
 486        /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
 487        intel_cqm_threshold /= r->mon_scale;
 488
 489        ret = dom_data_init(r);
 490        if (ret)
 491                return ret;
 492
 493        l3_mon_evt_init(r);
 494
 495        r->mon_capable = true;
 496        r->mon_enabled = true;
 497
 498        return 0;
 499}
 500