linux/arch/x86/kernel/cpu/intel_rdt_monitor.c
<<
>>
Prefs
   1/*
   2 * Resource Director Technology(RDT)
   3 * - Monitoring code
   4 *
   5 * Copyright (C) 2017 Intel Corporation
   6 *
   7 * Author:
   8 *    Vikas Shivappa <vikas.shivappa@intel.com>
   9 *
  10 * This replaces the cqm.c based on perf but we reuse a lot of
  11 * code and datastructures originally from Peter Zijlstra and Matt Fleming.
  12 *
  13 * This program is free software; you can redistribute it and/or modify it
  14 * under the terms and conditions of the GNU General Public License,
  15 * version 2, as published by the Free Software Foundation.
  16 *
  17 * This program is distributed in the hope it will be useful, but WITHOUT
  18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  20 * more details.
  21 *
  22 * More information about RDT be found in the Intel (R) x86 Architecture
  23 * Software Developer Manual June 2016, volume 3, section 17.17.
  24 */
  25
  26#include <linux/module.h>
  27#include <linux/slab.h>
  28#include <asm/cpu_device_id.h>
  29#include "intel_rdt.h"
  30
  31#define MSR_IA32_QM_CTR         0x0c8e
  32#define MSR_IA32_QM_EVTSEL              0x0c8d
  33
  34struct rmid_entry {
  35        u32                             rmid;
  36        int                             busy;
  37        struct list_head                list;
  38};
  39
  40/**
  41 * @rmid_free_lru    A least recently used list of free RMIDs
  42 *     These RMIDs are guaranteed to have an occupancy less than the
  43 *     threshold occupancy
  44 */
  45static LIST_HEAD(rmid_free_lru);
  46
  47/**
  48 * @rmid_limbo_count     count of currently unused but (potentially)
  49 *     dirty RMIDs.
  50 *     This counts RMIDs that no one is currently using but that
  51 *     may have a occupancy value > intel_cqm_threshold. User can change
  52 *     the threshold occupancy value.
  53 */
  54static unsigned int rmid_limbo_count;
  55
  56/**
  57 * @rmid_entry - The entry in the limbo and free lists.
  58 */
  59static struct rmid_entry        *rmid_ptrs;
  60
  61/*
  62 * Global boolean for rdt_monitor which is true if any
  63 * resource monitoring is enabled.
  64 */
  65bool rdt_mon_capable;
  66
  67/*
  68 * Global to indicate which monitoring events are enabled.
  69 */
  70unsigned int rdt_mon_features;
  71
  72/*
  73 * This is the threshold cache occupancy at which we will consider an
  74 * RMID available for re-allocation.
  75 */
  76unsigned int intel_cqm_threshold;
  77
  78static inline struct rmid_entry *__rmid_entry(u32 rmid)
  79{
  80        struct rmid_entry *entry;
  81
  82        entry = &rmid_ptrs[rmid];
  83        WARN_ON(entry->rmid != rmid);
  84
  85        return entry;
  86}
  87
  88static u64 __rmid_read(u32 rmid, u32 eventid)
  89{
  90        u64 val;
  91
  92        /*
  93         * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
  94         * with a valid event code for supported resource type and the bits
  95         * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
  96         * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
  97         * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
  98         * are error bits.
  99         */
 100        wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
 101        rdmsrl(MSR_IA32_QM_CTR, val);
 102
 103        return val;
 104}
 105
 106static bool rmid_dirty(struct rmid_entry *entry)
 107{
 108        u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
 109
 110        return val >= intel_cqm_threshold;
 111}
 112
 113/*
 114 * Check the RMIDs that are marked as busy for this domain. If the
 115 * reported LLC occupancy is below the threshold clear the busy bit and
 116 * decrement the count. If the busy count gets to zero on an RMID, we
 117 * free the RMID
 118 */
 119void __check_limbo(struct rdt_domain *d, bool force_free)
 120{
 121        struct rmid_entry *entry;
 122        struct rdt_resource *r;
 123        u32 crmid = 1, nrmid;
 124
 125        r = &rdt_resources_all[RDT_RESOURCE_L3];
 126
 127        /*
 128         * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
 129         * are marked as busy for occupancy < threshold. If the occupancy
 130         * is less than the threshold decrement the busy counter of the
 131         * RMID and move it to the free list when the counter reaches 0.
 132         */
 133        for (;;) {
 134                nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
 135                if (nrmid >= r->num_rmid)
 136                        break;
 137
 138                entry = __rmid_entry(nrmid);
 139                if (force_free || !rmid_dirty(entry)) {
 140                        clear_bit(entry->rmid, d->rmid_busy_llc);
 141                        if (!--entry->busy) {
 142                                rmid_limbo_count--;
 143                                list_add_tail(&entry->list, &rmid_free_lru);
 144                        }
 145                }
 146                crmid = nrmid + 1;
 147        }
 148}
 149
 150bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
 151{
 152        return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
 153}
 154
 155/*
 156 * As of now the RMIDs allocation is global.
 157 * However we keep track of which packages the RMIDs
 158 * are used to optimize the limbo list management.
 159 */
 160int alloc_rmid(void)
 161{
 162        struct rmid_entry *entry;
 163
 164        lockdep_assert_held(&rdtgroup_mutex);
 165
 166        if (list_empty(&rmid_free_lru))
 167                return rmid_limbo_count ? -EBUSY : -ENOSPC;
 168
 169        entry = list_first_entry(&rmid_free_lru,
 170                                 struct rmid_entry, list);
 171        list_del(&entry->list);
 172
 173        return entry->rmid;
 174}
 175
 176static void add_rmid_to_limbo(struct rmid_entry *entry)
 177{
 178        struct rdt_resource *r;
 179        struct rdt_domain *d;
 180        int cpu;
 181        u64 val;
 182
 183        r = &rdt_resources_all[RDT_RESOURCE_L3];
 184
 185        entry->busy = 0;
 186        cpu = get_cpu();
 187        list_for_each_entry(d, &r->domains, list) {
 188                if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
 189                        val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
 190                        if (val <= intel_cqm_threshold)
 191                                continue;
 192                }
 193
 194                /*
 195                 * For the first limbo RMID in the domain,
 196                 * setup up the limbo worker.
 197                 */
 198                if (!has_busy_rmid(r, d))
 199                        cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
 200                set_bit(entry->rmid, d->rmid_busy_llc);
 201                entry->busy++;
 202        }
 203        put_cpu();
 204
 205        if (entry->busy)
 206                rmid_limbo_count++;
 207        else
 208                list_add_tail(&entry->list, &rmid_free_lru);
 209}
 210
 211void free_rmid(u32 rmid)
 212{
 213        struct rmid_entry *entry;
 214
 215        if (!rmid)
 216                return;
 217
 218        lockdep_assert_held(&rdtgroup_mutex);
 219
 220        entry = __rmid_entry(rmid);
 221
 222        if (is_llc_occupancy_enabled())
 223                add_rmid_to_limbo(entry);
 224        else
 225                list_add_tail(&entry->list, &rmid_free_lru);
 226}
 227
 228static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr)
 229{
 230        u64 shift = 64 - MBM_CNTR_WIDTH, chunks;
 231
 232        chunks = (cur_msr << shift) - (prev_msr << shift);
 233        return chunks >>= shift;
 234}
 235
 236static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 237{
 238        struct mbm_state *m;
 239        u64 chunks, tval;
 240
 241        tval = __rmid_read(rmid, rr->evtid);
 242        if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
 243                rr->val = tval;
 244                return -EINVAL;
 245        }
 246        switch (rr->evtid) {
 247        case QOS_L3_OCCUP_EVENT_ID:
 248                rr->val += tval;
 249                return 0;
 250        case QOS_L3_MBM_TOTAL_EVENT_ID:
 251                m = &rr->d->mbm_total[rmid];
 252                break;
 253        case QOS_L3_MBM_LOCAL_EVENT_ID:
 254                m = &rr->d->mbm_local[rmid];
 255                break;
 256        default:
 257                /*
 258                 * Code would never reach here because
 259                 * an invalid event id would fail the __rmid_read.
 260                 */
 261                return -EINVAL;
 262        }
 263
 264        if (rr->first) {
 265                memset(m, 0, sizeof(struct mbm_state));
 266                m->prev_bw_msr = m->prev_msr = tval;
 267                return 0;
 268        }
 269
 270        chunks = mbm_overflow_count(m->prev_msr, tval);
 271        m->chunks += chunks;
 272        m->prev_msr = tval;
 273
 274        rr->val += m->chunks;
 275        return 0;
 276}
 277
 278/*
 279 * Supporting function to calculate the memory bandwidth
 280 * and delta bandwidth in MBps.
 281 */
 282static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
 283{
 284        struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
 285        struct mbm_state *m = &rr->d->mbm_local[rmid];
 286        u64 tval, cur_bw, chunks;
 287
 288        tval = __rmid_read(rmid, rr->evtid);
 289        if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
 290                return;
 291
 292        chunks = mbm_overflow_count(m->prev_bw_msr, tval);
 293        m->chunks_bw += chunks;
 294        m->chunks = m->chunks_bw;
 295        cur_bw = (chunks * r->mon_scale) >> 20;
 296
 297        if (m->delta_comp)
 298                m->delta_bw = abs(cur_bw - m->prev_bw);
 299        m->delta_comp = false;
 300        m->prev_bw = cur_bw;
 301        m->prev_bw_msr = tval;
 302}
 303
 304/*
 305 * This is called via IPI to read the CQM/MBM counters
 306 * on a domain.
 307 */
 308void mon_event_count(void *info)
 309{
 310        struct rdtgroup *rdtgrp, *entry;
 311        struct rmid_read *rr = info;
 312        struct list_head *head;
 313
 314        rdtgrp = rr->rgrp;
 315
 316        if (__mon_event_count(rdtgrp->mon.rmid, rr))
 317                return;
 318
 319        /*
 320         * For Ctrl groups read data from child monitor groups.
 321         */
 322        head = &rdtgrp->mon.crdtgrp_list;
 323
 324        if (rdtgrp->type == RDTCTRL_GROUP) {
 325                list_for_each_entry(entry, head, mon.crdtgrp_list) {
 326                        if (__mon_event_count(entry->mon.rmid, rr))
 327                                return;
 328                }
 329        }
 330}
 331
 332/*
 333 * Feedback loop for MBA software controller (mba_sc)
 334 *
 335 * mba_sc is a feedback loop where we periodically read MBM counters and
 336 * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
 337 * that:
 338 *
 339 *   current bandwdith(cur_bw) < user specified bandwidth(user_bw)
 340 *
 341 * This uses the MBM counters to measure the bandwidth and MBA throttle
 342 * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
 343 * fact that resctrl rdtgroups have both monitoring and control.
 344 *
 345 * The frequency of the checks is 1s and we just tag along the MBM overflow
 346 * timer. Having 1s interval makes the calculation of bandwidth simpler.
 347 *
 348 * Although MBA's goal is to restrict the bandwidth to a maximum, there may
 349 * be a need to increase the bandwidth to avoid uncecessarily restricting
 350 * the L2 <-> L3 traffic.
 351 *
 352 * Since MBA controls the L2 external bandwidth where as MBM measures the
 353 * L3 external bandwidth the following sequence could lead to such a
 354 * situation.
 355 *
 356 * Consider an rdtgroup which had high L3 <-> memory traffic in initial
 357 * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
 358 * after some time rdtgroup has mostly L2 <-> L3 traffic.
 359 *
 360 * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
 361 * throttle MSRs already have low percentage values.  To avoid
 362 * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
 363 */
 364static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 365{
 366        u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
 367        struct mbm_state *pmbm_data, *cmbm_data;
 368        u32 cur_bw, delta_bw, user_bw;
 369        struct rdt_resource *r_mba;
 370        struct rdt_domain *dom_mba;
 371        struct list_head *head;
 372        struct rdtgroup *entry;
 373
 374        r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
 375        closid = rgrp->closid;
 376        rmid = rgrp->mon.rmid;
 377        pmbm_data = &dom_mbm->mbm_local[rmid];
 378
 379        dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
 380        if (!dom_mba) {
 381                pr_warn_once("Failure to get domain for MBA update\n");
 382                return;
 383        }
 384
 385        cur_bw = pmbm_data->prev_bw;
 386        user_bw = dom_mba->mbps_val[closid];
 387        delta_bw = pmbm_data->delta_bw;
 388        cur_msr_val = dom_mba->ctrl_val[closid];
 389
 390        /*
 391         * For Ctrl groups read data from child monitor groups.
 392         */
 393        head = &rgrp->mon.crdtgrp_list;
 394        list_for_each_entry(entry, head, mon.crdtgrp_list) {
 395                cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
 396                cur_bw += cmbm_data->prev_bw;
 397                delta_bw += cmbm_data->delta_bw;
 398        }
 399
 400        /*
 401         * Scale up/down the bandwidth linearly for the ctrl group.  The
 402         * bandwidth step is the bandwidth granularity specified by the
 403         * hardware.
 404         *
 405         * The delta_bw is used when increasing the bandwidth so that we
 406         * dont alternately increase and decrease the control values
 407         * continuously.
 408         *
 409         * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
 410         * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
 411         * switching between 90 and 110 continuously if we only check
 412         * cur_bw < user_bw.
 413         */
 414        if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
 415                new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
 416        } else if (cur_msr_val < MAX_MBA_BW &&
 417                   (user_bw > (cur_bw + delta_bw))) {
 418                new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
 419        } else {
 420                return;
 421        }
 422
 423        cur_msr = r_mba->msr_base + closid;
 424        wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
 425        dom_mba->ctrl_val[closid] = new_msr_val;
 426
 427        /*
 428         * Delta values are updated dynamically package wise for each
 429         * rdtgrp everytime the throttle MSR changes value.
 430         *
 431         * This is because (1)the increase in bandwidth is not perfectly
 432         * linear and only "approximately" linear even when the hardware
 433         * says it is linear.(2)Also since MBA is a core specific
 434         * mechanism, the delta values vary based on number of cores used
 435         * by the rdtgrp.
 436         */
 437        pmbm_data->delta_comp = true;
 438        list_for_each_entry(entry, head, mon.crdtgrp_list) {
 439                cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
 440                cmbm_data->delta_comp = true;
 441        }
 442}
 443
 444static void mbm_update(struct rdt_domain *d, int rmid)
 445{
 446        struct rmid_read rr;
 447
 448        rr.first = false;
 449        rr.d = d;
 450
 451        /*
 452         * This is protected from concurrent reads from user
 453         * as both the user and we hold the global mutex.
 454         */
 455        if (is_mbm_total_enabled()) {
 456                rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
 457                __mon_event_count(rmid, &rr);
 458        }
 459        if (is_mbm_local_enabled()) {
 460                rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
 461
 462                /*
 463                 * Call the MBA software controller only for the
 464                 * control groups and when user has enabled
 465                 * the software controller explicitly.
 466                 */
 467                if (!is_mba_sc(NULL))
 468                        __mon_event_count(rmid, &rr);
 469                else
 470                        mbm_bw_count(rmid, &rr);
 471        }
 472}
 473
 474/*
 475 * Handler to scan the limbo list and move the RMIDs
 476 * to free list whose occupancy < threshold_occupancy.
 477 */
 478void cqm_handle_limbo(struct work_struct *work)
 479{
 480        unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
 481        int cpu = smp_processor_id();
 482        struct rdt_resource *r;
 483        struct rdt_domain *d;
 484
 485        mutex_lock(&rdtgroup_mutex);
 486
 487        r = &rdt_resources_all[RDT_RESOURCE_L3];
 488        d = get_domain_from_cpu(cpu, r);
 489
 490        if (!d) {
 491                pr_warn_once("Failure to get domain for limbo worker\n");
 492                goto out_unlock;
 493        }
 494
 495        __check_limbo(d, false);
 496
 497        if (has_busy_rmid(r, d))
 498                schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
 499
 500out_unlock:
 501        mutex_unlock(&rdtgroup_mutex);
 502}
 503
 504void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
 505{
 506        unsigned long delay = msecs_to_jiffies(delay_ms);
 507        struct rdt_resource *r;
 508        int cpu;
 509
 510        r = &rdt_resources_all[RDT_RESOURCE_L3];
 511
 512        cpu = cpumask_any(&dom->cpu_mask);
 513        dom->cqm_work_cpu = cpu;
 514
 515        schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
 516}
 517
 518void mbm_handle_overflow(struct work_struct *work)
 519{
 520        unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
 521        struct rdtgroup *prgrp, *crgrp;
 522        int cpu = smp_processor_id();
 523        struct list_head *head;
 524        struct rdt_domain *d;
 525
 526        mutex_lock(&rdtgroup_mutex);
 527
 528        if (!static_branch_likely(&rdt_enable_key))
 529                goto out_unlock;
 530
 531        d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
 532        if (!d)
 533                goto out_unlock;
 534
 535        list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
 536                mbm_update(d, prgrp->mon.rmid);
 537
 538                head = &prgrp->mon.crdtgrp_list;
 539                list_for_each_entry(crgrp, head, mon.crdtgrp_list)
 540                        mbm_update(d, crgrp->mon.rmid);
 541
 542                if (is_mba_sc(NULL))
 543                        update_mba_bw(prgrp, d);
 544        }
 545
 546        schedule_delayed_work_on(cpu, &d->mbm_over, delay);
 547
 548out_unlock:
 549        mutex_unlock(&rdtgroup_mutex);
 550}
 551
 552void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
 553{
 554        unsigned long delay = msecs_to_jiffies(delay_ms);
 555        int cpu;
 556
 557        if (!static_branch_likely(&rdt_enable_key))
 558                return;
 559        cpu = cpumask_any(&dom->cpu_mask);
 560        dom->mbm_work_cpu = cpu;
 561        schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
 562}
 563
 564static int dom_data_init(struct rdt_resource *r)
 565{
 566        struct rmid_entry *entry = NULL;
 567        int i, nr_rmids;
 568
 569        nr_rmids = r->num_rmid;
 570        rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
 571        if (!rmid_ptrs)
 572                return -ENOMEM;
 573
 574        for (i = 0; i < nr_rmids; i++) {
 575                entry = &rmid_ptrs[i];
 576                INIT_LIST_HEAD(&entry->list);
 577
 578                entry->rmid = i;
 579                list_add_tail(&entry->list, &rmid_free_lru);
 580        }
 581
 582        /*
 583         * RMID 0 is special and is always allocated. It's used for all
 584         * tasks that are not monitored.
 585         */
 586        entry = __rmid_entry(0);
 587        list_del(&entry->list);
 588
 589        return 0;
 590}
 591
 592static struct mon_evt llc_occupancy_event = {
 593        .name           = "llc_occupancy",
 594        .evtid          = QOS_L3_OCCUP_EVENT_ID,
 595};
 596
 597static struct mon_evt mbm_total_event = {
 598        .name           = "mbm_total_bytes",
 599        .evtid          = QOS_L3_MBM_TOTAL_EVENT_ID,
 600};
 601
 602static struct mon_evt mbm_local_event = {
 603        .name           = "mbm_local_bytes",
 604        .evtid          = QOS_L3_MBM_LOCAL_EVENT_ID,
 605};
 606
 607/*
 608 * Initialize the event list for the resource.
 609 *
 610 * Note that MBM events are also part of RDT_RESOURCE_L3 resource
 611 * because as per the SDM the total and local memory bandwidth
 612 * are enumerated as part of L3 monitoring.
 613 */
 614static void l3_mon_evt_init(struct rdt_resource *r)
 615{
 616        INIT_LIST_HEAD(&r->evt_list);
 617
 618        if (is_llc_occupancy_enabled())
 619                list_add_tail(&llc_occupancy_event.list, &r->evt_list);
 620        if (is_mbm_total_enabled())
 621                list_add_tail(&mbm_total_event.list, &r->evt_list);
 622        if (is_mbm_local_enabled())
 623                list_add_tail(&mbm_local_event.list, &r->evt_list);
 624}
 625
 626int rdt_get_mon_l3_config(struct rdt_resource *r)
 627{
 628        int ret;
 629
 630        r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
 631        r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
 632
 633        /*
 634         * A reasonable upper limit on the max threshold is the number
 635         * of lines tagged per RMID if all RMIDs have the same number of
 636         * lines tagged in the LLC.
 637         *
 638         * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
 639         */
 640        intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid;
 641
 642        /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
 643        intel_cqm_threshold /= r->mon_scale;
 644
 645        ret = dom_data_init(r);
 646        if (ret)
 647                return ret;
 648
 649        l3_mon_evt_init(r);
 650
 651        r->mon_capable = true;
 652        r->mon_enabled = true;
 653
 654        return 0;
 655}
 656