linux/arch/x86/mm/pat/memtype.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Page Attribute Table (PAT) support: handle memory caching attributes in page tables.
   4 *
   5 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
   6 *          Suresh B Siddha <suresh.b.siddha@intel.com>
   7 *
   8 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
   9 *
  10 * Basic principles:
  11 *
  12 * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and
  13 * the kernel to set one of a handful of 'caching type' attributes for physical
  14 * memory ranges: uncached, write-combining, write-through, write-protected,
  15 * and the most commonly used and default attribute: write-back caching.
  16 *
  17 * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is
  18 * a hardware interface to enumerate a limited number of physical memory ranges
  19 * and set their caching attributes explicitly, programmed into the CPU via MSRs.
  20 * Even modern CPUs have MTRRs enabled - but these are typically not touched
  21 * by the kernel or by user-space (such as the X server), we rely on PAT for any
  22 * additional cache attribute logic.
  23 *
  24 * PAT doesn't work via explicit memory ranges, but uses page table entries to add
  25 * cache attribute information to the mapped memory range: there's 3 bits used,
  26 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the
  27 * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT).
  28 *
  29 * ( There's a metric ton of finer details, such as compatibility with CPU quirks
  30 *   that only support 4 types of PAT entries, and interaction with MTRRs, see
  31 *   below for details. )
  32 */
  33
  34#include <linux/seq_file.h>
  35#include <linux/memblock.h>
  36#include <linux/debugfs.h>
  37#include <linux/ioport.h>
  38#include <linux/kernel.h>
  39#include <linux/pfn_t.h>
  40#include <linux/slab.h>
  41#include <linux/mm.h>
  42#include <linux/fs.h>
  43#include <linux/rbtree.h>
  44
  45#include <asm/cacheflush.h>
  46#include <asm/processor.h>
  47#include <asm/tlbflush.h>
  48#include <asm/x86_init.h>
  49#include <asm/fcntl.h>
  50#include <asm/e820/api.h>
  51#include <asm/mtrr.h>
  52#include <asm/page.h>
  53#include <asm/msr.h>
  54#include <asm/memtype.h>
  55#include <asm/io.h>
  56
  57#include "memtype.h"
  58#include "../mm_internal.h"
  59
  60#undef pr_fmt
  61#define pr_fmt(fmt) "" fmt
  62
  63static bool __read_mostly pat_bp_initialized;
  64static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT);
  65static bool __read_mostly pat_bp_enabled;
  66static bool __read_mostly pat_cm_initialized;
  67
  68/*
  69 * PAT support is enabled by default, but can be disabled for
  70 * various user-requested or hardware-forced reasons:
  71 */
  72void pat_disable(const char *msg_reason)
  73{
  74        if (pat_disabled)
  75                return;
  76
  77        if (pat_bp_initialized) {
  78                WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n");
  79                return;
  80        }
  81
  82        pat_disabled = true;
  83        pr_info("x86/PAT: %s\n", msg_reason);
  84}
  85
  86static int __init nopat(char *str)
  87{
  88        pat_disable("PAT support disabled via boot option.");
  89        return 0;
  90}
  91early_param("nopat", nopat);
  92
  93bool pat_enabled(void)
  94{
  95        return pat_bp_enabled;
  96}
  97EXPORT_SYMBOL_GPL(pat_enabled);
  98
  99int pat_debug_enable;
 100
 101static int __init pat_debug_setup(char *str)
 102{
 103        pat_debug_enable = 1;
 104        return 0;
 105}
 106__setup("debugpat", pat_debug_setup);
 107
 108#ifdef CONFIG_X86_PAT
 109/*
 110 * X86 PAT uses page flags arch_1 and uncached together to keep track of
 111 * memory type of pages that have backing page struct.
 112 *
 113 * X86 PAT supports 4 different memory types:
 114 *  - _PAGE_CACHE_MODE_WB
 115 *  - _PAGE_CACHE_MODE_WC
 116 *  - _PAGE_CACHE_MODE_UC_MINUS
 117 *  - _PAGE_CACHE_MODE_WT
 118 *
 119 * _PAGE_CACHE_MODE_WB is the default type.
 120 */
 121
 122#define _PGMT_WB                0
 123#define _PGMT_WC                (1UL << PG_arch_1)
 124#define _PGMT_UC_MINUS          (1UL << PG_uncached)
 125#define _PGMT_WT                (1UL << PG_uncached | 1UL << PG_arch_1)
 126#define _PGMT_MASK              (1UL << PG_uncached | 1UL << PG_arch_1)
 127#define _PGMT_CLEAR_MASK        (~_PGMT_MASK)
 128
 129static inline enum page_cache_mode get_page_memtype(struct page *pg)
 130{
 131        unsigned long pg_flags = pg->flags & _PGMT_MASK;
 132
 133        if (pg_flags == _PGMT_WB)
 134                return _PAGE_CACHE_MODE_WB;
 135        else if (pg_flags == _PGMT_WC)
 136                return _PAGE_CACHE_MODE_WC;
 137        else if (pg_flags == _PGMT_UC_MINUS)
 138                return _PAGE_CACHE_MODE_UC_MINUS;
 139        else
 140                return _PAGE_CACHE_MODE_WT;
 141}
 142
 143static inline void set_page_memtype(struct page *pg,
 144                                    enum page_cache_mode memtype)
 145{
 146        unsigned long memtype_flags;
 147        unsigned long old_flags;
 148        unsigned long new_flags;
 149
 150        switch (memtype) {
 151        case _PAGE_CACHE_MODE_WC:
 152                memtype_flags = _PGMT_WC;
 153                break;
 154        case _PAGE_CACHE_MODE_UC_MINUS:
 155                memtype_flags = _PGMT_UC_MINUS;
 156                break;
 157        case _PAGE_CACHE_MODE_WT:
 158                memtype_flags = _PGMT_WT;
 159                break;
 160        case _PAGE_CACHE_MODE_WB:
 161        default:
 162                memtype_flags = _PGMT_WB;
 163                break;
 164        }
 165
 166        do {
 167                old_flags = pg->flags;
 168                new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
 169        } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
 170}
 171#else
 172static inline enum page_cache_mode get_page_memtype(struct page *pg)
 173{
 174        return -1;
 175}
 176static inline void set_page_memtype(struct page *pg,
 177                                    enum page_cache_mode memtype)
 178{
 179}
 180#endif
 181
 182enum {
 183        PAT_UC = 0,             /* uncached */
 184        PAT_WC = 1,             /* Write combining */
 185        PAT_WT = 4,             /* Write Through */
 186        PAT_WP = 5,             /* Write Protected */
 187        PAT_WB = 6,             /* Write Back (default) */
 188        PAT_UC_MINUS = 7,       /* UC, but can be overridden by MTRR */
 189};
 190
 191#define CM(c) (_PAGE_CACHE_MODE_ ## c)
 192
 193static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
 194{
 195        enum page_cache_mode cache;
 196        char *cache_mode;
 197
 198        switch (pat_val) {
 199        case PAT_UC:       cache = CM(UC);       cache_mode = "UC  "; break;
 200        case PAT_WC:       cache = CM(WC);       cache_mode = "WC  "; break;
 201        case PAT_WT:       cache = CM(WT);       cache_mode = "WT  "; break;
 202        case PAT_WP:       cache = CM(WP);       cache_mode = "WP  "; break;
 203        case PAT_WB:       cache = CM(WB);       cache_mode = "WB  "; break;
 204        case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
 205        default:           cache = CM(WB);       cache_mode = "WB  "; break;
 206        }
 207
 208        memcpy(msg, cache_mode, 4);
 209
 210        return cache;
 211}
 212
 213#undef CM
 214
 215/*
 216 * Update the cache mode to pgprot translation tables according to PAT
 217 * configuration.
 218 * Using lower indices is preferred, so we start with highest index.
 219 */
 220static void __init_cache_modes(u64 pat)
 221{
 222        enum page_cache_mode cache;
 223        char pat_msg[33];
 224        int i;
 225
 226        WARN_ON_ONCE(pat_cm_initialized);
 227
 228        pat_msg[32] = 0;
 229        for (i = 7; i >= 0; i--) {
 230                cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
 231                                           pat_msg + 4 * i);
 232                update_cache_mode_entry(i, cache);
 233        }
 234        pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
 235
 236        pat_cm_initialized = true;
 237}
 238
 239#define PAT(x, y)       ((u64)PAT_ ## y << ((x)*8))
 240
 241static void pat_bp_init(u64 pat)
 242{
 243        u64 tmp_pat;
 244
 245        if (!boot_cpu_has(X86_FEATURE_PAT)) {
 246                pat_disable("PAT not supported by the CPU.");
 247                return;
 248        }
 249
 250        rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
 251        if (!tmp_pat) {
 252                pat_disable("PAT support disabled by the firmware.");
 253                return;
 254        }
 255
 256        wrmsrl(MSR_IA32_CR_PAT, pat);
 257        pat_bp_enabled = true;
 258
 259        __init_cache_modes(pat);
 260}
 261
 262static void pat_ap_init(u64 pat)
 263{
 264        if (!boot_cpu_has(X86_FEATURE_PAT)) {
 265                /*
 266                 * If this happens we are on a secondary CPU, but switched to
 267                 * PAT on the boot CPU. We have no way to undo PAT.
 268                 */
 269                panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
 270        }
 271
 272        wrmsrl(MSR_IA32_CR_PAT, pat);
 273}
 274
 275void init_cache_modes(void)
 276{
 277        u64 pat = 0;
 278
 279        if (pat_cm_initialized)
 280                return;
 281
 282        if (boot_cpu_has(X86_FEATURE_PAT)) {
 283                /*
 284                 * CPU supports PAT. Set PAT table to be consistent with
 285                 * PAT MSR. This case supports "nopat" boot option, and
 286                 * virtual machine environments which support PAT without
 287                 * MTRRs. In specific, Xen has unique setup to PAT MSR.
 288                 *
 289                 * If PAT MSR returns 0, it is considered invalid and emulates
 290                 * as No PAT.
 291                 */
 292                rdmsrl(MSR_IA32_CR_PAT, pat);
 293        }
 294
 295        if (!pat) {
 296                /*
 297                 * No PAT. Emulate the PAT table that corresponds to the two
 298                 * cache bits, PWT (Write Through) and PCD (Cache Disable).
 299                 * This setup is also the same as the BIOS default setup.
 300                 *
 301                 * PTE encoding:
 302                 *
 303                 *       PCD
 304                 *       |PWT  PAT
 305                 *       ||    slot
 306                 *       00    0    WB : _PAGE_CACHE_MODE_WB
 307                 *       01    1    WT : _PAGE_CACHE_MODE_WT
 308                 *       10    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
 309                 *       11    3    UC : _PAGE_CACHE_MODE_UC
 310                 *
 311                 * NOTE: When WC or WP is used, it is redirected to UC- per
 312                 * the default setup in __cachemode2pte_tbl[].
 313                 */
 314                pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
 315                      PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
 316        }
 317
 318        __init_cache_modes(pat);
 319}
 320
 321/**
 322 * pat_init - Initialize the PAT MSR and PAT table on the current CPU
 323 *
 324 * This function initializes PAT MSR and PAT table with an OS-defined value
 325 * to enable additional cache attributes, WC, WT and WP.
 326 *
 327 * This function must be called on all CPUs using the specific sequence of
 328 * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
 329 * procedure for PAT.
 330 */
 331void pat_init(void)
 332{
 333        u64 pat;
 334        struct cpuinfo_x86 *c = &boot_cpu_data;
 335
 336#ifndef CONFIG_X86_PAT
 337        pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
 338#endif
 339
 340        if (pat_disabled)
 341                return;
 342
 343        if ((c->x86_vendor == X86_VENDOR_INTEL) &&
 344            (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
 345             ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
 346                /*
 347                 * PAT support with the lower four entries. Intel Pentium 2,
 348                 * 3, M, and 4 are affected by PAT errata, which makes the
 349                 * upper four entries unusable. To be on the safe side, we don't
 350                 * use those.
 351                 *
 352                 *  PTE encoding:
 353                 *      PAT
 354                 *      |PCD
 355                 *      ||PWT  PAT
 356                 *      |||    slot
 357                 *      000    0    WB : _PAGE_CACHE_MODE_WB
 358                 *      001    1    WC : _PAGE_CACHE_MODE_WC
 359                 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
 360                 *      011    3    UC : _PAGE_CACHE_MODE_UC
 361                 * PAT bit unused
 362                 *
 363                 * NOTE: When WT or WP is used, it is redirected to UC- per
 364                 * the default setup in __cachemode2pte_tbl[].
 365                 */
 366                pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
 367                      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
 368        } else {
 369                /*
 370                 * Full PAT support.  We put WT in slot 7 to improve
 371                 * robustness in the presence of errata that might cause
 372                 * the high PAT bit to be ignored.  This way, a buggy slot 7
 373                 * access will hit slot 3, and slot 3 is UC, so at worst
 374                 * we lose performance without causing a correctness issue.
 375                 * Pentium 4 erratum N46 is an example for such an erratum,
 376                 * although we try not to use PAT at all on affected CPUs.
 377                 *
 378                 *  PTE encoding:
 379                 *      PAT
 380                 *      |PCD
 381                 *      ||PWT  PAT
 382                 *      |||    slot
 383                 *      000    0    WB : _PAGE_CACHE_MODE_WB
 384                 *      001    1    WC : _PAGE_CACHE_MODE_WC
 385                 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
 386                 *      011    3    UC : _PAGE_CACHE_MODE_UC
 387                 *      100    4    WB : Reserved
 388                 *      101    5    WP : _PAGE_CACHE_MODE_WP
 389                 *      110    6    UC-: Reserved
 390                 *      111    7    WT : _PAGE_CACHE_MODE_WT
 391                 *
 392                 * The reserved slots are unused, but mapped to their
 393                 * corresponding types in the presence of PAT errata.
 394                 */
 395                pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
 396                      PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
 397        }
 398
 399        if (!pat_bp_initialized) {
 400                pat_bp_init(pat);
 401                pat_bp_initialized = true;
 402        } else {
 403                pat_ap_init(pat);
 404        }
 405}
 406
 407#undef PAT
 408
 409static DEFINE_SPINLOCK(memtype_lock);   /* protects memtype accesses */
 410
 411/*
 412 * Does intersection of PAT memory type and MTRR memory type and returns
 413 * the resulting memory type as PAT understands it.
 414 * (Type in pat and mtrr will not have same value)
 415 * The intersection is based on "Effective Memory Type" tables in IA-32
 416 * SDM vol 3a
 417 */
 418static unsigned long pat_x_mtrr_type(u64 start, u64 end,
 419                                     enum page_cache_mode req_type)
 420{
 421        /*
 422         * Look for MTRR hint to get the effective type in case where PAT
 423         * request is for WB.
 424         */
 425        if (req_type == _PAGE_CACHE_MODE_WB) {
 426                u8 mtrr_type, uniform;
 427
 428                mtrr_type = mtrr_type_lookup(start, end, &uniform);
 429                if (mtrr_type != MTRR_TYPE_WRBACK)
 430                        return _PAGE_CACHE_MODE_UC_MINUS;
 431
 432                return _PAGE_CACHE_MODE_WB;
 433        }
 434
 435        return req_type;
 436}
 437
 438struct pagerange_state {
 439        unsigned long           cur_pfn;
 440        int                     ram;
 441        int                     not_ram;
 442};
 443
 444static int
 445pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
 446{
 447        struct pagerange_state *state = arg;
 448
 449        state->not_ram  |= initial_pfn > state->cur_pfn;
 450        state->ram      |= total_nr_pages > 0;
 451        state->cur_pfn   = initial_pfn + total_nr_pages;
 452
 453        return state->ram && state->not_ram;
 454}
 455
 456static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
 457{
 458        int ret = 0;
 459        unsigned long start_pfn = start >> PAGE_SHIFT;
 460        unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
 461        struct pagerange_state state = {start_pfn, 0, 0};
 462
 463        /*
 464         * For legacy reasons, physical address range in the legacy ISA
 465         * region is tracked as non-RAM. This will allow users of
 466         * /dev/mem to map portions of legacy ISA region, even when
 467         * some of those portions are listed(or not even listed) with
 468         * different e820 types(RAM/reserved/..)
 469         */
 470        if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
 471                start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
 472
 473        if (start_pfn < end_pfn) {
 474                ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
 475                                &state, pagerange_is_ram_callback);
 476        }
 477
 478        return (ret > 0) ? -1 : (state.ram ? 1 : 0);
 479}
 480
 481/*
 482 * For RAM pages, we use page flags to mark the pages with appropriate type.
 483 * The page flags are limited to four types, WB (default), WC, WT and UC-.
 484 * WP request fails with -EINVAL, and UC gets redirected to UC-.  Setting
 485 * a new memory type is only allowed for a page mapped with the default WB
 486 * type.
 487 *
 488 * Here we do two passes:
 489 * - Find the memtype of all the pages in the range, look for any conflicts.
 490 * - In case of no conflicts, set the new memtype for pages in the range.
 491 */
 492static int reserve_ram_pages_type(u64 start, u64 end,
 493                                  enum page_cache_mode req_type,
 494                                  enum page_cache_mode *new_type)
 495{
 496        struct page *page;
 497        u64 pfn;
 498
 499        if (req_type == _PAGE_CACHE_MODE_WP) {
 500                if (new_type)
 501                        *new_type = _PAGE_CACHE_MODE_UC_MINUS;
 502                return -EINVAL;
 503        }
 504
 505        if (req_type == _PAGE_CACHE_MODE_UC) {
 506                /* We do not support strong UC */
 507                WARN_ON_ONCE(1);
 508                req_type = _PAGE_CACHE_MODE_UC_MINUS;
 509        }
 510
 511        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 512                enum page_cache_mode type;
 513
 514                page = pfn_to_page(pfn);
 515                type = get_page_memtype(page);
 516                if (type != _PAGE_CACHE_MODE_WB) {
 517                        pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
 518                                start, end - 1, type, req_type);
 519                        if (new_type)
 520                                *new_type = type;
 521
 522                        return -EBUSY;
 523                }
 524        }
 525
 526        if (new_type)
 527                *new_type = req_type;
 528
 529        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 530                page = pfn_to_page(pfn);
 531                set_page_memtype(page, req_type);
 532        }
 533        return 0;
 534}
 535
 536static int free_ram_pages_type(u64 start, u64 end)
 537{
 538        struct page *page;
 539        u64 pfn;
 540
 541        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 542                page = pfn_to_page(pfn);
 543                set_page_memtype(page, _PAGE_CACHE_MODE_WB);
 544        }
 545        return 0;
 546}
 547
 548static u64 sanitize_phys(u64 address)
 549{
 550        /*
 551         * When changing the memtype for pages containing poison allow
 552         * for a "decoy" virtual address (bit 63 clear) passed to
 553         * set_memory_X(). __pa() on a "decoy" address results in a
 554         * physical address with bit 63 set.
 555         *
 556         * Decoy addresses are not present for 32-bit builds, see
 557         * set_mce_nospec().
 558         */
 559        if (IS_ENABLED(CONFIG_X86_64))
 560                return address & __PHYSICAL_MASK;
 561        return address;
 562}
 563
 564/*
 565 * req_type typically has one of the:
 566 * - _PAGE_CACHE_MODE_WB
 567 * - _PAGE_CACHE_MODE_WC
 568 * - _PAGE_CACHE_MODE_UC_MINUS
 569 * - _PAGE_CACHE_MODE_UC
 570 * - _PAGE_CACHE_MODE_WT
 571 *
 572 * If new_type is NULL, function will return an error if it cannot reserve the
 573 * region with req_type. If new_type is non-NULL, function will return
 574 * available type in new_type in case of no error. In case of any error
 575 * it will return a negative return value.
 576 */
 577int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type,
 578                    enum page_cache_mode *new_type)
 579{
 580        struct memtype *entry_new;
 581        enum page_cache_mode actual_type;
 582        int is_range_ram;
 583        int err = 0;
 584
 585        start = sanitize_phys(start);
 586
 587        /*
 588         * The end address passed into this function is exclusive, but
 589         * sanitize_phys() expects an inclusive address.
 590         */
 591        end = sanitize_phys(end - 1) + 1;
 592        if (start >= end) {
 593                WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__,
 594                                start, end - 1, cattr_name(req_type));
 595                return -EINVAL;
 596        }
 597
 598        if (!pat_enabled()) {
 599                /* This is identical to page table setting without PAT */
 600                if (new_type)
 601                        *new_type = req_type;
 602                return 0;
 603        }
 604
 605        /* Low ISA region is always mapped WB in page table. No need to track */
 606        if (x86_platform.is_untracked_pat_range(start, end)) {
 607                if (new_type)
 608                        *new_type = _PAGE_CACHE_MODE_WB;
 609                return 0;
 610        }
 611
 612        /*
 613         * Call mtrr_lookup to get the type hint. This is an
 614         * optimization for /dev/mem mmap'ers into WB memory (BIOS
 615         * tools and ACPI tools). Use WB request for WB memory and use
 616         * UC_MINUS otherwise.
 617         */
 618        actual_type = pat_x_mtrr_type(start, end, req_type);
 619
 620        if (new_type)
 621                *new_type = actual_type;
 622
 623        is_range_ram = pat_pagerange_is_ram(start, end);
 624        if (is_range_ram == 1) {
 625
 626                err = reserve_ram_pages_type(start, end, req_type, new_type);
 627
 628                return err;
 629        } else if (is_range_ram < 0) {
 630                return -EINVAL;
 631        }
 632
 633        entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL);
 634        if (!entry_new)
 635                return -ENOMEM;
 636
 637        entry_new->start = start;
 638        entry_new->end   = end;
 639        entry_new->type  = actual_type;
 640
 641        spin_lock(&memtype_lock);
 642
 643        err = memtype_check_insert(entry_new, new_type);
 644        if (err) {
 645                pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
 646                        start, end - 1,
 647                        cattr_name(entry_new->type), cattr_name(req_type));
 648                kfree(entry_new);
 649                spin_unlock(&memtype_lock);
 650
 651                return err;
 652        }
 653
 654        spin_unlock(&memtype_lock);
 655
 656        dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
 657                start, end - 1, cattr_name(entry_new->type), cattr_name(req_type),
 658                new_type ? cattr_name(*new_type) : "-");
 659
 660        return err;
 661}
 662
 663int memtype_free(u64 start, u64 end)
 664{
 665        int is_range_ram;
 666        struct memtype *entry_old;
 667
 668        if (!pat_enabled())
 669                return 0;
 670
 671        start = sanitize_phys(start);
 672        end = sanitize_phys(end);
 673
 674        /* Low ISA region is always mapped WB. No need to track */
 675        if (x86_platform.is_untracked_pat_range(start, end))
 676                return 0;
 677
 678        is_range_ram = pat_pagerange_is_ram(start, end);
 679        if (is_range_ram == 1)
 680                return free_ram_pages_type(start, end);
 681        if (is_range_ram < 0)
 682                return -EINVAL;
 683
 684        spin_lock(&memtype_lock);
 685        entry_old = memtype_erase(start, end);
 686        spin_unlock(&memtype_lock);
 687
 688        if (IS_ERR(entry_old)) {
 689                pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
 690                        current->comm, current->pid, start, end - 1);
 691                return -EINVAL;
 692        }
 693
 694        kfree(entry_old);
 695
 696        dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1);
 697
 698        return 0;
 699}
 700
 701
 702/**
 703 * lookup_memtype - Looks up the memory type for a physical address
 704 * @paddr: physical address of which memory type needs to be looked up
 705 *
 706 * Only to be called when PAT is enabled
 707 *
 708 * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
 709 * or _PAGE_CACHE_MODE_WT.
 710 */
 711static enum page_cache_mode lookup_memtype(u64 paddr)
 712{
 713        enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB;
 714        struct memtype *entry;
 715
 716        if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
 717                return rettype;
 718
 719        if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
 720                struct page *page;
 721
 722                page = pfn_to_page(paddr >> PAGE_SHIFT);
 723                return get_page_memtype(page);
 724        }
 725
 726        spin_lock(&memtype_lock);
 727
 728        entry = memtype_lookup(paddr);
 729        if (entry != NULL)
 730                rettype = entry->type;
 731        else
 732                rettype = _PAGE_CACHE_MODE_UC_MINUS;
 733
 734        spin_unlock(&memtype_lock);
 735
 736        return rettype;
 737}
 738
 739/**
 740 * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type
 741 * of @pfn cannot be overridden by UC MTRR memory type.
 742 *
 743 * Only to be called when PAT is enabled.
 744 *
 745 * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC.
 746 * Returns false in other cases.
 747 */
 748bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn)
 749{
 750        enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn));
 751
 752        return cm == _PAGE_CACHE_MODE_UC ||
 753               cm == _PAGE_CACHE_MODE_UC_MINUS ||
 754               cm == _PAGE_CACHE_MODE_WC;
 755}
 756EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr);
 757
 758/**
 759 * memtype_reserve_io - Request a memory type mapping for a region of memory
 760 * @start: start (physical address) of the region
 761 * @end: end (physical address) of the region
 762 * @type: A pointer to memtype, with requested type. On success, requested
 763 * or any other compatible type that was available for the region is returned
 764 *
 765 * On success, returns 0
 766 * On failure, returns non-zero
 767 */
 768int memtype_reserve_io(resource_size_t start, resource_size_t end,
 769                        enum page_cache_mode *type)
 770{
 771        resource_size_t size = end - start;
 772        enum page_cache_mode req_type = *type;
 773        enum page_cache_mode new_type;
 774        int ret;
 775
 776        WARN_ON_ONCE(iomem_map_sanity_check(start, size));
 777
 778        ret = memtype_reserve(start, end, req_type, &new_type);
 779        if (ret)
 780                goto out_err;
 781
 782        if (!is_new_memtype_allowed(start, size, req_type, new_type))
 783                goto out_free;
 784
 785        if (memtype_kernel_map_sync(start, size, new_type) < 0)
 786                goto out_free;
 787
 788        *type = new_type;
 789        return 0;
 790
 791out_free:
 792        memtype_free(start, end);
 793        ret = -EBUSY;
 794out_err:
 795        return ret;
 796}
 797
 798/**
 799 * memtype_free_io - Release a memory type mapping for a region of memory
 800 * @start: start (physical address) of the region
 801 * @end: end (physical address) of the region
 802 */
 803void memtype_free_io(resource_size_t start, resource_size_t end)
 804{
 805        memtype_free(start, end);
 806}
 807
 808#ifdef CONFIG_X86_PAT
 809int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size)
 810{
 811        enum page_cache_mode type = _PAGE_CACHE_MODE_WC;
 812
 813        return memtype_reserve_io(start, start + size, &type);
 814}
 815EXPORT_SYMBOL(arch_io_reserve_memtype_wc);
 816
 817void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
 818{
 819        memtype_free_io(start, start + size);
 820}
 821EXPORT_SYMBOL(arch_io_free_memtype_wc);
 822#endif
 823
 824pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 825                                unsigned long size, pgprot_t vma_prot)
 826{
 827        if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size))
 828                vma_prot = pgprot_decrypted(vma_prot);
 829
 830        return vma_prot;
 831}
 832
 833#ifdef CONFIG_STRICT_DEVMEM
 834/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
 835static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 836{
 837        return 1;
 838}
 839#else
 840/* This check is needed to avoid cache aliasing when PAT is enabled */
 841static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 842{
 843        u64 from = ((u64)pfn) << PAGE_SHIFT;
 844        u64 to = from + size;
 845        u64 cursor = from;
 846
 847        if (!pat_enabled())
 848                return 1;
 849
 850        while (cursor < to) {
 851                if (!devmem_is_allowed(pfn))
 852                        return 0;
 853                cursor += PAGE_SIZE;
 854                pfn++;
 855        }
 856        return 1;
 857}
 858#endif /* CONFIG_STRICT_DEVMEM */
 859
 860int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 861                                unsigned long size, pgprot_t *vma_prot)
 862{
 863        enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB;
 864
 865        if (!range_is_allowed(pfn, size))
 866                return 0;
 867
 868        if (file->f_flags & O_DSYNC)
 869                pcm = _PAGE_CACHE_MODE_UC_MINUS;
 870
 871        *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
 872                             cachemode2protval(pcm));
 873        return 1;
 874}
 875
 876/*
 877 * Change the memory type for the physical address range in kernel identity
 878 * mapping space if that range is a part of identity map.
 879 */
 880int memtype_kernel_map_sync(u64 base, unsigned long size,
 881                            enum page_cache_mode pcm)
 882{
 883        unsigned long id_sz;
 884
 885        if (base > __pa(high_memory-1))
 886                return 0;
 887
 888        /*
 889         * Some areas in the middle of the kernel identity range
 890         * are not mapped, for example the PCI space.
 891         */
 892        if (!page_is_ram(base >> PAGE_SHIFT))
 893                return 0;
 894
 895        id_sz = (__pa(high_memory-1) <= base + size) ?
 896                                __pa(high_memory) - base : size;
 897
 898        if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
 899                pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
 900                        current->comm, current->pid,
 901                        cattr_name(pcm),
 902                        base, (unsigned long long)(base + size-1));
 903                return -EINVAL;
 904        }
 905        return 0;
 906}
 907
 908/*
 909 * Internal interface to reserve a range of physical memory with prot.
 910 * Reserved non RAM regions only and after successful memtype_reserve,
 911 * this func also keeps identity mapping (if any) in sync with this new prot.
 912 */
 913static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 914                                int strict_prot)
 915{
 916        int is_ram = 0;
 917        int ret;
 918        enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot);
 919        enum page_cache_mode pcm = want_pcm;
 920
 921        is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 922
 923        /*
 924         * reserve_pfn_range() for RAM pages. We do not refcount to keep
 925         * track of number of mappings of RAM pages. We can assert that
 926         * the type requested matches the type of first page in the range.
 927         */
 928        if (is_ram) {
 929                if (!pat_enabled())
 930                        return 0;
 931
 932                pcm = lookup_memtype(paddr);
 933                if (want_pcm != pcm) {
 934                        pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
 935                                current->comm, current->pid,
 936                                cattr_name(want_pcm),
 937                                (unsigned long long)paddr,
 938                                (unsigned long long)(paddr + size - 1),
 939                                cattr_name(pcm));
 940                        *vma_prot = __pgprot((pgprot_val(*vma_prot) &
 941                                             (~_PAGE_CACHE_MASK)) |
 942                                             cachemode2protval(pcm));
 943                }
 944                return 0;
 945        }
 946
 947        ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm);
 948        if (ret)
 949                return ret;
 950
 951        if (pcm != want_pcm) {
 952                if (strict_prot ||
 953                    !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
 954                        memtype_free(paddr, paddr + size);
 955                        pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
 956                               current->comm, current->pid,
 957                               cattr_name(want_pcm),
 958                               (unsigned long long)paddr,
 959                               (unsigned long long)(paddr + size - 1),
 960                               cattr_name(pcm));
 961                        return -EINVAL;
 962                }
 963                /*
 964                 * We allow returning different type than the one requested in
 965                 * non strict case.
 966                 */
 967                *vma_prot = __pgprot((pgprot_val(*vma_prot) &
 968                                      (~_PAGE_CACHE_MASK)) |
 969                                     cachemode2protval(pcm));
 970        }
 971
 972        if (memtype_kernel_map_sync(paddr, size, pcm) < 0) {
 973                memtype_free(paddr, paddr + size);
 974                return -EINVAL;
 975        }
 976        return 0;
 977}
 978
 979/*
 980 * Internal interface to free a range of physical memory.
 981 * Frees non RAM regions only.
 982 */
 983static void free_pfn_range(u64 paddr, unsigned long size)
 984{
 985        int is_ram;
 986
 987        is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 988        if (is_ram == 0)
 989                memtype_free(paddr, paddr + size);
 990}
 991
 992/*
 993 * track_pfn_copy is called when vma that is covering the pfnmap gets
 994 * copied through copy_page_range().
 995 *
 996 * If the vma has a linear pfn mapping for the entire range, we get the prot
 997 * from pte and reserve the entire vma range with single reserve_pfn_range call.
 998 */
 999int track_pfn_copy(struct vm_area_struct *vma)
1000{
1001        resource_size_t paddr;
1002        unsigned long prot;
1003        unsigned long vma_size = vma->vm_end - vma->vm_start;
1004        pgprot_t pgprot;
1005
1006        if (vma->vm_flags & VM_PAT) {
1007                /*
1008                 * reserve the whole chunk covered by vma. We need the
1009                 * starting address and protection from pte.
1010                 */
1011                if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
1012                        WARN_ON_ONCE(1);
1013                        return -EINVAL;
1014                }
1015                pgprot = __pgprot(prot);
1016                return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
1017        }
1018
1019        return 0;
1020}
1021
1022/*
1023 * prot is passed in as a parameter for the new mapping. If the vma has
1024 * a linear pfn mapping for the entire range, or no vma is provided,
1025 * reserve the entire pfn + size range with single reserve_pfn_range
1026 * call.
1027 */
1028int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
1029                    unsigned long pfn, unsigned long addr, unsigned long size)
1030{
1031        resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
1032        enum page_cache_mode pcm;
1033
1034        /* reserve the whole chunk starting from paddr */
1035        if (!vma || (addr == vma->vm_start
1036                                && size == (vma->vm_end - vma->vm_start))) {
1037                int ret;
1038
1039                ret = reserve_pfn_range(paddr, size, prot, 0);
1040                if (ret == 0 && vma)
1041                        vma->vm_flags |= VM_PAT;
1042                return ret;
1043        }
1044
1045        if (!pat_enabled())
1046                return 0;
1047
1048        /*
1049         * For anything smaller than the vma size we set prot based on the
1050         * lookup.
1051         */
1052        pcm = lookup_memtype(paddr);
1053
1054        /* Check memtype for the remaining pages */
1055        while (size > PAGE_SIZE) {
1056                size -= PAGE_SIZE;
1057                paddr += PAGE_SIZE;
1058                if (pcm != lookup_memtype(paddr))
1059                        return -EINVAL;
1060        }
1061
1062        *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
1063                         cachemode2protval(pcm));
1064
1065        return 0;
1066}
1067
1068void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
1069{
1070        enum page_cache_mode pcm;
1071
1072        if (!pat_enabled())
1073                return;
1074
1075        /* Set prot based on lookup */
1076        pcm = lookup_memtype(pfn_t_to_phys(pfn));
1077        *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
1078                         cachemode2protval(pcm));
1079}
1080
1081/*
1082 * untrack_pfn is called while unmapping a pfnmap for a region.
1083 * untrack can be called for a specific region indicated by pfn and size or
1084 * can be for the entire vma (in which case pfn, size are zero).
1085 */
1086void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
1087                 unsigned long size)
1088{
1089        resource_size_t paddr;
1090        unsigned long prot;
1091
1092        if (vma && !(vma->vm_flags & VM_PAT))
1093                return;
1094
1095        /* free the chunk starting from pfn or the whole chunk */
1096        paddr = (resource_size_t)pfn << PAGE_SHIFT;
1097        if (!paddr && !size) {
1098                if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
1099                        WARN_ON_ONCE(1);
1100                        return;
1101                }
1102
1103                size = vma->vm_end - vma->vm_start;
1104        }
1105        free_pfn_range(paddr, size);
1106        if (vma)
1107                vma->vm_flags &= ~VM_PAT;
1108}
1109
1110/*
1111 * untrack_pfn_moved is called, while mremapping a pfnmap for a new region,
1112 * with the old vma after its pfnmap page table has been removed.  The new
1113 * vma has a new pfnmap to the same pfn & cache type with VM_PAT set.
1114 */
1115void untrack_pfn_moved(struct vm_area_struct *vma)
1116{
1117        vma->vm_flags &= ~VM_PAT;
1118}
1119
1120pgprot_t pgprot_writecombine(pgprot_t prot)
1121{
1122        return __pgprot(pgprot_val(prot) |
1123                                cachemode2protval(_PAGE_CACHE_MODE_WC));
1124}
1125EXPORT_SYMBOL_GPL(pgprot_writecombine);
1126
1127pgprot_t pgprot_writethrough(pgprot_t prot)
1128{
1129        return __pgprot(pgprot_val(prot) |
1130                                cachemode2protval(_PAGE_CACHE_MODE_WT));
1131}
1132EXPORT_SYMBOL_GPL(pgprot_writethrough);
1133
1134#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
1135
1136/*
1137 * We are allocating a temporary printout-entry to be passed
1138 * between seq_start()/next() and seq_show():
1139 */
1140static struct memtype *memtype_get_idx(loff_t pos)
1141{
1142        struct memtype *entry_print;
1143        int ret;
1144
1145        entry_print  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
1146        if (!entry_print)
1147                return NULL;
1148
1149        spin_lock(&memtype_lock);
1150        ret = memtype_copy_nth_element(entry_print, pos);
1151        spin_unlock(&memtype_lock);
1152
1153        /* Free it on error: */
1154        if (ret) {
1155                kfree(entry_print);
1156                return NULL;
1157        }
1158
1159        return entry_print;
1160}
1161
1162static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
1163{
1164        if (*pos == 0) {
1165                ++*pos;
1166                seq_puts(seq, "PAT memtype list:\n");
1167        }
1168
1169        return memtype_get_idx(*pos);
1170}
1171
1172static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1173{
1174        kfree(v);
1175        ++*pos;
1176        return memtype_get_idx(*pos);
1177}
1178
1179static void memtype_seq_stop(struct seq_file *seq, void *v)
1180{
1181        kfree(v);
1182}
1183
1184static int memtype_seq_show(struct seq_file *seq, void *v)
1185{
1186        struct memtype *entry_print = (struct memtype *)v;
1187
1188        seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n",
1189                        entry_print->start,
1190                        entry_print->end,
1191                        cattr_name(entry_print->type));
1192
1193        return 0;
1194}
1195
1196static const struct seq_operations memtype_seq_ops = {
1197        .start = memtype_seq_start,
1198        .next  = memtype_seq_next,
1199        .stop  = memtype_seq_stop,
1200        .show  = memtype_seq_show,
1201};
1202
1203static int memtype_seq_open(struct inode *inode, struct file *file)
1204{
1205        return seq_open(file, &memtype_seq_ops);
1206}
1207
1208static const struct file_operations memtype_fops = {
1209        .open    = memtype_seq_open,
1210        .read    = seq_read,
1211        .llseek  = seq_lseek,
1212        .release = seq_release,
1213};
1214
1215static int __init pat_memtype_list_init(void)
1216{
1217        if (pat_enabled()) {
1218                debugfs_create_file("pat_memtype_list", S_IRUSR,
1219                                    arch_debugfs_dir, NULL, &memtype_fops);
1220        }
1221        return 0;
1222}
1223late_initcall(pat_memtype_list_init);
1224
1225#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
1226