linux/arch/x86/mm/pat.c
<<
>>
Prefs
   1/*
   2 * Handle caching attributes in page tables (PAT)
   3 *
   4 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
   5 *          Suresh B Siddha <suresh.b.siddha@intel.com>
   6 *
   7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
   8 */
   9
  10#include <linux/seq_file.h>
  11#include <linux/bootmem.h>
  12#include <linux/debugfs.h>
  13#include <linux/kernel.h>
  14#include <linux/module.h>
  15#include <linux/slab.h>
  16#include <linux/mm.h>
  17#include <linux/fs.h>
  18#include <linux/rbtree.h>
  19
  20#include <asm/cacheflush.h>
  21#include <asm/processor.h>
  22#include <asm/tlbflush.h>
  23#include <asm/x86_init.h>
  24#include <asm/pgtable.h>
  25#include <asm/fcntl.h>
  26#include <asm/e820.h>
  27#include <asm/mtrr.h>
  28#include <asm/page.h>
  29#include <asm/msr.h>
  30#include <asm/pat.h>
  31#include <asm/io.h>
  32
  33#include "pat_internal.h"
  34#include "mm_internal.h"
  35
  36#ifdef CONFIG_X86_PAT
  37int __read_mostly pat_enabled = 1;
  38
  39static inline void pat_disable(const char *reason)
  40{
  41        pat_enabled = 0;
  42        printk(KERN_INFO "%s\n", reason);
  43}
  44
  45static int __init nopat(char *str)
  46{
  47        pat_disable("PAT support disabled.");
  48        return 0;
  49}
  50early_param("nopat", nopat);
  51#else
  52static inline void pat_disable(const char *reason)
  53{
  54        (void)reason;
  55}
  56#endif
  57
  58
  59int pat_debug_enable;
  60
  61static int __init pat_debug_setup(char *str)
  62{
  63        pat_debug_enable = 1;
  64        return 0;
  65}
  66__setup("debugpat", pat_debug_setup);
  67
  68static u64 __read_mostly boot_pat_state;
  69
  70#ifdef CONFIG_X86_PAT
  71/*
  72 * X86 PAT uses page flags WC and Uncached together to keep track of
  73 * memory type of pages that have backing page struct. X86 PAT supports 3
  74 * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and
  75 * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not
  76 * been changed from its default (value of -1 used to denote this).
  77 * Note we do not support _PAGE_CACHE_MODE_UC here.
  78 */
  79
  80#define _PGMT_DEFAULT           0
  81#define _PGMT_WC                (1UL << PG_arch_1)
  82#define _PGMT_UC_MINUS          (1UL << PG_uncached)
  83#define _PGMT_WB                (1UL << PG_uncached | 1UL << PG_arch_1)
  84#define _PGMT_MASK              (1UL << PG_uncached | 1UL << PG_arch_1)
  85#define _PGMT_CLEAR_MASK        (~_PGMT_MASK)
  86
  87static inline enum page_cache_mode get_page_memtype(struct page *pg)
  88{
  89        unsigned long pg_flags = pg->flags & _PGMT_MASK;
  90
  91        if (pg_flags == _PGMT_DEFAULT)
  92                return -1;
  93        else if (pg_flags == _PGMT_WC)
  94                return _PAGE_CACHE_MODE_WC;
  95        else if (pg_flags == _PGMT_UC_MINUS)
  96                return _PAGE_CACHE_MODE_UC_MINUS;
  97        else
  98                return _PAGE_CACHE_MODE_WB;
  99}
 100
 101static inline void set_page_memtype(struct page *pg,
 102                                    enum page_cache_mode memtype)
 103{
 104        unsigned long memtype_flags;
 105        unsigned long old_flags;
 106        unsigned long new_flags;
 107
 108        switch (memtype) {
 109        case _PAGE_CACHE_MODE_WC:
 110                memtype_flags = _PGMT_WC;
 111                break;
 112        case _PAGE_CACHE_MODE_UC_MINUS:
 113                memtype_flags = _PGMT_UC_MINUS;
 114                break;
 115        case _PAGE_CACHE_MODE_WB:
 116                memtype_flags = _PGMT_WB;
 117                break;
 118        default:
 119                memtype_flags = _PGMT_DEFAULT;
 120                break;
 121        }
 122
 123        do {
 124                old_flags = pg->flags;
 125                new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
 126        } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
 127}
 128#else
 129static inline enum page_cache_mode get_page_memtype(struct page *pg)
 130{
 131        return -1;
 132}
 133static inline void set_page_memtype(struct page *pg,
 134                                    enum page_cache_mode memtype)
 135{
 136}
 137#endif
 138
 139enum {
 140        PAT_UC = 0,             /* uncached */
 141        PAT_WC = 1,             /* Write combining */
 142        PAT_WT = 4,             /* Write Through */
 143        PAT_WP = 5,             /* Write Protected */
 144        PAT_WB = 6,             /* Write Back (default) */
 145        PAT_UC_MINUS = 7,       /* UC, but can be overriden by MTRR */
 146};
 147
 148#define CM(c) (_PAGE_CACHE_MODE_ ## c)
 149
 150static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
 151{
 152        enum page_cache_mode cache;
 153        char *cache_mode;
 154
 155        switch (pat_val) {
 156        case PAT_UC:       cache = CM(UC);       cache_mode = "UC  "; break;
 157        case PAT_WC:       cache = CM(WC);       cache_mode = "WC  "; break;
 158        case PAT_WT:       cache = CM(WT);       cache_mode = "WT  "; break;
 159        case PAT_WP:       cache = CM(WP);       cache_mode = "WP  "; break;
 160        case PAT_WB:       cache = CM(WB);       cache_mode = "WB  "; break;
 161        case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
 162        default:           cache = CM(WB);       cache_mode = "WB  "; break;
 163        }
 164
 165        memcpy(msg, cache_mode, 4);
 166
 167        return cache;
 168}
 169
 170#undef CM
 171
 172/*
 173 * Update the cache mode to pgprot translation tables according to PAT
 174 * configuration.
 175 * Using lower indices is preferred, so we start with highest index.
 176 */
 177void pat_init_cache_modes(void)
 178{
 179        int i;
 180        enum page_cache_mode cache;
 181        char pat_msg[33];
 182        u64 pat;
 183
 184        rdmsrl(MSR_IA32_CR_PAT, pat);
 185        pat_msg[32] = 0;
 186        for (i = 7; i >= 0; i--) {
 187                cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
 188                                           pat_msg + 4 * i);
 189                update_cache_mode_entry(i, cache);
 190        }
 191        pr_info("PAT configuration [0-7]: %s\n", pat_msg);
 192}
 193
 194#define PAT(x, y)       ((u64)PAT_ ## y << ((x)*8))
 195
 196void pat_init(void)
 197{
 198        u64 pat;
 199        bool boot_cpu = !boot_pat_state;
 200
 201        if (!pat_enabled)
 202                return;
 203
 204        if (!cpu_has_pat) {
 205                if (!boot_pat_state) {
 206                        pat_disable("PAT not supported by CPU.");
 207                        return;
 208                } else {
 209                        /*
 210                         * If this happens we are on a secondary CPU, but
 211                         * switched to PAT on the boot CPU. We have no way to
 212                         * undo PAT.
 213                         */
 214                        printk(KERN_ERR "PAT enabled, "
 215                               "but not supported by secondary CPU\n");
 216                        BUG();
 217                }
 218        }
 219
 220        /* Set PWT to Write-Combining. All other bits stay the same */
 221        /*
 222         * PTE encoding used in Linux:
 223         *      PAT
 224         *      |PCD
 225         *      ||PWT
 226         *      |||
 227         *      000 WB          _PAGE_CACHE_WB
 228         *      001 WC          _PAGE_CACHE_WC
 229         *      010 UC-         _PAGE_CACHE_UC_MINUS
 230         *      011 UC          _PAGE_CACHE_UC
 231         * PAT bit unused
 232         */
 233        pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
 234              PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
 235
 236        /* Boot CPU check */
 237        if (!boot_pat_state) {
 238                rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
 239                if (!boot_pat_state) {
 240                        pat_disable("PAT read returns always zero, disabled.");
 241                        return;
 242                }
 243        }
 244
 245        wrmsrl(MSR_IA32_CR_PAT, pat);
 246
 247        if (boot_cpu)
 248                pat_init_cache_modes();
 249}
 250
 251#undef PAT
 252
 253static DEFINE_SPINLOCK(memtype_lock);   /* protects memtype accesses */
 254
 255/*
 256 * Does intersection of PAT memory type and MTRR memory type and returns
 257 * the resulting memory type as PAT understands it.
 258 * (Type in pat and mtrr will not have same value)
 259 * The intersection is based on "Effective Memory Type" tables in IA-32
 260 * SDM vol 3a
 261 */
 262static unsigned long pat_x_mtrr_type(u64 start, u64 end,
 263                                     enum page_cache_mode req_type)
 264{
 265        /*
 266         * Look for MTRR hint to get the effective type in case where PAT
 267         * request is for WB.
 268         */
 269        if (req_type == _PAGE_CACHE_MODE_WB) {
 270                u8 mtrr_type;
 271
 272                mtrr_type = mtrr_type_lookup(start, end);
 273                if (mtrr_type != MTRR_TYPE_WRBACK)
 274                        return _PAGE_CACHE_MODE_UC_MINUS;
 275
 276                return _PAGE_CACHE_MODE_WB;
 277        }
 278
 279        return req_type;
 280}
 281
 282struct pagerange_state {
 283        unsigned long           cur_pfn;
 284        int                     ram;
 285        int                     not_ram;
 286};
 287
 288static int
 289pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
 290{
 291        struct pagerange_state *state = arg;
 292
 293        state->not_ram  |= initial_pfn > state->cur_pfn;
 294        state->ram      |= total_nr_pages > 0;
 295        state->cur_pfn   = initial_pfn + total_nr_pages;
 296
 297        return state->ram && state->not_ram;
 298}
 299
 300static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
 301{
 302        int ret = 0;
 303        unsigned long start_pfn = start >> PAGE_SHIFT;
 304        unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
 305        struct pagerange_state state = {start_pfn, 0, 0};
 306
 307        /*
 308         * For legacy reasons, physical address range in the legacy ISA
 309         * region is tracked as non-RAM. This will allow users of
 310         * /dev/mem to map portions of legacy ISA region, even when
 311         * some of those portions are listed(or not even listed) with
 312         * different e820 types(RAM/reserved/..)
 313         */
 314        if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
 315                start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
 316
 317        if (start_pfn < end_pfn) {
 318                ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
 319                                &state, pagerange_is_ram_callback);
 320        }
 321
 322        return (ret > 0) ? -1 : (state.ram ? 1 : 0);
 323}
 324
 325/*
 326 * For RAM pages, we use page flags to mark the pages with appropriate type.
 327 * Here we do two pass:
 328 * - Find the memtype of all the pages in the range, look for any conflicts
 329 * - In case of no conflicts, set the new memtype for pages in the range
 330 */
 331static int reserve_ram_pages_type(u64 start, u64 end,
 332                                  enum page_cache_mode req_type,
 333                                  enum page_cache_mode *new_type)
 334{
 335        struct page *page;
 336        u64 pfn;
 337
 338        if (req_type == _PAGE_CACHE_MODE_UC) {
 339                /* We do not support strong UC */
 340                WARN_ON_ONCE(1);
 341                req_type = _PAGE_CACHE_MODE_UC_MINUS;
 342        }
 343
 344        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 345                enum page_cache_mode type;
 346
 347                page = pfn_to_page(pfn);
 348                type = get_page_memtype(page);
 349                if (type != -1) {
 350                        pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
 351                                start, end - 1, type, req_type);
 352                        if (new_type)
 353                                *new_type = type;
 354
 355                        return -EBUSY;
 356                }
 357        }
 358
 359        if (new_type)
 360                *new_type = req_type;
 361
 362        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 363                page = pfn_to_page(pfn);
 364                set_page_memtype(page, req_type);
 365        }
 366        return 0;
 367}
 368
 369static int free_ram_pages_type(u64 start, u64 end)
 370{
 371        struct page *page;
 372        u64 pfn;
 373
 374        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 375                page = pfn_to_page(pfn);
 376                set_page_memtype(page, -1);
 377        }
 378        return 0;
 379}
 380
 381/*
 382 * req_type typically has one of the:
 383 * - _PAGE_CACHE_MODE_WB
 384 * - _PAGE_CACHE_MODE_WC
 385 * - _PAGE_CACHE_MODE_UC_MINUS
 386 * - _PAGE_CACHE_MODE_UC
 387 *
 388 * If new_type is NULL, function will return an error if it cannot reserve the
 389 * region with req_type. If new_type is non-NULL, function will return
 390 * available type in new_type in case of no error. In case of any error
 391 * it will return a negative return value.
 392 */
 393int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
 394                    enum page_cache_mode *new_type)
 395{
 396        struct memtype *new;
 397        enum page_cache_mode actual_type;
 398        int is_range_ram;
 399        int err = 0;
 400
 401        BUG_ON(start >= end); /* end is exclusive */
 402
 403        if (!pat_enabled) {
 404                /* This is identical to page table setting without PAT */
 405                if (new_type) {
 406                        if (req_type == _PAGE_CACHE_MODE_WC)
 407                                *new_type = _PAGE_CACHE_MODE_UC_MINUS;
 408                        else
 409                                *new_type = req_type;
 410                }
 411                return 0;
 412        }
 413
 414        /* Low ISA region is always mapped WB in page table. No need to track */
 415        if (x86_platform.is_untracked_pat_range(start, end)) {
 416                if (new_type)
 417                        *new_type = _PAGE_CACHE_MODE_WB;
 418                return 0;
 419        }
 420
 421        /*
 422         * Call mtrr_lookup to get the type hint. This is an
 423         * optimization for /dev/mem mmap'ers into WB memory (BIOS
 424         * tools and ACPI tools). Use WB request for WB memory and use
 425         * UC_MINUS otherwise.
 426         */
 427        actual_type = pat_x_mtrr_type(start, end, req_type);
 428
 429        if (new_type)
 430                *new_type = actual_type;
 431
 432        is_range_ram = pat_pagerange_is_ram(start, end);
 433        if (is_range_ram == 1) {
 434
 435                err = reserve_ram_pages_type(start, end, req_type, new_type);
 436
 437                return err;
 438        } else if (is_range_ram < 0) {
 439                return -EINVAL;
 440        }
 441
 442        new  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
 443        if (!new)
 444                return -ENOMEM;
 445
 446        new->start      = start;
 447        new->end        = end;
 448        new->type       = actual_type;
 449
 450        spin_lock(&memtype_lock);
 451
 452        err = rbt_memtype_check_insert(new, new_type);
 453        if (err) {
 454                printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
 455                       start, end - 1,
 456                       cattr_name(new->type), cattr_name(req_type));
 457                kfree(new);
 458                spin_unlock(&memtype_lock);
 459
 460                return err;
 461        }
 462
 463        spin_unlock(&memtype_lock);
 464
 465        dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
 466                start, end - 1, cattr_name(new->type), cattr_name(req_type),
 467                new_type ? cattr_name(*new_type) : "-");
 468
 469        return err;
 470}
 471
 472int free_memtype(u64 start, u64 end)
 473{
 474        int err = -EINVAL;
 475        int is_range_ram;
 476        struct memtype *entry;
 477
 478        if (!pat_enabled)
 479                return 0;
 480
 481        /* Low ISA region is always mapped WB. No need to track */
 482        if (x86_platform.is_untracked_pat_range(start, end))
 483                return 0;
 484
 485        is_range_ram = pat_pagerange_is_ram(start, end);
 486        if (is_range_ram == 1) {
 487
 488                err = free_ram_pages_type(start, end);
 489
 490                return err;
 491        } else if (is_range_ram < 0) {
 492                return -EINVAL;
 493        }
 494
 495        spin_lock(&memtype_lock);
 496        entry = rbt_memtype_erase(start, end);
 497        spin_unlock(&memtype_lock);
 498
 499        if (!entry) {
 500                printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
 501                       current->comm, current->pid, start, end - 1);
 502                return -EINVAL;
 503        }
 504
 505        kfree(entry);
 506
 507        dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
 508
 509        return 0;
 510}
 511
 512
 513/**
 514 * lookup_memtype - Looksup the memory type for a physical address
 515 * @paddr: physical address of which memory type needs to be looked up
 516 *
 517 * Only to be called when PAT is enabled
 518 *
 519 * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
 520 * or _PAGE_CACHE_MODE_UC
 521 */
 522static enum page_cache_mode lookup_memtype(u64 paddr)
 523{
 524        enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB;
 525        struct memtype *entry;
 526
 527        if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
 528                return rettype;
 529
 530        if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
 531                struct page *page;
 532                page = pfn_to_page(paddr >> PAGE_SHIFT);
 533                rettype = get_page_memtype(page);
 534                /*
 535                 * -1 from get_page_memtype() implies RAM page is in its
 536                 * default state and not reserved, and hence of type WB
 537                 */
 538                if (rettype == -1)
 539                        rettype = _PAGE_CACHE_MODE_WB;
 540
 541                return rettype;
 542        }
 543
 544        spin_lock(&memtype_lock);
 545
 546        entry = rbt_memtype_lookup(paddr);
 547        if (entry != NULL)
 548                rettype = entry->type;
 549        else
 550                rettype = _PAGE_CACHE_MODE_UC_MINUS;
 551
 552        spin_unlock(&memtype_lock);
 553        return rettype;
 554}
 555
 556/**
 557 * io_reserve_memtype - Request a memory type mapping for a region of memory
 558 * @start: start (physical address) of the region
 559 * @end: end (physical address) of the region
 560 * @type: A pointer to memtype, with requested type. On success, requested
 561 * or any other compatible type that was available for the region is returned
 562 *
 563 * On success, returns 0
 564 * On failure, returns non-zero
 565 */
 566int io_reserve_memtype(resource_size_t start, resource_size_t end,
 567                        enum page_cache_mode *type)
 568{
 569        resource_size_t size = end - start;
 570        enum page_cache_mode req_type = *type;
 571        enum page_cache_mode new_type;
 572        int ret;
 573
 574        WARN_ON_ONCE(iomem_map_sanity_check(start, size));
 575
 576        ret = reserve_memtype(start, end, req_type, &new_type);
 577        if (ret)
 578                goto out_err;
 579
 580        if (!is_new_memtype_allowed(start, size, req_type, new_type))
 581                goto out_free;
 582
 583        if (kernel_map_sync_memtype(start, size, new_type) < 0)
 584                goto out_free;
 585
 586        *type = new_type;
 587        return 0;
 588
 589out_free:
 590        free_memtype(start, end);
 591        ret = -EBUSY;
 592out_err:
 593        return ret;
 594}
 595
 596/**
 597 * io_free_memtype - Release a memory type mapping for a region of memory
 598 * @start: start (physical address) of the region
 599 * @end: end (physical address) of the region
 600 */
 601void io_free_memtype(resource_size_t start, resource_size_t end)
 602{
 603        free_memtype(start, end);
 604}
 605
 606pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 607                                unsigned long size, pgprot_t vma_prot)
 608{
 609        return vma_prot;
 610}
 611
 612#ifdef CONFIG_STRICT_DEVMEM
 613/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
 614static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 615{
 616        return 1;
 617}
 618#else
 619/* This check is needed to avoid cache aliasing when PAT is enabled */
 620static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 621{
 622        u64 from = ((u64)pfn) << PAGE_SHIFT;
 623        u64 to = from + size;
 624        u64 cursor = from;
 625
 626        if (!pat_enabled)
 627                return 1;
 628
 629        while (cursor < to) {
 630                if (!devmem_is_allowed(pfn)) {
 631                        printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n",
 632                                current->comm, from, to - 1);
 633                        return 0;
 634                }
 635                cursor += PAGE_SIZE;
 636                pfn++;
 637        }
 638        return 1;
 639}
 640#endif /* CONFIG_STRICT_DEVMEM */
 641
 642int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 643                                unsigned long size, pgprot_t *vma_prot)
 644{
 645        enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB;
 646
 647        if (!range_is_allowed(pfn, size))
 648                return 0;
 649
 650        if (file->f_flags & O_DSYNC)
 651                pcm = _PAGE_CACHE_MODE_UC_MINUS;
 652
 653#ifdef CONFIG_X86_32
 654        /*
 655         * On the PPro and successors, the MTRRs are used to set
 656         * memory types for physical addresses outside main memory,
 657         * so blindly setting UC or PWT on those pages is wrong.
 658         * For Pentiums and earlier, the surround logic should disable
 659         * caching for the high addresses through the KEN pin, but
 660         * we maintain the tradition of paranoia in this code.
 661         */
 662        if (!pat_enabled &&
 663            !(boot_cpu_has(X86_FEATURE_MTRR) ||
 664              boot_cpu_has(X86_FEATURE_K6_MTRR) ||
 665              boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
 666              boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
 667            (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
 668                pcm = _PAGE_CACHE_MODE_UC;
 669        }
 670#endif
 671
 672        *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
 673                             cachemode2protval(pcm));
 674        return 1;
 675}
 676
 677/*
 678 * Change the memory type for the physial address range in kernel identity
 679 * mapping space if that range is a part of identity map.
 680 */
 681int kernel_map_sync_memtype(u64 base, unsigned long size,
 682                            enum page_cache_mode pcm)
 683{
 684        unsigned long id_sz;
 685
 686        if (base > __pa(high_memory-1))
 687                return 0;
 688
 689        /*
 690         * some areas in the middle of the kernel identity range
 691         * are not mapped, like the PCI space.
 692         */
 693        if (!page_is_ram(base >> PAGE_SHIFT))
 694                return 0;
 695
 696        id_sz = (__pa(high_memory-1) <= base + size) ?
 697                                __pa(high_memory) - base :
 698                                size;
 699
 700        if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
 701                printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
 702                        "for [mem %#010Lx-%#010Lx]\n",
 703                        current->comm, current->pid,
 704                        cattr_name(pcm),
 705                        base, (unsigned long long)(base + size-1));
 706                return -EINVAL;
 707        }
 708        return 0;
 709}
 710
 711/*
 712 * Internal interface to reserve a range of physical memory with prot.
 713 * Reserved non RAM regions only and after successful reserve_memtype,
 714 * this func also keeps identity mapping (if any) in sync with this new prot.
 715 */
 716static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 717                                int strict_prot)
 718{
 719        int is_ram = 0;
 720        int ret;
 721        enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot);
 722        enum page_cache_mode pcm = want_pcm;
 723
 724        is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 725
 726        /*
 727         * reserve_pfn_range() for RAM pages. We do not refcount to keep
 728         * track of number of mappings of RAM pages. We can assert that
 729         * the type requested matches the type of first page in the range.
 730         */
 731        if (is_ram) {
 732                if (!pat_enabled)
 733                        return 0;
 734
 735                pcm = lookup_memtype(paddr);
 736                if (want_pcm != pcm) {
 737                        printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
 738                                current->comm, current->pid,
 739                                cattr_name(want_pcm),
 740                                (unsigned long long)paddr,
 741                                (unsigned long long)(paddr + size - 1),
 742                                cattr_name(pcm));
 743                        *vma_prot = __pgprot((pgprot_val(*vma_prot) &
 744                                             (~_PAGE_CACHE_MASK)) |
 745                                             cachemode2protval(pcm));
 746                }
 747                return 0;
 748        }
 749
 750        ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm);
 751        if (ret)
 752                return ret;
 753
 754        if (pcm != want_pcm) {
 755                if (strict_prot ||
 756                    !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
 757                        free_memtype(paddr, paddr + size);
 758                        printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
 759                                " for [mem %#010Lx-%#010Lx], got %s\n",
 760                                current->comm, current->pid,
 761                                cattr_name(want_pcm),
 762                                (unsigned long long)paddr,
 763                                (unsigned long long)(paddr + size - 1),
 764                                cattr_name(pcm));
 765                        return -EINVAL;
 766                }
 767                /*
 768                 * We allow returning different type than the one requested in
 769                 * non strict case.
 770                 */
 771                *vma_prot = __pgprot((pgprot_val(*vma_prot) &
 772                                      (~_PAGE_CACHE_MASK)) |
 773                                     cachemode2protval(pcm));
 774        }
 775
 776        if (kernel_map_sync_memtype(paddr, size, pcm) < 0) {
 777                free_memtype(paddr, paddr + size);
 778                return -EINVAL;
 779        }
 780        return 0;
 781}
 782
 783/*
 784 * Internal interface to free a range of physical memory.
 785 * Frees non RAM regions only.
 786 */
 787static void free_pfn_range(u64 paddr, unsigned long size)
 788{
 789        int is_ram;
 790
 791        is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 792        if (is_ram == 0)
 793                free_memtype(paddr, paddr + size);
 794}
 795
 796/*
 797 * track_pfn_copy is called when vma that is covering the pfnmap gets
 798 * copied through copy_page_range().
 799 *
 800 * If the vma has a linear pfn mapping for the entire range, we get the prot
 801 * from pte and reserve the entire vma range with single reserve_pfn_range call.
 802 */
 803int track_pfn_copy(struct vm_area_struct *vma)
 804{
 805        resource_size_t paddr;
 806        unsigned long prot;
 807        unsigned long vma_size = vma->vm_end - vma->vm_start;
 808        pgprot_t pgprot;
 809
 810        if (vma->vm_flags & VM_PAT) {
 811                /*
 812                 * reserve the whole chunk covered by vma. We need the
 813                 * starting address and protection from pte.
 814                 */
 815                if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
 816                        WARN_ON_ONCE(1);
 817                        return -EINVAL;
 818                }
 819                pgprot = __pgprot(prot);
 820                return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
 821        }
 822
 823        return 0;
 824}
 825
 826/*
 827 * prot is passed in as a parameter for the new mapping. If the vma has a
 828 * linear pfn mapping for the entire range reserve the entire vma range with
 829 * single reserve_pfn_range call.
 830 */
 831int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 832                    unsigned long pfn, unsigned long addr, unsigned long size)
 833{
 834        resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
 835        enum page_cache_mode pcm;
 836
 837        /* reserve the whole chunk starting from paddr */
 838        if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
 839                int ret;
 840
 841                ret = reserve_pfn_range(paddr, size, prot, 0);
 842                if (!ret)
 843                        vma->vm_flags |= VM_PAT;
 844                return ret;
 845        }
 846
 847        if (!pat_enabled)
 848                return 0;
 849
 850        /*
 851         * For anything smaller than the vma size we set prot based on the
 852         * lookup.
 853         */
 854        pcm = lookup_memtype(paddr);
 855
 856        /* Check memtype for the remaining pages */
 857        while (size > PAGE_SIZE) {
 858                size -= PAGE_SIZE;
 859                paddr += PAGE_SIZE;
 860                if (pcm != lookup_memtype(paddr))
 861                        return -EINVAL;
 862        }
 863
 864        *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
 865                         cachemode2protval(pcm));
 866
 867        return 0;
 868}
 869
 870int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
 871                     unsigned long pfn)
 872{
 873        enum page_cache_mode pcm;
 874
 875        if (!pat_enabled)
 876                return 0;
 877
 878        /* Set prot based on lookup */
 879        pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
 880        *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
 881                         cachemode2protval(pcm));
 882
 883        return 0;
 884}
 885
 886/*
 887 * untrack_pfn is called while unmapping a pfnmap for a region.
 888 * untrack can be called for a specific region indicated by pfn and size or
 889 * can be for the entire vma (in which case pfn, size are zero).
 890 */
 891void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 892                 unsigned long size)
 893{
 894        resource_size_t paddr;
 895        unsigned long prot;
 896
 897        if (!(vma->vm_flags & VM_PAT))
 898                return;
 899
 900        /* free the chunk starting from pfn or the whole chunk */
 901        paddr = (resource_size_t)pfn << PAGE_SHIFT;
 902        if (!paddr && !size) {
 903                if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
 904                        WARN_ON_ONCE(1);
 905                        return;
 906                }
 907
 908                size = vma->vm_end - vma->vm_start;
 909        }
 910        free_pfn_range(paddr, size);
 911        vma->vm_flags &= ~VM_PAT;
 912}
 913
 914pgprot_t pgprot_writecombine(pgprot_t prot)
 915{
 916        if (pat_enabled)
 917                return __pgprot(pgprot_val(prot) |
 918                                cachemode2protval(_PAGE_CACHE_MODE_WC));
 919        else
 920                return pgprot_noncached(prot);
 921}
 922EXPORT_SYMBOL_GPL(pgprot_writecombine);
 923
 924#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
 925
 926static struct memtype *memtype_get_idx(loff_t pos)
 927{
 928        struct memtype *print_entry;
 929        int ret;
 930
 931        print_entry  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
 932        if (!print_entry)
 933                return NULL;
 934
 935        spin_lock(&memtype_lock);
 936        ret = rbt_memtype_copy_nth_element(print_entry, pos);
 937        spin_unlock(&memtype_lock);
 938
 939        if (!ret) {
 940                return print_entry;
 941        } else {
 942                kfree(print_entry);
 943                return NULL;
 944        }
 945}
 946
 947static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
 948{
 949        if (*pos == 0) {
 950                ++*pos;
 951                seq_puts(seq, "PAT memtype list:\n");
 952        }
 953
 954        return memtype_get_idx(*pos);
 955}
 956
 957static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 958{
 959        ++*pos;
 960        return memtype_get_idx(*pos);
 961}
 962
 963static void memtype_seq_stop(struct seq_file *seq, void *v)
 964{
 965}
 966
 967static int memtype_seq_show(struct seq_file *seq, void *v)
 968{
 969        struct memtype *print_entry = (struct memtype *)v;
 970
 971        seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
 972                        print_entry->start, print_entry->end);
 973        kfree(print_entry);
 974
 975        return 0;
 976}
 977
 978static const struct seq_operations memtype_seq_ops = {
 979        .start = memtype_seq_start,
 980        .next  = memtype_seq_next,
 981        .stop  = memtype_seq_stop,
 982        .show  = memtype_seq_show,
 983};
 984
 985static int memtype_seq_open(struct inode *inode, struct file *file)
 986{
 987        return seq_open(file, &memtype_seq_ops);
 988}
 989
 990static const struct file_operations memtype_fops = {
 991        .open    = memtype_seq_open,
 992        .read    = seq_read,
 993        .llseek  = seq_lseek,
 994        .release = seq_release,
 995};
 996
 997static int __init pat_memtype_list_init(void)
 998{
 999        if (pat_enabled) {
1000                debugfs_create_file("pat_memtype_list", S_IRUSR,
1001                                    arch_debugfs_dir, NULL, &memtype_fops);
1002        }
1003        return 0;
1004}
1005
1006late_initcall(pat_memtype_list_init);
1007
1008#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
1009