linux/arch/x86/mm/pat.c
<<
>>
Prefs
   1/*
   2 * Handle caching attributes in page tables (PAT)
   3 *
   4 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
   5 *          Suresh B Siddha <suresh.b.siddha@intel.com>
   6 *
   7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
   8 */
   9
  10#include <linux/seq_file.h>
  11#include <linux/bootmem.h>
  12#include <linux/debugfs.h>
  13#include <linux/kernel.h>
  14#include <linux/module.h>
  15#include <linux/gfp.h>
  16#include <linux/mm.h>
  17#include <linux/fs.h>
  18#include <linux/rbtree.h>
  19
  20#include <asm/cacheflush.h>
  21#include <asm/processor.h>
  22#include <asm/tlbflush.h>
  23#include <asm/pgtable.h>
  24#include <asm/fcntl.h>
  25#include <asm/e820.h>
  26#include <asm/mtrr.h>
  27#include <asm/page.h>
  28#include <asm/msr.h>
  29#include <asm/pat.h>
  30#include <asm/io.h>
  31
  32#ifdef CONFIG_X86_PAT
  33int __read_mostly pat_enabled = 1;
  34
  35static inline void pat_disable(const char *reason)
  36{
  37        pat_enabled = 0;
  38        printk(KERN_INFO "%s\n", reason);
  39}
  40
  41static int __init nopat(char *str)
  42{
  43        pat_disable("PAT support disabled.");
  44        return 0;
  45}
  46early_param("nopat", nopat);
  47#else
  48static inline void pat_disable(const char *reason)
  49{
  50        (void)reason;
  51}
  52#endif
  53
  54
  55static int debug_enable;
  56
  57static int __init pat_debug_setup(char *str)
  58{
  59        debug_enable = 1;
  60        return 0;
  61}
  62__setup("debugpat", pat_debug_setup);
  63
  64#define dprintk(fmt, arg...) \
  65        do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
  66
  67
  68static u64 __read_mostly boot_pat_state;
  69
  70enum {
  71        PAT_UC = 0,             /* uncached */
  72        PAT_WC = 1,             /* Write combining */
  73        PAT_WT = 4,             /* Write Through */
  74        PAT_WP = 5,             /* Write Protected */
  75        PAT_WB = 6,             /* Write Back (default) */
  76        PAT_UC_MINUS = 7,       /* UC, but can be overriden by MTRR */
  77};
  78
  79#define PAT(x, y)       ((u64)PAT_ ## y << ((x)*8))
  80
  81void pat_init(void)
  82{
  83        u64 pat;
  84        bool boot_cpu = !boot_pat_state;
  85
  86        if (!pat_enabled)
  87                return;
  88
  89        if (!cpu_has_pat) {
  90                if (!boot_pat_state) {
  91                        pat_disable("PAT not supported by CPU.");
  92                        return;
  93                } else {
  94                        /*
  95                         * If this happens we are on a secondary CPU, but
  96                         * switched to PAT on the boot CPU. We have no way to
  97                         * undo PAT.
  98                         */
  99                        printk(KERN_ERR "PAT enabled, "
 100                               "but not supported by secondary CPU\n");
 101                        BUG();
 102                }
 103        }
 104
 105        /* Set PWT to Write-Combining. All other bits stay the same */
 106        /*
 107         * PTE encoding used in Linux:
 108         *      PAT
 109         *      |PCD
 110         *      ||PWT
 111         *      |||
 112         *      000 WB          _PAGE_CACHE_WB
 113         *      001 WC          _PAGE_CACHE_WC
 114         *      010 UC-         _PAGE_CACHE_UC_MINUS
 115         *      011 UC          _PAGE_CACHE_UC
 116         * PAT bit unused
 117         */
 118        pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
 119              PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
 120
 121        /* Boot CPU check */
 122        if (!boot_pat_state)
 123                rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
 124
 125        wrmsrl(MSR_IA32_CR_PAT, pat);
 126
 127        if (boot_cpu)
 128                printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
 129                       smp_processor_id(), boot_pat_state, pat);
 130}
 131
 132#undef PAT
 133
 134static char *cattr_name(unsigned long flags)
 135{
 136        switch (flags & _PAGE_CACHE_MASK) {
 137        case _PAGE_CACHE_UC:            return "uncached";
 138        case _PAGE_CACHE_UC_MINUS:      return "uncached-minus";
 139        case _PAGE_CACHE_WB:            return "write-back";
 140        case _PAGE_CACHE_WC:            return "write-combining";
 141        default:                        return "broken";
 142        }
 143}
 144
 145/*
 146 * The global memtype list keeps track of memory type for specific
 147 * physical memory areas. Conflicting memory types in different
 148 * mappings can cause CPU cache corruption. To avoid this we keep track.
 149 *
 150 * The list is sorted based on starting address and can contain multiple
 151 * entries for each address (this allows reference counting for overlapping
 152 * areas). All the aliases have the same cache attributes of course.
 153 * Zero attributes are represented as holes.
 154 *
 155 * The data structure is a list that is also organized as an rbtree
 156 * sorted on the start address of memtype range.
 157 *
 158 * memtype_lock protects both the linear list and rbtree.
 159 */
 160
 161struct memtype {
 162        u64                     start;
 163        u64                     end;
 164        unsigned long           type;
 165        struct list_head        nd;
 166        struct rb_node          rb;
 167};
 168
 169static struct rb_root memtype_rbroot = RB_ROOT;
 170static LIST_HEAD(memtype_list);
 171static DEFINE_SPINLOCK(memtype_lock);   /* protects memtype list */
 172
 173static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
 174{
 175        struct rb_node *node = root->rb_node;
 176        struct memtype *last_lower = NULL;
 177
 178        while (node) {
 179                struct memtype *data = container_of(node, struct memtype, rb);
 180
 181                if (data->start < start) {
 182                        last_lower = data;
 183                        node = node->rb_right;
 184                } else if (data->start > start) {
 185                        node = node->rb_left;
 186                } else
 187                        return data;
 188        }
 189
 190        /* Will return NULL if there is no entry with its start <= start */
 191        return last_lower;
 192}
 193
 194static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
 195{
 196        struct rb_node **new = &(root->rb_node);
 197        struct rb_node *parent = NULL;
 198
 199        while (*new) {
 200                struct memtype *this = container_of(*new, struct memtype, rb);
 201
 202                parent = *new;
 203                if (data->start <= this->start)
 204                        new = &((*new)->rb_left);
 205                else if (data->start > this->start)
 206                        new = &((*new)->rb_right);
 207        }
 208
 209        rb_link_node(&data->rb, parent, new);
 210        rb_insert_color(&data->rb, root);
 211}
 212
 213/*
 214 * Does intersection of PAT memory type and MTRR memory type and returns
 215 * the resulting memory type as PAT understands it.
 216 * (Type in pat and mtrr will not have same value)
 217 * The intersection is based on "Effective Memory Type" tables in IA-32
 218 * SDM vol 3a
 219 */
 220static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
 221{
 222        /*
 223         * Look for MTRR hint to get the effective type in case where PAT
 224         * request is for WB.
 225         */
 226        if (req_type == _PAGE_CACHE_WB) {
 227                u8 mtrr_type;
 228
 229                mtrr_type = mtrr_type_lookup(start, end);
 230                if (mtrr_type != MTRR_TYPE_WRBACK)
 231                        return _PAGE_CACHE_UC_MINUS;
 232
 233                return _PAGE_CACHE_WB;
 234        }
 235
 236        return req_type;
 237}
 238
 239static int
 240chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
 241{
 242        if (new->type != entry->type) {
 243                if (type) {
 244                        new->type = entry->type;
 245                        *type = entry->type;
 246                } else
 247                        goto conflict;
 248        }
 249
 250         /* check overlaps with more than one entry in the list */
 251        list_for_each_entry_continue(entry, &memtype_list, nd) {
 252                if (new->end <= entry->start)
 253                        break;
 254                else if (new->type != entry->type)
 255                        goto conflict;
 256        }
 257        return 0;
 258
 259 conflict:
 260        printk(KERN_INFO "%s:%d conflicting memory types "
 261               "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
 262               new->end, cattr_name(new->type), cattr_name(entry->type));
 263        return -EBUSY;
 264}
 265
 266static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
 267{
 268        int ram_page = 0, not_rampage = 0;
 269        unsigned long page_nr;
 270
 271        for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
 272             ++page_nr) {
 273                /*
 274                 * For legacy reasons, physical address range in the legacy ISA
 275                 * region is tracked as non-RAM. This will allow users of
 276                 * /dev/mem to map portions of legacy ISA region, even when
 277                 * some of those portions are listed(or not even listed) with
 278                 * different e820 types(RAM/reserved/..)
 279                 */
 280                if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
 281                    page_is_ram(page_nr))
 282                        ram_page = 1;
 283                else
 284                        not_rampage = 1;
 285
 286                if (ram_page == not_rampage)
 287                        return -1;
 288        }
 289
 290        return ram_page;
 291}
 292
 293/*
 294 * For RAM pages, we use page flags to mark the pages with appropriate type.
 295 * Here we do two pass:
 296 * - Find the memtype of all the pages in the range, look for any conflicts
 297 * - In case of no conflicts, set the new memtype for pages in the range
 298 *
 299 * Caller must hold memtype_lock for atomicity.
 300 */
 301static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
 302                                  unsigned long *new_type)
 303{
 304        struct page *page;
 305        u64 pfn;
 306
 307        if (req_type == _PAGE_CACHE_UC) {
 308                /* We do not support strong UC */
 309                WARN_ON_ONCE(1);
 310                req_type = _PAGE_CACHE_UC_MINUS;
 311        }
 312
 313        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 314                unsigned long type;
 315
 316                page = pfn_to_page(pfn);
 317                type = get_page_memtype(page);
 318                if (type != -1) {
 319                        printk(KERN_INFO "reserve_ram_pages_type failed "
 320                                "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
 321                                start, end, type, req_type);
 322                        if (new_type)
 323                                *new_type = type;
 324
 325                        return -EBUSY;
 326                }
 327        }
 328
 329        if (new_type)
 330                *new_type = req_type;
 331
 332        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 333                page = pfn_to_page(pfn);
 334                set_page_memtype(page, req_type);
 335        }
 336        return 0;
 337}
 338
 339static int free_ram_pages_type(u64 start, u64 end)
 340{
 341        struct page *page;
 342        u64 pfn;
 343
 344        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 345                page = pfn_to_page(pfn);
 346                set_page_memtype(page, -1);
 347        }
 348        return 0;
 349}
 350
 351/*
 352 * req_type typically has one of the:
 353 * - _PAGE_CACHE_WB
 354 * - _PAGE_CACHE_WC
 355 * - _PAGE_CACHE_UC_MINUS
 356 * - _PAGE_CACHE_UC
 357 *
 358 * req_type will have a special case value '-1', when requester want to inherit
 359 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
 360 *
 361 * If new_type is NULL, function will return an error if it cannot reserve the
 362 * region with req_type. If new_type is non-NULL, function will return
 363 * available type in new_type in case of no error. In case of any error
 364 * it will return a negative return value.
 365 */
 366int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 367                    unsigned long *new_type)
 368{
 369        struct memtype *new, *entry;
 370        unsigned long actual_type;
 371        struct list_head *where;
 372        int is_range_ram;
 373        int err = 0;
 374
 375        BUG_ON(start >= end); /* end is exclusive */
 376
 377        if (!pat_enabled) {
 378                /* This is identical to page table setting without PAT */
 379                if (new_type) {
 380                        if (req_type == -1)
 381                                *new_type = _PAGE_CACHE_WB;
 382                        else if (req_type == _PAGE_CACHE_WC)
 383                                *new_type = _PAGE_CACHE_UC_MINUS;
 384                        else
 385                                *new_type = req_type & _PAGE_CACHE_MASK;
 386                }
 387                return 0;
 388        }
 389
 390        /* Low ISA region is always mapped WB in page table. No need to track */
 391        if (is_ISA_range(start, end - 1)) {
 392                if (new_type)
 393                        *new_type = _PAGE_CACHE_WB;
 394                return 0;
 395        }
 396
 397        /*
 398         * Call mtrr_lookup to get the type hint. This is an
 399         * optimization for /dev/mem mmap'ers into WB memory (BIOS
 400         * tools and ACPI tools). Use WB request for WB memory and use
 401         * UC_MINUS otherwise.
 402         */
 403        actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK);
 404
 405        if (new_type)
 406                *new_type = actual_type;
 407
 408        is_range_ram = pat_pagerange_is_ram(start, end);
 409        if (is_range_ram == 1) {
 410
 411                spin_lock(&memtype_lock);
 412                err = reserve_ram_pages_type(start, end, req_type, new_type);
 413                spin_unlock(&memtype_lock);
 414
 415                return err;
 416        } else if (is_range_ram < 0) {
 417                return -EINVAL;
 418        }
 419
 420        new  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
 421        if (!new)
 422                return -ENOMEM;
 423
 424        new->start      = start;
 425        new->end        = end;
 426        new->type       = actual_type;
 427
 428        spin_lock(&memtype_lock);
 429
 430        /* Search for existing mapping that overlaps the current range */
 431        where = NULL;
 432        list_for_each_entry(entry, &memtype_list, nd) {
 433                if (end <= entry->start) {
 434                        where = entry->nd.prev;
 435                        break;
 436                } else if (start <= entry->start) { /* end > entry->start */
 437                        err = chk_conflict(new, entry, new_type);
 438                        if (!err) {
 439                                dprintk("Overlap at 0x%Lx-0x%Lx\n",
 440                                        entry->start, entry->end);
 441                                where = entry->nd.prev;
 442                        }
 443                        break;
 444                } else if (start < entry->end) { /* start > entry->start */
 445                        err = chk_conflict(new, entry, new_type);
 446                        if (!err) {
 447                                dprintk("Overlap at 0x%Lx-0x%Lx\n",
 448                                        entry->start, entry->end);
 449
 450                                /*
 451                                 * Move to right position in the linked
 452                                 * list to add this new entry
 453                                 */
 454                                list_for_each_entry_continue(entry,
 455                                                        &memtype_list, nd) {
 456                                        if (start <= entry->start) {
 457                                                where = entry->nd.prev;
 458                                                break;
 459                                        }
 460                                }
 461                        }
 462                        break;
 463                }
 464        }
 465
 466        if (err) {
 467                printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
 468                       "track %s, req %s\n",
 469                       start, end, cattr_name(new->type), cattr_name(req_type));
 470                kfree(new);
 471                spin_unlock(&memtype_lock);
 472
 473                return err;
 474        }
 475
 476        if (where)
 477                list_add(&new->nd, where);
 478        else
 479                list_add_tail(&new->nd, &memtype_list);
 480
 481        memtype_rb_insert(&memtype_rbroot, new);
 482
 483        spin_unlock(&memtype_lock);
 484
 485        dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
 486                start, end, cattr_name(new->type), cattr_name(req_type),
 487                new_type ? cattr_name(*new_type) : "-");
 488
 489        return err;
 490}
 491
 492int free_memtype(u64 start, u64 end)
 493{
 494        struct memtype *entry, *saved_entry;
 495        int err = -EINVAL;
 496        int is_range_ram;
 497
 498        if (!pat_enabled)
 499                return 0;
 500
 501        /* Low ISA region is always mapped WB. No need to track */
 502        if (is_ISA_range(start, end - 1))
 503                return 0;
 504
 505        is_range_ram = pat_pagerange_is_ram(start, end);
 506        if (is_range_ram == 1) {
 507
 508                spin_lock(&memtype_lock);
 509                err = free_ram_pages_type(start, end);
 510                spin_unlock(&memtype_lock);
 511
 512                return err;
 513        } else if (is_range_ram < 0) {
 514                return -EINVAL;
 515        }
 516
 517        spin_lock(&memtype_lock);
 518
 519        entry = memtype_rb_search(&memtype_rbroot, start);
 520        if (unlikely(entry == NULL))
 521                goto unlock_ret;
 522
 523        /*
 524         * Saved entry points to an entry with start same or less than what
 525         * we searched for. Now go through the list in both directions to look
 526         * for the entry that matches with both start and end, with list stored
 527         * in sorted start address
 528         */
 529        saved_entry = entry;
 530        list_for_each_entry_from(entry, &memtype_list, nd) {
 531                if (entry->start == start && entry->end == end) {
 532                        rb_erase(&entry->rb, &memtype_rbroot);
 533                        list_del(&entry->nd);
 534                        kfree(entry);
 535                        err = 0;
 536                        break;
 537                } else if (entry->start > start) {
 538                        break;
 539                }
 540        }
 541
 542        if (!err)
 543                goto unlock_ret;
 544
 545        entry = saved_entry;
 546        list_for_each_entry_reverse(entry, &memtype_list, nd) {
 547                if (entry->start == start && entry->end == end) {
 548                        rb_erase(&entry->rb, &memtype_rbroot);
 549                        list_del(&entry->nd);
 550                        kfree(entry);
 551                        err = 0;
 552                        break;
 553                } else if (entry->start < start) {
 554                        break;
 555                }
 556        }
 557unlock_ret:
 558        spin_unlock(&memtype_lock);
 559
 560        if (err) {
 561                printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
 562                        current->comm, current->pid, start, end);
 563        }
 564
 565        dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
 566
 567        return err;
 568}
 569
 570
 571/**
 572 * lookup_memtype - Looksup the memory type for a physical address
 573 * @paddr: physical address of which memory type needs to be looked up
 574 *
 575 * Only to be called when PAT is enabled
 576 *
 577 * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
 578 * _PAGE_CACHE_UC
 579 */
 580static unsigned long lookup_memtype(u64 paddr)
 581{
 582        int rettype = _PAGE_CACHE_WB;
 583        struct memtype *entry;
 584
 585        if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1))
 586                return rettype;
 587
 588        if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
 589                struct page *page;
 590                spin_lock(&memtype_lock);
 591                page = pfn_to_page(paddr >> PAGE_SHIFT);
 592                rettype = get_page_memtype(page);
 593                spin_unlock(&memtype_lock);
 594                /*
 595                 * -1 from get_page_memtype() implies RAM page is in its
 596                 * default state and not reserved, and hence of type WB
 597                 */
 598                if (rettype == -1)
 599                        rettype = _PAGE_CACHE_WB;
 600
 601                return rettype;
 602        }
 603
 604        spin_lock(&memtype_lock);
 605
 606        entry = memtype_rb_search(&memtype_rbroot, paddr);
 607        if (entry != NULL)
 608                rettype = entry->type;
 609        else
 610                rettype = _PAGE_CACHE_UC_MINUS;
 611
 612        spin_unlock(&memtype_lock);
 613        return rettype;
 614}
 615
 616/**
 617 * io_reserve_memtype - Request a memory type mapping for a region of memory
 618 * @start: start (physical address) of the region
 619 * @end: end (physical address) of the region
 620 * @type: A pointer to memtype, with requested type. On success, requested
 621 * or any other compatible type that was available for the region is returned
 622 *
 623 * On success, returns 0
 624 * On failure, returns non-zero
 625 */
 626int io_reserve_memtype(resource_size_t start, resource_size_t end,
 627                        unsigned long *type)
 628{
 629        resource_size_t size = end - start;
 630        unsigned long req_type = *type;
 631        unsigned long new_type;
 632        int ret;
 633
 634        WARN_ON_ONCE(iomem_map_sanity_check(start, size));
 635
 636        ret = reserve_memtype(start, end, req_type, &new_type);
 637        if (ret)
 638                goto out_err;
 639
 640        if (!is_new_memtype_allowed(start, size, req_type, new_type))
 641                goto out_free;
 642
 643        if (kernel_map_sync_memtype(start, size, new_type) < 0)
 644                goto out_free;
 645
 646        *type = new_type;
 647        return 0;
 648
 649out_free:
 650        free_memtype(start, end);
 651        ret = -EBUSY;
 652out_err:
 653        return ret;
 654}
 655
 656/**
 657 * io_free_memtype - Release a memory type mapping for a region of memory
 658 * @start: start (physical address) of the region
 659 * @end: end (physical address) of the region
 660 */
 661void io_free_memtype(resource_size_t start, resource_size_t end)
 662{
 663        free_memtype(start, end);
 664}
 665
 666pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 667                                unsigned long size, pgprot_t vma_prot)
 668{
 669        return vma_prot;
 670}
 671
 672#ifdef CONFIG_STRICT_DEVMEM
 673/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
 674static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 675{
 676        return 1;
 677}
 678#else
 679/* This check is needed to avoid cache aliasing when PAT is enabled */
 680static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 681{
 682        u64 from = ((u64)pfn) << PAGE_SHIFT;
 683        u64 to = from + size;
 684        u64 cursor = from;
 685
 686        if (!pat_enabled)
 687                return 1;
 688
 689        while (cursor < to) {
 690                if (!devmem_is_allowed(pfn)) {
 691                        printk(KERN_INFO
 692                "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
 693                                current->comm, from, to);
 694                        return 0;
 695                }
 696                cursor += PAGE_SIZE;
 697                pfn++;
 698        }
 699        return 1;
 700}
 701#endif /* CONFIG_STRICT_DEVMEM */
 702
 703int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 704                                unsigned long size, pgprot_t *vma_prot)
 705{
 706        unsigned long flags = _PAGE_CACHE_WB;
 707
 708        if (!range_is_allowed(pfn, size))
 709                return 0;
 710
 711        if (file->f_flags & O_SYNC) {
 712                flags = _PAGE_CACHE_UC_MINUS;
 713        }
 714
 715#ifdef CONFIG_X86_32
 716        /*
 717         * On the PPro and successors, the MTRRs are used to set
 718         * memory types for physical addresses outside main memory,
 719         * so blindly setting UC or PWT on those pages is wrong.
 720         * For Pentiums and earlier, the surround logic should disable
 721         * caching for the high addresses through the KEN pin, but
 722         * we maintain the tradition of paranoia in this code.
 723         */
 724        if (!pat_enabled &&
 725            !(boot_cpu_has(X86_FEATURE_MTRR) ||
 726              boot_cpu_has(X86_FEATURE_K6_MTRR) ||
 727              boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
 728              boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
 729            (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
 730                flags = _PAGE_CACHE_UC;
 731        }
 732#endif
 733
 734        *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
 735                             flags);
 736        return 1;
 737}
 738
 739/*
 740 * Change the memory type for the physial address range in kernel identity
 741 * mapping space if that range is a part of identity map.
 742 */
 743int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
 744{
 745        unsigned long id_sz;
 746
 747        if (base >= __pa(high_memory))
 748                return 0;
 749
 750        id_sz = (__pa(high_memory) < base + size) ?
 751                                __pa(high_memory) - base :
 752                                size;
 753
 754        if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
 755                printk(KERN_INFO
 756                        "%s:%d ioremap_change_attr failed %s "
 757                        "for %Lx-%Lx\n",
 758                        current->comm, current->pid,
 759                        cattr_name(flags),
 760                        base, (unsigned long long)(base + size));
 761                return -EINVAL;
 762        }
 763        return 0;
 764}
 765
 766/*
 767 * Internal interface to reserve a range of physical memory with prot.
 768 * Reserved non RAM regions only and after successful reserve_memtype,
 769 * this func also keeps identity mapping (if any) in sync with this new prot.
 770 */
 771static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 772                                int strict_prot)
 773{
 774        int is_ram = 0;
 775        int ret;
 776        unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
 777        unsigned long flags = want_flags;
 778
 779        is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 780
 781        /*
 782         * reserve_pfn_range() for RAM pages. We do not refcount to keep
 783         * track of number of mappings of RAM pages. We can assert that
 784         * the type requested matches the type of first page in the range.
 785         */
 786        if (is_ram) {
 787                if (!pat_enabled)
 788                        return 0;
 789
 790                flags = lookup_memtype(paddr);
 791                if (want_flags != flags) {
 792                        printk(KERN_WARNING
 793                        "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
 794                                current->comm, current->pid,
 795                                cattr_name(want_flags),
 796                                (unsigned long long)paddr,
 797                                (unsigned long long)(paddr + size),
 798                                cattr_name(flags));
 799                        *vma_prot = __pgprot((pgprot_val(*vma_prot) &
 800                                              (~_PAGE_CACHE_MASK)) |
 801                                             flags);
 802                }
 803                return 0;
 804        }
 805
 806        ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
 807        if (ret)
 808                return ret;
 809
 810        if (flags != want_flags) {
 811                if (strict_prot ||
 812                    !is_new_memtype_allowed(paddr, size, want_flags, flags)) {
 813                        free_memtype(paddr, paddr + size);
 814                        printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
 815                                " for %Lx-%Lx, got %s\n",
 816                                current->comm, current->pid,
 817                                cattr_name(want_flags),
 818                                (unsigned long long)paddr,
 819                                (unsigned long long)(paddr + size),
 820                                cattr_name(flags));
 821                        return -EINVAL;
 822                }
 823                /*
 824                 * We allow returning different type than the one requested in
 825                 * non strict case.
 826                 */
 827                *vma_prot = __pgprot((pgprot_val(*vma_prot) &
 828                                      (~_PAGE_CACHE_MASK)) |
 829                                     flags);
 830        }
 831
 832        if (kernel_map_sync_memtype(paddr, size, flags) < 0) {
 833                free_memtype(paddr, paddr + size);
 834                return -EINVAL;
 835        }
 836        return 0;
 837}
 838
 839/*
 840 * Internal interface to free a range of physical memory.
 841 * Frees non RAM regions only.
 842 */
 843static void free_pfn_range(u64 paddr, unsigned long size)
 844{
 845        int is_ram;
 846
 847        is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 848        if (is_ram == 0)
 849                free_memtype(paddr, paddr + size);
 850}
 851
 852/*
 853 * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
 854 * copied through copy_page_range().
 855 *
 856 * If the vma has a linear pfn mapping for the entire range, we get the prot
 857 * from pte and reserve the entire vma range with single reserve_pfn_range call.
 858 */
 859int track_pfn_vma_copy(struct vm_area_struct *vma)
 860{
 861        resource_size_t paddr;
 862        unsigned long prot;
 863        unsigned long vma_size = vma->vm_end - vma->vm_start;
 864        pgprot_t pgprot;
 865
 866        if (is_linear_pfn_mapping(vma)) {
 867                /*
 868                 * reserve the whole chunk covered by vma. We need the
 869                 * starting address and protection from pte.
 870                 */
 871                if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
 872                        WARN_ON_ONCE(1);
 873                        return -EINVAL;
 874                }
 875                pgprot = __pgprot(prot);
 876                return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
 877        }
 878
 879        return 0;
 880}
 881
 882/*
 883 * track_pfn_vma_new is called when a _new_ pfn mapping is being established
 884 * for physical range indicated by pfn and size.
 885 *
 886 * prot is passed in as a parameter for the new mapping. If the vma has a
 887 * linear pfn mapping for the entire range reserve the entire vma range with
 888 * single reserve_pfn_range call.
 889 */
 890int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
 891                        unsigned long pfn, unsigned long size)
 892{
 893        unsigned long flags;
 894        resource_size_t paddr;
 895        unsigned long vma_size = vma->vm_end - vma->vm_start;
 896
 897        if (is_linear_pfn_mapping(vma)) {
 898                /* reserve the whole chunk starting from vm_pgoff */
 899                paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
 900                return reserve_pfn_range(paddr, vma_size, prot, 0);
 901        }
 902
 903        if (!pat_enabled)
 904                return 0;
 905
 906        /* for vm_insert_pfn and friends, we set prot based on lookup */
 907        flags = lookup_memtype(pfn << PAGE_SHIFT);
 908        *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
 909                         flags);
 910
 911        return 0;
 912}
 913
 914/*
 915 * untrack_pfn_vma is called while unmapping a pfnmap for a region.
 916 * untrack can be called for a specific region indicated by pfn and size or
 917 * can be for the entire vma (in which case size can be zero).
 918 */
 919void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
 920                        unsigned long size)
 921{
 922        resource_size_t paddr;
 923        unsigned long vma_size = vma->vm_end - vma->vm_start;
 924
 925        if (is_linear_pfn_mapping(vma)) {
 926                /* free the whole chunk starting from vm_pgoff */
 927                paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
 928                free_pfn_range(paddr, vma_size);
 929                return;
 930        }
 931}
 932
 933pgprot_t pgprot_writecombine(pgprot_t prot)
 934{
 935        if (pat_enabled)
 936                return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
 937        else
 938                return pgprot_noncached(prot);
 939}
 940EXPORT_SYMBOL_GPL(pgprot_writecombine);
 941
 942#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
 943
 944/* get Nth element of the linked list */
 945static struct memtype *memtype_get_idx(loff_t pos)
 946{
 947        struct memtype *list_node, *print_entry;
 948        int i = 1;
 949
 950        print_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
 951        if (!print_entry)
 952                return NULL;
 953
 954        spin_lock(&memtype_lock);
 955        list_for_each_entry(list_node, &memtype_list, nd) {
 956                if (pos == i) {
 957                        *print_entry = *list_node;
 958                        spin_unlock(&memtype_lock);
 959                        return print_entry;
 960                }
 961                ++i;
 962        }
 963        spin_unlock(&memtype_lock);
 964        kfree(print_entry);
 965
 966        return NULL;
 967}
 968
 969static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
 970{
 971        if (*pos == 0) {
 972                ++*pos;
 973                seq_printf(seq, "PAT memtype list:\n");
 974        }
 975
 976        return memtype_get_idx(*pos);
 977}
 978
 979static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 980{
 981        ++*pos;
 982        return memtype_get_idx(*pos);
 983}
 984
 985static void memtype_seq_stop(struct seq_file *seq, void *v)
 986{
 987}
 988
 989static int memtype_seq_show(struct seq_file *seq, void *v)
 990{
 991        struct memtype *print_entry = (struct memtype *)v;
 992
 993        seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
 994                        print_entry->start, print_entry->end);
 995        kfree(print_entry);
 996
 997        return 0;
 998}
 999
1000static const struct seq_operations memtype_seq_ops = {
1001        .start = memtype_seq_start,
1002        .next  = memtype_seq_next,
1003        .stop  = memtype_seq_stop,
1004        .show  = memtype_seq_show,
1005};
1006
1007static int memtype_seq_open(struct inode *inode, struct file *file)
1008{
1009        return seq_open(file, &memtype_seq_ops);
1010}
1011
1012static const struct file_operations memtype_fops = {
1013        .open    = memtype_seq_open,
1014        .read    = seq_read,
1015        .llseek  = seq_lseek,
1016        .release = seq_release,
1017};
1018
1019static int __init pat_memtype_list_init(void)
1020{
1021        debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
1022                                NULL, &memtype_fops);
1023        return 0;
1024}
1025
1026late_initcall(pat_memtype_list_init);
1027
1028#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
1029