linux/kernel/memremap.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2015 Intel Corporation. All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of version 2 of the GNU General Public License as
   6 * published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 */
  13#include <linux/radix-tree.h>
  14#include <linux/device.h>
  15#include <linux/types.h>
  16#include <linux/pfn_t.h>
  17#include <linux/io.h>
  18#include <linux/mm.h>
  19#include <linux/memory_hotplug.h>
  20#include <linux/swap.h>
  21#include <linux/swapops.h>
  22
  23#ifndef ioremap_cache
  24/* temporary while we convert existing ioremap_cache users to memremap */
  25__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
  26{
  27        return ioremap(offset, size);
  28}
  29#endif
  30
  31#ifndef arch_memremap_wb
  32static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
  33{
  34        return (__force void *)ioremap_cache(offset, size);
  35}
  36#endif
  37
  38#ifndef arch_memremap_can_ram_remap
  39static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
  40                                        unsigned long flags)
  41{
  42        return true;
  43}
  44#endif
  45
  46static void *try_ram_remap(resource_size_t offset, size_t size,
  47                           unsigned long flags)
  48{
  49        unsigned long pfn = PHYS_PFN(offset);
  50
  51        /* In the simple case just return the existing linear address */
  52        if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
  53            arch_memremap_can_ram_remap(offset, size, flags))
  54                return __va(offset);
  55
  56        return NULL; /* fallback to arch_memremap_wb */
  57}
  58
  59/**
  60 * memremap() - remap an iomem_resource as cacheable memory
  61 * @offset: iomem resource start address
  62 * @size: size of remap
  63 * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
  64 *                MEMREMAP_ENC, MEMREMAP_DEC
  65 *
  66 * memremap() is "ioremap" for cases where it is known that the resource
  67 * being mapped does not have i/o side effects and the __iomem
  68 * annotation is not applicable. In the case of multiple flags, the different
  69 * mapping types will be attempted in the order listed below until one of
  70 * them succeeds.
  71 *
  72 * MEMREMAP_WB - matches the default mapping for System RAM on
  73 * the architecture.  This is usually a read-allocate write-back cache.
  74 * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
  75 * memremap() will bypass establishing a new mapping and instead return
  76 * a pointer into the direct map.
  77 *
  78 * MEMREMAP_WT - establish a mapping whereby writes either bypass the
  79 * cache or are written through to memory and never exist in a
  80 * cache-dirty state with respect to program visibility.  Attempts to
  81 * map System RAM with this mapping type will fail.
  82 *
  83 * MEMREMAP_WC - establish a writecombine mapping, whereby writes may
  84 * be coalesced together (e.g. in the CPU's write buffers), but is otherwise
  85 * uncached. Attempts to map System RAM with this mapping type will fail.
  86 */
  87void *memremap(resource_size_t offset, size_t size, unsigned long flags)
  88{
  89        int is_ram = region_intersects(offset, size,
  90                                       IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
  91        void *addr = NULL;
  92
  93        if (!flags)
  94                return NULL;
  95
  96        if (is_ram == REGION_MIXED) {
  97                WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
  98                                &offset, (unsigned long) size);
  99                return NULL;
 100        }
 101
 102        /* Try all mapping types requested until one returns non-NULL */
 103        if (flags & MEMREMAP_WB) {
 104                /*
 105                 * MEMREMAP_WB is special in that it can be satisifed
 106                 * from the direct map.  Some archs depend on the
 107                 * capability of memremap() to autodetect cases where
 108                 * the requested range is potentially in System RAM.
 109                 */
 110                if (is_ram == REGION_INTERSECTS)
 111                        addr = try_ram_remap(offset, size, flags);
 112                if (!addr)
 113                        addr = arch_memremap_wb(offset, size);
 114        }
 115
 116        /*
 117         * If we don't have a mapping yet and other request flags are
 118         * present then we will be attempting to establish a new virtual
 119         * address mapping.  Enforce that this mapping is not aliasing
 120         * System RAM.
 121         */
 122        if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) {
 123                WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
 124                                &offset, (unsigned long) size);
 125                return NULL;
 126        }
 127
 128        if (!addr && (flags & MEMREMAP_WT))
 129                addr = ioremap_wt(offset, size);
 130
 131        if (!addr && (flags & MEMREMAP_WC))
 132                addr = ioremap_wc(offset, size);
 133
 134        return addr;
 135}
 136EXPORT_SYMBOL(memremap);
 137
 138void memunmap(void *addr)
 139{
 140        if (is_vmalloc_addr(addr))
 141                iounmap((void __iomem *) addr);
 142}
 143EXPORT_SYMBOL(memunmap);
 144
 145static void devm_memremap_release(struct device *dev, void *res)
 146{
 147        memunmap(*(void **)res);
 148}
 149
 150static int devm_memremap_match(struct device *dev, void *res, void *match_data)
 151{
 152        return *(void **)res == match_data;
 153}
 154
 155void *devm_memremap(struct device *dev, resource_size_t offset,
 156                size_t size, unsigned long flags)
 157{
 158        void **ptr, *addr;
 159
 160        ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL,
 161                        dev_to_node(dev));
 162        if (!ptr)
 163                return ERR_PTR(-ENOMEM);
 164
 165        addr = memremap(offset, size, flags);
 166        if (addr) {
 167                *ptr = addr;
 168                devres_add(dev, ptr);
 169        } else {
 170                devres_free(ptr);
 171                return ERR_PTR(-ENXIO);
 172        }
 173
 174        return addr;
 175}
 176EXPORT_SYMBOL(devm_memremap);
 177
 178void devm_memunmap(struct device *dev, void *addr)
 179{
 180        WARN_ON(devres_release(dev, devm_memremap_release,
 181                                devm_memremap_match, addr));
 182}
 183EXPORT_SYMBOL(devm_memunmap);
 184
 185#ifdef CONFIG_ZONE_DEVICE
 186static DEFINE_MUTEX(pgmap_lock);
 187static RADIX_TREE(pgmap_radix, GFP_KERNEL);
 188#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
 189#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
 190
 191static unsigned long order_at(struct resource *res, unsigned long pgoff)
 192{
 193        unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
 194        unsigned long nr_pages, mask;
 195
 196        nr_pages = PHYS_PFN(resource_size(res));
 197        if (nr_pages == pgoff)
 198                return ULONG_MAX;
 199
 200        /*
 201         * What is the largest aligned power-of-2 range available from
 202         * this resource pgoff to the end of the resource range,
 203         * considering the alignment of the current pgoff?
 204         */
 205        mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
 206        if (!mask)
 207                return ULONG_MAX;
 208
 209        return find_first_bit(&mask, BITS_PER_LONG);
 210}
 211
 212#define foreach_order_pgoff(res, order, pgoff) \
 213        for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
 214                        pgoff += 1UL << order, order = order_at((res), pgoff))
 215
 216#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 217int device_private_entry_fault(struct vm_area_struct *vma,
 218                       unsigned long addr,
 219                       swp_entry_t entry,
 220                       unsigned int flags,
 221                       pmd_t *pmdp)
 222{
 223        struct page *page = device_private_entry_to_page(entry);
 224
 225        /*
 226         * The page_fault() callback must migrate page back to system memory
 227         * so that CPU can access it. This might fail for various reasons
 228         * (device issue, device was unsafely unplugged, ...). When such
 229         * error conditions happen, the callback must return VM_FAULT_SIGBUS.
 230         *
 231         * Note that because memory cgroup charges are accounted to the device
 232         * memory, this should never fail because of memory restrictions (but
 233         * allocation of regular system page might still fail because we are
 234         * out of memory).
 235         *
 236         * There is a more in-depth description of what that callback can and
 237         * cannot do, in include/linux/memremap.h
 238         */
 239        return page->pgmap->page_fault(vma, addr, page, flags, pmdp);
 240}
 241EXPORT_SYMBOL(device_private_entry_fault);
 242#endif /* CONFIG_DEVICE_PRIVATE */
 243
 244static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff)
 245{
 246        unsigned long pgoff, order;
 247
 248        mutex_lock(&pgmap_lock);
 249        foreach_order_pgoff(res, order, pgoff) {
 250                if (pgoff >= end_pgoff)
 251                        break;
 252                radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
 253        }
 254        mutex_unlock(&pgmap_lock);
 255
 256        synchronize_rcu();
 257}
 258
 259static unsigned long pfn_first(struct dev_pagemap *pgmap)
 260{
 261        const struct resource *res = &pgmap->res;
 262        struct vmem_altmap *altmap = &pgmap->altmap;
 263        unsigned long pfn;
 264
 265        pfn = res->start >> PAGE_SHIFT;
 266        if (pgmap->altmap_valid)
 267                pfn += vmem_altmap_offset(altmap);
 268        return pfn;
 269}
 270
 271static unsigned long pfn_end(struct dev_pagemap *pgmap)
 272{
 273        const struct resource *res = &pgmap->res;
 274
 275        return (res->start + resource_size(res)) >> PAGE_SHIFT;
 276}
 277
 278static unsigned long pfn_next(unsigned long pfn)
 279{
 280        if (pfn % 1024 == 0)
 281                cond_resched();
 282        return pfn + 1;
 283}
 284
 285#define for_each_device_pfn(pfn, map) \
 286        for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
 287
 288static void devm_memremap_pages_release(void *data)
 289{
 290        struct dev_pagemap *pgmap = data;
 291        struct device *dev = pgmap->dev;
 292        struct resource *res = &pgmap->res;
 293        resource_size_t align_start, align_size;
 294        unsigned long pfn;
 295
 296        for_each_device_pfn(pfn, pgmap)
 297                put_page(pfn_to_page(pfn));
 298
 299        if (percpu_ref_tryget_live(pgmap->ref)) {
 300                dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
 301                percpu_ref_put(pgmap->ref);
 302        }
 303
 304        /* pages are dead and unused, undo the arch mapping */
 305        align_start = res->start & ~(SECTION_SIZE - 1);
 306        align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
 307                - align_start;
 308
 309        mem_hotplug_begin();
 310        arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
 311                        &pgmap->altmap : NULL);
 312        mem_hotplug_done();
 313
 314        untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
 315        pgmap_radix_release(res, -1);
 316        dev_WARN_ONCE(dev, pgmap->altmap.alloc,
 317                      "%s: failed to free all reserved pages\n", __func__);
 318}
 319
 320/**
 321 * devm_memremap_pages - remap and provide memmap backing for the given resource
 322 * @dev: hosting device for @res
 323 * @pgmap: pointer to a struct dev_pgmap
 324 *
 325 * Notes:
 326 * 1/ At a minimum the res, ref and type members of @pgmap must be initialized
 327 *    by the caller before passing it to this function
 328 *
 329 * 2/ The altmap field may optionally be initialized, in which case altmap_valid
 330 *    must be set to true
 331 *
 332 * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages()
 333 *    time (or devm release event). The expected order of events is that ref has
 334 *    been through percpu_ref_kill() before devm_memremap_pages_release(). The
 335 *    wait for the completion of all references being dropped and
 336 *    percpu_ref_exit() must occur after devm_memremap_pages_release().
 337 *
 338 * 4/ res is expected to be a host memory range that could feasibly be
 339 *    treated as a "System RAM" range, i.e. not a device mmio range, but
 340 *    this is not enforced.
 341 */
 342void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 343{
 344        resource_size_t align_start, align_size, align_end;
 345        struct vmem_altmap *altmap = pgmap->altmap_valid ?
 346                        &pgmap->altmap : NULL;
 347        struct resource *res = &pgmap->res;
 348        unsigned long pfn, pgoff, order;
 349        pgprot_t pgprot = PAGE_KERNEL;
 350        int error, nid, is_ram;
 351
 352        align_start = res->start & ~(SECTION_SIZE - 1);
 353        align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
 354                - align_start;
 355        is_ram = region_intersects(align_start, align_size,
 356                IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
 357
 358        if (is_ram == REGION_MIXED) {
 359                WARN_ONCE(1, "%s attempted on mixed region %pr\n",
 360                                __func__, res);
 361                return ERR_PTR(-ENXIO);
 362        }
 363
 364        if (is_ram == REGION_INTERSECTS)
 365                return __va(res->start);
 366
 367        if (!pgmap->ref)
 368                return ERR_PTR(-EINVAL);
 369
 370        pgmap->dev = dev;
 371
 372        mutex_lock(&pgmap_lock);
 373        error = 0;
 374        align_end = align_start + align_size - 1;
 375
 376        foreach_order_pgoff(res, order, pgoff) {
 377                error = __radix_tree_insert(&pgmap_radix,
 378                                PHYS_PFN(res->start) + pgoff, order, pgmap);
 379                if (error) {
 380                        dev_err(dev, "%s: failed: %d\n", __func__, error);
 381                        break;
 382                }
 383        }
 384        mutex_unlock(&pgmap_lock);
 385        if (error)
 386                goto err_radix;
 387
 388        nid = dev_to_node(dev);
 389        if (nid < 0)
 390                nid = numa_mem_id();
 391
 392        error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0,
 393                        align_size);
 394        if (error)
 395                goto err_pfn_remap;
 396
 397        mem_hotplug_begin();
 398        error = arch_add_memory(nid, align_start, align_size, altmap, false);
 399        if (!error)
 400                move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
 401                                        align_start >> PAGE_SHIFT,
 402                                        align_size >> PAGE_SHIFT, altmap);
 403        mem_hotplug_done();
 404        if (error)
 405                goto err_add_memory;
 406
 407        for_each_device_pfn(pfn, pgmap) {
 408                struct page *page = pfn_to_page(pfn);
 409
 410                /*
 411                 * ZONE_DEVICE pages union ->lru with a ->pgmap back
 412                 * pointer.  It is a bug if a ZONE_DEVICE page is ever
 413                 * freed or placed on a driver-private list.  Seed the
 414                 * storage with LIST_POISON* values.
 415                 */
 416                list_del(&page->lru);
 417                page->pgmap = pgmap;
 418                percpu_ref_get(pgmap->ref);
 419        }
 420
 421        devm_add_action(dev, devm_memremap_pages_release, pgmap);
 422
 423        return __va(res->start);
 424
 425 err_add_memory:
 426        untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
 427 err_pfn_remap:
 428 err_radix:
 429        pgmap_radix_release(res, pgoff);
 430        return ERR_PTR(error);
 431}
 432EXPORT_SYMBOL(devm_memremap_pages);
 433
 434unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
 435{
 436        /* number of pfns from base where pfn_to_page() is valid */
 437        return altmap->reserve + altmap->free;
 438}
 439
 440void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
 441{
 442        altmap->alloc -= nr_pfns;
 443}
 444
 445/**
 446 * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
 447 * @pfn: page frame number to lookup page_map
 448 * @pgmap: optional known pgmap that already has a reference
 449 *
 450 * If @pgmap is non-NULL and covers @pfn it will be returned as-is.  If @pgmap
 451 * is non-NULL but does not cover @pfn the reference to it will be released.
 452 */
 453struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 454                struct dev_pagemap *pgmap)
 455{
 456        resource_size_t phys = PFN_PHYS(pfn);
 457
 458        /*
 459         * In the cached case we're already holding a live reference.
 460         */
 461        if (pgmap) {
 462                if (phys >= pgmap->res.start && phys <= pgmap->res.end)
 463                        return pgmap;
 464                put_dev_pagemap(pgmap);
 465        }
 466
 467        /* fall back to slow path lookup */
 468        rcu_read_lock();
 469        pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
 470        if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
 471                pgmap = NULL;
 472        rcu_read_unlock();
 473
 474        return pgmap;
 475}
 476#endif /* CONFIG_ZONE_DEVICE */
 477
 478#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
 479void put_zone_device_private_or_public_page(struct page *page)
 480{
 481        int count = page_ref_dec_return(page);
 482
 483        /*
 484         * If refcount is 1 then page is freed and refcount is stable as nobody
 485         * holds a reference on the page.
 486         */
 487        if (count == 1) {
 488                /* Clear Active bit in case of parallel mark_page_accessed */
 489                __ClearPageActive(page);
 490                __ClearPageWaiters(page);
 491
 492                page->mapping = NULL;
 493                mem_cgroup_uncharge(page);
 494
 495                page->pgmap->page_free(page, page->pgmap->data);
 496        } else if (!count)
 497                __put_page(page);
 498}
 499EXPORT_SYMBOL(put_zone_device_private_or_public_page);
 500#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 501