linux/arch/x86/kernel/e820.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Low level x86 E820 memory map handling functions.
   4 *
   5 * The firmware and bootloader passes us the "E820 table", which is the primary
   6 * physical memory layout description available about x86 systems.
   7 *
   8 * The kernel takes the E820 memory layout and optionally modifies it with
   9 * quirks and other tweaks, and feeds that into the generic Linux memory
  10 * allocation code routines via a platform independent interface (memblock, etc.).
  11 */
  12#include <linux/crash_dump.h>
  13#include <linux/memblock.h>
  14#include <linux/suspend.h>
  15#include <linux/acpi.h>
  16#include <linux/firmware-map.h>
  17#include <linux/sort.h>
  18#include <linux/memory_hotplug.h>
  19
  20#include <asm/e820/api.h>
  21#include <asm/setup.h>
  22
  23/*
  24 * We organize the E820 table into three main data structures:
  25 *
  26 * - 'e820_table_firmware': the original firmware version passed to us by the
  27 *   bootloader - not modified by the kernel. It is composed of two parts:
  28 *   the first 128 E820 memory entries in boot_params.e820_table and the remaining
  29 *   (if any) entries of the SETUP_E820_EXT nodes. We use this to:
  30 *
  31 *       - inform the user about the firmware's notion of memory layout
  32 *         via /sys/firmware/memmap
  33 *
  34 *       - the hibernation code uses it to generate a kernel-independent MD5
  35 *         fingerprint of the physical memory layout of a system.
  36 *
  37 * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
  38 *   passed to us by the bootloader - the major difference between
  39 *   e820_table_firmware[] and this one is that, the latter marks the setup_data
  40 *   list created by the EFI boot stub as reserved, so that kexec can reuse the
  41 *   setup_data information in the second kernel. Besides, e820_table_kexec[]
  42 *   might also be modified by the kexec itself to fake a mptable.
  43 *   We use this to:
  44 *
  45 *       - kexec, which is a bootloader in disguise, uses the original E820
  46 *         layout to pass to the kexec-ed kernel. This way the original kernel
  47 *         can have a restricted E820 map while the kexec()-ed kexec-kernel
  48 *         can have access to full memory - etc.
  49 *
  50 * - 'e820_table': this is the main E820 table that is massaged by the
  51 *   low level x86 platform code, or modified by boot parameters, before
  52 *   passed on to higher level MM layers.
  53 *
  54 * Once the E820 map has been converted to the standard Linux memory layout
  55 * information its role stops - modifying it has no effect and does not get
  56 * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
  57 * specific memory layout data during early bootup.
  58 */
  59static struct e820_table e820_table_init                __initdata;
  60static struct e820_table e820_table_kexec_init          __initdata;
  61static struct e820_table e820_table_firmware_init       __initdata;
  62
  63struct e820_table *e820_table __refdata                 = &e820_table_init;
  64struct e820_table *e820_table_kexec __refdata           = &e820_table_kexec_init;
  65struct e820_table *e820_table_firmware __refdata        = &e820_table_firmware_init;
  66
  67/* For PCI or other memory-mapped resources */
  68unsigned long pci_mem_start = 0xaeedbabe;
  69#ifdef CONFIG_PCI
  70EXPORT_SYMBOL(pci_mem_start);
  71#endif
  72
  73/*
  74 * This function checks if any part of the range <start,end> is mapped
  75 * with type.
  76 */
  77static bool _e820__mapped_any(struct e820_table *table,
  78                              u64 start, u64 end, enum e820_type type)
  79{
  80        int i;
  81
  82        for (i = 0; i < table->nr_entries; i++) {
  83                struct e820_entry *entry = &table->entries[i];
  84
  85                if (type && entry->type != type)
  86                        continue;
  87                if (entry->addr >= end || entry->addr + entry->size <= start)
  88                        continue;
  89                return true;
  90        }
  91        return false;
  92}
  93
  94bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type)
  95{
  96        return _e820__mapped_any(e820_table_firmware, start, end, type);
  97}
  98EXPORT_SYMBOL_GPL(e820__mapped_raw_any);
  99
 100bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
 101{
 102        return _e820__mapped_any(e820_table, start, end, type);
 103}
 104EXPORT_SYMBOL_GPL(e820__mapped_any);
 105
 106/*
 107 * This function checks if the entire <start,end> range is mapped with 'type'.
 108 *
 109 * Note: this function only works correctly once the E820 table is sorted and
 110 * not-overlapping (at least for the range specified), which is the case normally.
 111 */
 112static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
 113                                             enum e820_type type)
 114{
 115        int i;
 116
 117        for (i = 0; i < e820_table->nr_entries; i++) {
 118                struct e820_entry *entry = &e820_table->entries[i];
 119
 120                if (type && entry->type != type)
 121                        continue;
 122
 123                /* Is the region (part) in overlap with the current region? */
 124                if (entry->addr >= end || entry->addr + entry->size <= start)
 125                        continue;
 126
 127                /*
 128                 * If the region is at the beginning of <start,end> we move
 129                 * 'start' to the end of the region since it's ok until there
 130                 */
 131                if (entry->addr <= start)
 132                        start = entry->addr + entry->size;
 133
 134                /*
 135                 * If 'start' is now at or beyond 'end', we're done, full
 136                 * coverage of the desired range exists:
 137                 */
 138                if (start >= end)
 139                        return entry;
 140        }
 141
 142        return NULL;
 143}
 144
 145/*
 146 * This function checks if the entire range <start,end> is mapped with type.
 147 */
 148bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
 149{
 150        return __e820__mapped_all(start, end, type);
 151}
 152
 153/*
 154 * This function returns the type associated with the range <start,end>.
 155 */
 156int e820__get_entry_type(u64 start, u64 end)
 157{
 158        struct e820_entry *entry = __e820__mapped_all(start, end, 0);
 159
 160        return entry ? entry->type : -EINVAL;
 161}
 162
 163/*
 164 * Add a memory region to the kernel E820 map.
 165 */
 166static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
 167{
 168        int x = table->nr_entries;
 169
 170        if (x >= ARRAY_SIZE(table->entries)) {
 171                pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n",
 172                       start, start + size - 1);
 173                return;
 174        }
 175
 176        table->entries[x].addr = start;
 177        table->entries[x].size = size;
 178        table->entries[x].type = type;
 179        table->nr_entries++;
 180}
 181
 182void __init e820__range_add(u64 start, u64 size, enum e820_type type)
 183{
 184        __e820__range_add(e820_table, start, size, type);
 185}
 186
 187static void __init e820_print_type(enum e820_type type)
 188{
 189        switch (type) {
 190        case E820_TYPE_RAM:             /* Fall through: */
 191        case E820_TYPE_RESERVED_KERN:   pr_cont("usable");                      break;
 192        case E820_TYPE_RESERVED:        pr_cont("reserved");                    break;
 193        case E820_TYPE_ACPI:            pr_cont("ACPI data");                   break;
 194        case E820_TYPE_NVS:             pr_cont("ACPI NVS");                    break;
 195        case E820_TYPE_UNUSABLE:        pr_cont("unusable");                    break;
 196        case E820_TYPE_PMEM:            /* Fall through: */
 197        case E820_TYPE_PRAM:            pr_cont("persistent (type %u)", type);  break;
 198        default:                        pr_cont("type %u", type);               break;
 199        }
 200}
 201
 202void __init e820__print_table(char *who)
 203{
 204        int i;
 205
 206        for (i = 0; i < e820_table->nr_entries; i++) {
 207                pr_info("%s: [mem %#018Lx-%#018Lx] ",
 208                        who,
 209                        e820_table->entries[i].addr,
 210                        e820_table->entries[i].addr + e820_table->entries[i].size - 1);
 211
 212                e820_print_type(e820_table->entries[i].type);
 213                pr_cont("\n");
 214        }
 215}
 216
 217/*
 218 * Sanitize an E820 map.
 219 *
 220 * Some E820 layouts include overlapping entries. The following
 221 * replaces the original E820 map with a new one, removing overlaps,
 222 * and resolving conflicting memory types in favor of highest
 223 * numbered type.
 224 *
 225 * The input parameter 'entries' points to an array of 'struct
 226 * e820_entry' which on entry has elements in the range [0, *nr_entries)
 227 * valid, and which has space for up to max_nr_entries entries.
 228 * On return, the resulting sanitized E820 map entries will be in
 229 * overwritten in the same location, starting at 'entries'.
 230 *
 231 * The integer pointed to by nr_entries must be valid on entry (the
 232 * current number of valid entries located at 'entries'). If the
 233 * sanitizing succeeds the *nr_entries will be updated with the new
 234 * number of valid entries (something no more than max_nr_entries).
 235 *
 236 * The return value from e820__update_table() is zero if it
 237 * successfully 'sanitized' the map entries passed in, and is -1
 238 * if it did nothing, which can happen if either of (1) it was
 239 * only passed one map entry, or (2) any of the input map entries
 240 * were invalid (start + size < start, meaning that the size was
 241 * so big the described memory range wrapped around through zero.)
 242 *
 243 *      Visually we're performing the following
 244 *      (1,2,3,4 = memory types)...
 245 *
 246 *      Sample memory map (w/overlaps):
 247 *         ____22__________________
 248 *         ______________________4_
 249 *         ____1111________________
 250 *         _44_____________________
 251 *         11111111________________
 252 *         ____________________33__
 253 *         ___________44___________
 254 *         __________33333_________
 255 *         ______________22________
 256 *         ___________________2222_
 257 *         _________111111111______
 258 *         _____________________11_
 259 *         _________________4______
 260 *
 261 *      Sanitized equivalent (no overlap):
 262 *         1_______________________
 263 *         _44_____________________
 264 *         ___1____________________
 265 *         ____22__________________
 266 *         ______11________________
 267 *         _________1______________
 268 *         __________3_____________
 269 *         ___________44___________
 270 *         _____________33_________
 271 *         _______________2________
 272 *         ________________1_______
 273 *         _________________4______
 274 *         ___________________2____
 275 *         ____________________33__
 276 *         ______________________4_
 277 */
 278struct change_member {
 279        /* Pointer to the original entry: */
 280        struct e820_entry       *entry;
 281        /* Address for this change point: */
 282        unsigned long long      addr;
 283};
 284
 285static struct change_member     change_point_list[2*E820_MAX_ENTRIES]   __initdata;
 286static struct change_member     *change_point[2*E820_MAX_ENTRIES]       __initdata;
 287static struct e820_entry        *overlap_list[E820_MAX_ENTRIES]         __initdata;
 288static struct e820_entry        new_entries[E820_MAX_ENTRIES]           __initdata;
 289
 290static int __init cpcompare(const void *a, const void *b)
 291{
 292        struct change_member * const *app = a, * const *bpp = b;
 293        const struct change_member *ap = *app, *bp = *bpp;
 294
 295        /*
 296         * Inputs are pointers to two elements of change_point[].  If their
 297         * addresses are not equal, their difference dominates.  If the addresses
 298         * are equal, then consider one that represents the end of its region
 299         * to be greater than one that does not.
 300         */
 301        if (ap->addr != bp->addr)
 302                return ap->addr > bp->addr ? 1 : -1;
 303
 304        return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
 305}
 306
 307int __init e820__update_table(struct e820_table *table)
 308{
 309        struct e820_entry *entries = table->entries;
 310        u32 max_nr_entries = ARRAY_SIZE(table->entries);
 311        enum e820_type current_type, last_type;
 312        unsigned long long last_addr;
 313        u32 new_nr_entries, overlap_entries;
 314        u32 i, chg_idx, chg_nr;
 315
 316        /* If there's only one memory region, don't bother: */
 317        if (table->nr_entries < 2)
 318                return -1;
 319
 320        BUG_ON(table->nr_entries > max_nr_entries);
 321
 322        /* Bail out if we find any unreasonable addresses in the map: */
 323        for (i = 0; i < table->nr_entries; i++) {
 324                if (entries[i].addr + entries[i].size < entries[i].addr)
 325                        return -1;
 326        }
 327
 328        /* Create pointers for initial change-point information (for sorting): */
 329        for (i = 0; i < 2 * table->nr_entries; i++)
 330                change_point[i] = &change_point_list[i];
 331
 332        /*
 333         * Record all known change-points (starting and ending addresses),
 334         * omitting empty memory regions:
 335         */
 336        chg_idx = 0;
 337        for (i = 0; i < table->nr_entries; i++) {
 338                if (entries[i].size != 0) {
 339                        change_point[chg_idx]->addr     = entries[i].addr;
 340                        change_point[chg_idx++]->entry  = &entries[i];
 341                        change_point[chg_idx]->addr     = entries[i].addr + entries[i].size;
 342                        change_point[chg_idx++]->entry  = &entries[i];
 343                }
 344        }
 345        chg_nr = chg_idx;
 346
 347        /* Sort change-point list by memory addresses (low -> high): */
 348        sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL);
 349
 350        /* Create a new memory map, removing overlaps: */
 351        overlap_entries = 0;     /* Number of entries in the overlap table */
 352        new_nr_entries = 0;      /* Index for creating new map entries */
 353        last_type = 0;           /* Start with undefined memory type */
 354        last_addr = 0;           /* Start with 0 as last starting address */
 355
 356        /* Loop through change-points, determining effect on the new map: */
 357        for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) {
 358                /* Keep track of all overlapping entries */
 359                if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) {
 360                        /* Add map entry to overlap list (> 1 entry implies an overlap) */
 361                        overlap_list[overlap_entries++] = change_point[chg_idx]->entry;
 362                } else {
 363                        /* Remove entry from list (order independent, so swap with last): */
 364                        for (i = 0; i < overlap_entries; i++) {
 365                                if (overlap_list[i] == change_point[chg_idx]->entry)
 366                                        overlap_list[i] = overlap_list[overlap_entries-1];
 367                        }
 368                        overlap_entries--;
 369                }
 370                /*
 371                 * If there are overlapping entries, decide which
 372                 * "type" to use (larger value takes precedence --
 373                 * 1=usable, 2,3,4,4+=unusable)
 374                 */
 375                current_type = 0;
 376                for (i = 0; i < overlap_entries; i++) {
 377                        if (overlap_list[i]->type > current_type)
 378                                current_type = overlap_list[i]->type;
 379                }
 380
 381                /* Continue building up new map based on this information: */
 382                if (current_type != last_type || current_type == E820_TYPE_PRAM) {
 383                        if (last_type != 0)      {
 384                                new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
 385                                /* Move forward only if the new size was non-zero: */
 386                                if (new_entries[new_nr_entries].size != 0)
 387                                        /* No more space left for new entries? */
 388                                        if (++new_nr_entries >= max_nr_entries)
 389                                                break;
 390                        }
 391                        if (current_type != 0)  {
 392                                new_entries[new_nr_entries].addr = change_point[chg_idx]->addr;
 393                                new_entries[new_nr_entries].type = current_type;
 394                                last_addr = change_point[chg_idx]->addr;
 395                        }
 396                        last_type = current_type;
 397                }
 398        }
 399
 400        /* Copy the new entries into the original location: */
 401        memcpy(entries, new_entries, new_nr_entries*sizeof(*entries));
 402        table->nr_entries = new_nr_entries;
 403
 404        return 0;
 405}
 406
 407static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
 408{
 409        struct boot_e820_entry *entry = entries;
 410
 411        while (nr_entries) {
 412                u64 start = entry->addr;
 413                u64 size = entry->size;
 414                u64 end = start + size - 1;
 415                u32 type = entry->type;
 416
 417                /* Ignore the entry on 64-bit overflow: */
 418                if (start > end && likely(size))
 419                        return -1;
 420
 421                e820__range_add(start, size, type);
 422
 423                entry++;
 424                nr_entries--;
 425        }
 426        return 0;
 427}
 428
 429/*
 430 * Copy the BIOS E820 map into a safe place.
 431 *
 432 * Sanity-check it while we're at it..
 433 *
 434 * If we're lucky and live on a modern system, the setup code
 435 * will have given us a memory map that we can use to properly
 436 * set up memory.  If we aren't, we'll fake a memory map.
 437 */
 438static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
 439{
 440        /* Only one memory region (or negative)? Ignore it */
 441        if (nr_entries < 2)
 442                return -1;
 443
 444        return __append_e820_table(entries, nr_entries);
 445}
 446
 447static u64 __init
 448__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
 449{
 450        u64 end;
 451        unsigned int i;
 452        u64 real_updated_size = 0;
 453
 454        BUG_ON(old_type == new_type);
 455
 456        if (size > (ULLONG_MAX - start))
 457                size = ULLONG_MAX - start;
 458
 459        end = start + size;
 460        printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
 461        e820_print_type(old_type);
 462        pr_cont(" ==> ");
 463        e820_print_type(new_type);
 464        pr_cont("\n");
 465
 466        for (i = 0; i < table->nr_entries; i++) {
 467                struct e820_entry *entry = &table->entries[i];
 468                u64 final_start, final_end;
 469                u64 entry_end;
 470
 471                if (entry->type != old_type)
 472                        continue;
 473
 474                entry_end = entry->addr + entry->size;
 475
 476                /* Completely covered by new range? */
 477                if (entry->addr >= start && entry_end <= end) {
 478                        entry->type = new_type;
 479                        real_updated_size += entry->size;
 480                        continue;
 481                }
 482
 483                /* New range is completely covered? */
 484                if (entry->addr < start && entry_end > end) {
 485                        __e820__range_add(table, start, size, new_type);
 486                        __e820__range_add(table, end, entry_end - end, entry->type);
 487                        entry->size = start - entry->addr;
 488                        real_updated_size += size;
 489                        continue;
 490                }
 491
 492                /* Partially covered: */
 493                final_start = max(start, entry->addr);
 494                final_end = min(end, entry_end);
 495                if (final_start >= final_end)
 496                        continue;
 497
 498                __e820__range_add(table, final_start, final_end - final_start, new_type);
 499
 500                real_updated_size += final_end - final_start;
 501
 502                /*
 503                 * Left range could be head or tail, so need to update
 504                 * its size first:
 505                 */
 506                entry->size -= final_end - final_start;
 507                if (entry->addr < final_start)
 508                        continue;
 509
 510                entry->addr = final_end;
 511        }
 512        return real_updated_size;
 513}
 514
 515u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
 516{
 517        return __e820__range_update(e820_table, start, size, old_type, new_type);
 518}
 519
 520static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type  new_type)
 521{
 522        return __e820__range_update(e820_table_kexec, start, size, old_type, new_type);
 523}
 524
 525/* Remove a range of memory from the E820 table: */
 526u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
 527{
 528        int i;
 529        u64 end;
 530        u64 real_removed_size = 0;
 531
 532        if (size > (ULLONG_MAX - start))
 533                size = ULLONG_MAX - start;
 534
 535        end = start + size;
 536        printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
 537        if (check_type)
 538                e820_print_type(old_type);
 539        pr_cont("\n");
 540
 541        for (i = 0; i < e820_table->nr_entries; i++) {
 542                struct e820_entry *entry = &e820_table->entries[i];
 543                u64 final_start, final_end;
 544                u64 entry_end;
 545
 546                if (check_type && entry->type != old_type)
 547                        continue;
 548
 549                entry_end = entry->addr + entry->size;
 550
 551                /* Completely covered? */
 552                if (entry->addr >= start && entry_end <= end) {
 553                        real_removed_size += entry->size;
 554                        memset(entry, 0, sizeof(*entry));
 555                        continue;
 556                }
 557
 558                /* Is the new range completely covered? */
 559                if (entry->addr < start && entry_end > end) {
 560                        e820__range_add(end, entry_end - end, entry->type);
 561                        entry->size = start - entry->addr;
 562                        real_removed_size += size;
 563                        continue;
 564                }
 565
 566                /* Partially covered: */
 567                final_start = max(start, entry->addr);
 568                final_end = min(end, entry_end);
 569                if (final_start >= final_end)
 570                        continue;
 571
 572                real_removed_size += final_end - final_start;
 573
 574                /*
 575                 * Left range could be head or tail, so need to update
 576                 * the size first:
 577                 */
 578                entry->size -= final_end - final_start;
 579                if (entry->addr < final_start)
 580                        continue;
 581
 582                entry->addr = final_end;
 583        }
 584        return real_removed_size;
 585}
 586
 587void __init e820__update_table_print(void)
 588{
 589        if (e820__update_table(e820_table))
 590                return;
 591
 592        pr_info("modified physical RAM map:\n");
 593        e820__print_table("modified");
 594}
 595
 596static void __init e820__update_table_kexec(void)
 597{
 598        e820__update_table(e820_table_kexec);
 599}
 600
 601#define MAX_GAP_END 0x100000000ull
 602
 603/*
 604 * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
 605 */
 606static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
 607{
 608        unsigned long long last = MAX_GAP_END;
 609        int i = e820_table->nr_entries;
 610        int found = 0;
 611
 612        while (--i >= 0) {
 613                unsigned long long start = e820_table->entries[i].addr;
 614                unsigned long long end = start + e820_table->entries[i].size;
 615
 616                /*
 617                 * Since "last" is at most 4GB, we know we'll
 618                 * fit in 32 bits if this condition is true:
 619                 */
 620                if (last > end) {
 621                        unsigned long gap = last - end;
 622
 623                        if (gap >= *gapsize) {
 624                                *gapsize = gap;
 625                                *gapstart = end;
 626                                found = 1;
 627                        }
 628                }
 629                if (start < last)
 630                        last = start;
 631        }
 632        return found;
 633}
 634
 635/*
 636 * Search for the biggest gap in the low 32 bits of the E820
 637 * memory space. We pass this space to the PCI subsystem, so
 638 * that it can assign MMIO resources for hotplug or
 639 * unconfigured devices in.
 640 *
 641 * Hopefully the BIOS let enough space left.
 642 */
 643__init void e820__setup_pci_gap(void)
 644{
 645        unsigned long gapstart, gapsize;
 646        int found;
 647
 648        gapsize = 0x400000;
 649        found  = e820_search_gap(&gapstart, &gapsize);
 650
 651        if (!found) {
 652#ifdef CONFIG_X86_64
 653                gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
 654                pr_err("Cannot find an available gap in the 32-bit address range\n");
 655                pr_err("PCI devices with unassigned 32-bit BARs may not work!\n");
 656#else
 657                gapstart = 0x10000000;
 658#endif
 659        }
 660
 661        /*
 662         * e820__reserve_resources_late() protects stolen RAM already:
 663         */
 664        pci_mem_start = gapstart;
 665
 666        pr_info("[mem %#010lx-%#010lx] available for PCI devices\n",
 667                gapstart, gapstart + gapsize - 1);
 668}
 669
 670/*
 671 * Called late during init, in free_initmem().
 672 *
 673 * Initial e820_table and e820_table_kexec are largish __initdata arrays.
 674 *
 675 * Copy them to a (usually much smaller) dynamically allocated area that is
 676 * sized precisely after the number of e820 entries.
 677 *
 678 * This is done after we've performed all the fixes and tweaks to the tables.
 679 * All functions which modify them are __init functions, which won't exist
 680 * after free_initmem().
 681 */
 682__init void e820__reallocate_tables(void)
 683{
 684        struct e820_table *n;
 685        int size;
 686
 687        size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
 688        n = kmemdup(e820_table, size, GFP_KERNEL);
 689        BUG_ON(!n);
 690        e820_table = n;
 691
 692        size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries;
 693        n = kmemdup(e820_table_kexec, size, GFP_KERNEL);
 694        BUG_ON(!n);
 695        e820_table_kexec = n;
 696
 697        size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
 698        n = kmemdup(e820_table_firmware, size, GFP_KERNEL);
 699        BUG_ON(!n);
 700        e820_table_firmware = n;
 701}
 702
 703/*
 704 * Because of the small fixed size of struct boot_params, only the first
 705 * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
 706 * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
 707 * struct setup_data, which is parsed here.
 708 */
 709void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
 710{
 711        int entries;
 712        struct boot_e820_entry *extmap;
 713        struct setup_data *sdata;
 714
 715        sdata = early_memremap(phys_addr, data_len);
 716        entries = sdata->len / sizeof(*extmap);
 717        extmap = (struct boot_e820_entry *)(sdata->data);
 718
 719        __append_e820_table(extmap, entries);
 720        e820__update_table(e820_table);
 721
 722        memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
 723        memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
 724
 725        early_memunmap(sdata, data_len);
 726        pr_info("extended physical RAM map:\n");
 727        e820__print_table("extended");
 728}
 729
 730/*
 731 * Find the ranges of physical addresses that do not correspond to
 732 * E820 RAM areas and register the corresponding pages as 'nosave' for
 733 * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
 734 *
 735 * This function requires the E820 map to be sorted and without any
 736 * overlapping entries.
 737 */
 738void __init e820__register_nosave_regions(unsigned long limit_pfn)
 739{
 740        int i;
 741        unsigned long pfn = 0;
 742
 743        for (i = 0; i < e820_table->nr_entries; i++) {
 744                struct e820_entry *entry = &e820_table->entries[i];
 745
 746                if (pfn < PFN_UP(entry->addr))
 747                        register_nosave_region(pfn, PFN_UP(entry->addr));
 748
 749                pfn = PFN_DOWN(entry->addr + entry->size);
 750
 751                if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
 752                        register_nosave_region(PFN_UP(entry->addr), pfn);
 753
 754                if (pfn >= limit_pfn)
 755                        break;
 756        }
 757}
 758
 759#ifdef CONFIG_ACPI
 760/*
 761 * Register ACPI NVS memory regions, so that we can save/restore them during
 762 * hibernation and the subsequent resume:
 763 */
 764static int __init e820__register_nvs_regions(void)
 765{
 766        int i;
 767
 768        for (i = 0; i < e820_table->nr_entries; i++) {
 769                struct e820_entry *entry = &e820_table->entries[i];
 770
 771                if (entry->type == E820_TYPE_NVS)
 772                        acpi_nvs_register(entry->addr, entry->size);
 773        }
 774
 775        return 0;
 776}
 777core_initcall(e820__register_nvs_regions);
 778#endif
 779
 780/*
 781 * Allocate the requested number of bytes with the requsted alignment
 782 * and return (the physical address) to the caller. Also register this
 783 * range in the 'kexec' E820 table as a reserved range.
 784 *
 785 * This allows kexec to fake a new mptable, as if it came from the real
 786 * system.
 787 */
 788u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
 789{
 790        u64 addr;
 791
 792        addr = memblock_phys_alloc(size, align);
 793        if (addr) {
 794                e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
 795                pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
 796                e820__update_table_kexec();
 797        }
 798
 799        return addr;
 800}
 801
 802#ifdef CONFIG_X86_32
 803# ifdef CONFIG_X86_PAE
 804#  define MAX_ARCH_PFN          (1ULL<<(36-PAGE_SHIFT))
 805# else
 806#  define MAX_ARCH_PFN          (1ULL<<(32-PAGE_SHIFT))
 807# endif
 808#else /* CONFIG_X86_32 */
 809# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
 810#endif
 811
 812/*
 813 * Find the highest page frame number we have available
 814 */
 815static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
 816{
 817        int i;
 818        unsigned long last_pfn = 0;
 819        unsigned long max_arch_pfn = MAX_ARCH_PFN;
 820
 821        for (i = 0; i < e820_table->nr_entries; i++) {
 822                struct e820_entry *entry = &e820_table->entries[i];
 823                unsigned long start_pfn;
 824                unsigned long end_pfn;
 825
 826                if (entry->type != type)
 827                        continue;
 828
 829                start_pfn = entry->addr >> PAGE_SHIFT;
 830                end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
 831
 832                if (start_pfn >= limit_pfn)
 833                        continue;
 834                if (end_pfn > limit_pfn) {
 835                        last_pfn = limit_pfn;
 836                        break;
 837                }
 838                if (end_pfn > last_pfn)
 839                        last_pfn = end_pfn;
 840        }
 841
 842        if (last_pfn > max_arch_pfn)
 843                last_pfn = max_arch_pfn;
 844
 845        pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n",
 846                last_pfn, max_arch_pfn);
 847        return last_pfn;
 848}
 849
 850unsigned long __init e820__end_of_ram_pfn(void)
 851{
 852        return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
 853}
 854
 855unsigned long __init e820__end_of_low_ram_pfn(void)
 856{
 857        return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
 858}
 859
 860static void __init early_panic(char *msg)
 861{
 862        early_printk(msg);
 863        panic(msg);
 864}
 865
 866static int userdef __initdata;
 867
 868/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
 869static int __init parse_memopt(char *p)
 870{
 871        u64 mem_size;
 872
 873        if (!p)
 874                return -EINVAL;
 875
 876        if (!strcmp(p, "nopentium")) {
 877#ifdef CONFIG_X86_32
 878                setup_clear_cpu_cap(X86_FEATURE_PSE);
 879                return 0;
 880#else
 881                pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
 882                return -EINVAL;
 883#endif
 884        }
 885
 886        userdef = 1;
 887        mem_size = memparse(p, &p);
 888
 889        /* Don't remove all memory when getting "mem={invalid}" parameter: */
 890        if (mem_size == 0)
 891                return -EINVAL;
 892
 893        e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
 894
 895#ifdef CONFIG_MEMORY_HOTPLUG
 896        max_mem_size = mem_size;
 897#endif
 898
 899        return 0;
 900}
 901early_param("mem", parse_memopt);
 902
 903static int __init parse_memmap_one(char *p)
 904{
 905        char *oldp;
 906        u64 start_at, mem_size;
 907
 908        if (!p)
 909                return -EINVAL;
 910
 911        if (!strncmp(p, "exactmap", 8)) {
 912#ifdef CONFIG_CRASH_DUMP
 913                /*
 914                 * If we are doing a crash dump, we still need to know
 915                 * the real memory size before the original memory map is
 916                 * reset.
 917                 */
 918                saved_max_pfn = e820__end_of_ram_pfn();
 919#endif
 920                e820_table->nr_entries = 0;
 921                userdef = 1;
 922                return 0;
 923        }
 924
 925        oldp = p;
 926        mem_size = memparse(p, &p);
 927        if (p == oldp)
 928                return -EINVAL;
 929
 930        userdef = 1;
 931        if (*p == '@') {
 932                start_at = memparse(p+1, &p);
 933                e820__range_add(start_at, mem_size, E820_TYPE_RAM);
 934        } else if (*p == '#') {
 935                start_at = memparse(p+1, &p);
 936                e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
 937        } else if (*p == '$') {
 938                start_at = memparse(p+1, &p);
 939                e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
 940        } else if (*p == '!') {
 941                start_at = memparse(p+1, &p);
 942                e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
 943        } else if (*p == '%') {
 944                enum e820_type from = 0, to = 0;
 945
 946                start_at = memparse(p + 1, &p);
 947                if (*p == '-')
 948                        from = simple_strtoull(p + 1, &p, 0);
 949                if (*p == '+')
 950                        to = simple_strtoull(p + 1, &p, 0);
 951                if (*p != '\0')
 952                        return -EINVAL;
 953                if (from && to)
 954                        e820__range_update(start_at, mem_size, from, to);
 955                else if (to)
 956                        e820__range_add(start_at, mem_size, to);
 957                else if (from)
 958                        e820__range_remove(start_at, mem_size, from, 1);
 959                else
 960                        e820__range_remove(start_at, mem_size, 0, 0);
 961        } else {
 962                e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
 963        }
 964
 965        return *p == '\0' ? 0 : -EINVAL;
 966}
 967
 968static int __init parse_memmap_opt(char *str)
 969{
 970        while (str) {
 971                char *k = strchr(str, ',');
 972
 973                if (k)
 974                        *k++ = 0;
 975
 976                parse_memmap_one(str);
 977                str = k;
 978        }
 979
 980        return 0;
 981}
 982early_param("memmap", parse_memmap_opt);
 983
 984/*
 985 * Reserve all entries from the bootloader's extensible data nodes list,
 986 * because if present we are going to use it later on to fetch e820
 987 * entries from it:
 988 */
 989void __init e820__reserve_setup_data(void)
 990{
 991        struct setup_data *data;
 992        u64 pa_data;
 993
 994        pa_data = boot_params.hdr.setup_data;
 995        if (!pa_data)
 996                return;
 997
 998        while (pa_data) {
 999                data = early_memremap(pa_data, sizeof(*data));
1000                e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
1001                e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
1002                pa_data = data->next;
1003                early_memunmap(data, sizeof(*data));
1004        }
1005
1006        e820__update_table(e820_table);
1007        e820__update_table(e820_table_kexec);
1008
1009        pr_info("extended physical RAM map:\n");
1010        e820__print_table("reserve setup_data");
1011}
1012
1013/*
1014 * Called after parse_early_param(), after early parameters (such as mem=)
1015 * have been processed, in which case we already have an E820 table filled in
1016 * via the parameter callback function(s), but it's not sorted and printed yet:
1017 */
1018void __init e820__finish_early_params(void)
1019{
1020        if (userdef) {
1021                if (e820__update_table(e820_table) < 0)
1022                        early_panic("Invalid user supplied memory map");
1023
1024                pr_info("user-defined physical RAM map:\n");
1025                e820__print_table("user");
1026        }
1027}
1028
1029static const char *__init e820_type_to_string(struct e820_entry *entry)
1030{
1031        switch (entry->type) {
1032        case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
1033        case E820_TYPE_RAM:             return "System RAM";
1034        case E820_TYPE_ACPI:            return "ACPI Tables";
1035        case E820_TYPE_NVS:             return "ACPI Non-volatile Storage";
1036        case E820_TYPE_UNUSABLE:        return "Unusable memory";
1037        case E820_TYPE_PRAM:            return "Persistent Memory (legacy)";
1038        case E820_TYPE_PMEM:            return "Persistent Memory";
1039        case E820_TYPE_RESERVED:        return "Reserved";
1040        default:                        return "Unknown E820 type";
1041        }
1042}
1043
1044static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
1045{
1046        switch (entry->type) {
1047        case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
1048        case E820_TYPE_RAM:             return IORESOURCE_SYSTEM_RAM;
1049        case E820_TYPE_ACPI:            /* Fall-through: */
1050        case E820_TYPE_NVS:             /* Fall-through: */
1051        case E820_TYPE_UNUSABLE:        /* Fall-through: */
1052        case E820_TYPE_PRAM:            /* Fall-through: */
1053        case E820_TYPE_PMEM:            /* Fall-through: */
1054        case E820_TYPE_RESERVED:        /* Fall-through: */
1055        default:                        return IORESOURCE_MEM;
1056        }
1057}
1058
1059static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
1060{
1061        switch (entry->type) {
1062        case E820_TYPE_ACPI:            return IORES_DESC_ACPI_TABLES;
1063        case E820_TYPE_NVS:             return IORES_DESC_ACPI_NV_STORAGE;
1064        case E820_TYPE_PMEM:            return IORES_DESC_PERSISTENT_MEMORY;
1065        case E820_TYPE_PRAM:            return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
1066        case E820_TYPE_RESERVED:        return IORES_DESC_RESERVED;
1067        case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
1068        case E820_TYPE_RAM:             /* Fall-through: */
1069        case E820_TYPE_UNUSABLE:        /* Fall-through: */
1070        default:                        return IORES_DESC_NONE;
1071        }
1072}
1073
1074static bool __init do_mark_busy(enum e820_type type, struct resource *res)
1075{
1076        /* this is the legacy bios/dos rom-shadow + mmio region */
1077        if (res->start < (1ULL<<20))
1078                return true;
1079
1080        /*
1081         * Treat persistent memory like device memory, i.e. reserve it
1082         * for exclusive use of a driver
1083         */
1084        switch (type) {
1085        case E820_TYPE_RESERVED:
1086        case E820_TYPE_PRAM:
1087        case E820_TYPE_PMEM:
1088                return false;
1089        case E820_TYPE_RESERVED_KERN:
1090        case E820_TYPE_RAM:
1091        case E820_TYPE_ACPI:
1092        case E820_TYPE_NVS:
1093        case E820_TYPE_UNUSABLE:
1094        default:
1095                return true;
1096        }
1097}
1098
1099/*
1100 * Mark E820 reserved areas as busy for the resource manager:
1101 */
1102
1103static struct resource __initdata *e820_res;
1104
1105void __init e820__reserve_resources(void)
1106{
1107        int i;
1108        struct resource *res;
1109        u64 end;
1110
1111        res = memblock_alloc(sizeof(*res) * e820_table->nr_entries,
1112                             SMP_CACHE_BYTES);
1113        if (!res)
1114                panic("%s: Failed to allocate %zu bytes\n", __func__,
1115                      sizeof(*res) * e820_table->nr_entries);
1116        e820_res = res;
1117
1118        for (i = 0; i < e820_table->nr_entries; i++) {
1119                struct e820_entry *entry = e820_table->entries + i;
1120
1121                end = entry->addr + entry->size - 1;
1122                if (end != (resource_size_t)end) {
1123                        res++;
1124                        continue;
1125                }
1126                res->start = entry->addr;
1127                res->end   = end;
1128                res->name  = e820_type_to_string(entry);
1129                res->flags = e820_type_to_iomem_type(entry);
1130                res->desc  = e820_type_to_iores_desc(entry);
1131
1132                /*
1133                 * Don't register the region that could be conflicted with
1134                 * PCI device BAR resources and insert them later in
1135                 * pcibios_resource_survey():
1136                 */
1137                if (do_mark_busy(entry->type, res)) {
1138                        res->flags |= IORESOURCE_BUSY;
1139                        insert_resource(&iomem_resource, res);
1140                }
1141                res++;
1142        }
1143
1144        /* Expose the bootloader-provided memory layout to the sysfs. */
1145        for (i = 0; i < e820_table_firmware->nr_entries; i++) {
1146                struct e820_entry *entry = e820_table_firmware->entries + i;
1147
1148                firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
1149        }
1150}
1151
1152/*
1153 * How much should we pad the end of RAM, depending on where it is?
1154 */
1155static unsigned long __init ram_alignment(resource_size_t pos)
1156{
1157        unsigned long mb = pos >> 20;
1158
1159        /* To 64kB in the first megabyte */
1160        if (!mb)
1161                return 64*1024;
1162
1163        /* To 1MB in the first 16MB */
1164        if (mb < 16)
1165                return 1024*1024;
1166
1167        /* To 64MB for anything above that */
1168        return 64*1024*1024;
1169}
1170
1171#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
1172
1173void __init e820__reserve_resources_late(void)
1174{
1175        int i;
1176        struct resource *res;
1177
1178        res = e820_res;
1179        for (i = 0; i < e820_table->nr_entries; i++) {
1180                if (!res->parent && res->end)
1181                        insert_resource_expand_to_fit(&iomem_resource, res);
1182                res++;
1183        }
1184
1185        /*
1186         * Try to bump up RAM regions to reasonable boundaries, to
1187         * avoid stolen RAM:
1188         */
1189        for (i = 0; i < e820_table->nr_entries; i++) {
1190                struct e820_entry *entry = &e820_table->entries[i];
1191                u64 start, end;
1192
1193                if (entry->type != E820_TYPE_RAM)
1194                        continue;
1195
1196                start = entry->addr + entry->size;
1197                end = round_up(start, ram_alignment(start)) - 1;
1198                if (end > MAX_RESOURCE_SIZE)
1199                        end = MAX_RESOURCE_SIZE;
1200                if (start >= end)
1201                        continue;
1202
1203                printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
1204                reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
1205        }
1206}
1207
1208/*
1209 * Pass the firmware (bootloader) E820 map to the kernel and process it:
1210 */
1211char *__init e820__memory_setup_default(void)
1212{
1213        char *who = "BIOS-e820";
1214
1215        /*
1216         * Try to copy the BIOS-supplied E820-map.
1217         *
1218         * Otherwise fake a memory map; one section from 0k->640k,
1219         * the next section from 1mb->appropriate_mem_k
1220         */
1221        if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
1222                u64 mem_size;
1223
1224                /* Compare results from other methods and take the one that gives more RAM: */
1225                if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
1226                        mem_size = boot_params.screen_info.ext_mem_k;
1227                        who = "BIOS-88";
1228                } else {
1229                        mem_size = boot_params.alt_mem_k;
1230                        who = "BIOS-e801";
1231                }
1232
1233                e820_table->nr_entries = 0;
1234                e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM);
1235                e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM);
1236        }
1237
1238        /* We just appended a lot of ranges, sanitize the table: */
1239        e820__update_table(e820_table);
1240
1241        return who;
1242}
1243
1244/*
1245 * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
1246 * E820 map - with an optional platform quirk available for virtual platforms
1247 * to override this method of boot environment processing:
1248 */
1249void __init e820__memory_setup(void)
1250{
1251        char *who;
1252
1253        /* This is a firmware interface ABI - make sure we don't break it: */
1254        BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20);
1255
1256        who = x86_init.resources.memory_setup();
1257
1258        memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
1259        memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
1260
1261        pr_info("BIOS-provided physical RAM map:\n");
1262        e820__print_table(who);
1263}
1264
1265void __init e820__memblock_setup(void)
1266{
1267        int i;
1268        u64 end;
1269
1270        /*
1271         * The bootstrap memblock region count maximum is 128 entries
1272         * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
1273         * than that - so allow memblock resizing.
1274         *
1275         * This is safe, because this call happens pretty late during x86 setup,
1276         * so we know about reserved memory regions already. (This is important
1277         * so that memblock resizing does no stomp over reserved areas.)
1278         */
1279        memblock_allow_resize();
1280
1281        for (i = 0; i < e820_table->nr_entries; i++) {
1282                struct e820_entry *entry = &e820_table->entries[i];
1283
1284                end = entry->addr + entry->size;
1285                if (end != (resource_size_t)end)
1286                        continue;
1287
1288                if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
1289                        continue;
1290
1291                memblock_add(entry->addr, entry->size);
1292        }
1293
1294        /* Throw away partial pages: */
1295        memblock_trim_memory(PAGE_SIZE);
1296
1297        memblock_dump_all();
1298}
1299