linux/arch/x86/kernel/e820.c
<<
>>
Prefs
   1/*
   2 * Low level x86 E820 memory map handling functions.
   3 *
   4 * The firmware and bootloader passes us the "E820 table", which is the primary
   5 * physical memory layout description available about x86 systems.
   6 *
   7 * The kernel takes the E820 memory layout and optionally modifies it with
   8 * quirks and other tweaks, and feeds that into the generic Linux memory
   9 * allocation code routines via a platform independent interface (memblock, etc.).
  10 */
  11#include <linux/crash_dump.h>
  12#include <linux/memblock.h>
  13#include <linux/suspend.h>
  14#include <linux/acpi.h>
  15#include <linux/firmware-map.h>
  16#include <linux/sort.h>
  17#include <linux/memory_hotplug.h>
  18
  19#include <asm/e820/api.h>
  20#include <asm/setup.h>
  21
  22/*
  23 * We organize the E820 table into three main data structures:
  24 *
  25 * - 'e820_table_firmware': the original firmware version passed to us by the
  26 *   bootloader - not modified by the kernel. It is composed of two parts:
  27 *   the first 128 E820 memory entries in boot_params.e820_table and the remaining
  28 *   (if any) entries of the SETUP_E820_EXT nodes. We use this to:
  29 *
  30 *       - inform the user about the firmware's notion of memory layout
  31 *         via /sys/firmware/memmap
  32 *
  33 *       - the hibernation code uses it to generate a kernel-independent MD5
  34 *         fingerprint of the physical memory layout of a system.
  35 *
  36 * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
  37 *   passed to us by the bootloader - the major difference between
  38 *   e820_table_firmware[] and this one is that, the latter marks the setup_data
  39 *   list created by the EFI boot stub as reserved, so that kexec can reuse the
  40 *   setup_data information in the second kernel. Besides, e820_table_kexec[]
  41 *   might also be modified by the kexec itself to fake a mptable.
  42 *   We use this to:
  43 *
  44 *       - kexec, which is a bootloader in disguise, uses the original E820
  45 *         layout to pass to the kexec-ed kernel. This way the original kernel
  46 *         can have a restricted E820 map while the kexec()-ed kexec-kernel
  47 *         can have access to full memory - etc.
  48 *
  49 * - 'e820_table': this is the main E820 table that is massaged by the
  50 *   low level x86 platform code, or modified by boot parameters, before
  51 *   passed on to higher level MM layers.
  52 *
  53 * Once the E820 map has been converted to the standard Linux memory layout
  54 * information its role stops - modifying it has no effect and does not get
  55 * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
  56 * specific memory layout data during early bootup.
  57 */
  58static struct e820_table e820_table_init                __initdata;
  59static struct e820_table e820_table_kexec_init          __initdata;
  60static struct e820_table e820_table_firmware_init       __initdata;
  61
  62struct e820_table *e820_table __refdata                 = &e820_table_init;
  63struct e820_table *e820_table_kexec __refdata           = &e820_table_kexec_init;
  64struct e820_table *e820_table_firmware __refdata        = &e820_table_firmware_init;
  65
  66/* For PCI or other memory-mapped resources */
  67unsigned long pci_mem_start = 0xaeedbabe;
  68#ifdef CONFIG_PCI
  69EXPORT_SYMBOL(pci_mem_start);
  70#endif
  71
  72/*
  73 * This function checks if any part of the range <start,end> is mapped
  74 * with type.
  75 */
  76static bool _e820__mapped_any(struct e820_table *table,
  77                              u64 start, u64 end, enum e820_type type)
  78{
  79        int i;
  80
  81        for (i = 0; i < table->nr_entries; i++) {
  82                struct e820_entry *entry = &table->entries[i];
  83
  84                if (type && entry->type != type)
  85                        continue;
  86                if (entry->addr >= end || entry->addr + entry->size <= start)
  87                        continue;
  88                return 1;
  89        }
  90        return 0;
  91}
  92
  93bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type)
  94{
  95        return _e820__mapped_any(e820_table_firmware, start, end, type);
  96}
  97EXPORT_SYMBOL_GPL(e820__mapped_raw_any);
  98
  99bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
 100{
 101        return _e820__mapped_any(e820_table, start, end, type);
 102}
 103EXPORT_SYMBOL_GPL(e820__mapped_any);
 104
 105/*
 106 * This function checks if the entire <start,end> range is mapped with 'type'.
 107 *
 108 * Note: this function only works correctly once the E820 table is sorted and
 109 * not-overlapping (at least for the range specified), which is the case normally.
 110 */
 111static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
 112                                             enum e820_type type)
 113{
 114        int i;
 115
 116        for (i = 0; i < e820_table->nr_entries; i++) {
 117                struct e820_entry *entry = &e820_table->entries[i];
 118
 119                if (type && entry->type != type)
 120                        continue;
 121
 122                /* Is the region (part) in overlap with the current region? */
 123                if (entry->addr >= end || entry->addr + entry->size <= start)
 124                        continue;
 125
 126                /*
 127                 * If the region is at the beginning of <start,end> we move
 128                 * 'start' to the end of the region since it's ok until there
 129                 */
 130                if (entry->addr <= start)
 131                        start = entry->addr + entry->size;
 132
 133                /*
 134                 * If 'start' is now at or beyond 'end', we're done, full
 135                 * coverage of the desired range exists:
 136                 */
 137                if (start >= end)
 138                        return entry;
 139        }
 140
 141        return NULL;
 142}
 143
 144/*
 145 * This function checks if the entire range <start,end> is mapped with type.
 146 */
 147bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
 148{
 149        return __e820__mapped_all(start, end, type);
 150}
 151
 152/*
 153 * This function returns the type associated with the range <start,end>.
 154 */
 155int e820__get_entry_type(u64 start, u64 end)
 156{
 157        struct e820_entry *entry = __e820__mapped_all(start, end, 0);
 158
 159        return entry ? entry->type : -EINVAL;
 160}
 161
 162/*
 163 * Add a memory region to the kernel E820 map.
 164 */
 165static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
 166{
 167        int x = table->nr_entries;
 168
 169        if (x >= ARRAY_SIZE(table->entries)) {
 170                pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n",
 171                       start, start + size - 1);
 172                return;
 173        }
 174
 175        table->entries[x].addr = start;
 176        table->entries[x].size = size;
 177        table->entries[x].type = type;
 178        table->nr_entries++;
 179}
 180
 181void __init e820__range_add(u64 start, u64 size, enum e820_type type)
 182{
 183        __e820__range_add(e820_table, start, size, type);
 184}
 185
 186static void __init e820_print_type(enum e820_type type)
 187{
 188        switch (type) {
 189        case E820_TYPE_RAM:             /* Fall through: */
 190        case E820_TYPE_RESERVED_KERN:   pr_cont("usable");                      break;
 191        case E820_TYPE_RESERVED:        pr_cont("reserved");                    break;
 192        case E820_TYPE_SOFT_RESERVED:   pr_cont("soft reserved");               break;
 193        case E820_TYPE_ACPI:            pr_cont("ACPI data");                   break;
 194        case E820_TYPE_NVS:             pr_cont("ACPI NVS");                    break;
 195        case E820_TYPE_UNUSABLE:        pr_cont("unusable");                    break;
 196        case E820_TYPE_PMEM:            /* Fall through: */
 197        case E820_TYPE_PRAM:            pr_cont("persistent (type %u)", type);  break;
 198        default:                        pr_cont("type %u", type);               break;
 199        }
 200}
 201
 202void __init e820__print_table(char *who)
 203{
 204        int i;
 205
 206        for (i = 0; i < e820_table->nr_entries; i++) {
 207                pr_info("%s: [mem %#018Lx-%#018Lx] ",
 208                        who,
 209                        e820_table->entries[i].addr,
 210                        e820_table->entries[i].addr + e820_table->entries[i].size - 1);
 211
 212                e820_print_type(e820_table->entries[i].type);
 213                pr_cont("\n");
 214        }
 215}
 216
 217/*
 218 * Sanitize an E820 map.
 219 *
 220 * Some E820 layouts include overlapping entries. The following
 221 * replaces the original E820 map with a new one, removing overlaps,
 222 * and resolving conflicting memory types in favor of highest
 223 * numbered type.
 224 *
 225 * The input parameter 'entries' points to an array of 'struct
 226 * e820_entry' which on entry has elements in the range [0, *nr_entries)
 227 * valid, and which has space for up to max_nr_entries entries.
 228 * On return, the resulting sanitized E820 map entries will be in
 229 * overwritten in the same location, starting at 'entries'.
 230 *
 231 * The integer pointed to by nr_entries must be valid on entry (the
 232 * current number of valid entries located at 'entries'). If the
 233 * sanitizing succeeds the *nr_entries will be updated with the new
 234 * number of valid entries (something no more than max_nr_entries).
 235 *
 236 * The return value from e820__update_table() is zero if it
 237 * successfully 'sanitized' the map entries passed in, and is -1
 238 * if it did nothing, which can happen if either of (1) it was
 239 * only passed one map entry, or (2) any of the input map entries
 240 * were invalid (start + size < start, meaning that the size was
 241 * so big the described memory range wrapped around through zero.)
 242 *
 243 *      Visually we're performing the following
 244 *      (1,2,3,4 = memory types)...
 245 *
 246 *      Sample memory map (w/overlaps):
 247 *         ____22__________________
 248 *         ______________________4_
 249 *         ____1111________________
 250 *         _44_____________________
 251 *         11111111________________
 252 *         ____________________33__
 253 *         ___________44___________
 254 *         __________33333_________
 255 *         ______________22________
 256 *         ___________________2222_
 257 *         _________111111111______
 258 *         _____________________11_
 259 *         _________________4______
 260 *
 261 *      Sanitized equivalent (no overlap):
 262 *         1_______________________
 263 *         _44_____________________
 264 *         ___1____________________
 265 *         ____22__________________
 266 *         ______11________________
 267 *         _________1______________
 268 *         __________3_____________
 269 *         ___________44___________
 270 *         _____________33_________
 271 *         _______________2________
 272 *         ________________1_______
 273 *         _________________4______
 274 *         ___________________2____
 275 *         ____________________33__
 276 *         ______________________4_
 277 */
 278struct change_member {
 279        /* Pointer to the original entry: */
 280        struct e820_entry       *entry;
 281        /* Address for this change point: */
 282        unsigned long long      addr;
 283};
 284
 285static struct change_member     change_point_list[2*E820_MAX_ENTRIES]   __initdata;
 286static struct change_member     *change_point[2*E820_MAX_ENTRIES]       __initdata;
 287static struct e820_entry        *overlap_list[E820_MAX_ENTRIES]         __initdata;
 288static struct e820_entry        new_entries[E820_MAX_ENTRIES]           __initdata;
 289
 290static int __init cpcompare(const void *a, const void *b)
 291{
 292        struct change_member * const *app = a, * const *bpp = b;
 293        const struct change_member *ap = *app, *bp = *bpp;
 294
 295        /*
 296         * Inputs are pointers to two elements of change_point[].  If their
 297         * addresses are not equal, their difference dominates.  If the addresses
 298         * are equal, then consider one that represents the end of its region
 299         * to be greater than one that does not.
 300         */
 301        if (ap->addr != bp->addr)
 302                return ap->addr > bp->addr ? 1 : -1;
 303
 304        return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
 305}
 306
 307int __init e820__update_table(struct e820_table *table)
 308{
 309        struct e820_entry *entries = table->entries;
 310        u32 max_nr_entries = ARRAY_SIZE(table->entries);
 311        enum e820_type current_type, last_type;
 312        unsigned long long last_addr;
 313        u32 new_nr_entries, overlap_entries;
 314        u32 i, chg_idx, chg_nr;
 315
 316        /* If there's only one memory region, don't bother: */
 317        if (table->nr_entries < 2)
 318                return -1;
 319
 320        BUG_ON(table->nr_entries > max_nr_entries);
 321
 322        /* Bail out if we find any unreasonable addresses in the map: */
 323        for (i = 0; i < table->nr_entries; i++) {
 324                if (entries[i].addr + entries[i].size < entries[i].addr)
 325                        return -1;
 326        }
 327
 328        /* Create pointers for initial change-point information (for sorting): */
 329        for (i = 0; i < 2 * table->nr_entries; i++)
 330                change_point[i] = &change_point_list[i];
 331
 332        /*
 333         * Record all known change-points (starting and ending addresses),
 334         * omitting empty memory regions:
 335         */
 336        chg_idx = 0;
 337        for (i = 0; i < table->nr_entries; i++) {
 338                if (entries[i].size != 0) {
 339                        change_point[chg_idx]->addr     = entries[i].addr;
 340                        change_point[chg_idx++]->entry  = &entries[i];
 341                        change_point[chg_idx]->addr     = entries[i].addr + entries[i].size;
 342                        change_point[chg_idx++]->entry  = &entries[i];
 343                }
 344        }
 345        chg_nr = chg_idx;
 346
 347        /* Sort change-point list by memory addresses (low -> high): */
 348        sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL);
 349
 350        /* Create a new memory map, removing overlaps: */
 351        overlap_entries = 0;     /* Number of entries in the overlap table */
 352        new_nr_entries = 0;      /* Index for creating new map entries */
 353        last_type = 0;           /* Start with undefined memory type */
 354        last_addr = 0;           /* Start with 0 as last starting address */
 355
 356        /* Loop through change-points, determining effect on the new map: */
 357        for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) {
 358                /* Keep track of all overlapping entries */
 359                if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) {
 360                        /* Add map entry to overlap list (> 1 entry implies an overlap) */
 361                        overlap_list[overlap_entries++] = change_point[chg_idx]->entry;
 362                } else {
 363                        /* Remove entry from list (order independent, so swap with last): */
 364                        for (i = 0; i < overlap_entries; i++) {
 365                                if (overlap_list[i] == change_point[chg_idx]->entry)
 366                                        overlap_list[i] = overlap_list[overlap_entries-1];
 367                        }
 368                        overlap_entries--;
 369                }
 370                /*
 371                 * If there are overlapping entries, decide which
 372                 * "type" to use (larger value takes precedence --
 373                 * 1=usable, 2,3,4,4+=unusable)
 374                 */
 375                current_type = 0;
 376                for (i = 0; i < overlap_entries; i++) {
 377                        if (overlap_list[i]->type > current_type)
 378                                current_type = overlap_list[i]->type;
 379                }
 380
 381                /* Continue building up new map based on this information: */
 382                if (current_type != last_type || current_type == E820_TYPE_PRAM) {
 383                        if (last_type != 0)      {
 384                                new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
 385                                /* Move forward only if the new size was non-zero: */
 386                                if (new_entries[new_nr_entries].size != 0)
 387                                        /* No more space left for new entries? */
 388                                        if (++new_nr_entries >= max_nr_entries)
 389                                                break;
 390                        }
 391                        if (current_type != 0)  {
 392                                new_entries[new_nr_entries].addr = change_point[chg_idx]->addr;
 393                                new_entries[new_nr_entries].type = current_type;
 394                                last_addr = change_point[chg_idx]->addr;
 395                        }
 396                        last_type = current_type;
 397                }
 398        }
 399
 400        /* Copy the new entries into the original location: */
 401        memcpy(entries, new_entries, new_nr_entries*sizeof(*entries));
 402        table->nr_entries = new_nr_entries;
 403
 404        return 0;
 405}
 406
 407static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
 408{
 409        struct boot_e820_entry *entry = entries;
 410
 411        while (nr_entries) {
 412                u64 start = entry->addr;
 413                u64 size = entry->size;
 414                u64 end = start + size - 1;
 415                u32 type = entry->type;
 416
 417                /* Ignore the entry on 64-bit overflow: */
 418                if (start > end && likely(size))
 419                        return -1;
 420
 421                e820__range_add(start, size, type);
 422
 423                entry++;
 424                nr_entries--;
 425        }
 426        return 0;
 427}
 428
 429/*
 430 * Copy the BIOS E820 map into a safe place.
 431 *
 432 * Sanity-check it while we're at it..
 433 *
 434 * If we're lucky and live on a modern system, the setup code
 435 * will have given us a memory map that we can use to properly
 436 * set up memory.  If we aren't, we'll fake a memory map.
 437 */
 438static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
 439{
 440        /* Only one memory region (or negative)? Ignore it */
 441        if (nr_entries < 2)
 442                return -1;
 443
 444        return __append_e820_table(entries, nr_entries);
 445}
 446
 447static u64 __init
 448__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
 449{
 450        u64 end;
 451        unsigned int i;
 452        u64 real_updated_size = 0;
 453
 454        BUG_ON(old_type == new_type);
 455
 456        if (size > (ULLONG_MAX - start))
 457                size = ULLONG_MAX - start;
 458
 459        end = start + size;
 460        printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
 461        e820_print_type(old_type);
 462        pr_cont(" ==> ");
 463        e820_print_type(new_type);
 464        pr_cont("\n");
 465
 466        for (i = 0; i < table->nr_entries; i++) {
 467                struct e820_entry *entry = &table->entries[i];
 468                u64 final_start, final_end;
 469                u64 entry_end;
 470
 471                if (entry->type != old_type)
 472                        continue;
 473
 474                entry_end = entry->addr + entry->size;
 475
 476                /* Completely covered by new range? */
 477                if (entry->addr >= start && entry_end <= end) {
 478                        entry->type = new_type;
 479                        real_updated_size += entry->size;
 480                        continue;
 481                }
 482
 483                /* New range is completely covered? */
 484                if (entry->addr < start && entry_end > end) {
 485                        __e820__range_add(table, start, size, new_type);
 486                        __e820__range_add(table, end, entry_end - end, entry->type);
 487                        entry->size = start - entry->addr;
 488                        real_updated_size += size;
 489                        continue;
 490                }
 491
 492                /* Partially covered: */
 493                final_start = max(start, entry->addr);
 494                final_end = min(end, entry_end);
 495                if (final_start >= final_end)
 496                        continue;
 497
 498                __e820__range_add(table, final_start, final_end - final_start, new_type);
 499
 500                real_updated_size += final_end - final_start;
 501
 502                /*
 503                 * Left range could be head or tail, so need to update
 504                 * its size first:
 505                 */
 506                entry->size -= final_end - final_start;
 507                if (entry->addr < final_start)
 508                        continue;
 509
 510                entry->addr = final_end;
 511        }
 512        return real_updated_size;
 513}
 514
 515u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
 516{
 517        return __e820__range_update(e820_table, start, size, old_type, new_type);
 518}
 519
 520static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type  new_type)
 521{
 522        return __e820__range_update(e820_table_kexec, start, size, old_type, new_type);
 523}
 524
 525/* Remove a range of memory from the E820 table: */
 526u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
 527{
 528        int i;
 529        u64 end;
 530        u64 real_removed_size = 0;
 531
 532        if (size > (ULLONG_MAX - start))
 533                size = ULLONG_MAX - start;
 534
 535        end = start + size;
 536        printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
 537        if (check_type)
 538                e820_print_type(old_type);
 539        pr_cont("\n");
 540
 541        for (i = 0; i < e820_table->nr_entries; i++) {
 542                struct e820_entry *entry = &e820_table->entries[i];
 543                u64 final_start, final_end;
 544                u64 entry_end;
 545
 546                if (check_type && entry->type != old_type)
 547                        continue;
 548
 549                entry_end = entry->addr + entry->size;
 550
 551                /* Completely covered? */
 552                if (entry->addr >= start && entry_end <= end) {
 553                        real_removed_size += entry->size;
 554                        memset(entry, 0, sizeof(*entry));
 555                        continue;
 556                }
 557
 558                /* Is the new range completely covered? */
 559                if (entry->addr < start && entry_end > end) {
 560                        e820__range_add(end, entry_end - end, entry->type);
 561                        entry->size = start - entry->addr;
 562                        real_removed_size += size;
 563                        continue;
 564                }
 565
 566                /* Partially covered: */
 567                final_start = max(start, entry->addr);
 568                final_end = min(end, entry_end);
 569                if (final_start >= final_end)
 570                        continue;
 571
 572                real_removed_size += final_end - final_start;
 573
 574                /*
 575                 * Left range could be head or tail, so need to update
 576                 * the size first:
 577                 */
 578                entry->size -= final_end - final_start;
 579                if (entry->addr < final_start)
 580                        continue;
 581
 582                entry->addr = final_end;
 583        }
 584        return real_removed_size;
 585}
 586
 587void __init e820__update_table_print(void)
 588{
 589        if (e820__update_table(e820_table))
 590                return;
 591
 592        pr_info("modified physical RAM map:\n");
 593        e820__print_table("modified");
 594}
 595
 596static void __init e820__update_table_kexec(void)
 597{
 598        e820__update_table(e820_table_kexec);
 599}
 600
 601#define MAX_GAP_END 0x100000000ull
 602
 603/*
 604 * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
 605 */
 606static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
 607{
 608        unsigned long long last = MAX_GAP_END;
 609        int i = e820_table->nr_entries;
 610        int found = 0;
 611
 612        while (--i >= 0) {
 613                unsigned long long start = e820_table->entries[i].addr;
 614                unsigned long long end = start + e820_table->entries[i].size;
 615
 616                /*
 617                 * Since "last" is at most 4GB, we know we'll
 618                 * fit in 32 bits if this condition is true:
 619                 */
 620                if (last > end) {
 621                        unsigned long gap = last - end;
 622
 623                        if (gap >= *gapsize) {
 624                                *gapsize = gap;
 625                                *gapstart = end;
 626                                found = 1;
 627                        }
 628                }
 629                if (start < last)
 630                        last = start;
 631        }
 632        return found;
 633}
 634
 635/*
 636 * Search for the biggest gap in the low 32 bits of the E820
 637 * memory space. We pass this space to the PCI subsystem, so
 638 * that it can assign MMIO resources for hotplug or
 639 * unconfigured devices in.
 640 *
 641 * Hopefully the BIOS let enough space left.
 642 */
 643__init void e820__setup_pci_gap(void)
 644{
 645        unsigned long gapstart, gapsize;
 646        int found;
 647
 648        gapsize = 0x400000;
 649        found  = e820_search_gap(&gapstart, &gapsize);
 650
 651        if (!found) {
 652#ifdef CONFIG_X86_64
 653                gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
 654                pr_err("Cannot find an available gap in the 32-bit address range\n");
 655                pr_err("PCI devices with unassigned 32-bit BARs may not work!\n");
 656#else
 657                gapstart = 0x10000000;
 658#endif
 659        }
 660
 661        /*
 662         * e820__reserve_resources_late() protects stolen RAM already:
 663         */
 664        pci_mem_start = gapstart;
 665
 666        pr_info("[mem %#010lx-%#010lx] available for PCI devices\n",
 667                gapstart, gapstart + gapsize - 1);
 668}
 669
 670/*
 671 * Called late during init, in free_initmem().
 672 *
 673 * Initial e820_table and e820_table_kexec are largish __initdata arrays.
 674 *
 675 * Copy them to a (usually much smaller) dynamically allocated area that is
 676 * sized precisely after the number of e820 entries.
 677 *
 678 * This is done after we've performed all the fixes and tweaks to the tables.
 679 * All functions which modify them are __init functions, which won't exist
 680 * after free_initmem().
 681 */
 682__init void e820__reallocate_tables(void)
 683{
 684        struct e820_table *n;
 685        int size;
 686
 687        size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
 688        n = kmalloc(size, GFP_KERNEL);
 689        BUG_ON(!n);
 690        memcpy(n, e820_table, size);
 691        e820_table = n;
 692
 693        size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries;
 694        n = kmalloc(size, GFP_KERNEL);
 695        BUG_ON(!n);
 696        memcpy(n, e820_table_kexec, size);
 697        e820_table_kexec = n;
 698
 699        size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
 700        n = kmalloc(size, GFP_KERNEL);
 701        BUG_ON(!n);
 702        memcpy(n, e820_table_firmware, size);
 703        e820_table_firmware = n;
 704}
 705
 706/*
 707 * Because of the small fixed size of struct boot_params, only the first
 708 * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
 709 * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
 710 * struct setup_data, which is parsed here.
 711 */
 712void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
 713{
 714        int entries;
 715        struct boot_e820_entry *extmap;
 716        struct setup_data *sdata;
 717
 718        sdata = early_memremap(phys_addr, data_len);
 719        entries = sdata->len / sizeof(*extmap);
 720        extmap = (struct boot_e820_entry *)(sdata->data);
 721
 722        __append_e820_table(extmap, entries);
 723        e820__update_table(e820_table);
 724
 725        memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
 726        memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
 727
 728        early_memunmap(sdata, data_len);
 729        pr_info("extended physical RAM map:\n");
 730        e820__print_table("extended");
 731}
 732
 733/*
 734 * Find the ranges of physical addresses that do not correspond to
 735 * E820 RAM areas and register the corresponding pages as 'nosave' for
 736 * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
 737 *
 738 * This function requires the E820 map to be sorted and without any
 739 * overlapping entries.
 740 */
 741void __init e820__register_nosave_regions(unsigned long limit_pfn)
 742{
 743        int i;
 744        unsigned long pfn = 0;
 745
 746        for (i = 0; i < e820_table->nr_entries; i++) {
 747                struct e820_entry *entry = &e820_table->entries[i];
 748
 749                if (pfn < PFN_UP(entry->addr))
 750                        register_nosave_region(pfn, PFN_UP(entry->addr));
 751
 752                pfn = PFN_DOWN(entry->addr + entry->size);
 753
 754                if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
 755                        register_nosave_region(PFN_UP(entry->addr), pfn);
 756
 757                if (pfn >= limit_pfn)
 758                        break;
 759        }
 760}
 761
 762#ifdef CONFIG_ACPI
 763/*
 764 * Register ACPI NVS memory regions, so that we can save/restore them during
 765 * hibernation and the subsequent resume:
 766 */
 767static int __init e820__register_nvs_regions(void)
 768{
 769        int i;
 770
 771        for (i = 0; i < e820_table->nr_entries; i++) {
 772                struct e820_entry *entry = &e820_table->entries[i];
 773
 774                if (entry->type == E820_TYPE_NVS)
 775                        acpi_nvs_register(entry->addr, entry->size);
 776        }
 777
 778        return 0;
 779}
 780core_initcall(e820__register_nvs_regions);
 781#endif
 782
 783/*
 784 * Allocate the requested number of bytes with the requsted alignment
 785 * and return (the physical address) to the caller. Also register this
 786 * range in the 'kexec' E820 table as a reserved range.
 787 *
 788 * This allows kexec to fake a new mptable, as if it came from the real
 789 * system.
 790 */
 791u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
 792{
 793        u64 addr;
 794
 795        addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 796        if (addr) {
 797                e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
 798                pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
 799                e820__update_table_kexec();
 800        }
 801
 802        return addr;
 803}
 804
 805#ifdef CONFIG_X86_32
 806# ifdef CONFIG_X86_PAE
 807#  define MAX_ARCH_PFN          (1ULL<<(36-PAGE_SHIFT))
 808# else
 809#  define MAX_ARCH_PFN          (1ULL<<(32-PAGE_SHIFT))
 810# endif
 811#else /* CONFIG_X86_32 */
 812# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
 813#endif
 814
 815/*
 816 * Find the highest page frame number we have available
 817 */
 818static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
 819{
 820        int i;
 821        unsigned long last_pfn = 0;
 822        unsigned long max_arch_pfn = MAX_ARCH_PFN;
 823
 824        for (i = 0; i < e820_table->nr_entries; i++) {
 825                struct e820_entry *entry = &e820_table->entries[i];
 826                unsigned long start_pfn;
 827                unsigned long end_pfn;
 828
 829                if (entry->type != type)
 830                        continue;
 831
 832                start_pfn = entry->addr >> PAGE_SHIFT;
 833                end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
 834
 835                if (start_pfn >= limit_pfn)
 836                        continue;
 837                if (end_pfn > limit_pfn) {
 838                        last_pfn = limit_pfn;
 839                        break;
 840                }
 841                if (end_pfn > last_pfn)
 842                        last_pfn = end_pfn;
 843        }
 844
 845        if (last_pfn > max_arch_pfn)
 846                last_pfn = max_arch_pfn;
 847
 848        pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n",
 849                last_pfn, max_arch_pfn);
 850        return last_pfn;
 851}
 852
 853unsigned long __init e820__end_of_ram_pfn(void)
 854{
 855        return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
 856}
 857
 858unsigned long __init e820__end_of_low_ram_pfn(void)
 859{
 860        return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
 861}
 862
 863static void __init early_panic(char *msg)
 864{
 865        early_printk(msg);
 866        panic(msg);
 867}
 868
 869static int userdef __initdata;
 870
 871/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
 872static int __init parse_memopt(char *p)
 873{
 874        u64 mem_size;
 875
 876        if (!p)
 877                return -EINVAL;
 878
 879        if (!strcmp(p, "nopentium")) {
 880#ifdef CONFIG_X86_32
 881                setup_clear_cpu_cap(X86_FEATURE_PSE);
 882                return 0;
 883#else
 884                pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
 885                return -EINVAL;
 886#endif
 887        }
 888
 889        userdef = 1;
 890        mem_size = memparse(p, &p);
 891
 892        /* Don't remove all memory when getting "mem={invalid}" parameter: */
 893        if (mem_size == 0)
 894                return -EINVAL;
 895
 896        e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
 897
 898#ifdef CONFIG_MEMORY_HOTPLUG
 899        max_mem_size = mem_size;
 900#endif
 901
 902        return 0;
 903}
 904early_param("mem", parse_memopt);
 905
 906static int __init parse_memmap_one(char *p)
 907{
 908        char *oldp;
 909        u64 start_at, mem_size;
 910
 911        if (!p)
 912                return -EINVAL;
 913
 914        if (!strncmp(p, "exactmap", 8)) {
 915#ifdef CONFIG_CRASH_DUMP
 916                /*
 917                 * If we are doing a crash dump, we still need to know
 918                 * the real memory size before the original memory map is
 919                 * reset.
 920                 */
 921                saved_max_pfn = e820__end_of_ram_pfn();
 922#endif
 923                e820_table->nr_entries = 0;
 924                userdef = 1;
 925                return 0;
 926        }
 927
 928        oldp = p;
 929        mem_size = memparse(p, &p);
 930        if (p == oldp)
 931                return -EINVAL;
 932
 933        userdef = 1;
 934        if (*p == '@') {
 935                start_at = memparse(p+1, &p);
 936                e820__range_add(start_at, mem_size, E820_TYPE_RAM);
 937        } else if (*p == '#') {
 938                start_at = memparse(p+1, &p);
 939                e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
 940        } else if (*p == '$') {
 941                start_at = memparse(p+1, &p);
 942                e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
 943        } else if (*p == '!') {
 944                start_at = memparse(p+1, &p);
 945                e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
 946        } else if (*p == '%') {
 947                enum e820_type from = 0, to = 0;
 948
 949                start_at = memparse(p + 1, &p);
 950                if (*p == '-')
 951                        from = simple_strtoull(p + 1, &p, 0);
 952                if (*p == '+')
 953                        to = simple_strtoull(p + 1, &p, 0);
 954                if (*p != '\0')
 955                        return -EINVAL;
 956                if (from && to)
 957                        e820__range_update(start_at, mem_size, from, to);
 958                else if (to)
 959                        e820__range_add(start_at, mem_size, to);
 960                else if (from)
 961                        e820__range_remove(start_at, mem_size, from, 1);
 962                else
 963                        e820__range_remove(start_at, mem_size, 0, 0);
 964        } else {
 965                e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
 966        }
 967
 968        return *p == '\0' ? 0 : -EINVAL;
 969}
 970
 971static int __init parse_memmap_opt(char *str)
 972{
 973        while (str) {
 974                char *k = strchr(str, ',');
 975
 976                if (k)
 977                        *k++ = 0;
 978
 979                parse_memmap_one(str);
 980                str = k;
 981        }
 982
 983        return 0;
 984}
 985early_param("memmap", parse_memmap_opt);
 986
 987/*
 988 * Reserve all entries from the bootloader's extensible data nodes list,
 989 * because if present we are going to use it later on to fetch e820
 990 * entries from it:
 991 */
 992void __init e820__reserve_setup_data(void)
 993{
 994        struct setup_data *data;
 995        u64 pa_data;
 996
 997        pa_data = boot_params.hdr.setup_data;
 998        if (!pa_data)
 999                return;
1000
1001        while (pa_data) {
1002                data = early_memremap(pa_data, sizeof(*data));
1003                e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
1004                e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
1005                pa_data = data->next;
1006                early_memunmap(data, sizeof(*data));
1007        }
1008
1009        e820__update_table(e820_table);
1010        e820__update_table(e820_table_kexec);
1011
1012        pr_info("extended physical RAM map:\n");
1013        e820__print_table("reserve setup_data");
1014}
1015
1016/*
1017 * Called after parse_early_param(), after early parameters (such as mem=)
1018 * have been processed, in which case we already have an E820 table filled in
1019 * via the parameter callback function(s), but it's not sorted and printed yet:
1020 */
1021void __init e820__finish_early_params(void)
1022{
1023        if (userdef) {
1024                if (e820__update_table(e820_table) < 0)
1025                        early_panic("Invalid user supplied memory map");
1026
1027                pr_info("user-defined physical RAM map:\n");
1028                e820__print_table("user");
1029        }
1030}
1031
1032static const char *__init e820_type_to_string(struct e820_entry *entry)
1033{
1034        switch (entry->type) {
1035        case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
1036        case E820_TYPE_RAM:             return "System RAM";
1037        case E820_TYPE_ACPI:            return "ACPI Tables";
1038        case E820_TYPE_NVS:             return "ACPI Non-volatile Storage";
1039        case E820_TYPE_UNUSABLE:        return "Unusable memory";
1040        case E820_TYPE_PRAM:            return "Persistent Memory (legacy)";
1041        case E820_TYPE_PMEM:            return "Persistent Memory";
1042        case E820_TYPE_RESERVED:        return "Reserved";
1043        case E820_TYPE_SOFT_RESERVED:   return "Soft Reserved";
1044        default:                        return "Unknown E820 type";
1045        }
1046}
1047
1048static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
1049{
1050        switch (entry->type) {
1051        case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
1052        case E820_TYPE_RAM:             return IORESOURCE_SYSTEM_RAM;
1053        case E820_TYPE_ACPI:            /* Fall-through: */
1054        case E820_TYPE_NVS:             /* Fall-through: */
1055        case E820_TYPE_UNUSABLE:        /* Fall-through: */
1056        case E820_TYPE_PRAM:            /* Fall-through: */
1057        case E820_TYPE_PMEM:            /* Fall-through: */
1058        case E820_TYPE_RESERVED:        /* Fall-through: */
1059        case E820_TYPE_SOFT_RESERVED:   /* Fall-through: */
1060        default:                        return IORESOURCE_MEM;
1061        }
1062}
1063
1064static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
1065{
1066        switch (entry->type) {
1067        case E820_TYPE_ACPI:            return IORES_DESC_ACPI_TABLES;
1068        case E820_TYPE_NVS:             return IORES_DESC_ACPI_NV_STORAGE;
1069        case E820_TYPE_PMEM:            return IORES_DESC_PERSISTENT_MEMORY;
1070        case E820_TYPE_PRAM:            return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
1071        case E820_TYPE_RESERVED:        return IORES_DESC_RESERVED;
1072        case E820_TYPE_SOFT_RESERVED:   return IORES_DESC_SOFT_RESERVED;
1073        case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
1074        case E820_TYPE_RAM:             /* Fall-through: */
1075        case E820_TYPE_UNUSABLE:        /* Fall-through: */
1076        default:                        return IORES_DESC_NONE;
1077        }
1078}
1079
1080static bool __init do_mark_busy(enum e820_type type, struct resource *res)
1081{
1082        /* this is the legacy bios/dos rom-shadow + mmio region */
1083        if (res->start < (1ULL<<20))
1084                return true;
1085
1086        /*
1087         * Treat persistent memory and other special memory ranges like
1088         * device memory, i.e. reserve it for exclusive use of a driver
1089         */
1090        switch (type) {
1091        case E820_TYPE_RESERVED:
1092        case E820_TYPE_SOFT_RESERVED:
1093        case E820_TYPE_PRAM:
1094        case E820_TYPE_PMEM:
1095                return false;
1096        case E820_TYPE_RESERVED_KERN:
1097        case E820_TYPE_RAM:
1098        case E820_TYPE_ACPI:
1099        case E820_TYPE_NVS:
1100        case E820_TYPE_UNUSABLE:
1101        default:
1102                return true;
1103        }
1104}
1105
1106/*
1107 * Mark E820 reserved areas as busy for the resource manager:
1108 */
1109
1110static struct resource __initdata *e820_res;
1111
1112void __init e820__reserve_resources(void)
1113{
1114        int i;
1115        struct resource *res;
1116        u64 end;
1117
1118        res = memblock_alloc(sizeof(*res) * e820_table->nr_entries,
1119                             SMP_CACHE_BYTES);
1120        if (!res)
1121                panic("%s: Failed to allocate %zu bytes\n", __func__,
1122                      sizeof(*res) * e820_table->nr_entries);
1123        e820_res = res;
1124
1125        for (i = 0; i < e820_table->nr_entries; i++) {
1126                struct e820_entry *entry = e820_table->entries + i;
1127
1128                end = entry->addr + entry->size - 1;
1129                if (end != (resource_size_t)end) {
1130                        res++;
1131                        continue;
1132                }
1133                res->start = entry->addr;
1134                res->end   = end;
1135                res->name  = e820_type_to_string(entry);
1136                res->flags = e820_type_to_iomem_type(entry);
1137                res->desc  = e820_type_to_iores_desc(entry);
1138
1139                /*
1140                 * Don't register the region that could be conflicted with
1141                 * PCI device BAR resources and insert them later in
1142                 * pcibios_resource_survey():
1143                 */
1144                if (do_mark_busy(entry->type, res)) {
1145                        res->flags |= IORESOURCE_BUSY;
1146                        insert_resource(&iomem_resource, res);
1147                }
1148                res++;
1149        }
1150
1151        /* Expose the bootloader-provided memory layout to the sysfs. */
1152        for (i = 0; i < e820_table_firmware->nr_entries; i++) {
1153                struct e820_entry *entry = e820_table_firmware->entries + i;
1154
1155                firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
1156        }
1157}
1158
1159/*
1160 * How much should we pad the end of RAM, depending on where it is?
1161 */
1162static unsigned long __init ram_alignment(resource_size_t pos)
1163{
1164        unsigned long mb = pos >> 20;
1165
1166        /* To 64kB in the first megabyte */
1167        if (!mb)
1168                return 64*1024;
1169
1170        /* To 1MB in the first 16MB */
1171        if (mb < 16)
1172                return 1024*1024;
1173
1174        /* To 64MB for anything above that */
1175        return 64*1024*1024;
1176}
1177
1178#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
1179
1180void __init e820__reserve_resources_late(void)
1181{
1182        int i;
1183        struct resource *res;
1184
1185        res = e820_res;
1186        for (i = 0; i < e820_table->nr_entries; i++) {
1187                if (!res->parent && res->end)
1188                        insert_resource_expand_to_fit(&iomem_resource, res);
1189                res++;
1190        }
1191
1192        /*
1193         * Try to bump up RAM regions to reasonable boundaries, to
1194         * avoid stolen RAM:
1195         */
1196        for (i = 0; i < e820_table->nr_entries; i++) {
1197                struct e820_entry *entry = &e820_table->entries[i];
1198                u64 start, end;
1199
1200                if (entry->type != E820_TYPE_RAM)
1201                        continue;
1202
1203                start = entry->addr + entry->size;
1204                end = round_up(start, ram_alignment(start)) - 1;
1205                if (end > MAX_RESOURCE_SIZE)
1206                        end = MAX_RESOURCE_SIZE;
1207                if (start >= end)
1208                        continue;
1209
1210                printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
1211                reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
1212        }
1213}
1214
1215/*
1216 * Pass the firmware (bootloader) E820 map to the kernel and process it:
1217 */
1218char *__init e820__memory_setup_default(void)
1219{
1220        char *who = "BIOS-e820";
1221
1222        /*
1223         * Try to copy the BIOS-supplied E820-map.
1224         *
1225         * Otherwise fake a memory map; one section from 0k->640k,
1226         * the next section from 1mb->appropriate_mem_k
1227         */
1228        if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
1229                u64 mem_size;
1230
1231                /* Compare results from other methods and take the one that gives more RAM: */
1232                if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
1233                        mem_size = boot_params.screen_info.ext_mem_k;
1234                        who = "BIOS-88";
1235                } else {
1236                        mem_size = boot_params.alt_mem_k;
1237                        who = "BIOS-e801";
1238                }
1239
1240                e820_table->nr_entries = 0;
1241                e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM);
1242                e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM);
1243        }
1244
1245        /* We just appended a lot of ranges, sanitize the table: */
1246        e820__update_table(e820_table);
1247
1248        return who;
1249}
1250
1251/*
1252 * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
1253 * E820 map - with an optional platform quirk available for virtual platforms
1254 * to override this method of boot environment processing:
1255 */
1256void __init e820__memory_setup(void)
1257{
1258        char *who;
1259
1260        /* This is a firmware interface ABI - make sure we don't break it: */
1261        BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20);
1262
1263        who = x86_init.resources.memory_setup();
1264
1265        memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
1266        memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
1267
1268        pr_info("BIOS-provided physical RAM map:\n");
1269        e820__print_table(who);
1270}
1271
1272void __init e820__memblock_setup(void)
1273{
1274        int i;
1275        u64 end;
1276
1277        /*
1278         * The bootstrap memblock region count maximum is 128 entries
1279         * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
1280         * than that - so allow memblock resizing.
1281         *
1282         * This is safe, because this call happens pretty late during x86 setup,
1283         * so we know about reserved memory regions already. (This is important
1284         * so that memblock resizing does no stomp over reserved areas.)
1285         */
1286        memblock_allow_resize();
1287
1288        for (i = 0; i < e820_table->nr_entries; i++) {
1289                struct e820_entry *entry = &e820_table->entries[i];
1290
1291                end = entry->addr + entry->size;
1292                if (end != (resource_size_t)end)
1293                        continue;
1294
1295                if (entry->type == E820_TYPE_SOFT_RESERVED)
1296                        memblock_reserve(entry->addr, entry->size);
1297
1298                if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
1299                        continue;
1300
1301                memblock_add(entry->addr, entry->size);
1302        }
1303
1304        /* Throw away partial pages: */
1305        memblock_trim_memory(PAGE_SIZE);
1306
1307        memblock_dump_all();
1308}
1309