linux/arch/x86/kernel/e820.c
<<
>>
Prefs
   1/*
   2 * Low level x86 E820 memory map handling functions.
   3 *
   4 * The firmware and bootloader passes us the "E820 table", which is the primary
   5 * physical memory layout description available about x86 systems.
   6 *
   7 * The kernel takes the E820 memory layout and optionally modifies it with
   8 * quirks and other tweaks, and feeds that into the generic Linux memory
   9 * allocation code routines via a platform independent interface (memblock, etc.).
  10 */
  11#include <linux/crash_dump.h>
  12#include <linux/bootmem.h>
  13#include <linux/suspend.h>
  14#include <linux/acpi.h>
  15#include <linux/firmware-map.h>
  16#include <linux/memblock.h>
  17#include <linux/sort.h>
  18
  19#include <asm/e820/api.h>
  20#include <asm/setup.h>
  21
  22/*
  23 * We organize the E820 table into three main data structures:
  24 *
  25 * - 'e820_table_firmware': the original firmware version passed to us by the
  26 *   bootloader - not modified by the kernel. It is composed of two parts:
  27 *   the first 128 E820 memory entries in boot_params.e820_table and the remaining
  28 *   (if any) entries of the SETUP_E820_EXT nodes. We use this to:
  29 *
  30 *       - inform the user about the firmware's notion of memory layout
  31 *         via /sys/firmware/memmap
  32 *
  33 *       - the hibernation code uses it to generate a kernel-independent MD5
  34 *         fingerprint of the physical memory layout of a system.
  35 *
  36 * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
  37 *   passed to us by the bootloader - the major difference between
  38 *   e820_table_firmware[] and this one is that, the latter marks the setup_data
  39 *   list created by the EFI boot stub as reserved, so that kexec can reuse the
  40 *   setup_data information in the second kernel. Besides, e820_table_kexec[]
  41 *   might also be modified by the kexec itself to fake a mptable.
  42 *   We use this to:
  43 *
  44 *       - kexec, which is a bootloader in disguise, uses the original E820
  45 *         layout to pass to the kexec-ed kernel. This way the original kernel
  46 *         can have a restricted E820 map while the kexec()-ed kexec-kernel
  47 *         can have access to full memory - etc.
  48 *
  49 * - 'e820_table': this is the main E820 table that is massaged by the
  50 *   low level x86 platform code, or modified by boot parameters, before
  51 *   passed on to higher level MM layers.
  52 *
  53 * Once the E820 map has been converted to the standard Linux memory layout
  54 * information its role stops - modifying it has no effect and does not get
  55 * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
  56 * specific memory layout data during early bootup.
  57 */
  58static struct e820_table e820_table_init                __initdata;
  59static struct e820_table e820_table_kexec_init          __initdata;
  60static struct e820_table e820_table_firmware_init       __initdata;
  61
  62struct e820_table *e820_table __refdata                 = &e820_table_init;
  63struct e820_table *e820_table_kexec __refdata           = &e820_table_kexec_init;
  64struct e820_table *e820_table_firmware __refdata        = &e820_table_firmware_init;
  65
  66/* For PCI or other memory-mapped resources */
  67unsigned long pci_mem_start = 0xaeedbabe;
  68#ifdef CONFIG_PCI
  69EXPORT_SYMBOL(pci_mem_start);
  70#endif
  71
  72/*
  73 * This function checks if any part of the range <start,end> is mapped
  74 * with type.
  75 */
  76bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
  77{
  78        int i;
  79
  80        for (i = 0; i < e820_table->nr_entries; i++) {
  81                struct e820_entry *entry = &e820_table->entries[i];
  82
  83                if (type && entry->type != type)
  84                        continue;
  85                if (entry->addr >= end || entry->addr + entry->size <= start)
  86                        continue;
  87                return 1;
  88        }
  89        return 0;
  90}
  91EXPORT_SYMBOL_GPL(e820__mapped_any);
  92
  93/*
  94 * This function checks if the entire <start,end> range is mapped with 'type'.
  95 *
  96 * Note: this function only works correctly once the E820 table is sorted and
  97 * not-overlapping (at least for the range specified), which is the case normally.
  98 */
  99bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
 100{
 101        int i;
 102
 103        for (i = 0; i < e820_table->nr_entries; i++) {
 104                struct e820_entry *entry = &e820_table->entries[i];
 105
 106                if (type && entry->type != type)
 107                        continue;
 108
 109                /* Is the region (part) in overlap with the current region? */
 110                if (entry->addr >= end || entry->addr + entry->size <= start)
 111                        continue;
 112
 113                /*
 114                 * If the region is at the beginning of <start,end> we move
 115                 * 'start' to the end of the region since it's ok until there
 116                 */
 117                if (entry->addr <= start)
 118                        start = entry->addr + entry->size;
 119
 120                /*
 121                 * If 'start' is now at or beyond 'end', we're done, full
 122                 * coverage of the desired range exists:
 123                 */
 124                if (start >= end)
 125                        return 1;
 126        }
 127        return 0;
 128}
 129
 130/*
 131 * Add a memory region to the kernel E820 map.
 132 */
 133static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
 134{
 135        int x = table->nr_entries;
 136
 137        if (x >= ARRAY_SIZE(table->entries)) {
 138                pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1);
 139                return;
 140        }
 141
 142        table->entries[x].addr = start;
 143        table->entries[x].size = size;
 144        table->entries[x].type = type;
 145        table->nr_entries++;
 146}
 147
 148void __init e820__range_add(u64 start, u64 size, enum e820_type type)
 149{
 150        __e820__range_add(e820_table, start, size, type);
 151}
 152
 153static void __init e820_print_type(enum e820_type type)
 154{
 155        switch (type) {
 156        case E820_TYPE_RAM:             /* Fall through: */
 157        case E820_TYPE_RESERVED_KERN:   pr_cont("usable");                      break;
 158        case E820_TYPE_RESERVED:        pr_cont("reserved");                    break;
 159        case E820_TYPE_ACPI:            pr_cont("ACPI data");                   break;
 160        case E820_TYPE_NVS:             pr_cont("ACPI NVS");                    break;
 161        case E820_TYPE_UNUSABLE:        pr_cont("unusable");                    break;
 162        case E820_TYPE_PMEM:            /* Fall through: */
 163        case E820_TYPE_PRAM:            pr_cont("persistent (type %u)", type);  break;
 164        default:                        pr_cont("type %u", type);               break;
 165        }
 166}
 167
 168void __init e820__print_table(char *who)
 169{
 170        int i;
 171
 172        for (i = 0; i < e820_table->nr_entries; i++) {
 173                pr_info("%s: [mem %#018Lx-%#018Lx] ", who,
 174                       e820_table->entries[i].addr,
 175                       e820_table->entries[i].addr + e820_table->entries[i].size - 1);
 176
 177                e820_print_type(e820_table->entries[i].type);
 178                pr_cont("\n");
 179        }
 180}
 181
 182/*
 183 * Sanitize an E820 map.
 184 *
 185 * Some E820 layouts include overlapping entries. The following
 186 * replaces the original E820 map with a new one, removing overlaps,
 187 * and resolving conflicting memory types in favor of highest
 188 * numbered type.
 189 *
 190 * The input parameter 'entries' points to an array of 'struct
 191 * e820_entry' which on entry has elements in the range [0, *nr_entries)
 192 * valid, and which has space for up to max_nr_entries entries.
 193 * On return, the resulting sanitized E820 map entries will be in
 194 * overwritten in the same location, starting at 'entries'.
 195 *
 196 * The integer pointed to by nr_entries must be valid on entry (the
 197 * current number of valid entries located at 'entries'). If the
 198 * sanitizing succeeds the *nr_entries will be updated with the new
 199 * number of valid entries (something no more than max_nr_entries).
 200 *
 201 * The return value from e820__update_table() is zero if it
 202 * successfully 'sanitized' the map entries passed in, and is -1
 203 * if it did nothing, which can happen if either of (1) it was
 204 * only passed one map entry, or (2) any of the input map entries
 205 * were invalid (start + size < start, meaning that the size was
 206 * so big the described memory range wrapped around through zero.)
 207 *
 208 *      Visually we're performing the following
 209 *      (1,2,3,4 = memory types)...
 210 *
 211 *      Sample memory map (w/overlaps):
 212 *         ____22__________________
 213 *         ______________________4_
 214 *         ____1111________________
 215 *         _44_____________________
 216 *         11111111________________
 217 *         ____________________33__
 218 *         ___________44___________
 219 *         __________33333_________
 220 *         ______________22________
 221 *         ___________________2222_
 222 *         _________111111111______
 223 *         _____________________11_
 224 *         _________________4______
 225 *
 226 *      Sanitized equivalent (no overlap):
 227 *         1_______________________
 228 *         _44_____________________
 229 *         ___1____________________
 230 *         ____22__________________
 231 *         ______11________________
 232 *         _________1______________
 233 *         __________3_____________
 234 *         ___________44___________
 235 *         _____________33_________
 236 *         _______________2________
 237 *         ________________1_______
 238 *         _________________4______
 239 *         ___________________2____
 240 *         ____________________33__
 241 *         ______________________4_
 242 */
 243struct change_member {
 244        /* Pointer to the original entry: */
 245        struct e820_entry       *entry;
 246        /* Address for this change point: */
 247        unsigned long long      addr;
 248};
 249
 250static struct change_member     change_point_list[2*E820_MAX_ENTRIES]   __initdata;
 251static struct change_member     *change_point[2*E820_MAX_ENTRIES]       __initdata;
 252static struct e820_entry        *overlap_list[E820_MAX_ENTRIES]         __initdata;
 253static struct e820_entry        new_entries[E820_MAX_ENTRIES]           __initdata;
 254
 255static int __init cpcompare(const void *a, const void *b)
 256{
 257        struct change_member * const *app = a, * const *bpp = b;
 258        const struct change_member *ap = *app, *bp = *bpp;
 259
 260        /*
 261         * Inputs are pointers to two elements of change_point[].  If their
 262         * addresses are not equal, their difference dominates.  If the addresses
 263         * are equal, then consider one that represents the end of its region
 264         * to be greater than one that does not.
 265         */
 266        if (ap->addr != bp->addr)
 267                return ap->addr > bp->addr ? 1 : -1;
 268
 269        return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
 270}
 271
 272int __init e820__update_table(struct e820_table *table)
 273{
 274        struct e820_entry *entries = table->entries;
 275        u32 max_nr_entries = ARRAY_SIZE(table->entries);
 276        enum e820_type current_type, last_type;
 277        unsigned long long last_addr;
 278        u32 new_nr_entries, overlap_entries;
 279        u32 i, chg_idx, chg_nr;
 280
 281        /* If there's only one memory region, don't bother: */
 282        if (table->nr_entries < 2)
 283                return -1;
 284
 285        BUG_ON(table->nr_entries > max_nr_entries);
 286
 287        /* Bail out if we find any unreasonable addresses in the map: */
 288        for (i = 0; i < table->nr_entries; i++) {
 289                if (entries[i].addr + entries[i].size < entries[i].addr)
 290                        return -1;
 291        }
 292
 293        /* Create pointers for initial change-point information (for sorting): */
 294        for (i = 0; i < 2 * table->nr_entries; i++)
 295                change_point[i] = &change_point_list[i];
 296
 297        /*
 298         * Record all known change-points (starting and ending addresses),
 299         * omitting empty memory regions:
 300         */
 301        chg_idx = 0;
 302        for (i = 0; i < table->nr_entries; i++) {
 303                if (entries[i].size != 0) {
 304                        change_point[chg_idx]->addr     = entries[i].addr;
 305                        change_point[chg_idx++]->entry  = &entries[i];
 306                        change_point[chg_idx]->addr     = entries[i].addr + entries[i].size;
 307                        change_point[chg_idx++]->entry  = &entries[i];
 308                }
 309        }
 310        chg_nr = chg_idx;
 311
 312        /* Sort change-point list by memory addresses (low -> high): */
 313        sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL);
 314
 315        /* Create a new memory map, removing overlaps: */
 316        overlap_entries = 0;     /* Number of entries in the overlap table */
 317        new_nr_entries = 0;      /* Index for creating new map entries */
 318        last_type = 0;           /* Start with undefined memory type */
 319        last_addr = 0;           /* Start with 0 as last starting address */
 320
 321        /* Loop through change-points, determining effect on the new map: */
 322        for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) {
 323                /* Keep track of all overlapping entries */
 324                if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) {
 325                        /* Add map entry to overlap list (> 1 entry implies an overlap) */
 326                        overlap_list[overlap_entries++] = change_point[chg_idx]->entry;
 327                } else {
 328                        /* Remove entry from list (order independent, so swap with last): */
 329                        for (i = 0; i < overlap_entries; i++) {
 330                                if (overlap_list[i] == change_point[chg_idx]->entry)
 331                                        overlap_list[i] = overlap_list[overlap_entries-1];
 332                        }
 333                        overlap_entries--;
 334                }
 335                /*
 336                 * If there are overlapping entries, decide which
 337                 * "type" to use (larger value takes precedence --
 338                 * 1=usable, 2,3,4,4+=unusable)
 339                 */
 340                current_type = 0;
 341                for (i = 0; i < overlap_entries; i++) {
 342                        if (overlap_list[i]->type > current_type)
 343                                current_type = overlap_list[i]->type;
 344                }
 345
 346                /* Continue building up new map based on this information: */
 347                if (current_type != last_type || current_type == E820_TYPE_PRAM) {
 348                        if (last_type != 0)      {
 349                                new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
 350                                /* Move forward only if the new size was non-zero: */
 351                                if (new_entries[new_nr_entries].size != 0)
 352                                        /* No more space left for new entries? */
 353                                        if (++new_nr_entries >= max_nr_entries)
 354                                                break;
 355                        }
 356                        if (current_type != 0)  {
 357                                new_entries[new_nr_entries].addr = change_point[chg_idx]->addr;
 358                                new_entries[new_nr_entries].type = current_type;
 359                                last_addr = change_point[chg_idx]->addr;
 360                        }
 361                        last_type = current_type;
 362                }
 363        }
 364
 365        /* Copy the new entries into the original location: */
 366        memcpy(entries, new_entries, new_nr_entries*sizeof(*entries));
 367        table->nr_entries = new_nr_entries;
 368
 369        return 0;
 370}
 371
 372static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
 373{
 374        struct boot_e820_entry *entry = entries;
 375
 376        while (nr_entries) {
 377                u64 start = entry->addr;
 378                u64 size = entry->size;
 379                u64 end = start + size - 1;
 380                u32 type = entry->type;
 381
 382                /* Ignore the entry on 64-bit overflow: */
 383                if (start > end && likely(size))
 384                        return -1;
 385
 386                e820__range_add(start, size, type);
 387
 388                entry++;
 389                nr_entries--;
 390        }
 391        return 0;
 392}
 393
 394/*
 395 * Copy the BIOS E820 map into a safe place.
 396 *
 397 * Sanity-check it while we're at it..
 398 *
 399 * If we're lucky and live on a modern system, the setup code
 400 * will have given us a memory map that we can use to properly
 401 * set up memory.  If we aren't, we'll fake a memory map.
 402 */
 403static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
 404{
 405        /* Only one memory region (or negative)? Ignore it */
 406        if (nr_entries < 2)
 407                return -1;
 408
 409        return __append_e820_table(entries, nr_entries);
 410}
 411
 412static u64 __init
 413__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
 414{
 415        u64 end;
 416        unsigned int i;
 417        u64 real_updated_size = 0;
 418
 419        BUG_ON(old_type == new_type);
 420
 421        if (size > (ULLONG_MAX - start))
 422                size = ULLONG_MAX - start;
 423
 424        end = start + size;
 425        printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
 426        e820_print_type(old_type);
 427        pr_cont(" ==> ");
 428        e820_print_type(new_type);
 429        pr_cont("\n");
 430
 431        for (i = 0; i < table->nr_entries; i++) {
 432                struct e820_entry *entry = &table->entries[i];
 433                u64 final_start, final_end;
 434                u64 entry_end;
 435
 436                if (entry->type != old_type)
 437                        continue;
 438
 439                entry_end = entry->addr + entry->size;
 440
 441                /* Completely covered by new range? */
 442                if (entry->addr >= start && entry_end <= end) {
 443                        entry->type = new_type;
 444                        real_updated_size += entry->size;
 445                        continue;
 446                }
 447
 448                /* New range is completely covered? */
 449                if (entry->addr < start && entry_end > end) {
 450                        __e820__range_add(table, start, size, new_type);
 451                        __e820__range_add(table, end, entry_end - end, entry->type);
 452                        entry->size = start - entry->addr;
 453                        real_updated_size += size;
 454                        continue;
 455                }
 456
 457                /* Partially covered: */
 458                final_start = max(start, entry->addr);
 459                final_end = min(end, entry_end);
 460                if (final_start >= final_end)
 461                        continue;
 462
 463                __e820__range_add(table, final_start, final_end - final_start, new_type);
 464
 465                real_updated_size += final_end - final_start;
 466
 467                /*
 468                 * Left range could be head or tail, so need to update
 469                 * its size first:
 470                 */
 471                entry->size -= final_end - final_start;
 472                if (entry->addr < final_start)
 473                        continue;
 474
 475                entry->addr = final_end;
 476        }
 477        return real_updated_size;
 478}
 479
 480u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
 481{
 482        return __e820__range_update(e820_table, start, size, old_type, new_type);
 483}
 484
 485static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type  new_type)
 486{
 487        return __e820__range_update(e820_table_kexec, start, size, old_type, new_type);
 488}
 489
 490/* Remove a range of memory from the E820 table: */
 491u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
 492{
 493        int i;
 494        u64 end;
 495        u64 real_removed_size = 0;
 496
 497        if (size > (ULLONG_MAX - start))
 498                size = ULLONG_MAX - start;
 499
 500        end = start + size;
 501        printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
 502        if (check_type)
 503                e820_print_type(old_type);
 504        pr_cont("\n");
 505
 506        for (i = 0; i < e820_table->nr_entries; i++) {
 507                struct e820_entry *entry = &e820_table->entries[i];
 508                u64 final_start, final_end;
 509                u64 entry_end;
 510
 511                if (check_type && entry->type != old_type)
 512                        continue;
 513
 514                entry_end = entry->addr + entry->size;
 515
 516                /* Completely covered? */
 517                if (entry->addr >= start && entry_end <= end) {
 518                        real_removed_size += entry->size;
 519                        memset(entry, 0, sizeof(*entry));
 520                        continue;
 521                }
 522
 523                /* Is the new range completely covered? */
 524                if (entry->addr < start && entry_end > end) {
 525                        e820__range_add(end, entry_end - end, entry->type);
 526                        entry->size = start - entry->addr;
 527                        real_removed_size += size;
 528                        continue;
 529                }
 530
 531                /* Partially covered: */
 532                final_start = max(start, entry->addr);
 533                final_end = min(end, entry_end);
 534                if (final_start >= final_end)
 535                        continue;
 536
 537                real_removed_size += final_end - final_start;
 538
 539                /*
 540                 * Left range could be head or tail, so need to update
 541                 * the size first:
 542                 */
 543                entry->size -= final_end - final_start;
 544                if (entry->addr < final_start)
 545                        continue;
 546
 547                entry->addr = final_end;
 548        }
 549        return real_removed_size;
 550}
 551
 552void __init e820__update_table_print(void)
 553{
 554        if (e820__update_table(e820_table))
 555                return;
 556
 557        pr_info("e820: modified physical RAM map:\n");
 558        e820__print_table("modified");
 559}
 560
 561static void __init e820__update_table_kexec(void)
 562{
 563        e820__update_table(e820_table_kexec);
 564}
 565
 566#define MAX_GAP_END 0x100000000ull
 567
 568/*
 569 * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
 570 */
 571static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
 572{
 573        unsigned long long last = MAX_GAP_END;
 574        int i = e820_table->nr_entries;
 575        int found = 0;
 576
 577        while (--i >= 0) {
 578                unsigned long long start = e820_table->entries[i].addr;
 579                unsigned long long end = start + e820_table->entries[i].size;
 580
 581                /*
 582                 * Since "last" is at most 4GB, we know we'll
 583                 * fit in 32 bits if this condition is true:
 584                 */
 585                if (last > end) {
 586                        unsigned long gap = last - end;
 587
 588                        if (gap >= *gapsize) {
 589                                *gapsize = gap;
 590                                *gapstart = end;
 591                                found = 1;
 592                        }
 593                }
 594                if (start < last)
 595                        last = start;
 596        }
 597        return found;
 598}
 599
 600/*
 601 * Search for the biggest gap in the low 32 bits of the E820
 602 * memory space. We pass this space to the PCI subsystem, so
 603 * that it can assign MMIO resources for hotplug or
 604 * unconfigured devices in.
 605 *
 606 * Hopefully the BIOS let enough space left.
 607 */
 608__init void e820__setup_pci_gap(void)
 609{
 610        unsigned long gapstart, gapsize;
 611        int found;
 612
 613        gapsize = 0x400000;
 614        found  = e820_search_gap(&gapstart, &gapsize);
 615
 616        if (!found) {
 617#ifdef CONFIG_X86_64
 618                gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
 619                pr_err(
 620                        "e820: Cannot find an available gap in the 32-bit address range\n"
 621                        "e820: PCI devices with unassigned 32-bit BARs may not work!\n");
 622#else
 623                gapstart = 0x10000000;
 624#endif
 625        }
 626
 627        /*
 628         * e820__reserve_resources_late() protects stolen RAM already:
 629         */
 630        pci_mem_start = gapstart;
 631
 632        pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1);
 633}
 634
 635/*
 636 * Called late during init, in free_initmem().
 637 *
 638 * Initial e820_table and e820_table_kexec are largish __initdata arrays.
 639 *
 640 * Copy them to a (usually much smaller) dynamically allocated area that is
 641 * sized precisely after the number of e820 entries.
 642 *
 643 * This is done after we've performed all the fixes and tweaks to the tables.
 644 * All functions which modify them are __init functions, which won't exist
 645 * after free_initmem().
 646 */
 647__init void e820__reallocate_tables(void)
 648{
 649        struct e820_table *n;
 650        int size;
 651
 652        size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
 653        n = kmalloc(size, GFP_KERNEL);
 654        BUG_ON(!n);
 655        memcpy(n, e820_table, size);
 656        e820_table = n;
 657
 658        size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries;
 659        n = kmalloc(size, GFP_KERNEL);
 660        BUG_ON(!n);
 661        memcpy(n, e820_table_kexec, size);
 662        e820_table_kexec = n;
 663
 664        size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
 665        n = kmalloc(size, GFP_KERNEL);
 666        BUG_ON(!n);
 667        memcpy(n, e820_table_firmware, size);
 668        e820_table_firmware = n;
 669}
 670
 671/*
 672 * Because of the small fixed size of struct boot_params, only the first
 673 * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
 674 * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
 675 * struct setup_data, which is parsed here.
 676 */
 677void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
 678{
 679        int entries;
 680        struct boot_e820_entry *extmap;
 681        struct setup_data *sdata;
 682
 683        sdata = early_memremap(phys_addr, data_len);
 684        entries = sdata->len / sizeof(*extmap);
 685        extmap = (struct boot_e820_entry *)(sdata->data);
 686
 687        __append_e820_table(extmap, entries);
 688        e820__update_table(e820_table);
 689
 690        memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
 691        memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
 692
 693        early_memunmap(sdata, data_len);
 694        pr_info("e820: extended physical RAM map:\n");
 695        e820__print_table("extended");
 696}
 697
 698/*
 699 * Find the ranges of physical addresses that do not correspond to
 700 * E820 RAM areas and register the corresponding pages as 'nosave' for
 701 * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
 702 *
 703 * This function requires the E820 map to be sorted and without any
 704 * overlapping entries.
 705 */
 706void __init e820__register_nosave_regions(unsigned long limit_pfn)
 707{
 708        int i;
 709        unsigned long pfn = 0;
 710
 711        for (i = 0; i < e820_table->nr_entries; i++) {
 712                struct e820_entry *entry = &e820_table->entries[i];
 713
 714                if (pfn < PFN_UP(entry->addr))
 715                        register_nosave_region(pfn, PFN_UP(entry->addr));
 716
 717                pfn = PFN_DOWN(entry->addr + entry->size);
 718
 719                if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
 720                        register_nosave_region(PFN_UP(entry->addr), pfn);
 721
 722                if (pfn >= limit_pfn)
 723                        break;
 724        }
 725}
 726
 727#ifdef CONFIG_ACPI
 728/*
 729 * Register ACPI NVS memory regions, so that we can save/restore them during
 730 * hibernation and the subsequent resume:
 731 */
 732static int __init e820__register_nvs_regions(void)
 733{
 734        int i;
 735
 736        for (i = 0; i < e820_table->nr_entries; i++) {
 737                struct e820_entry *entry = &e820_table->entries[i];
 738
 739                if (entry->type == E820_TYPE_NVS)
 740                        acpi_nvs_register(entry->addr, entry->size);
 741        }
 742
 743        return 0;
 744}
 745core_initcall(e820__register_nvs_regions);
 746#endif
 747
 748/*
 749 * Allocate the requested number of bytes with the requsted alignment
 750 * and return (the physical address) to the caller. Also register this
 751 * range in the 'kexec' E820 table as a reserved range.
 752 *
 753 * This allows kexec to fake a new mptable, as if it came from the real
 754 * system.
 755 */
 756u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
 757{
 758        u64 addr;
 759
 760        addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 761        if (addr) {
 762                e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
 763                pr_info("e820: update e820_table_kexec for e820__memblock_alloc_reserved()\n");
 764                e820__update_table_kexec();
 765        }
 766
 767        return addr;
 768}
 769
 770#ifdef CONFIG_X86_32
 771# ifdef CONFIG_X86_PAE
 772#  define MAX_ARCH_PFN          (1ULL<<(36-PAGE_SHIFT))
 773# else
 774#  define MAX_ARCH_PFN          (1ULL<<(32-PAGE_SHIFT))
 775# endif
 776#else /* CONFIG_X86_32 */
 777# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
 778#endif
 779
 780/*
 781 * Find the highest page frame number we have available
 782 */
 783static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
 784{
 785        int i;
 786        unsigned long last_pfn = 0;
 787        unsigned long max_arch_pfn = MAX_ARCH_PFN;
 788
 789        for (i = 0; i < e820_table->nr_entries; i++) {
 790                struct e820_entry *entry = &e820_table->entries[i];
 791                unsigned long start_pfn;
 792                unsigned long end_pfn;
 793
 794                if (entry->type != type)
 795                        continue;
 796
 797                start_pfn = entry->addr >> PAGE_SHIFT;
 798                end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
 799
 800                if (start_pfn >= limit_pfn)
 801                        continue;
 802                if (end_pfn > limit_pfn) {
 803                        last_pfn = limit_pfn;
 804                        break;
 805                }
 806                if (end_pfn > last_pfn)
 807                        last_pfn = end_pfn;
 808        }
 809
 810        if (last_pfn > max_arch_pfn)
 811                last_pfn = max_arch_pfn;
 812
 813        pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
 814                         last_pfn, max_arch_pfn);
 815        return last_pfn;
 816}
 817
 818unsigned long __init e820__end_of_ram_pfn(void)
 819{
 820        return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
 821}
 822
 823unsigned long __init e820__end_of_low_ram_pfn(void)
 824{
 825        return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
 826}
 827
 828static void __init early_panic(char *msg)
 829{
 830        early_printk(msg);
 831        panic(msg);
 832}
 833
 834static int userdef __initdata;
 835
 836/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
 837static int __init parse_memopt(char *p)
 838{
 839        u64 mem_size;
 840
 841        if (!p)
 842                return -EINVAL;
 843
 844        if (!strcmp(p, "nopentium")) {
 845#ifdef CONFIG_X86_32
 846                setup_clear_cpu_cap(X86_FEATURE_PSE);
 847                return 0;
 848#else
 849                pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
 850                return -EINVAL;
 851#endif
 852        }
 853
 854        userdef = 1;
 855        mem_size = memparse(p, &p);
 856
 857        /* Don't remove all memory when getting "mem={invalid}" parameter: */
 858        if (mem_size == 0)
 859                return -EINVAL;
 860
 861        e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
 862
 863        return 0;
 864}
 865early_param("mem", parse_memopt);
 866
 867static int __init parse_memmap_one(char *p)
 868{
 869        char *oldp;
 870        u64 start_at, mem_size;
 871
 872        if (!p)
 873                return -EINVAL;
 874
 875        if (!strncmp(p, "exactmap", 8)) {
 876#ifdef CONFIG_CRASH_DUMP
 877                /*
 878                 * If we are doing a crash dump, we still need to know
 879                 * the real memory size before the original memory map is
 880                 * reset.
 881                 */
 882                saved_max_pfn = e820__end_of_ram_pfn();
 883#endif
 884                e820_table->nr_entries = 0;
 885                userdef = 1;
 886                return 0;
 887        }
 888
 889        oldp = p;
 890        mem_size = memparse(p, &p);
 891        if (p == oldp)
 892                return -EINVAL;
 893
 894        userdef = 1;
 895        if (*p == '@') {
 896                start_at = memparse(p+1, &p);
 897                e820__range_add(start_at, mem_size, E820_TYPE_RAM);
 898        } else if (*p == '#') {
 899                start_at = memparse(p+1, &p);
 900                e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
 901        } else if (*p == '$') {
 902                start_at = memparse(p+1, &p);
 903                e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
 904        } else if (*p == '!') {
 905                start_at = memparse(p+1, &p);
 906                e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
 907        } else {
 908                e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
 909        }
 910
 911        return *p == '\0' ? 0 : -EINVAL;
 912}
 913
 914static int __init parse_memmap_opt(char *str)
 915{
 916        while (str) {
 917                char *k = strchr(str, ',');
 918
 919                if (k)
 920                        *k++ = 0;
 921
 922                parse_memmap_one(str);
 923                str = k;
 924        }
 925
 926        return 0;
 927}
 928early_param("memmap", parse_memmap_opt);
 929
 930/*
 931 * Reserve all entries from the bootloader's extensible data nodes list,
 932 * because if present we are going to use it later on to fetch e820
 933 * entries from it:
 934 */
 935void __init e820__reserve_setup_data(void)
 936{
 937        struct setup_data *data;
 938        u64 pa_data;
 939
 940        pa_data = boot_params.hdr.setup_data;
 941        if (!pa_data)
 942                return;
 943
 944        while (pa_data) {
 945                data = early_memremap(pa_data, sizeof(*data));
 946                e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
 947                e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
 948                pa_data = data->next;
 949                early_memunmap(data, sizeof(*data));
 950        }
 951
 952        e820__update_table(e820_table);
 953        e820__update_table(e820_table_kexec);
 954
 955        pr_info("extended physical RAM map:\n");
 956        e820__print_table("reserve setup_data");
 957}
 958
 959/*
 960 * Called after parse_early_param(), after early parameters (such as mem=)
 961 * have been processed, in which case we already have an E820 table filled in
 962 * via the parameter callback function(s), but it's not sorted and printed yet:
 963 */
 964void __init e820__finish_early_params(void)
 965{
 966        if (userdef) {
 967                if (e820__update_table(e820_table) < 0)
 968                        early_panic("Invalid user supplied memory map");
 969
 970                pr_info("e820: user-defined physical RAM map:\n");
 971                e820__print_table("user");
 972        }
 973}
 974
 975static const char *__init e820_type_to_string(struct e820_entry *entry)
 976{
 977        switch (entry->type) {
 978        case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
 979        case E820_TYPE_RAM:             return "System RAM";
 980        case E820_TYPE_ACPI:            return "ACPI Tables";
 981        case E820_TYPE_NVS:             return "ACPI Non-volatile Storage";
 982        case E820_TYPE_UNUSABLE:        return "Unusable memory";
 983        case E820_TYPE_PRAM:            return "Persistent Memory (legacy)";
 984        case E820_TYPE_PMEM:            return "Persistent Memory";
 985        case E820_TYPE_RESERVED:        return "Reserved";
 986        default:                        return "Unknown E820 type";
 987        }
 988}
 989
 990static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
 991{
 992        switch (entry->type) {
 993        case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
 994        case E820_TYPE_RAM:             return IORESOURCE_SYSTEM_RAM;
 995        case E820_TYPE_ACPI:            /* Fall-through: */
 996        case E820_TYPE_NVS:             /* Fall-through: */
 997        case E820_TYPE_UNUSABLE:        /* Fall-through: */
 998        case E820_TYPE_PRAM:            /* Fall-through: */
 999        case E820_TYPE_PMEM:            /* Fall-through: */
1000        case E820_TYPE_RESERVED:        /* Fall-through: */
1001        default:                        return IORESOURCE_MEM;
1002        }
1003}
1004
1005static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
1006{
1007        switch (entry->type) {
1008        case E820_TYPE_ACPI:            return IORES_DESC_ACPI_TABLES;
1009        case E820_TYPE_NVS:             return IORES_DESC_ACPI_NV_STORAGE;
1010        case E820_TYPE_PMEM:            return IORES_DESC_PERSISTENT_MEMORY;
1011        case E820_TYPE_PRAM:            return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
1012        case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
1013        case E820_TYPE_RAM:             /* Fall-through: */
1014        case E820_TYPE_UNUSABLE:        /* Fall-through: */
1015        case E820_TYPE_RESERVED:        /* Fall-through: */
1016        default:                        return IORES_DESC_NONE;
1017        }
1018}
1019
1020static bool __init do_mark_busy(enum e820_type type, struct resource *res)
1021{
1022        /* this is the legacy bios/dos rom-shadow + mmio region */
1023        if (res->start < (1ULL<<20))
1024                return true;
1025
1026        /*
1027         * Treat persistent memory like device memory, i.e. reserve it
1028         * for exclusive use of a driver
1029         */
1030        switch (type) {
1031        case E820_TYPE_RESERVED:
1032        case E820_TYPE_PRAM:
1033        case E820_TYPE_PMEM:
1034                return false;
1035        case E820_TYPE_RESERVED_KERN:
1036        case E820_TYPE_RAM:
1037        case E820_TYPE_ACPI:
1038        case E820_TYPE_NVS:
1039        case E820_TYPE_UNUSABLE:
1040        default:
1041                return true;
1042        }
1043}
1044
1045/*
1046 * Mark E820 reserved areas as busy for the resource manager:
1047 */
1048
1049static struct resource __initdata *e820_res;
1050
1051void __init e820__reserve_resources(void)
1052{
1053        int i;
1054        struct resource *res;
1055        u64 end;
1056
1057        res = alloc_bootmem(sizeof(*res) * e820_table->nr_entries);
1058        e820_res = res;
1059
1060        for (i = 0; i < e820_table->nr_entries; i++) {
1061                struct e820_entry *entry = e820_table->entries + i;
1062
1063                end = entry->addr + entry->size - 1;
1064                if (end != (resource_size_t)end) {
1065                        res++;
1066                        continue;
1067                }
1068                res->start = entry->addr;
1069                res->end   = end;
1070                res->name  = e820_type_to_string(entry);
1071                res->flags = e820_type_to_iomem_type(entry);
1072                res->desc  = e820_type_to_iores_desc(entry);
1073
1074                /*
1075                 * Don't register the region that could be conflicted with
1076                 * PCI device BAR resources and insert them later in
1077                 * pcibios_resource_survey():
1078                 */
1079                if (do_mark_busy(entry->type, res)) {
1080                        res->flags |= IORESOURCE_BUSY;
1081                        insert_resource(&iomem_resource, res);
1082                }
1083                res++;
1084        }
1085
1086        /* Expose the bootloader-provided memory layout to the sysfs. */
1087        for (i = 0; i < e820_table_firmware->nr_entries; i++) {
1088                struct e820_entry *entry = e820_table_firmware->entries + i;
1089
1090                firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
1091        }
1092}
1093
1094/*
1095 * How much should we pad the end of RAM, depending on where it is?
1096 */
1097static unsigned long __init ram_alignment(resource_size_t pos)
1098{
1099        unsigned long mb = pos >> 20;
1100
1101        /* To 64kB in the first megabyte */
1102        if (!mb)
1103                return 64*1024;
1104
1105        /* To 1MB in the first 16MB */
1106        if (mb < 16)
1107                return 1024*1024;
1108
1109        /* To 64MB for anything above that */
1110        return 64*1024*1024;
1111}
1112
1113#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
1114
1115void __init e820__reserve_resources_late(void)
1116{
1117        int i;
1118        struct resource *res;
1119
1120        res = e820_res;
1121        for (i = 0; i < e820_table->nr_entries; i++) {
1122                if (!res->parent && res->end)
1123                        insert_resource_expand_to_fit(&iomem_resource, res);
1124                res++;
1125        }
1126
1127        /*
1128         * Try to bump up RAM regions to reasonable boundaries, to
1129         * avoid stolen RAM:
1130         */
1131        for (i = 0; i < e820_table->nr_entries; i++) {
1132                struct e820_entry *entry = &e820_table->entries[i];
1133                u64 start, end;
1134
1135                if (entry->type != E820_TYPE_RAM)
1136                        continue;
1137
1138                start = entry->addr + entry->size;
1139                end = round_up(start, ram_alignment(start)) - 1;
1140                if (end > MAX_RESOURCE_SIZE)
1141                        end = MAX_RESOURCE_SIZE;
1142                if (start >= end)
1143                        continue;
1144
1145                printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
1146                reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
1147        }
1148}
1149
1150/*
1151 * Pass the firmware (bootloader) E820 map to the kernel and process it:
1152 */
1153char *__init e820__memory_setup_default(void)
1154{
1155        char *who = "BIOS-e820";
1156
1157        /*
1158         * Try to copy the BIOS-supplied E820-map.
1159         *
1160         * Otherwise fake a memory map; one section from 0k->640k,
1161         * the next section from 1mb->appropriate_mem_k
1162         */
1163        if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
1164                u64 mem_size;
1165
1166                /* Compare results from other methods and take the one that gives more RAM: */
1167                if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
1168                        mem_size = boot_params.screen_info.ext_mem_k;
1169                        who = "BIOS-88";
1170                } else {
1171                        mem_size = boot_params.alt_mem_k;
1172                        who = "BIOS-e801";
1173                }
1174
1175                e820_table->nr_entries = 0;
1176                e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM);
1177                e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM);
1178        }
1179
1180        /* We just appended a lot of ranges, sanitize the table: */
1181        e820__update_table(e820_table);
1182
1183        return who;
1184}
1185
1186/*
1187 * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
1188 * E820 map - with an optional platform quirk available for virtual platforms
1189 * to override this method of boot environment processing:
1190 */
1191void __init e820__memory_setup(void)
1192{
1193        char *who;
1194
1195        /* This is a firmware interface ABI - make sure we don't break it: */
1196        BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20);
1197
1198        who = x86_init.resources.memory_setup();
1199
1200        memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
1201        memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
1202
1203        pr_info("e820: BIOS-provided physical RAM map:\n");
1204        e820__print_table(who);
1205}
1206
1207void __init e820__memblock_setup(void)
1208{
1209        int i;
1210        u64 end;
1211
1212        /*
1213         * The bootstrap memblock region count maximum is 128 entries
1214         * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
1215         * than that - so allow memblock resizing.
1216         *
1217         * This is safe, because this call happens pretty late during x86 setup,
1218         * so we know about reserved memory regions already. (This is important
1219         * so that memblock resizing does no stomp over reserved areas.)
1220         */
1221        memblock_allow_resize();
1222
1223        for (i = 0; i < e820_table->nr_entries; i++) {
1224                struct e820_entry *entry = &e820_table->entries[i];
1225
1226                end = entry->addr + entry->size;
1227                if (end != (resource_size_t)end)
1228                        continue;
1229
1230                if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
1231                        continue;
1232
1233                memblock_add(entry->addr, entry->size);
1234        }
1235
1236        /* Throw away partial pages: */
1237        memblock_trim_memory(PAGE_SIZE);
1238
1239        memblock_dump_all();
1240}
1241