linux/kernel/kexec.c
<<
>>
Prefs
   1/*
   2 * kexec.c - kexec system call
   3 * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
   4 *
   5 * This source code is licensed under the GNU General Public License,
   6 * Version 2.  See the file COPYING for more details.
   7 */
   8
   9#include <linux/capability.h>
  10#include <linux/mm.h>
  11#include <linux/file.h>
  12#include <linux/slab.h>
  13#include <linux/fs.h>
  14#include <linux/kexec.h>
  15#include <linux/mutex.h>
  16#include <linux/list.h>
  17#include <linux/highmem.h>
  18#include <linux/syscalls.h>
  19#include <linux/reboot.h>
  20#include <linux/ioport.h>
  21#include <linux/hardirq.h>
  22#include <linux/elf.h>
  23#include <linux/elfcore.h>
  24#include <generated/utsrelease.h>
  25#include <linux/utsname.h>
  26#include <linux/numa.h>
  27#include <linux/suspend.h>
  28#include <linux/device.h>
  29#include <linux/freezer.h>
  30#include <linux/pm.h>
  31#include <linux/cpu.h>
  32#include <linux/console.h>
  33#include <linux/vmalloc.h>
  34#include <linux/swap.h>
  35#include <linux/kmsg_dump.h>
  36
  37#include <asm/page.h>
  38#include <asm/uaccess.h>
  39#include <asm/io.h>
  40#include <asm/system.h>
  41#include <asm/sections.h>
  42
  43/* Per cpu memory for storing cpu states in case of system crash. */
  44note_buf_t __percpu *crash_notes;
  45
  46/* vmcoreinfo stuff */
  47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
  48u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
  49size_t vmcoreinfo_size;
  50size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
  51
  52/* Location of the reserved area for the crash kernel */
  53struct resource crashk_res = {
  54        .name  = "Crash kernel",
  55        .start = 0,
  56        .end   = 0,
  57        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  58};
  59
  60int kexec_should_crash(struct task_struct *p)
  61{
  62        if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
  63                return 1;
  64        return 0;
  65}
  66
  67/*
  68 * When kexec transitions to the new kernel there is a one-to-one
  69 * mapping between physical and virtual addresses.  On processors
  70 * where you can disable the MMU this is trivial, and easy.  For
  71 * others it is still a simple predictable page table to setup.
  72 *
  73 * In that environment kexec copies the new kernel to its final
  74 * resting place.  This means I can only support memory whose
  75 * physical address can fit in an unsigned long.  In particular
  76 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  77 * If the assembly stub has more restrictive requirements
  78 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  79 * defined more restrictively in <asm/kexec.h>.
  80 *
  81 * The code for the transition from the current kernel to the
  82 * the new kernel is placed in the control_code_buffer, whose size
  83 * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
  84 * page of memory is necessary, but some architectures require more.
  85 * Because this memory must be identity mapped in the transition from
  86 * virtual to physical addresses it must live in the range
  87 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  88 * modifiable.
  89 *
  90 * The assembly stub in the control code buffer is passed a linked list
  91 * of descriptor pages detailing the source pages of the new kernel,
  92 * and the destination addresses of those source pages.  As this data
  93 * structure is not used in the context of the current OS, it must
  94 * be self-contained.
  95 *
  96 * The code has been made to work with highmem pages and will use a
  97 * destination page in its final resting place (if it happens
  98 * to allocate it).  The end product of this is that most of the
  99 * physical address space, and most of RAM can be used.
 100 *
 101 * Future directions include:
 102 *  - allocating a page table with the control code buffer identity
 103 *    mapped, to simplify machine_kexec and make kexec_on_panic more
 104 *    reliable.
 105 */
 106
 107/*
 108 * KIMAGE_NO_DEST is an impossible destination address..., for
 109 * allocating pages whose destination address we do not care about.
 110 */
 111#define KIMAGE_NO_DEST (-1UL)
 112
 113static int kimage_is_destination_range(struct kimage *image,
 114                                       unsigned long start, unsigned long end);
 115static struct page *kimage_alloc_page(struct kimage *image,
 116                                       gfp_t gfp_mask,
 117                                       unsigned long dest);
 118
 119static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 120                            unsigned long nr_segments,
 121                            struct kexec_segment __user *segments)
 122{
 123        size_t segment_bytes;
 124        struct kimage *image;
 125        unsigned long i;
 126        int result;
 127
 128        /* Allocate a controlling structure */
 129        result = -ENOMEM;
 130        image = kzalloc(sizeof(*image), GFP_KERNEL);
 131        if (!image)
 132                goto out;
 133
 134        image->head = 0;
 135        image->entry = &image->head;
 136        image->last_entry = &image->head;
 137        image->control_page = ~0; /* By default this does not apply */
 138        image->start = entry;
 139        image->type = KEXEC_TYPE_DEFAULT;
 140
 141        /* Initialize the list of control pages */
 142        INIT_LIST_HEAD(&image->control_pages);
 143
 144        /* Initialize the list of destination pages */
 145        INIT_LIST_HEAD(&image->dest_pages);
 146
 147        /* Initialize the list of unuseable pages */
 148        INIT_LIST_HEAD(&image->unuseable_pages);
 149
 150        /* Read in the segments */
 151        image->nr_segments = nr_segments;
 152        segment_bytes = nr_segments * sizeof(*segments);
 153        result = copy_from_user(image->segment, segments, segment_bytes);
 154        if (result) {
 155                result = -EFAULT;
 156                goto out;
 157        }
 158
 159        /*
 160         * Verify we have good destination addresses.  The caller is
 161         * responsible for making certain we don't attempt to load
 162         * the new image into invalid or reserved areas of RAM.  This
 163         * just verifies it is an address we can use.
 164         *
 165         * Since the kernel does everything in page size chunks ensure
 166         * the destination addresses are page aligned.  Too many
 167         * special cases crop of when we don't do this.  The most
 168         * insidious is getting overlapping destination addresses
 169         * simply because addresses are changed to page size
 170         * granularity.
 171         */
 172        result = -EADDRNOTAVAIL;
 173        for (i = 0; i < nr_segments; i++) {
 174                unsigned long mstart, mend;
 175
 176                mstart = image->segment[i].mem;
 177                mend   = mstart + image->segment[i].memsz;
 178                if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
 179                        goto out;
 180                if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
 181                        goto out;
 182        }
 183
 184        /* Verify our destination addresses do not overlap.
 185         * If we alloed overlapping destination addresses
 186         * through very weird things can happen with no
 187         * easy explanation as one segment stops on another.
 188         */
 189        result = -EINVAL;
 190        for (i = 0; i < nr_segments; i++) {
 191                unsigned long mstart, mend;
 192                unsigned long j;
 193
 194                mstart = image->segment[i].mem;
 195                mend   = mstart + image->segment[i].memsz;
 196                for (j = 0; j < i; j++) {
 197                        unsigned long pstart, pend;
 198                        pstart = image->segment[j].mem;
 199                        pend   = pstart + image->segment[j].memsz;
 200                        /* Do the segments overlap ? */
 201                        if ((mend > pstart) && (mstart < pend))
 202                                goto out;
 203                }
 204        }
 205
 206        /* Ensure our buffer sizes are strictly less than
 207         * our memory sizes.  This should always be the case,
 208         * and it is easier to check up front than to be surprised
 209         * later on.
 210         */
 211        result = -EINVAL;
 212        for (i = 0; i < nr_segments; i++) {
 213                if (image->segment[i].bufsz > image->segment[i].memsz)
 214                        goto out;
 215        }
 216
 217        result = 0;
 218out:
 219        if (result == 0)
 220                *rimage = image;
 221        else
 222                kfree(image);
 223
 224        return result;
 225
 226}
 227
 228static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 229                                unsigned long nr_segments,
 230                                struct kexec_segment __user *segments)
 231{
 232        int result;
 233        struct kimage *image;
 234
 235        /* Allocate and initialize a controlling structure */
 236        image = NULL;
 237        result = do_kimage_alloc(&image, entry, nr_segments, segments);
 238        if (result)
 239                goto out;
 240
 241        *rimage = image;
 242
 243        /*
 244         * Find a location for the control code buffer, and add it
 245         * the vector of segments so that it's pages will also be
 246         * counted as destination pages.
 247         */
 248        result = -ENOMEM;
 249        image->control_code_page = kimage_alloc_control_pages(image,
 250                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
 251        if (!image->control_code_page) {
 252                printk(KERN_ERR "Could not allocate control_code_buffer\n");
 253                goto out;
 254        }
 255
 256        image->swap_page = kimage_alloc_control_pages(image, 0);
 257        if (!image->swap_page) {
 258                printk(KERN_ERR "Could not allocate swap buffer\n");
 259                goto out;
 260        }
 261
 262        result = 0;
 263 out:
 264        if (result == 0)
 265                *rimage = image;
 266        else
 267                kfree(image);
 268
 269        return result;
 270}
 271
 272static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 273                                unsigned long nr_segments,
 274                                struct kexec_segment __user *segments)
 275{
 276        int result;
 277        struct kimage *image;
 278        unsigned long i;
 279
 280        image = NULL;
 281        /* Verify we have a valid entry point */
 282        if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
 283                result = -EADDRNOTAVAIL;
 284                goto out;
 285        }
 286
 287        /* Allocate and initialize a controlling structure */
 288        result = do_kimage_alloc(&image, entry, nr_segments, segments);
 289        if (result)
 290                goto out;
 291
 292        /* Enable the special crash kernel control page
 293         * allocation policy.
 294         */
 295        image->control_page = crashk_res.start;
 296        image->type = KEXEC_TYPE_CRASH;
 297
 298        /*
 299         * Verify we have good destination addresses.  Normally
 300         * the caller is responsible for making certain we don't
 301         * attempt to load the new image into invalid or reserved
 302         * areas of RAM.  But crash kernels are preloaded into a
 303         * reserved area of ram.  We must ensure the addresses
 304         * are in the reserved area otherwise preloading the
 305         * kernel could corrupt things.
 306         */
 307        result = -EADDRNOTAVAIL;
 308        for (i = 0; i < nr_segments; i++) {
 309                unsigned long mstart, mend;
 310
 311                mstart = image->segment[i].mem;
 312                mend = mstart + image->segment[i].memsz - 1;
 313                /* Ensure we are within the crash kernel limits */
 314                if ((mstart < crashk_res.start) || (mend > crashk_res.end))
 315                        goto out;
 316        }
 317
 318        /*
 319         * Find a location for the control code buffer, and add
 320         * the vector of segments so that it's pages will also be
 321         * counted as destination pages.
 322         */
 323        result = -ENOMEM;
 324        image->control_code_page = kimage_alloc_control_pages(image,
 325                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
 326        if (!image->control_code_page) {
 327                printk(KERN_ERR "Could not allocate control_code_buffer\n");
 328                goto out;
 329        }
 330
 331        result = 0;
 332out:
 333        if (result == 0)
 334                *rimage = image;
 335        else
 336                kfree(image);
 337
 338        return result;
 339}
 340
 341static int kimage_is_destination_range(struct kimage *image,
 342                                        unsigned long start,
 343                                        unsigned long end)
 344{
 345        unsigned long i;
 346
 347        for (i = 0; i < image->nr_segments; i++) {
 348                unsigned long mstart, mend;
 349
 350                mstart = image->segment[i].mem;
 351                mend = mstart + image->segment[i].memsz;
 352                if ((end > mstart) && (start < mend))
 353                        return 1;
 354        }
 355
 356        return 0;
 357}
 358
 359static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 360{
 361        struct page *pages;
 362
 363        pages = alloc_pages(gfp_mask, order);
 364        if (pages) {
 365                unsigned int count, i;
 366                pages->mapping = NULL;
 367                set_page_private(pages, order);
 368                count = 1 << order;
 369                for (i = 0; i < count; i++)
 370                        SetPageReserved(pages + i);
 371        }
 372
 373        return pages;
 374}
 375
 376static void kimage_free_pages(struct page *page)
 377{
 378        unsigned int order, count, i;
 379
 380        order = page_private(page);
 381        count = 1 << order;
 382        for (i = 0; i < count; i++)
 383                ClearPageReserved(page + i);
 384        __free_pages(page, order);
 385}
 386
 387static void kimage_free_page_list(struct list_head *list)
 388{
 389        struct list_head *pos, *next;
 390
 391        list_for_each_safe(pos, next, list) {
 392                struct page *page;
 393
 394                page = list_entry(pos, struct page, lru);
 395                list_del(&page->lru);
 396                kimage_free_pages(page);
 397        }
 398}
 399
 400static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 401                                                        unsigned int order)
 402{
 403        /* Control pages are special, they are the intermediaries
 404         * that are needed while we copy the rest of the pages
 405         * to their final resting place.  As such they must
 406         * not conflict with either the destination addresses
 407         * or memory the kernel is already using.
 408         *
 409         * The only case where we really need more than one of
 410         * these are for architectures where we cannot disable
 411         * the MMU and must instead generate an identity mapped
 412         * page table for all of the memory.
 413         *
 414         * At worst this runs in O(N) of the image size.
 415         */
 416        struct list_head extra_pages;
 417        struct page *pages;
 418        unsigned int count;
 419
 420        count = 1 << order;
 421        INIT_LIST_HEAD(&extra_pages);
 422
 423        /* Loop while I can allocate a page and the page allocated
 424         * is a destination page.
 425         */
 426        do {
 427                unsigned long pfn, epfn, addr, eaddr;
 428
 429                pages = kimage_alloc_pages(GFP_KERNEL, order);
 430                if (!pages)
 431                        break;
 432                pfn   = page_to_pfn(pages);
 433                epfn  = pfn + count;
 434                addr  = pfn << PAGE_SHIFT;
 435                eaddr = epfn << PAGE_SHIFT;
 436                if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
 437                              kimage_is_destination_range(image, addr, eaddr)) {
 438                        list_add(&pages->lru, &extra_pages);
 439                        pages = NULL;
 440                }
 441        } while (!pages);
 442
 443        if (pages) {
 444                /* Remember the allocated page... */
 445                list_add(&pages->lru, &image->control_pages);
 446
 447                /* Because the page is already in it's destination
 448                 * location we will never allocate another page at
 449                 * that address.  Therefore kimage_alloc_pages
 450                 * will not return it (again) and we don't need
 451                 * to give it an entry in image->segment[].
 452                 */
 453        }
 454        /* Deal with the destination pages I have inadvertently allocated.
 455         *
 456         * Ideally I would convert multi-page allocations into single
 457         * page allocations, and add everyting to image->dest_pages.
 458         *
 459         * For now it is simpler to just free the pages.
 460         */
 461        kimage_free_page_list(&extra_pages);
 462
 463        return pages;
 464}
 465
 466static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 467                                                      unsigned int order)
 468{
 469        /* Control pages are special, they are the intermediaries
 470         * that are needed while we copy the rest of the pages
 471         * to their final resting place.  As such they must
 472         * not conflict with either the destination addresses
 473         * or memory the kernel is already using.
 474         *
 475         * Control pages are also the only pags we must allocate
 476         * when loading a crash kernel.  All of the other pages
 477         * are specified by the segments and we just memcpy
 478         * into them directly.
 479         *
 480         * The only case where we really need more than one of
 481         * these are for architectures where we cannot disable
 482         * the MMU and must instead generate an identity mapped
 483         * page table for all of the memory.
 484         *
 485         * Given the low demand this implements a very simple
 486         * allocator that finds the first hole of the appropriate
 487         * size in the reserved memory region, and allocates all
 488         * of the memory up to and including the hole.
 489         */
 490        unsigned long hole_start, hole_end, size;
 491        struct page *pages;
 492
 493        pages = NULL;
 494        size = (1 << order) << PAGE_SHIFT;
 495        hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 496        hole_end   = hole_start + size - 1;
 497        while (hole_end <= crashk_res.end) {
 498                unsigned long i;
 499
 500                if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
 501                        break;
 502                if (hole_end > crashk_res.end)
 503                        break;
 504                /* See if I overlap any of the segments */
 505                for (i = 0; i < image->nr_segments; i++) {
 506                        unsigned long mstart, mend;
 507
 508                        mstart = image->segment[i].mem;
 509                        mend   = mstart + image->segment[i].memsz - 1;
 510                        if ((hole_end >= mstart) && (hole_start <= mend)) {
 511                                /* Advance the hole to the end of the segment */
 512                                hole_start = (mend + (size - 1)) & ~(size - 1);
 513                                hole_end   = hole_start + size - 1;
 514                                break;
 515                        }
 516                }
 517                /* If I don't overlap any segments I have found my hole! */
 518                if (i == image->nr_segments) {
 519                        pages = pfn_to_page(hole_start >> PAGE_SHIFT);
 520                        break;
 521                }
 522        }
 523        if (pages)
 524                image->control_page = hole_end;
 525
 526        return pages;
 527}
 528
 529
 530struct page *kimage_alloc_control_pages(struct kimage *image,
 531                                         unsigned int order)
 532{
 533        struct page *pages = NULL;
 534
 535        switch (image->type) {
 536        case KEXEC_TYPE_DEFAULT:
 537                pages = kimage_alloc_normal_control_pages(image, order);
 538                break;
 539        case KEXEC_TYPE_CRASH:
 540                pages = kimage_alloc_crash_control_pages(image, order);
 541                break;
 542        }
 543
 544        return pages;
 545}
 546
 547static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 548{
 549        if (*image->entry != 0)
 550                image->entry++;
 551
 552        if (image->entry == image->last_entry) {
 553                kimage_entry_t *ind_page;
 554                struct page *page;
 555
 556                page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
 557                if (!page)
 558                        return -ENOMEM;
 559
 560                ind_page = page_address(page);
 561                *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
 562                image->entry = ind_page;
 563                image->last_entry = ind_page +
 564                                      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
 565        }
 566        *image->entry = entry;
 567        image->entry++;
 568        *image->entry = 0;
 569
 570        return 0;
 571}
 572
 573static int kimage_set_destination(struct kimage *image,
 574                                   unsigned long destination)
 575{
 576        int result;
 577
 578        destination &= PAGE_MASK;
 579        result = kimage_add_entry(image, destination | IND_DESTINATION);
 580        if (result == 0)
 581                image->destination = destination;
 582
 583        return result;
 584}
 585
 586
 587static int kimage_add_page(struct kimage *image, unsigned long page)
 588{
 589        int result;
 590
 591        page &= PAGE_MASK;
 592        result = kimage_add_entry(image, page | IND_SOURCE);
 593        if (result == 0)
 594                image->destination += PAGE_SIZE;
 595
 596        return result;
 597}
 598
 599
 600static void kimage_free_extra_pages(struct kimage *image)
 601{
 602        /* Walk through and free any extra destination pages I may have */
 603        kimage_free_page_list(&image->dest_pages);
 604
 605        /* Walk through and free any unuseable pages I have cached */
 606        kimage_free_page_list(&image->unuseable_pages);
 607
 608}
 609static void kimage_terminate(struct kimage *image)
 610{
 611        if (*image->entry != 0)
 612                image->entry++;
 613
 614        *image->entry = IND_DONE;
 615}
 616
 617#define for_each_kimage_entry(image, ptr, entry) \
 618        for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 619                ptr = (entry & IND_INDIRECTION)? \
 620                        phys_to_virt((entry & PAGE_MASK)): ptr +1)
 621
 622static void kimage_free_entry(kimage_entry_t entry)
 623{
 624        struct page *page;
 625
 626        page = pfn_to_page(entry >> PAGE_SHIFT);
 627        kimage_free_pages(page);
 628}
 629
 630static void kimage_free(struct kimage *image)
 631{
 632        kimage_entry_t *ptr, entry;
 633        kimage_entry_t ind = 0;
 634
 635        if (!image)
 636                return;
 637
 638        kimage_free_extra_pages(image);
 639        for_each_kimage_entry(image, ptr, entry) {
 640                if (entry & IND_INDIRECTION) {
 641                        /* Free the previous indirection page */
 642                        if (ind & IND_INDIRECTION)
 643                                kimage_free_entry(ind);
 644                        /* Save this indirection page until we are
 645                         * done with it.
 646                         */
 647                        ind = entry;
 648                }
 649                else if (entry & IND_SOURCE)
 650                        kimage_free_entry(entry);
 651        }
 652        /* Free the final indirection page */
 653        if (ind & IND_INDIRECTION)
 654                kimage_free_entry(ind);
 655
 656        /* Handle any machine specific cleanup */
 657        machine_kexec_cleanup(image);
 658
 659        /* Free the kexec control pages... */
 660        kimage_free_page_list(&image->control_pages);
 661        kfree(image);
 662}
 663
 664static kimage_entry_t *kimage_dst_used(struct kimage *image,
 665                                        unsigned long page)
 666{
 667        kimage_entry_t *ptr, entry;
 668        unsigned long destination = 0;
 669
 670        for_each_kimage_entry(image, ptr, entry) {
 671                if (entry & IND_DESTINATION)
 672                        destination = entry & PAGE_MASK;
 673                else if (entry & IND_SOURCE) {
 674                        if (page == destination)
 675                                return ptr;
 676                        destination += PAGE_SIZE;
 677                }
 678        }
 679
 680        return NULL;
 681}
 682
 683static struct page *kimage_alloc_page(struct kimage *image,
 684                                        gfp_t gfp_mask,
 685                                        unsigned long destination)
 686{
 687        /*
 688         * Here we implement safeguards to ensure that a source page
 689         * is not copied to its destination page before the data on
 690         * the destination page is no longer useful.
 691         *
 692         * To do this we maintain the invariant that a source page is
 693         * either its own destination page, or it is not a
 694         * destination page at all.
 695         *
 696         * That is slightly stronger than required, but the proof
 697         * that no problems will not occur is trivial, and the
 698         * implementation is simply to verify.
 699         *
 700         * When allocating all pages normally this algorithm will run
 701         * in O(N) time, but in the worst case it will run in O(N^2)
 702         * time.   If the runtime is a problem the data structures can
 703         * be fixed.
 704         */
 705        struct page *page;
 706        unsigned long addr;
 707
 708        /*
 709         * Walk through the list of destination pages, and see if I
 710         * have a match.
 711         */
 712        list_for_each_entry(page, &image->dest_pages, lru) {
 713                addr = page_to_pfn(page) << PAGE_SHIFT;
 714                if (addr == destination) {
 715                        list_del(&page->lru);
 716                        return page;
 717                }
 718        }
 719        page = NULL;
 720        while (1) {
 721                kimage_entry_t *old;
 722
 723                /* Allocate a page, if we run out of memory give up */
 724                page = kimage_alloc_pages(gfp_mask, 0);
 725                if (!page)
 726                        return NULL;
 727                /* If the page cannot be used file it away */
 728                if (page_to_pfn(page) >
 729                                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 730                        list_add(&page->lru, &image->unuseable_pages);
 731                        continue;
 732                }
 733                addr = page_to_pfn(page) << PAGE_SHIFT;
 734
 735                /* If it is the destination page we want use it */
 736                if (addr == destination)
 737                        break;
 738
 739                /* If the page is not a destination page use it */
 740                if (!kimage_is_destination_range(image, addr,
 741                                                  addr + PAGE_SIZE))
 742                        break;
 743
 744                /*
 745                 * I know that the page is someones destination page.
 746                 * See if there is already a source page for this
 747                 * destination page.  And if so swap the source pages.
 748                 */
 749                old = kimage_dst_used(image, addr);
 750                if (old) {
 751                        /* If so move it */
 752                        unsigned long old_addr;
 753                        struct page *old_page;
 754
 755                        old_addr = *old & PAGE_MASK;
 756                        old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
 757                        copy_highpage(page, old_page);
 758                        *old = addr | (*old & ~PAGE_MASK);
 759
 760                        /* The old page I have found cannot be a
 761                         * destination page, so return it if it's
 762                         * gfp_flags honor the ones passed in.
 763                         */
 764                        if (!(gfp_mask & __GFP_HIGHMEM) &&
 765                            PageHighMem(old_page)) {
 766                                kimage_free_pages(old_page);
 767                                continue;
 768                        }
 769                        addr = old_addr;
 770                        page = old_page;
 771                        break;
 772                }
 773                else {
 774                        /* Place the page on the destination list I
 775                         * will use it later.
 776                         */
 777                        list_add(&page->lru, &image->dest_pages);
 778                }
 779        }
 780
 781        return page;
 782}
 783
 784static int kimage_load_normal_segment(struct kimage *image,
 785                                         struct kexec_segment *segment)
 786{
 787        unsigned long maddr;
 788        unsigned long ubytes, mbytes;
 789        int result;
 790        unsigned char __user *buf;
 791
 792        result = 0;
 793        buf = segment->buf;
 794        ubytes = segment->bufsz;
 795        mbytes = segment->memsz;
 796        maddr = segment->mem;
 797
 798        result = kimage_set_destination(image, maddr);
 799        if (result < 0)
 800                goto out;
 801
 802        while (mbytes) {
 803                struct page *page;
 804                char *ptr;
 805                size_t uchunk, mchunk;
 806
 807                page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
 808                if (!page) {
 809                        result  = -ENOMEM;
 810                        goto out;
 811                }
 812                result = kimage_add_page(image, page_to_pfn(page)
 813                                                                << PAGE_SHIFT);
 814                if (result < 0)
 815                        goto out;
 816
 817                ptr = kmap(page);
 818                /* Start with a clear page */
 819                clear_page(ptr);
 820                ptr += maddr & ~PAGE_MASK;
 821                mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 822                if (mchunk > mbytes)
 823                        mchunk = mbytes;
 824
 825                uchunk = mchunk;
 826                if (uchunk > ubytes)
 827                        uchunk = ubytes;
 828
 829                result = copy_from_user(ptr, buf, uchunk);
 830                kunmap(page);
 831                if (result) {
 832                        result = -EFAULT;
 833                        goto out;
 834                }
 835                ubytes -= uchunk;
 836                maddr  += mchunk;
 837                buf    += mchunk;
 838                mbytes -= mchunk;
 839        }
 840out:
 841        return result;
 842}
 843
 844static int kimage_load_crash_segment(struct kimage *image,
 845                                        struct kexec_segment *segment)
 846{
 847        /* For crash dumps kernels we simply copy the data from
 848         * user space to it's destination.
 849         * We do things a page at a time for the sake of kmap.
 850         */
 851        unsigned long maddr;
 852        unsigned long ubytes, mbytes;
 853        int result;
 854        unsigned char __user *buf;
 855
 856        result = 0;
 857        buf = segment->buf;
 858        ubytes = segment->bufsz;
 859        mbytes = segment->memsz;
 860        maddr = segment->mem;
 861        while (mbytes) {
 862                struct page *page;
 863                char *ptr;
 864                size_t uchunk, mchunk;
 865
 866                page = pfn_to_page(maddr >> PAGE_SHIFT);
 867                if (!page) {
 868                        result  = -ENOMEM;
 869                        goto out;
 870                }
 871                ptr = kmap(page);
 872                ptr += maddr & ~PAGE_MASK;
 873                mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 874                if (mchunk > mbytes)
 875                        mchunk = mbytes;
 876
 877                uchunk = mchunk;
 878                if (uchunk > ubytes) {
 879                        uchunk = ubytes;
 880                        /* Zero the trailing part of the page */
 881                        memset(ptr + uchunk, 0, mchunk - uchunk);
 882                }
 883                result = copy_from_user(ptr, buf, uchunk);
 884                kexec_flush_icache_page(page);
 885                kunmap(page);
 886                if (result) {
 887                        result = -EFAULT;
 888                        goto out;
 889                }
 890                ubytes -= uchunk;
 891                maddr  += mchunk;
 892                buf    += mchunk;
 893                mbytes -= mchunk;
 894        }
 895out:
 896        return result;
 897}
 898
 899static int kimage_load_segment(struct kimage *image,
 900                                struct kexec_segment *segment)
 901{
 902        int result = -ENOMEM;
 903
 904        switch (image->type) {
 905        case KEXEC_TYPE_DEFAULT:
 906                result = kimage_load_normal_segment(image, segment);
 907                break;
 908        case KEXEC_TYPE_CRASH:
 909                result = kimage_load_crash_segment(image, segment);
 910                break;
 911        }
 912
 913        return result;
 914}
 915
 916/*
 917 * Exec Kernel system call: for obvious reasons only root may call it.
 918 *
 919 * This call breaks up into three pieces.
 920 * - A generic part which loads the new kernel from the current
 921 *   address space, and very carefully places the data in the
 922 *   allocated pages.
 923 *
 924 * - A generic part that interacts with the kernel and tells all of
 925 *   the devices to shut down.  Preventing on-going dmas, and placing
 926 *   the devices in a consistent state so a later kernel can
 927 *   reinitialize them.
 928 *
 929 * - A machine specific part that includes the syscall number
 930 *   and the copies the image to it's final destination.  And
 931 *   jumps into the image at entry.
 932 *
 933 * kexec does not sync, or unmount filesystems so if you need
 934 * that to happen you need to do that yourself.
 935 */
 936struct kimage *kexec_image;
 937struct kimage *kexec_crash_image;
 938
 939static DEFINE_MUTEX(kexec_mutex);
 940
 941SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 942                struct kexec_segment __user *, segments, unsigned long, flags)
 943{
 944        struct kimage **dest_image, *image;
 945        int result;
 946
 947        /* We only trust the superuser with rebooting the system. */
 948        if (!capable(CAP_SYS_BOOT))
 949                return -EPERM;
 950
 951        /*
 952         * Verify we have a legal set of flags
 953         * This leaves us room for future extensions.
 954         */
 955        if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
 956                return -EINVAL;
 957
 958        /* Verify we are on the appropriate architecture */
 959        if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
 960                ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
 961                return -EINVAL;
 962
 963        /* Put an artificial cap on the number
 964         * of segments passed to kexec_load.
 965         */
 966        if (nr_segments > KEXEC_SEGMENT_MAX)
 967                return -EINVAL;
 968
 969        image = NULL;
 970        result = 0;
 971
 972        /* Because we write directly to the reserved memory
 973         * region when loading crash kernels we need a mutex here to
 974         * prevent multiple crash  kernels from attempting to load
 975         * simultaneously, and to prevent a crash kernel from loading
 976         * over the top of a in use crash kernel.
 977         *
 978         * KISS: always take the mutex.
 979         */
 980        if (!mutex_trylock(&kexec_mutex))
 981                return -EBUSY;
 982
 983        dest_image = &kexec_image;
 984        if (flags & KEXEC_ON_CRASH)
 985                dest_image = &kexec_crash_image;
 986        if (nr_segments > 0) {
 987                unsigned long i;
 988
 989                /* Loading another kernel to reboot into */
 990                if ((flags & KEXEC_ON_CRASH) == 0)
 991                        result = kimage_normal_alloc(&image, entry,
 992                                                        nr_segments, segments);
 993                /* Loading another kernel to switch to if this one crashes */
 994                else if (flags & KEXEC_ON_CRASH) {
 995                        /* Free any current crash dump kernel before
 996                         * we corrupt it.
 997                         */
 998                        kimage_free(xchg(&kexec_crash_image, NULL));
 999                        result = kimage_crash_alloc(&image, entry,
1000                                                     nr_segments, segments);
1001                }
1002                if (result)
1003                        goto out;
1004
1005                if (flags & KEXEC_PRESERVE_CONTEXT)
1006                        image->preserve_context = 1;
1007                result = machine_kexec_prepare(image);
1008                if (result)
1009                        goto out;
1010
1011                for (i = 0; i < nr_segments; i++) {
1012                        result = kimage_load_segment(image, &image->segment[i]);
1013                        if (result)
1014                                goto out;
1015                }
1016                kimage_terminate(image);
1017        }
1018        /* Install the new kernel, and  Uninstall the old */
1019        image = xchg(dest_image, image);
1020
1021out:
1022        mutex_unlock(&kexec_mutex);
1023        kimage_free(image);
1024
1025        return result;
1026}
1027
1028#ifdef CONFIG_COMPAT
1029asmlinkage long compat_sys_kexec_load(unsigned long entry,
1030                                unsigned long nr_segments,
1031                                struct compat_kexec_segment __user *segments,
1032                                unsigned long flags)
1033{
1034        struct compat_kexec_segment in;
1035        struct kexec_segment out, __user *ksegments;
1036        unsigned long i, result;
1037
1038        /* Don't allow clients that don't understand the native
1039         * architecture to do anything.
1040         */
1041        if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1042                return -EINVAL;
1043
1044        if (nr_segments > KEXEC_SEGMENT_MAX)
1045                return -EINVAL;
1046
1047        ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1048        for (i=0; i < nr_segments; i++) {
1049                result = copy_from_user(&in, &segments[i], sizeof(in));
1050                if (result)
1051                        return -EFAULT;
1052
1053                out.buf   = compat_ptr(in.buf);
1054                out.bufsz = in.bufsz;
1055                out.mem   = in.mem;
1056                out.memsz = in.memsz;
1057
1058                result = copy_to_user(&ksegments[i], &out, sizeof(out));
1059                if (result)
1060                        return -EFAULT;
1061        }
1062
1063        return sys_kexec_load(entry, nr_segments, ksegments, flags);
1064}
1065#endif
1066
1067void crash_kexec(struct pt_regs *regs)
1068{
1069        /* Take the kexec_mutex here to prevent sys_kexec_load
1070         * running on one cpu from replacing the crash kernel
1071         * we are using after a panic on a different cpu.
1072         *
1073         * If the crash kernel was not located in a fixed area
1074         * of memory the xchg(&kexec_crash_image) would be
1075         * sufficient.  But since I reuse the memory...
1076         */
1077        if (mutex_trylock(&kexec_mutex)) {
1078                if (kexec_crash_image) {
1079                        struct pt_regs fixed_regs;
1080
1081                        kmsg_dump(KMSG_DUMP_KEXEC);
1082
1083                        crash_setup_regs(&fixed_regs, regs);
1084                        crash_save_vmcoreinfo();
1085                        machine_crash_shutdown(&fixed_regs);
1086                        machine_kexec(kexec_crash_image);
1087                }
1088                mutex_unlock(&kexec_mutex);
1089        }
1090}
1091
1092size_t crash_get_memory_size(void)
1093{
1094        size_t size = 0;
1095        mutex_lock(&kexec_mutex);
1096        if (crashk_res.end != crashk_res.start)
1097                size = crashk_res.end - crashk_res.start + 1;
1098        mutex_unlock(&kexec_mutex);
1099        return size;
1100}
1101
1102static void free_reserved_phys_range(unsigned long begin, unsigned long end)
1103{
1104        unsigned long addr;
1105
1106        for (addr = begin; addr < end; addr += PAGE_SIZE) {
1107                ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
1108                init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
1109                free_page((unsigned long)__va(addr));
1110                totalram_pages++;
1111        }
1112}
1113
1114int crash_shrink_memory(unsigned long new_size)
1115{
1116        int ret = 0;
1117        unsigned long start, end;
1118
1119        mutex_lock(&kexec_mutex);
1120
1121        if (kexec_crash_image) {
1122                ret = -ENOENT;
1123                goto unlock;
1124        }
1125        start = crashk_res.start;
1126        end = crashk_res.end;
1127
1128        if (new_size >= end - start + 1) {
1129                ret = -EINVAL;
1130                if (new_size == end - start + 1)
1131                        ret = 0;
1132                goto unlock;
1133        }
1134
1135        start = roundup(start, PAGE_SIZE);
1136        end = roundup(start + new_size, PAGE_SIZE);
1137
1138        free_reserved_phys_range(end, crashk_res.end);
1139
1140        if ((start == end) && (crashk_res.parent != NULL))
1141                release_resource(&crashk_res);
1142        crashk_res.end = end - 1;
1143
1144unlock:
1145        mutex_unlock(&kexec_mutex);
1146        return ret;
1147}
1148
1149static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1150                            size_t data_len)
1151{
1152        struct elf_note note;
1153
1154        note.n_namesz = strlen(name) + 1;
1155        note.n_descsz = data_len;
1156        note.n_type   = type;
1157        memcpy(buf, &note, sizeof(note));
1158        buf += (sizeof(note) + 3)/4;
1159        memcpy(buf, name, note.n_namesz);
1160        buf += (note.n_namesz + 3)/4;
1161        memcpy(buf, data, note.n_descsz);
1162        buf += (note.n_descsz + 3)/4;
1163
1164        return buf;
1165}
1166
1167static void final_note(u32 *buf)
1168{
1169        struct elf_note note;
1170
1171        note.n_namesz = 0;
1172        note.n_descsz = 0;
1173        note.n_type   = 0;
1174        memcpy(buf, &note, sizeof(note));
1175}
1176
1177void crash_save_cpu(struct pt_regs *regs, int cpu)
1178{
1179        struct elf_prstatus prstatus;
1180        u32 *buf;
1181
1182        if ((cpu < 0) || (cpu >= nr_cpu_ids))
1183                return;
1184
1185        /* Using ELF notes here is opportunistic.
1186         * I need a well defined structure format
1187         * for the data I pass, and I need tags
1188         * on the data to indicate what information I have
1189         * squirrelled away.  ELF notes happen to provide
1190         * all of that, so there is no need to invent something new.
1191         */
1192        buf = (u32*)per_cpu_ptr(crash_notes, cpu);
1193        if (!buf)
1194                return;
1195        memset(&prstatus, 0, sizeof(prstatus));
1196        prstatus.pr_pid = current->pid;
1197        elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
1198        buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1199                              &prstatus, sizeof(prstatus));
1200        final_note(buf);
1201}
1202
1203static int __init crash_notes_memory_init(void)
1204{
1205        /* Allocate memory for saving cpu registers. */
1206        crash_notes = alloc_percpu(note_buf_t);
1207        if (!crash_notes) {
1208                printk("Kexec: Memory allocation for saving cpu register"
1209                " states failed\n");
1210                return -ENOMEM;
1211        }
1212        return 0;
1213}
1214module_init(crash_notes_memory_init)
1215
1216
1217/*
1218 * parsing the "crashkernel" commandline
1219 *
1220 * this code is intended to be called from architecture specific code
1221 */
1222
1223
1224/*
1225 * This function parses command lines in the format
1226 *
1227 *   crashkernel=ramsize-range:size[,...][@offset]
1228 *
1229 * The function returns 0 on success and -EINVAL on failure.
1230 */
1231static int __init parse_crashkernel_mem(char                    *cmdline,
1232                                        unsigned long long      system_ram,
1233                                        unsigned long long      *crash_size,
1234                                        unsigned long long      *crash_base)
1235{
1236        char *cur = cmdline, *tmp;
1237
1238        /* for each entry of the comma-separated list */
1239        do {
1240                unsigned long long start, end = ULLONG_MAX, size;
1241
1242                /* get the start of the range */
1243                start = memparse(cur, &tmp);
1244                if (cur == tmp) {
1245                        pr_warning("crashkernel: Memory value expected\n");
1246                        return -EINVAL;
1247                }
1248                cur = tmp;
1249                if (*cur != '-') {
1250                        pr_warning("crashkernel: '-' expected\n");
1251                        return -EINVAL;
1252                }
1253                cur++;
1254
1255                /* if no ':' is here, than we read the end */
1256                if (*cur != ':') {
1257                        end = memparse(cur, &tmp);
1258                        if (cur == tmp) {
1259                                pr_warning("crashkernel: Memory "
1260                                                "value expected\n");
1261                                return -EINVAL;
1262                        }
1263                        cur = tmp;
1264                        if (end <= start) {
1265                                pr_warning("crashkernel: end <= start\n");
1266                                return -EINVAL;
1267                        }
1268                }
1269
1270                if (*cur != ':') {
1271                        pr_warning("crashkernel: ':' expected\n");
1272                        return -EINVAL;
1273                }
1274                cur++;
1275
1276                size = memparse(cur, &tmp);
1277                if (cur == tmp) {
1278                        pr_warning("Memory value expected\n");
1279                        return -EINVAL;
1280                }
1281                cur = tmp;
1282                if (size >= system_ram) {
1283                        pr_warning("crashkernel: invalid size\n");
1284                        return -EINVAL;
1285                }
1286
1287                /* match ? */
1288                if (system_ram >= start && system_ram < end) {
1289                        *crash_size = size;
1290                        break;
1291                }
1292        } while (*cur++ == ',');
1293
1294        if (*crash_size > 0) {
1295                while (*cur && *cur != ' ' && *cur != '@')
1296                        cur++;
1297                if (*cur == '@') {
1298                        cur++;
1299                        *crash_base = memparse(cur, &tmp);
1300                        if (cur == tmp) {
1301                                pr_warning("Memory value expected "
1302                                                "after '@'\n");
1303                                return -EINVAL;
1304                        }
1305                }
1306        }
1307
1308        return 0;
1309}
1310
1311/*
1312 * That function parses "simple" (old) crashkernel command lines like
1313 *
1314 *      crashkernel=size[@offset]
1315 *
1316 * It returns 0 on success and -EINVAL on failure.
1317 */
1318static int __init parse_crashkernel_simple(char                 *cmdline,
1319                                           unsigned long long   *crash_size,
1320                                           unsigned long long   *crash_base)
1321{
1322        char *cur = cmdline;
1323
1324        *crash_size = memparse(cmdline, &cur);
1325        if (cmdline == cur) {
1326                pr_warning("crashkernel: memory value expected\n");
1327                return -EINVAL;
1328        }
1329
1330        if (*cur == '@')
1331                *crash_base = memparse(cur+1, &cur);
1332
1333        return 0;
1334}
1335
1336/*
1337 * That function is the entry point for command line parsing and should be
1338 * called from the arch-specific code.
1339 */
1340int __init parse_crashkernel(char                *cmdline,
1341                             unsigned long long system_ram,
1342                             unsigned long long *crash_size,
1343                             unsigned long long *crash_base)
1344{
1345        char    *p = cmdline, *ck_cmdline = NULL;
1346        char    *first_colon, *first_space;
1347
1348        BUG_ON(!crash_size || !crash_base);
1349        *crash_size = 0;
1350        *crash_base = 0;
1351
1352        /* find crashkernel and use the last one if there are more */
1353        p = strstr(p, "crashkernel=");
1354        while (p) {
1355                ck_cmdline = p;
1356                p = strstr(p+1, "crashkernel=");
1357        }
1358
1359        if (!ck_cmdline)
1360                return -EINVAL;
1361
1362        ck_cmdline += 12; /* strlen("crashkernel=") */
1363
1364        /*
1365         * if the commandline contains a ':', then that's the extended
1366         * syntax -- if not, it must be the classic syntax
1367         */
1368        first_colon = strchr(ck_cmdline, ':');
1369        first_space = strchr(ck_cmdline, ' ');
1370        if (first_colon && (!first_space || first_colon < first_space))
1371                return parse_crashkernel_mem(ck_cmdline, system_ram,
1372                                crash_size, crash_base);
1373        else
1374                return parse_crashkernel_simple(ck_cmdline, crash_size,
1375                                crash_base);
1376
1377        return 0;
1378}
1379
1380
1381
1382void crash_save_vmcoreinfo(void)
1383{
1384        u32 *buf;
1385
1386        if (!vmcoreinfo_size)
1387                return;
1388
1389        vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
1390
1391        buf = (u32 *)vmcoreinfo_note;
1392
1393        buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1394                              vmcoreinfo_size);
1395
1396        final_note(buf);
1397}
1398
1399void vmcoreinfo_append_str(const char *fmt, ...)
1400{
1401        va_list args;
1402        char buf[0x50];
1403        int r;
1404
1405        va_start(args, fmt);
1406        r = vsnprintf(buf, sizeof(buf), fmt, args);
1407        va_end(args);
1408
1409        if (r + vmcoreinfo_size > vmcoreinfo_max_size)
1410                r = vmcoreinfo_max_size - vmcoreinfo_size;
1411
1412        memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1413
1414        vmcoreinfo_size += r;
1415}
1416
1417/*
1418 * provide an empty default implementation here -- architecture
1419 * code may override this
1420 */
1421void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void)
1422{}
1423
1424unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
1425{
1426        return __pa((unsigned long)(char *)&vmcoreinfo_note);
1427}
1428
1429static int __init crash_save_vmcoreinfo_init(void)
1430{
1431        VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1432        VMCOREINFO_PAGESIZE(PAGE_SIZE);
1433
1434        VMCOREINFO_SYMBOL(init_uts_ns);
1435        VMCOREINFO_SYMBOL(node_online_map);
1436        VMCOREINFO_SYMBOL(swapper_pg_dir);
1437        VMCOREINFO_SYMBOL(_stext);
1438        VMCOREINFO_SYMBOL(vmlist);
1439
1440#ifndef CONFIG_NEED_MULTIPLE_NODES
1441        VMCOREINFO_SYMBOL(mem_map);
1442        VMCOREINFO_SYMBOL(contig_page_data);
1443#endif
1444#ifdef CONFIG_SPARSEMEM
1445        VMCOREINFO_SYMBOL(mem_section);
1446        VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1447        VMCOREINFO_STRUCT_SIZE(mem_section);
1448        VMCOREINFO_OFFSET(mem_section, section_mem_map);
1449#endif
1450        VMCOREINFO_STRUCT_SIZE(page);
1451        VMCOREINFO_STRUCT_SIZE(pglist_data);
1452        VMCOREINFO_STRUCT_SIZE(zone);
1453        VMCOREINFO_STRUCT_SIZE(free_area);
1454        VMCOREINFO_STRUCT_SIZE(list_head);
1455        VMCOREINFO_SIZE(nodemask_t);
1456        VMCOREINFO_OFFSET(page, flags);
1457        VMCOREINFO_OFFSET(page, _count);
1458        VMCOREINFO_OFFSET(page, mapping);
1459        VMCOREINFO_OFFSET(page, lru);
1460        VMCOREINFO_OFFSET(pglist_data, node_zones);
1461        VMCOREINFO_OFFSET(pglist_data, nr_zones);
1462#ifdef CONFIG_FLAT_NODE_MEM_MAP
1463        VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1464#endif
1465        VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1466        VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1467        VMCOREINFO_OFFSET(pglist_data, node_id);
1468        VMCOREINFO_OFFSET(zone, free_area);
1469        VMCOREINFO_OFFSET(zone, vm_stat);
1470        VMCOREINFO_OFFSET(zone, spanned_pages);
1471        VMCOREINFO_OFFSET(free_area, free_list);
1472        VMCOREINFO_OFFSET(list_head, next);
1473        VMCOREINFO_OFFSET(list_head, prev);
1474        VMCOREINFO_OFFSET(vm_struct, addr);
1475        VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1476        log_buf_kexec_setup();
1477        VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1478        VMCOREINFO_NUMBER(NR_FREE_PAGES);
1479        VMCOREINFO_NUMBER(PG_lru);
1480        VMCOREINFO_NUMBER(PG_private);
1481        VMCOREINFO_NUMBER(PG_swapcache);
1482
1483        arch_crash_save_vmcoreinfo();
1484
1485        return 0;
1486}
1487
1488module_init(crash_save_vmcoreinfo_init)
1489
1490/*
1491 * Move into place and start executing a preloaded standalone
1492 * executable.  If nothing was preloaded return an error.
1493 */
1494int kernel_kexec(void)
1495{
1496        int error = 0;
1497
1498        if (!mutex_trylock(&kexec_mutex))
1499                return -EBUSY;
1500        if (!kexec_image) {
1501                error = -EINVAL;
1502                goto Unlock;
1503        }
1504
1505#ifdef CONFIG_KEXEC_JUMP
1506        if (kexec_image->preserve_context) {
1507                mutex_lock(&pm_mutex);
1508                pm_prepare_console();
1509                error = freeze_processes();
1510                if (error) {
1511                        error = -EBUSY;
1512                        goto Restore_console;
1513                }
1514                suspend_console();
1515                error = dpm_suspend_start(PMSG_FREEZE);
1516                if (error)
1517                        goto Resume_console;
1518                /* At this point, dpm_suspend_start() has been called,
1519                 * but *not* dpm_suspend_noirq(). We *must* call
1520                 * dpm_suspend_noirq() now.  Otherwise, drivers for
1521                 * some devices (e.g. interrupt controllers) become
1522                 * desynchronized with the actual state of the
1523                 * hardware at resume time, and evil weirdness ensues.
1524                 */
1525                error = dpm_suspend_noirq(PMSG_FREEZE);
1526                if (error)
1527                        goto Resume_devices;
1528                error = disable_nonboot_cpus();
1529                if (error)
1530                        goto Enable_cpus;
1531                local_irq_disable();
1532                /* Suspend system devices */
1533                error = sysdev_suspend(PMSG_FREEZE);
1534                if (error)
1535                        goto Enable_irqs;
1536        } else
1537#endif
1538        {
1539                kernel_restart_prepare(NULL);
1540                printk(KERN_EMERG "Starting new kernel\n");
1541                machine_shutdown();
1542        }
1543
1544        machine_kexec(kexec_image);
1545
1546#ifdef CONFIG_KEXEC_JUMP
1547        if (kexec_image->preserve_context) {
1548                sysdev_resume();
1549 Enable_irqs:
1550                local_irq_enable();
1551 Enable_cpus:
1552                enable_nonboot_cpus();
1553                dpm_resume_noirq(PMSG_RESTORE);
1554 Resume_devices:
1555                dpm_resume_end(PMSG_RESTORE);
1556 Resume_console:
1557                resume_console();
1558                thaw_processes();
1559 Restore_console:
1560                pm_restore_console();
1561                mutex_unlock(&pm_mutex);
1562        }
1563#endif
1564
1565 Unlock:
1566        mutex_unlock(&kexec_mutex);
1567        return error;
1568}
1569