linux/fs/proc/kcore.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *      fs/proc/kcore.c kernel ELF core dumper
   4 *
   5 *      Modelled on fs/exec.c:aout_core_dump()
   6 *      Jeremy Fitzhardinge <jeremy@sw.oz.au>
   7 *      ELF version written by David Howells <David.Howells@nexor.co.uk>
   8 *      Modified and incorporated into 2.3.x by Tigran Aivazian <tigran@veritas.com>
   9 *      Support to dump vmalloc'd areas (ELF only), Tigran Aivazian <tigran@veritas.com>
  10 *      Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
  11 */
  12
  13#include <linux/crash_core.h>
  14#include <linux/mm.h>
  15#include <linux/proc_fs.h>
  16#include <linux/kcore.h>
  17#include <linux/user.h>
  18#include <linux/capability.h>
  19#include <linux/elf.h>
  20#include <linux/elfcore.h>
  21#include <linux/notifier.h>
  22#include <linux/vmalloc.h>
  23#include <linux/highmem.h>
  24#include <linux/printk.h>
  25#include <linux/memblock.h>
  26#include <linux/init.h>
  27#include <linux/slab.h>
  28#include <linux/uaccess.h>
  29#include <asm/io.h>
  30#include <linux/list.h>
  31#include <linux/ioport.h>
  32#include <linux/memory.h>
  33#include <linux/sched/task.h>
  34#include <asm/sections.h>
  35#include "internal.h"
  36
  37#define CORE_STR "CORE"
  38
  39#ifndef ELF_CORE_EFLAGS
  40#define ELF_CORE_EFLAGS 0
  41#endif
  42
  43static struct proc_dir_entry *proc_root_kcore;
  44
  45
  46#ifndef kc_vaddr_to_offset
  47#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET)
  48#endif
  49#ifndef kc_offset_to_vaddr
  50#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
  51#endif
  52
  53static LIST_HEAD(kclist_head);
  54static DECLARE_RWSEM(kclist_lock);
  55static int kcore_need_update = 1;
  56
  57/*
  58 * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
  59 * Same as oldmem_pfn_is_ram in vmcore
  60 */
  61static int (*mem_pfn_is_ram)(unsigned long pfn);
  62
  63int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn))
  64{
  65        if (mem_pfn_is_ram)
  66                return -EBUSY;
  67        mem_pfn_is_ram = fn;
  68        return 0;
  69}
  70
  71static int pfn_is_ram(unsigned long pfn)
  72{
  73        if (mem_pfn_is_ram)
  74                return mem_pfn_is_ram(pfn);
  75        else
  76                return 1;
  77}
  78
  79/* This doesn't grab kclist_lock, so it should only be used at init time. */
  80void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
  81                       int type)
  82{
  83        new->addr = (unsigned long)addr;
  84        new->size = size;
  85        new->type = type;
  86
  87        list_add_tail(&new->list, &kclist_head);
  88}
  89
  90static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
  91                             size_t *data_offset)
  92{
  93        size_t try, size;
  94        struct kcore_list *m;
  95
  96        *nphdr = 1; /* PT_NOTE */
  97        size = 0;
  98
  99        list_for_each_entry(m, &kclist_head, list) {
 100                try = kc_vaddr_to_offset((size_t)m->addr + m->size);
 101                if (try > size)
 102                        size = try;
 103                *nphdr = *nphdr + 1;
 104        }
 105
 106        *phdrs_len = *nphdr * sizeof(struct elf_phdr);
 107        *notes_len = (4 * sizeof(struct elf_note) +
 108                      3 * ALIGN(sizeof(CORE_STR), 4) +
 109                      VMCOREINFO_NOTE_NAME_BYTES +
 110                      ALIGN(sizeof(struct elf_prstatus), 4) +
 111                      ALIGN(sizeof(struct elf_prpsinfo), 4) +
 112                      ALIGN(arch_task_struct_size, 4) +
 113                      ALIGN(vmcoreinfo_size, 4));
 114        *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
 115                                  *notes_len);
 116        return *data_offset + size;
 117}
 118
 119#ifdef CONFIG_HIGHMEM
 120/*
 121 * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
 122 * because memory hole is not as big as !HIGHMEM case.
 123 * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
 124 */
 125static int kcore_ram_list(struct list_head *head)
 126{
 127        struct kcore_list *ent;
 128
 129        ent = kmalloc(sizeof(*ent), GFP_KERNEL);
 130        if (!ent)
 131                return -ENOMEM;
 132        ent->addr = (unsigned long)__va(0);
 133        ent->size = max_low_pfn << PAGE_SHIFT;
 134        ent->type = KCORE_RAM;
 135        list_add(&ent->list, head);
 136        return 0;
 137}
 138
 139#else /* !CONFIG_HIGHMEM */
 140
 141#ifdef CONFIG_SPARSEMEM_VMEMMAP
 142/* calculate vmemmap's address from given system ram pfn and register it */
 143static int
 144get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
 145{
 146        unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
 147        unsigned long nr_pages = ent->size >> PAGE_SHIFT;
 148        unsigned long start, end;
 149        struct kcore_list *vmm, *tmp;
 150
 151
 152        start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK;
 153        end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1;
 154        end = PAGE_ALIGN(end);
 155        /* overlap check (because we have to align page */
 156        list_for_each_entry(tmp, head, list) {
 157                if (tmp->type != KCORE_VMEMMAP)
 158                        continue;
 159                if (start < tmp->addr + tmp->size)
 160                        if (end > tmp->addr)
 161                                end = tmp->addr;
 162        }
 163        if (start < end) {
 164                vmm = kmalloc(sizeof(*vmm), GFP_KERNEL);
 165                if (!vmm)
 166                        return 0;
 167                vmm->addr = start;
 168                vmm->size = end - start;
 169                vmm->type = KCORE_VMEMMAP;
 170                list_add_tail(&vmm->list, head);
 171        }
 172        return 1;
 173
 174}
 175#else
 176static int
 177get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
 178{
 179        return 1;
 180}
 181
 182#endif
 183
 184static int
 185kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
 186{
 187        struct list_head *head = (struct list_head *)arg;
 188        struct kcore_list *ent;
 189        struct page *p;
 190
 191        if (!pfn_valid(pfn))
 192                return 1;
 193
 194        p = pfn_to_page(pfn);
 195        if (!memmap_valid_within(pfn, p, page_zone(p)))
 196                return 1;
 197
 198        ent = kmalloc(sizeof(*ent), GFP_KERNEL);
 199        if (!ent)
 200                return -ENOMEM;
 201        ent->addr = (unsigned long)page_to_virt(p);
 202        ent->size = nr_pages << PAGE_SHIFT;
 203
 204        if (!virt_addr_valid(ent->addr))
 205                goto free_out;
 206
 207        /* cut not-mapped area. ....from ppc-32 code. */
 208        if (ULONG_MAX - ent->addr < ent->size)
 209                ent->size = ULONG_MAX - ent->addr;
 210
 211        /*
 212         * We've already checked virt_addr_valid so we know this address
 213         * is a valid pointer, therefore we can check against it to determine
 214         * if we need to trim
 215         */
 216        if (VMALLOC_START > ent->addr) {
 217                if (VMALLOC_START - ent->addr < ent->size)
 218                        ent->size = VMALLOC_START - ent->addr;
 219        }
 220
 221        ent->type = KCORE_RAM;
 222        list_add_tail(&ent->list, head);
 223
 224        if (!get_sparsemem_vmemmap_info(ent, head)) {
 225                list_del(&ent->list);
 226                goto free_out;
 227        }
 228
 229        return 0;
 230free_out:
 231        kfree(ent);
 232        return 1;
 233}
 234
 235static int kcore_ram_list(struct list_head *list)
 236{
 237        int nid, ret;
 238        unsigned long end_pfn;
 239
 240        /* Not inialized....update now */
 241        /* find out "max pfn" */
 242        end_pfn = 0;
 243        for_each_node_state(nid, N_MEMORY) {
 244                unsigned long node_end;
 245                node_end = node_end_pfn(nid);
 246                if (end_pfn < node_end)
 247                        end_pfn = node_end;
 248        }
 249        /* scan 0 to max_pfn */
 250        ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private);
 251        if (ret)
 252                return -ENOMEM;
 253        return 0;
 254}
 255#endif /* CONFIG_HIGHMEM */
 256
 257static int kcore_update_ram(void)
 258{
 259        LIST_HEAD(list);
 260        LIST_HEAD(garbage);
 261        int nphdr;
 262        size_t phdrs_len, notes_len, data_offset;
 263        struct kcore_list *tmp, *pos;
 264        int ret = 0;
 265
 266        down_write(&kclist_lock);
 267        if (!xchg(&kcore_need_update, 0))
 268                goto out;
 269
 270        ret = kcore_ram_list(&list);
 271        if (ret) {
 272                /* Couldn't get the RAM list, try again next time. */
 273                WRITE_ONCE(kcore_need_update, 1);
 274                list_splice_tail(&list, &garbage);
 275                goto out;
 276        }
 277
 278        list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
 279                if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
 280                        list_move(&pos->list, &garbage);
 281        }
 282        list_splice_tail(&list, &kclist_head);
 283
 284        proc_root_kcore->size = get_kcore_size(&nphdr, &phdrs_len, &notes_len,
 285                                               &data_offset);
 286
 287out:
 288        up_write(&kclist_lock);
 289        list_for_each_entry_safe(pos, tmp, &garbage, list) {
 290                list_del(&pos->list);
 291                kfree(pos);
 292        }
 293        return ret;
 294}
 295
 296static void append_kcore_note(char *notes, size_t *i, const char *name,
 297                              unsigned int type, const void *desc,
 298                              size_t descsz)
 299{
 300        struct elf_note *note = (struct elf_note *)&notes[*i];
 301
 302        note->n_namesz = strlen(name) + 1;
 303        note->n_descsz = descsz;
 304        note->n_type = type;
 305        *i += sizeof(*note);
 306        memcpy(&notes[*i], name, note->n_namesz);
 307        *i = ALIGN(*i + note->n_namesz, 4);
 308        memcpy(&notes[*i], desc, descsz);
 309        *i = ALIGN(*i + descsz, 4);
 310}
 311
 312static ssize_t
 313read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 314{
 315        char *buf = file->private_data;
 316        size_t phdrs_offset, notes_offset, data_offset;
 317        size_t phdrs_len, notes_len;
 318        struct kcore_list *m;
 319        size_t tsz;
 320        int nphdr;
 321        unsigned long start;
 322        size_t orig_buflen = buflen;
 323        int ret = 0;
 324
 325        down_read(&kclist_lock);
 326
 327        get_kcore_size(&nphdr, &phdrs_len, &notes_len, &data_offset);
 328        phdrs_offset = sizeof(struct elfhdr);
 329        notes_offset = phdrs_offset + phdrs_len;
 330
 331        /* ELF file header. */
 332        if (buflen && *fpos < sizeof(struct elfhdr)) {
 333                struct elfhdr ehdr = {
 334                        .e_ident = {
 335                                [EI_MAG0] = ELFMAG0,
 336                                [EI_MAG1] = ELFMAG1,
 337                                [EI_MAG2] = ELFMAG2,
 338                                [EI_MAG3] = ELFMAG3,
 339                                [EI_CLASS] = ELF_CLASS,
 340                                [EI_DATA] = ELF_DATA,
 341                                [EI_VERSION] = EV_CURRENT,
 342                                [EI_OSABI] = ELF_OSABI,
 343                        },
 344                        .e_type = ET_CORE,
 345                        .e_machine = ELF_ARCH,
 346                        .e_version = EV_CURRENT,
 347                        .e_phoff = sizeof(struct elfhdr),
 348                        .e_flags = ELF_CORE_EFLAGS,
 349                        .e_ehsize = sizeof(struct elfhdr),
 350                        .e_phentsize = sizeof(struct elf_phdr),
 351                        .e_phnum = nphdr,
 352                };
 353
 354                tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
 355                if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) {
 356                        ret = -EFAULT;
 357                        goto out;
 358                }
 359
 360                buffer += tsz;
 361                buflen -= tsz;
 362                *fpos += tsz;
 363        }
 364
 365        /* ELF program headers. */
 366        if (buflen && *fpos < phdrs_offset + phdrs_len) {
 367                struct elf_phdr *phdrs, *phdr;
 368
 369                phdrs = kzalloc(phdrs_len, GFP_KERNEL);
 370                if (!phdrs) {
 371                        ret = -ENOMEM;
 372                        goto out;
 373                }
 374
 375                phdrs[0].p_type = PT_NOTE;
 376                phdrs[0].p_offset = notes_offset;
 377                phdrs[0].p_filesz = notes_len;
 378
 379                phdr = &phdrs[1];
 380                list_for_each_entry(m, &kclist_head, list) {
 381                        phdr->p_type = PT_LOAD;
 382                        phdr->p_flags = PF_R | PF_W | PF_X;
 383                        phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset;
 384                        if (m->type == KCORE_REMAP)
 385                                phdr->p_vaddr = (size_t)m->vaddr;
 386                        else
 387                                phdr->p_vaddr = (size_t)m->addr;
 388                        if (m->type == KCORE_RAM || m->type == KCORE_REMAP)
 389                                phdr->p_paddr = __pa(m->addr);
 390                        else if (m->type == KCORE_TEXT)
 391                                phdr->p_paddr = __pa_symbol(m->addr);
 392                        else
 393                                phdr->p_paddr = (elf_addr_t)-1;
 394                        phdr->p_filesz = phdr->p_memsz = m->size;
 395                        phdr->p_align = PAGE_SIZE;
 396                        phdr++;
 397                }
 398
 399                tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
 400                if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset,
 401                                 tsz)) {
 402                        kfree(phdrs);
 403                        ret = -EFAULT;
 404                        goto out;
 405                }
 406                kfree(phdrs);
 407
 408                buffer += tsz;
 409                buflen -= tsz;
 410                *fpos += tsz;
 411        }
 412
 413        /* ELF note segment. */
 414        if (buflen && *fpos < notes_offset + notes_len) {
 415                struct elf_prstatus prstatus = {};
 416                struct elf_prpsinfo prpsinfo = {
 417                        .pr_sname = 'R',
 418                        .pr_fname = "vmlinux",
 419                };
 420                char *notes;
 421                size_t i = 0;
 422
 423                strlcpy(prpsinfo.pr_psargs, saved_command_line,
 424                        sizeof(prpsinfo.pr_psargs));
 425
 426                notes = kzalloc(notes_len, GFP_KERNEL);
 427                if (!notes) {
 428                        ret = -ENOMEM;
 429                        goto out;
 430                }
 431
 432                append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus,
 433                                  sizeof(prstatus));
 434                append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo,
 435                                  sizeof(prpsinfo));
 436                append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current,
 437                                  arch_task_struct_size);
 438                /*
 439                 * vmcoreinfo_size is mostly constant after init time, but it
 440                 * can be changed by crash_save_vmcoreinfo(). Racing here with a
 441                 * panic on another CPU before the machine goes down is insanely
 442                 * unlikely, but it's better to not leave potential buffer
 443                 * overflows lying around, regardless.
 444                 */
 445                append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0,
 446                                  vmcoreinfo_data,
 447                                  min(vmcoreinfo_size, notes_len - i));
 448
 449                tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
 450                if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
 451                        kfree(notes);
 452                        ret = -EFAULT;
 453                        goto out;
 454                }
 455                kfree(notes);
 456
 457                buffer += tsz;
 458                buflen -= tsz;
 459                *fpos += tsz;
 460        }
 461
 462        /*
 463         * Check to see if our file offset matches with any of
 464         * the addresses in the elf_phdr on our list.
 465         */
 466        start = kc_offset_to_vaddr(*fpos - data_offset);
 467        if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
 468                tsz = buflen;
 469
 470        m = NULL;
 471        while (buflen) {
 472                /*
 473                 * If this is the first iteration or the address is not within
 474                 * the previous entry, search for a matching entry.
 475                 */
 476                if (!m || start < m->addr || start >= m->addr + m->size) {
 477                        list_for_each_entry(m, &kclist_head, list) {
 478                                if (start >= m->addr &&
 479                                    start < m->addr + m->size)
 480                                        break;
 481                        }
 482                }
 483
 484                if (&m->list == &kclist_head) {
 485                        if (clear_user(buffer, tsz)) {
 486                                ret = -EFAULT;
 487                                goto out;
 488                        }
 489                        m = NULL;       /* skip the list anchor */
 490                } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) {
 491                        if (clear_user(buffer, tsz)) {
 492                                ret = -EFAULT;
 493                                goto out;
 494                        }
 495                } else if (m->type == KCORE_VMALLOC) {
 496                        vread(buf, (char *)start, tsz);
 497                        /* we have to zero-fill user buffer even if no read */
 498                        if (copy_to_user(buffer, buf, tsz)) {
 499                                ret = -EFAULT;
 500                                goto out;
 501                        }
 502                } else if (m->type == KCORE_USER) {
 503                        /* User page is handled prior to normal kernel page: */
 504                        if (copy_to_user(buffer, (char *)start, tsz)) {
 505                                ret = -EFAULT;
 506                                goto out;
 507                        }
 508                } else {
 509                        if (kern_addr_valid(start)) {
 510                                /*
 511                                 * Using bounce buffer to bypass the
 512                                 * hardened user copy kernel text checks.
 513                                 */
 514                                if (probe_kernel_read(buf, (void *) start, tsz)) {
 515                                        if (clear_user(buffer, tsz)) {
 516                                                ret = -EFAULT;
 517                                                goto out;
 518                                        }
 519                                } else {
 520                                        if (copy_to_user(buffer, buf, tsz)) {
 521                                                ret = -EFAULT;
 522                                                goto out;
 523                                        }
 524                                }
 525                        } else {
 526                                if (clear_user(buffer, tsz)) {
 527                                        ret = -EFAULT;
 528                                        goto out;
 529                                }
 530                        }
 531                }
 532                buflen -= tsz;
 533                *fpos += tsz;
 534                buffer += tsz;
 535                start += tsz;
 536                tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
 537        }
 538
 539out:
 540        up_read(&kclist_lock);
 541        if (ret)
 542                return ret;
 543        return orig_buflen - buflen;
 544}
 545
 546static int open_kcore(struct inode *inode, struct file *filp)
 547{
 548        if (!capable(CAP_SYS_RAWIO))
 549                return -EPERM;
 550
 551        filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
 552        if (!filp->private_data)
 553                return -ENOMEM;
 554
 555        if (kcore_need_update)
 556                kcore_update_ram();
 557        if (i_size_read(inode) != proc_root_kcore->size) {
 558                inode_lock(inode);
 559                i_size_write(inode, proc_root_kcore->size);
 560                inode_unlock(inode);
 561        }
 562        return 0;
 563}
 564
 565static int release_kcore(struct inode *inode, struct file *file)
 566{
 567        kfree(file->private_data);
 568        return 0;
 569}
 570
 571static const struct file_operations proc_kcore_operations = {
 572        .read           = read_kcore,
 573        .open           = open_kcore,
 574        .release        = release_kcore,
 575        .llseek         = default_llseek,
 576};
 577
 578/* just remember that we have to update kcore */
 579static int __meminit kcore_callback(struct notifier_block *self,
 580                                    unsigned long action, void *arg)
 581{
 582        switch (action) {
 583        case MEM_ONLINE:
 584        case MEM_OFFLINE:
 585                kcore_need_update = 1;
 586                break;
 587        }
 588        return NOTIFY_OK;
 589}
 590
 591static struct notifier_block kcore_callback_nb __meminitdata = {
 592        .notifier_call = kcore_callback,
 593        .priority = 0,
 594};
 595
 596static struct kcore_list kcore_vmalloc;
 597
 598#ifdef CONFIG_ARCH_PROC_KCORE_TEXT
 599static struct kcore_list kcore_text;
 600/*
 601 * If defined, special segment is used for mapping kernel text instead of
 602 * direct-map area. We need to create special TEXT section.
 603 */
 604static void __init proc_kcore_text_init(void)
 605{
 606        kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
 607}
 608#else
 609static void __init proc_kcore_text_init(void)
 610{
 611}
 612#endif
 613
 614#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
 615/*
 616 * MODULES_VADDR has no intersection with VMALLOC_ADDR.
 617 */
 618static struct kcore_list kcore_modules;
 619static void __init add_modules_range(void)
 620{
 621        if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {
 622                kclist_add(&kcore_modules, (void *)MODULES_VADDR,
 623                        MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
 624        }
 625}
 626#else
 627static void __init add_modules_range(void)
 628{
 629}
 630#endif
 631
 632static int __init proc_kcore_init(void)
 633{
 634        proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
 635                                      &proc_kcore_operations);
 636        if (!proc_root_kcore) {
 637                pr_err("couldn't create /proc/kcore\n");
 638                return 0; /* Always returns 0. */
 639        }
 640        /* Store text area if it's special */
 641        proc_kcore_text_init();
 642        /* Store vmalloc area */
 643        kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
 644                VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
 645        add_modules_range();
 646        /* Store direct-map area from physical memory map */
 647        kcore_update_ram();
 648        register_hotmemory_notifier(&kcore_callback_nb);
 649
 650        return 0;
 651}
 652fs_initcall(proc_kcore_init);
 653