qemu/util/oslib-posix.c
<<
>>
Prefs
   1/*
   2 * os-posix-lib.c
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2010 Red Hat, Inc.
   6 *
   7 * QEMU library functions on POSIX which are shared between QEMU and
   8 * the QEMU tools.
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include <termios.h>
  31
  32#include <glib/gprintf.h>
  33
  34#include "sysemu/sysemu.h"
  35#include "trace.h"
  36#include "qapi/error.h"
  37#include "qemu/sockets.h"
  38#include <libgen.h>
  39#include <sys/signal.h>
  40#include "qemu/cutils.h"
  41
  42#ifdef CONFIG_LINUX
  43#include <sys/syscall.h>
  44#endif
  45
  46#ifdef __FreeBSD__
  47#include <sys/sysctl.h>
  48#include <sys/user.h>
  49#include <libutil.h>
  50#endif
  51
  52#ifdef __NetBSD__
  53#include <sys/sysctl.h>
  54#endif
  55
  56#include "qemu/mmap-alloc.h"
  57
  58#ifdef CONFIG_DEBUG_STACK_USAGE
  59#include "qemu/error-report.h"
  60#endif
  61
  62#define MAX_MEM_PREALLOC_THREAD_COUNT 16
  63
  64struct MemsetThread {
  65    char *addr;
  66    size_t numpages;
  67    size_t hpagesize;
  68    QemuThread pgthread;
  69    sigjmp_buf env;
  70};
  71typedef struct MemsetThread MemsetThread;
  72
  73static MemsetThread *memset_thread;
  74static int memset_num_threads;
  75static bool memset_thread_failed;
  76
  77int qemu_get_thread_id(void)
  78{
  79#if defined(__linux__)
  80    return syscall(SYS_gettid);
  81#else
  82    return getpid();
  83#endif
  84}
  85
  86int qemu_daemon(int nochdir, int noclose)
  87{
  88    return daemon(nochdir, noclose);
  89}
  90
  91void *qemu_oom_check(void *ptr)
  92{
  93    if (ptr == NULL) {
  94        fprintf(stderr, "Failed to allocate memory: %s\n", strerror(errno));
  95        abort();
  96    }
  97    return ptr;
  98}
  99
 100void *qemu_try_memalign(size_t alignment, size_t size)
 101{
 102    void *ptr;
 103
 104    if (alignment < sizeof(void*)) {
 105        alignment = sizeof(void*);
 106    }
 107
 108#if defined(CONFIG_POSIX_MEMALIGN)
 109    int ret;
 110    ret = posix_memalign(&ptr, alignment, size);
 111    if (ret != 0) {
 112        errno = ret;
 113        ptr = NULL;
 114    }
 115#elif defined(CONFIG_BSD)
 116    ptr = valloc(size);
 117#else
 118    ptr = memalign(alignment, size);
 119#endif
 120    trace_qemu_memalign(alignment, size, ptr);
 121    return ptr;
 122}
 123
 124void *qemu_memalign(size_t alignment, size_t size)
 125{
 126    return qemu_oom_check(qemu_try_memalign(alignment, size));
 127}
 128
 129/* alloc shared memory pages */
 130void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
 131{
 132    size_t align = QEMU_VMALLOC_ALIGN;
 133    void *ptr = qemu_ram_mmap(-1, size, align, shared);
 134
 135    if (ptr == MAP_FAILED) {
 136        return NULL;
 137    }
 138
 139    if (alignment) {
 140        *alignment = align;
 141    }
 142
 143    trace_qemu_anon_ram_alloc(size, ptr);
 144    return ptr;
 145}
 146
 147void qemu_vfree(void *ptr)
 148{
 149    trace_qemu_vfree(ptr);
 150    free(ptr);
 151}
 152
 153void qemu_anon_ram_free(void *ptr, size_t size)
 154{
 155    trace_qemu_anon_ram_free(ptr, size);
 156    qemu_ram_munmap(ptr, size);
 157}
 158
 159void qemu_set_block(int fd)
 160{
 161    int f;
 162    f = fcntl(fd, F_GETFL);
 163    fcntl(fd, F_SETFL, f & ~O_NONBLOCK);
 164}
 165
 166void qemu_set_nonblock(int fd)
 167{
 168    int f;
 169    f = fcntl(fd, F_GETFL);
 170    fcntl(fd, F_SETFL, f | O_NONBLOCK);
 171}
 172
 173int socket_set_fast_reuse(int fd)
 174{
 175    int val = 1, ret;
 176
 177    ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
 178                     (const char *)&val, sizeof(val));
 179
 180    assert(ret == 0);
 181
 182    return ret;
 183}
 184
 185void qemu_set_cloexec(int fd)
 186{
 187    int f;
 188    f = fcntl(fd, F_GETFD);
 189    assert(f != -1);
 190    f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
 191    assert(f != -1);
 192}
 193
 194/*
 195 * Creates a pipe with FD_CLOEXEC set on both file descriptors
 196 */
 197int qemu_pipe(int pipefd[2])
 198{
 199    int ret;
 200
 201#ifdef CONFIG_PIPE2
 202    ret = pipe2(pipefd, O_CLOEXEC);
 203    if (ret != -1 || errno != ENOSYS) {
 204        return ret;
 205    }
 206#endif
 207    ret = pipe(pipefd);
 208    if (ret == 0) {
 209        qemu_set_cloexec(pipefd[0]);
 210        qemu_set_cloexec(pipefd[1]);
 211    }
 212
 213    return ret;
 214}
 215
 216char *
 217qemu_get_local_state_pathname(const char *relative_pathname)
 218{
 219    return g_strdup_printf("%s/%s", CONFIG_QEMU_LOCALSTATEDIR,
 220                           relative_pathname);
 221}
 222
 223void qemu_set_tty_echo(int fd, bool echo)
 224{
 225    struct termios tty;
 226
 227    tcgetattr(fd, &tty);
 228
 229    if (echo) {
 230        tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
 231    } else {
 232        tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
 233    }
 234
 235    tcsetattr(fd, TCSANOW, &tty);
 236}
 237
 238static char exec_dir[PATH_MAX];
 239
 240void qemu_init_exec_dir(const char *argv0)
 241{
 242    char *dir;
 243    char *p = NULL;
 244    char buf[PATH_MAX];
 245
 246    assert(!exec_dir[0]);
 247
 248#if defined(__linux__)
 249    {
 250        int len;
 251        len = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
 252        if (len > 0) {
 253            buf[len] = 0;
 254            p = buf;
 255        }
 256    }
 257#elif defined(__FreeBSD__) \
 258      || (defined(__NetBSD__) && defined(KERN_PROC_PATHNAME))
 259    {
 260#if defined(__FreeBSD__)
 261        static int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
 262#else
 263        static int mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME};
 264#endif
 265        size_t len = sizeof(buf) - 1;
 266
 267        *buf = '\0';
 268        if (!sysctl(mib, ARRAY_SIZE(mib), buf, &len, NULL, 0) &&
 269            *buf) {
 270            buf[sizeof(buf) - 1] = '\0';
 271            p = buf;
 272        }
 273    }
 274#endif
 275    /* If we don't have any way of figuring out the actual executable
 276       location then try argv[0].  */
 277    if (!p) {
 278        if (!argv0) {
 279            return;
 280        }
 281        p = realpath(argv0, buf);
 282        if (!p) {
 283            return;
 284        }
 285    }
 286    dir = g_path_get_dirname(p);
 287
 288    pstrcpy(exec_dir, sizeof(exec_dir), dir);
 289
 290    g_free(dir);
 291}
 292
 293char *qemu_get_exec_dir(void)
 294{
 295    return g_strdup(exec_dir);
 296}
 297
 298static void sigbus_handler(int signal)
 299{
 300    int i;
 301    if (memset_thread) {
 302        for (i = 0; i < memset_num_threads; i++) {
 303            if (qemu_thread_is_self(&memset_thread[i].pgthread)) {
 304                siglongjmp(memset_thread[i].env, 1);
 305            }
 306        }
 307    }
 308}
 309
 310static void *do_touch_pages(void *arg)
 311{
 312    MemsetThread *memset_args = (MemsetThread *)arg;
 313    sigset_t set, oldset;
 314
 315    /* unblock SIGBUS */
 316    sigemptyset(&set);
 317    sigaddset(&set, SIGBUS);
 318    pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
 319
 320    if (sigsetjmp(memset_args->env, 1)) {
 321        memset_thread_failed = true;
 322    } else {
 323        char *addr = memset_args->addr;
 324        size_t numpages = memset_args->numpages;
 325        size_t hpagesize = memset_args->hpagesize;
 326        size_t i;
 327        for (i = 0; i < numpages; i++) {
 328            /*
 329             * Read & write back the same value, so we don't
 330             * corrupt existing user/app data that might be
 331             * stored.
 332             *
 333             * 'volatile' to stop compiler optimizing this away
 334             * to a no-op
 335             *
 336             * TODO: get a better solution from kernel so we
 337             * don't need to write at all so we don't cause
 338             * wear on the storage backing the region...
 339             */
 340            *(volatile char *)addr = *addr;
 341            addr += hpagesize;
 342        }
 343    }
 344    pthread_sigmask(SIG_SETMASK, &oldset, NULL);
 345    return NULL;
 346}
 347
 348static inline int get_memset_num_threads(int smp_cpus)
 349{
 350    long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
 351    int ret = 1;
 352
 353    if (host_procs > 0) {
 354        ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus);
 355    }
 356    /* In case sysconf() fails, we fall back to single threaded */
 357    return ret;
 358}
 359
 360static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
 361                            int smp_cpus)
 362{
 363    size_t numpages_per_thread;
 364    size_t size_per_thread;
 365    char *addr = area;
 366    int i = 0;
 367
 368    memset_thread_failed = false;
 369    memset_num_threads = get_memset_num_threads(smp_cpus);
 370    memset_thread = g_new0(MemsetThread, memset_num_threads);
 371    numpages_per_thread = (numpages / memset_num_threads);
 372    size_per_thread = (hpagesize * numpages_per_thread);
 373    for (i = 0; i < memset_num_threads; i++) {
 374        memset_thread[i].addr = addr;
 375        memset_thread[i].numpages = (i == (memset_num_threads - 1)) ?
 376                                    numpages : numpages_per_thread;
 377        memset_thread[i].hpagesize = hpagesize;
 378        qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
 379                           do_touch_pages, &memset_thread[i],
 380                           QEMU_THREAD_JOINABLE);
 381        addr += size_per_thread;
 382        numpages -= numpages_per_thread;
 383    }
 384    for (i = 0; i < memset_num_threads; i++) {
 385        qemu_thread_join(&memset_thread[i].pgthread);
 386    }
 387    g_free(memset_thread);
 388    memset_thread = NULL;
 389
 390    return memset_thread_failed;
 391}
 392
 393void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
 394                     Error **errp)
 395{
 396    int ret;
 397    struct sigaction act, oldact;
 398    size_t hpagesize = qemu_fd_getpagesize(fd);
 399    size_t numpages = DIV_ROUND_UP(memory, hpagesize);
 400
 401    memset(&act, 0, sizeof(act));
 402    act.sa_handler = &sigbus_handler;
 403    act.sa_flags = 0;
 404
 405    ret = sigaction(SIGBUS, &act, &oldact);
 406    if (ret) {
 407        error_setg_errno(errp, errno,
 408            "os_mem_prealloc: failed to install signal handler");
 409        return;
 410    }
 411
 412    /* touch pages simultaneously */
 413    if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) {
 414        error_setg(errp, "os_mem_prealloc: Insufficient free host memory "
 415            "pages available to allocate guest RAM");
 416    }
 417
 418    ret = sigaction(SIGBUS, &oldact, NULL);
 419    if (ret) {
 420        /* Terminate QEMU since it can't recover from error */
 421        perror("os_mem_prealloc: failed to reinstall signal handler");
 422        exit(1);
 423    }
 424}
 425
 426
 427char *qemu_get_pid_name(pid_t pid)
 428{
 429    char *name = NULL;
 430
 431#if defined(__FreeBSD__)
 432    /* BSDs don't have /proc, but they provide a nice substitute */
 433    struct kinfo_proc *proc = kinfo_getproc(pid);
 434
 435    if (proc) {
 436        name = g_strdup(proc->ki_comm);
 437        free(proc);
 438    }
 439#else
 440    /* Assume a system with reasonable procfs */
 441    char *pid_path;
 442    size_t len;
 443
 444    pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
 445    g_file_get_contents(pid_path, &name, &len, NULL);
 446    g_free(pid_path);
 447#endif
 448
 449    return name;
 450}
 451
 452
 453pid_t qemu_fork(Error **errp)
 454{
 455    sigset_t oldmask, newmask;
 456    struct sigaction sig_action;
 457    int saved_errno;
 458    pid_t pid;
 459
 460    /*
 461     * Need to block signals now, so that child process can safely
 462     * kill off caller's signal handlers without a race.
 463     */
 464    sigfillset(&newmask);
 465    if (pthread_sigmask(SIG_SETMASK, &newmask, &oldmask) != 0) {
 466        error_setg_errno(errp, errno,
 467                         "cannot block signals");
 468        return -1;
 469    }
 470
 471    pid = fork();
 472    saved_errno = errno;
 473
 474    if (pid < 0) {
 475        /* attempt to restore signal mask, but ignore failure, to
 476         * avoid obscuring the fork failure */
 477        (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
 478        error_setg_errno(errp, saved_errno,
 479                         "cannot fork child process");
 480        errno = saved_errno;
 481        return -1;
 482    } else if (pid) {
 483        /* parent process */
 484
 485        /* Restore our original signal mask now that the child is
 486         * safely running. Only documented failures are EFAULT (not
 487         * possible, since we are using just-grabbed mask) or EINVAL
 488         * (not possible, since we are using correct arguments).  */
 489        (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
 490    } else {
 491        /* child process */
 492        size_t i;
 493
 494        /* Clear out all signal handlers from parent so nothing
 495         * unexpected can happen in our child once we unblock
 496         * signals */
 497        sig_action.sa_handler = SIG_DFL;
 498        sig_action.sa_flags = 0;
 499        sigemptyset(&sig_action.sa_mask);
 500
 501        for (i = 1; i < NSIG; i++) {
 502            /* Only possible errors are EFAULT or EINVAL The former
 503             * won't happen, the latter we expect, so no need to check
 504             * return value */
 505            (void)sigaction(i, &sig_action, NULL);
 506        }
 507
 508        /* Unmask all signals in child, since we've no idea what the
 509         * caller's done with their signal mask and don't want to
 510         * propagate that to children */
 511        sigemptyset(&newmask);
 512        if (pthread_sigmask(SIG_SETMASK, &newmask, NULL) != 0) {
 513            Error *local_err = NULL;
 514            error_setg_errno(&local_err, errno,
 515                             "cannot unblock signals");
 516            error_report_err(local_err);
 517            _exit(1);
 518        }
 519    }
 520    return pid;
 521}
 522
 523void *qemu_alloc_stack(size_t *sz)
 524{
 525    void *ptr, *guardpage;
 526#ifdef CONFIG_DEBUG_STACK_USAGE
 527    void *ptr2;
 528#endif
 529    size_t pagesz = getpagesize();
 530#ifdef _SC_THREAD_STACK_MIN
 531    /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
 532    long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
 533    *sz = MAX(MAX(min_stack_sz, 0), *sz);
 534#endif
 535    /* adjust stack size to a multiple of the page size */
 536    *sz = ROUND_UP(*sz, pagesz);
 537    /* allocate one extra page for the guard page */
 538    *sz += pagesz;
 539
 540    ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE,
 541               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 542    if (ptr == MAP_FAILED) {
 543        perror("failed to allocate memory for stack");
 544        abort();
 545    }
 546
 547#if defined(HOST_IA64)
 548    /* separate register stack */
 549    guardpage = ptr + (((*sz - pagesz) / 2) & ~pagesz);
 550#elif defined(HOST_HPPA)
 551    /* stack grows up */
 552    guardpage = ptr + *sz - pagesz;
 553#else
 554    /* stack grows down */
 555    guardpage = ptr;
 556#endif
 557    if (mprotect(guardpage, pagesz, PROT_NONE) != 0) {
 558        perror("failed to set up stack guard page");
 559        abort();
 560    }
 561
 562#ifdef CONFIG_DEBUG_STACK_USAGE
 563    for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
 564        *(uint32_t *)ptr2 = 0xdeadbeaf;
 565    }
 566#endif
 567
 568    return ptr;
 569}
 570
 571#ifdef CONFIG_DEBUG_STACK_USAGE
 572static __thread unsigned int max_stack_usage;
 573#endif
 574
 575void qemu_free_stack(void *stack, size_t sz)
 576{
 577#ifdef CONFIG_DEBUG_STACK_USAGE
 578    unsigned int usage;
 579    void *ptr;
 580
 581    for (ptr = stack + getpagesize(); ptr < stack + sz;
 582         ptr += sizeof(uint32_t)) {
 583        if (*(uint32_t *)ptr != 0xdeadbeaf) {
 584            break;
 585        }
 586    }
 587    usage = sz - (uintptr_t) (ptr - stack);
 588    if (usage > max_stack_usage) {
 589        error_report("thread %d max stack usage increased from %u to %u",
 590                     qemu_get_thread_id(), max_stack_usage, usage);
 591        max_stack_usage = usage;
 592    }
 593#endif
 594
 595    munmap(stack, sz);
 596}
 597
 598void sigaction_invoke(struct sigaction *action,
 599                      struct qemu_signalfd_siginfo *info)
 600{
 601    siginfo_t si = {};
 602    si.si_signo = info->ssi_signo;
 603    si.si_errno = info->ssi_errno;
 604    si.si_code = info->ssi_code;
 605
 606    /* Convert the minimal set of fields defined by POSIX.
 607     * Positive si_code values are reserved for kernel-generated
 608     * signals, where the valid siginfo fields are determined by
 609     * the signal number.  But according to POSIX, it is unspecified
 610     * whether SI_USER and SI_QUEUE have values less than or equal to
 611     * zero.
 612     */
 613    if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
 614        info->ssi_code <= 0) {
 615        /* SIGTERM, etc.  */
 616        si.si_pid = info->ssi_pid;
 617        si.si_uid = info->ssi_uid;
 618    } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
 619               info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
 620        si.si_addr = (void *)(uintptr_t)info->ssi_addr;
 621    } else if (info->ssi_signo == SIGCHLD) {
 622        si.si_pid = info->ssi_pid;
 623        si.si_status = info->ssi_status;
 624        si.si_uid = info->ssi_uid;
 625    }
 626    action->sa_sigaction(info->ssi_signo, &si, NULL);
 627}
 628