qemu/util/oslib-posix.c
<<
>>
Prefs
   1/*
   2 * os-posix-lib.c
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2010 Red Hat, Inc.
   6 *
   7 * QEMU library functions on POSIX which are shared between QEMU and
   8 * the QEMU tools.
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include <termios.h>
  31
  32#include <glib/gprintf.h>
  33
  34#include "sysemu/sysemu.h"
  35#include "trace.h"
  36#include "qapi/error.h"
  37#include "qemu/sockets.h"
  38#include <libgen.h>
  39#include <sys/signal.h>
  40#include "qemu/cutils.h"
  41
  42#ifdef CONFIG_LINUX
  43#include <sys/syscall.h>
  44#endif
  45
  46#ifdef __FreeBSD__
  47#include <sys/sysctl.h>
  48#include <sys/user.h>
  49#include <libutil.h>
  50#endif
  51
  52#include "qemu/mmap-alloc.h"
  53
  54#ifdef CONFIG_DEBUG_STACK_USAGE
  55#include "qemu/error-report.h"
  56#endif
  57
  58#define MAX_MEM_PREALLOC_THREAD_COUNT 16
  59
  60struct MemsetThread {
  61    char *addr;
  62    uint64_t numpages;
  63    uint64_t hpagesize;
  64    QemuThread pgthread;
  65    sigjmp_buf env;
  66};
  67typedef struct MemsetThread MemsetThread;
  68
  69static MemsetThread *memset_thread;
  70static int memset_num_threads;
  71static bool memset_thread_failed;
  72
  73int qemu_get_thread_id(void)
  74{
  75#if defined(__linux__)
  76    return syscall(SYS_gettid);
  77#else
  78    return getpid();
  79#endif
  80}
  81
  82int qemu_daemon(int nochdir, int noclose)
  83{
  84    return daemon(nochdir, noclose);
  85}
  86
  87void *qemu_oom_check(void *ptr)
  88{
  89    if (ptr == NULL) {
  90        fprintf(stderr, "Failed to allocate memory: %s\n", strerror(errno));
  91        abort();
  92    }
  93    return ptr;
  94}
  95
  96void *qemu_try_memalign(size_t alignment, size_t size)
  97{
  98    void *ptr;
  99
 100    if (alignment < sizeof(void*)) {
 101        alignment = sizeof(void*);
 102    }
 103
 104#if defined(_POSIX_C_SOURCE) && !defined(__sun__)
 105    int ret;
 106    ret = posix_memalign(&ptr, alignment, size);
 107    if (ret != 0) {
 108        errno = ret;
 109        ptr = NULL;
 110    }
 111#elif defined(CONFIG_BSD)
 112    ptr = valloc(size);
 113#else
 114    ptr = memalign(alignment, size);
 115#endif
 116    trace_qemu_memalign(alignment, size, ptr);
 117    return ptr;
 118}
 119
 120void *qemu_memalign(size_t alignment, size_t size)
 121{
 122    return qemu_oom_check(qemu_try_memalign(alignment, size));
 123}
 124
 125/* alloc shared memory pages */
 126void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment)
 127{
 128    size_t align = QEMU_VMALLOC_ALIGN;
 129    void *ptr = qemu_ram_mmap(-1, size, align, false);
 130
 131    if (ptr == MAP_FAILED) {
 132        return NULL;
 133    }
 134
 135    if (alignment) {
 136        *alignment = align;
 137    }
 138
 139    trace_qemu_anon_ram_alloc(size, ptr);
 140    return ptr;
 141}
 142
 143void qemu_vfree(void *ptr)
 144{
 145    trace_qemu_vfree(ptr);
 146    free(ptr);
 147}
 148
 149void qemu_anon_ram_free(void *ptr, size_t size)
 150{
 151    trace_qemu_anon_ram_free(ptr, size);
 152    qemu_ram_munmap(ptr, size);
 153}
 154
 155void qemu_set_block(int fd)
 156{
 157    int f;
 158    f = fcntl(fd, F_GETFL);
 159    fcntl(fd, F_SETFL, f & ~O_NONBLOCK);
 160}
 161
 162void qemu_set_nonblock(int fd)
 163{
 164    int f;
 165    f = fcntl(fd, F_GETFL);
 166    fcntl(fd, F_SETFL, f | O_NONBLOCK);
 167}
 168
 169int socket_set_fast_reuse(int fd)
 170{
 171    int val = 1, ret;
 172
 173    ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
 174                     (const char *)&val, sizeof(val));
 175
 176    assert(ret == 0);
 177
 178    return ret;
 179}
 180
 181void qemu_set_cloexec(int fd)
 182{
 183    int f;
 184    f = fcntl(fd, F_GETFD);
 185    assert(f != -1);
 186    f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
 187    assert(f != -1);
 188}
 189
 190/*
 191 * Creates a pipe with FD_CLOEXEC set on both file descriptors
 192 */
 193int qemu_pipe(int pipefd[2])
 194{
 195    int ret;
 196
 197#ifdef CONFIG_PIPE2
 198    ret = pipe2(pipefd, O_CLOEXEC);
 199    if (ret != -1 || errno != ENOSYS) {
 200        return ret;
 201    }
 202#endif
 203    ret = pipe(pipefd);
 204    if (ret == 0) {
 205        qemu_set_cloexec(pipefd[0]);
 206        qemu_set_cloexec(pipefd[1]);
 207    }
 208
 209    return ret;
 210}
 211
 212char *
 213qemu_get_local_state_pathname(const char *relative_pathname)
 214{
 215    return g_strdup_printf("%s/%s", CONFIG_QEMU_LOCALSTATEDIR,
 216                           relative_pathname);
 217}
 218
 219void qemu_set_tty_echo(int fd, bool echo)
 220{
 221    struct termios tty;
 222
 223    tcgetattr(fd, &tty);
 224
 225    if (echo) {
 226        tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
 227    } else {
 228        tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
 229    }
 230
 231    tcsetattr(fd, TCSANOW, &tty);
 232}
 233
 234static char exec_dir[PATH_MAX];
 235
 236void qemu_init_exec_dir(const char *argv0)
 237{
 238    char *dir;
 239    char *p = NULL;
 240    char buf[PATH_MAX];
 241
 242    assert(!exec_dir[0]);
 243
 244#if defined(__linux__)
 245    {
 246        int len;
 247        len = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
 248        if (len > 0) {
 249            buf[len] = 0;
 250            p = buf;
 251        }
 252    }
 253#elif defined(__FreeBSD__)
 254    {
 255        static int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
 256        size_t len = sizeof(buf) - 1;
 257
 258        *buf = '\0';
 259        if (!sysctl(mib, ARRAY_SIZE(mib), buf, &len, NULL, 0) &&
 260            *buf) {
 261            buf[sizeof(buf) - 1] = '\0';
 262            p = buf;
 263        }
 264    }
 265#endif
 266    /* If we don't have any way of figuring out the actual executable
 267       location then try argv[0].  */
 268    if (!p) {
 269        if (!argv0) {
 270            return;
 271        }
 272        p = realpath(argv0, buf);
 273        if (!p) {
 274            return;
 275        }
 276    }
 277    dir = g_path_get_dirname(p);
 278
 279    pstrcpy(exec_dir, sizeof(exec_dir), dir);
 280
 281    g_free(dir);
 282}
 283
 284char *qemu_get_exec_dir(void)
 285{
 286    return g_strdup(exec_dir);
 287}
 288
 289static void sigbus_handler(int signal)
 290{
 291    int i;
 292    if (memset_thread) {
 293        for (i = 0; i < memset_num_threads; i++) {
 294            if (qemu_thread_is_self(&memset_thread[i].pgthread)) {
 295                siglongjmp(memset_thread[i].env, 1);
 296            }
 297        }
 298    }
 299}
 300
 301static void *do_touch_pages(void *arg)
 302{
 303    MemsetThread *memset_args = (MemsetThread *)arg;
 304    char *addr = memset_args->addr;
 305    uint64_t numpages = memset_args->numpages;
 306    uint64_t hpagesize = memset_args->hpagesize;
 307    sigset_t set, oldset;
 308    int i = 0;
 309
 310    /* unblock SIGBUS */
 311    sigemptyset(&set);
 312    sigaddset(&set, SIGBUS);
 313    pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
 314
 315    if (sigsetjmp(memset_args->env, 1)) {
 316        memset_thread_failed = true;
 317    } else {
 318        for (i = 0; i < numpages; i++) {
 319            /*
 320             * Read & write back the same value, so we don't
 321             * corrupt existing user/app data that might be
 322             * stored.
 323             *
 324             * 'volatile' to stop compiler optimizing this away
 325             * to a no-op
 326             *
 327             * TODO: get a better solution from kernel so we
 328             * don't need to write at all so we don't cause
 329             * wear on the storage backing the region...
 330             */
 331            *(volatile char *)addr = *addr;
 332            addr += hpagesize;
 333        }
 334    }
 335    pthread_sigmask(SIG_SETMASK, &oldset, NULL);
 336    return NULL;
 337}
 338
 339static inline int get_memset_num_threads(int smp_cpus)
 340{
 341    long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
 342    int ret = 1;
 343
 344    if (host_procs > 0) {
 345        ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus);
 346    }
 347    /* In case sysconf() fails, we fall back to single threaded */
 348    return ret;
 349}
 350
 351static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
 352                            int smp_cpus)
 353{
 354    uint64_t numpages_per_thread, size_per_thread;
 355    char *addr = area;
 356    int i = 0;
 357
 358    memset_thread_failed = false;
 359    memset_num_threads = get_memset_num_threads(smp_cpus);
 360    memset_thread = g_new0(MemsetThread, memset_num_threads);
 361    numpages_per_thread = (numpages / memset_num_threads);
 362    size_per_thread = (hpagesize * numpages_per_thread);
 363    for (i = 0; i < memset_num_threads; i++) {
 364        memset_thread[i].addr = addr;
 365        memset_thread[i].numpages = (i == (memset_num_threads - 1)) ?
 366                                    numpages : numpages_per_thread;
 367        memset_thread[i].hpagesize = hpagesize;
 368        qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
 369                           do_touch_pages, &memset_thread[i],
 370                           QEMU_THREAD_JOINABLE);
 371        addr += size_per_thread;
 372        numpages -= numpages_per_thread;
 373    }
 374    for (i = 0; i < memset_num_threads; i++) {
 375        qemu_thread_join(&memset_thread[i].pgthread);
 376    }
 377    g_free(memset_thread);
 378    memset_thread = NULL;
 379
 380    return memset_thread_failed;
 381}
 382
 383void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
 384                     Error **errp)
 385{
 386    int ret;
 387    struct sigaction act, oldact;
 388    size_t hpagesize = qemu_fd_getpagesize(fd);
 389    size_t numpages = DIV_ROUND_UP(memory, hpagesize);
 390
 391    memset(&act, 0, sizeof(act));
 392    act.sa_handler = &sigbus_handler;
 393    act.sa_flags = 0;
 394
 395    ret = sigaction(SIGBUS, &act, &oldact);
 396    if (ret) {
 397        error_setg_errno(errp, errno,
 398            "os_mem_prealloc: failed to install signal handler");
 399        return;
 400    }
 401
 402    /* touch pages simultaneously */
 403    if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) {
 404        error_setg(errp, "os_mem_prealloc: Insufficient free host memory "
 405            "pages available to allocate guest RAM");
 406    }
 407
 408    ret = sigaction(SIGBUS, &oldact, NULL);
 409    if (ret) {
 410        /* Terminate QEMU since it can't recover from error */
 411        perror("os_mem_prealloc: failed to reinstall signal handler");
 412        exit(1);
 413    }
 414}
 415
 416
 417char *qemu_get_pid_name(pid_t pid)
 418{
 419    char *name = NULL;
 420
 421#if defined(__FreeBSD__)
 422    /* BSDs don't have /proc, but they provide a nice substitute */
 423    struct kinfo_proc *proc = kinfo_getproc(pid);
 424
 425    if (proc) {
 426        name = g_strdup(proc->ki_comm);
 427        free(proc);
 428    }
 429#else
 430    /* Assume a system with reasonable procfs */
 431    char *pid_path;
 432    size_t len;
 433
 434    pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
 435    g_file_get_contents(pid_path, &name, &len, NULL);
 436    g_free(pid_path);
 437#endif
 438
 439    return name;
 440}
 441
 442
 443pid_t qemu_fork(Error **errp)
 444{
 445    sigset_t oldmask, newmask;
 446    struct sigaction sig_action;
 447    int saved_errno;
 448    pid_t pid;
 449
 450    /*
 451     * Need to block signals now, so that child process can safely
 452     * kill off caller's signal handlers without a race.
 453     */
 454    sigfillset(&newmask);
 455    if (pthread_sigmask(SIG_SETMASK, &newmask, &oldmask) != 0) {
 456        error_setg_errno(errp, errno,
 457                         "cannot block signals");
 458        return -1;
 459    }
 460
 461    pid = fork();
 462    saved_errno = errno;
 463
 464    if (pid < 0) {
 465        /* attempt to restore signal mask, but ignore failure, to
 466         * avoid obscuring the fork failure */
 467        (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
 468        error_setg_errno(errp, saved_errno,
 469                         "cannot fork child process");
 470        errno = saved_errno;
 471        return -1;
 472    } else if (pid) {
 473        /* parent process */
 474
 475        /* Restore our original signal mask now that the child is
 476         * safely running. Only documented failures are EFAULT (not
 477         * possible, since we are using just-grabbed mask) or EINVAL
 478         * (not possible, since we are using correct arguments).  */
 479        (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
 480    } else {
 481        /* child process */
 482        size_t i;
 483
 484        /* Clear out all signal handlers from parent so nothing
 485         * unexpected can happen in our child once we unblock
 486         * signals */
 487        sig_action.sa_handler = SIG_DFL;
 488        sig_action.sa_flags = 0;
 489        sigemptyset(&sig_action.sa_mask);
 490
 491        for (i = 1; i < NSIG; i++) {
 492            /* Only possible errors are EFAULT or EINVAL The former
 493             * won't happen, the latter we expect, so no need to check
 494             * return value */
 495            (void)sigaction(i, &sig_action, NULL);
 496        }
 497
 498        /* Unmask all signals in child, since we've no idea what the
 499         * caller's done with their signal mask and don't want to
 500         * propagate that to children */
 501        sigemptyset(&newmask);
 502        if (pthread_sigmask(SIG_SETMASK, &newmask, NULL) != 0) {
 503            Error *local_err = NULL;
 504            error_setg_errno(&local_err, errno,
 505                             "cannot unblock signals");
 506            error_report_err(local_err);
 507            _exit(1);
 508        }
 509    }
 510    return pid;
 511}
 512
 513void *qemu_alloc_stack(size_t *sz)
 514{
 515    void *ptr, *guardpage;
 516#ifdef CONFIG_DEBUG_STACK_USAGE
 517    void *ptr2;
 518#endif
 519    size_t pagesz = getpagesize();
 520#ifdef _SC_THREAD_STACK_MIN
 521    /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
 522    long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
 523    *sz = MAX(MAX(min_stack_sz, 0), *sz);
 524#endif
 525    /* adjust stack size to a multiple of the page size */
 526    *sz = ROUND_UP(*sz, pagesz);
 527    /* allocate one extra page for the guard page */
 528    *sz += pagesz;
 529
 530    ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE,
 531               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 532    if (ptr == MAP_FAILED) {
 533        abort();
 534    }
 535
 536#if defined(HOST_IA64)
 537    /* separate register stack */
 538    guardpage = ptr + (((*sz - pagesz) / 2) & ~pagesz);
 539#elif defined(HOST_HPPA)
 540    /* stack grows up */
 541    guardpage = ptr + *sz - pagesz;
 542#else
 543    /* stack grows down */
 544    guardpage = ptr;
 545#endif
 546    if (mprotect(guardpage, pagesz, PROT_NONE) != 0) {
 547        abort();
 548    }
 549
 550#ifdef CONFIG_DEBUG_STACK_USAGE
 551    for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
 552        *(uint32_t *)ptr2 = 0xdeadbeaf;
 553    }
 554#endif
 555
 556    return ptr;
 557}
 558
 559#ifdef CONFIG_DEBUG_STACK_USAGE
 560static __thread unsigned int max_stack_usage;
 561#endif
 562
 563void qemu_free_stack(void *stack, size_t sz)
 564{
 565#ifdef CONFIG_DEBUG_STACK_USAGE
 566    unsigned int usage;
 567    void *ptr;
 568
 569    for (ptr = stack + getpagesize(); ptr < stack + sz;
 570         ptr += sizeof(uint32_t)) {
 571        if (*(uint32_t *)ptr != 0xdeadbeaf) {
 572            break;
 573        }
 574    }
 575    usage = sz - (uintptr_t) (ptr - stack);
 576    if (usage > max_stack_usage) {
 577        error_report("thread %d max stack usage increased from %u to %u",
 578                     qemu_get_thread_id(), max_stack_usage, usage);
 579        max_stack_usage = usage;
 580    }
 581#endif
 582
 583    munmap(stack, sz);
 584}
 585
 586void sigaction_invoke(struct sigaction *action,
 587                      struct qemu_signalfd_siginfo *info)
 588{
 589    siginfo_t si = {};
 590    si.si_signo = info->ssi_signo;
 591    si.si_errno = info->ssi_errno;
 592    si.si_code = info->ssi_code;
 593
 594    /* Convert the minimal set of fields defined by POSIX.
 595     * Positive si_code values are reserved for kernel-generated
 596     * signals, where the valid siginfo fields are determined by
 597     * the signal number.  But according to POSIX, it is unspecified
 598     * whether SI_USER and SI_QUEUE have values less than or equal to
 599     * zero.
 600     */
 601    if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
 602        info->ssi_code <= 0) {
 603        /* SIGTERM, etc.  */
 604        si.si_pid = info->ssi_pid;
 605        si.si_uid = info->ssi_uid;
 606    } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
 607               info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
 608        si.si_addr = (void *)(uintptr_t)info->ssi_addr;
 609    } else if (info->ssi_signo == SIGCHLD) {
 610        si.si_pid = info->ssi_pid;
 611        si.si_status = info->ssi_status;
 612        si.si_uid = info->ssi_uid;
 613    }
 614    action->sa_sigaction(info->ssi_signo, &si, NULL);
 615}
 616