qemu/util/oslib-posix.c
<<
>>
Prefs
   1/*
   2 * os-posix-lib.c
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2010 Red Hat, Inc.
   6 *
   7 * QEMU library functions on POSIX which are shared between QEMU and
   8 * the QEMU tools.
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include <termios.h>
  31
  32#include <glib/gprintf.h>
  33
  34#include "qemu-common.h"
  35#include "sysemu/sysemu.h"
  36#include "trace.h"
  37#include "qapi/error.h"
  38#include "qemu/error-report.h"
  39#include "qemu/madvise.h"
  40#include "qemu/sockets.h"
  41#include "qemu/thread.h"
  42#include <libgen.h>
  43#include "qemu/cutils.h"
  44#include "qemu/compiler.h"
  45#include "qemu/units.h"
  46
  47#ifdef CONFIG_LINUX
  48#include <sys/syscall.h>
  49#endif
  50
  51#ifdef __FreeBSD__
  52#include <sys/sysctl.h>
  53#include <sys/user.h>
  54#include <sys/thr.h>
  55#include <libutil.h>
  56#endif
  57
  58#ifdef __NetBSD__
  59#include <sys/sysctl.h>
  60#include <lwp.h>
  61#endif
  62
  63#ifdef __APPLE__
  64#include <mach-o/dyld.h>
  65#endif
  66
  67#ifdef __HAIKU__
  68#include <kernel/image.h>
  69#endif
  70
  71#include "qemu/mmap-alloc.h"
  72
  73#ifdef CONFIG_DEBUG_STACK_USAGE
  74#include "qemu/error-report.h"
  75#endif
  76
  77#define MAX_MEM_PREALLOC_THREAD_COUNT 16
  78
  79struct MemsetThread;
  80
  81typedef struct MemsetContext {
  82    bool all_threads_created;
  83    bool any_thread_failed;
  84    struct MemsetThread *threads;
  85    int num_threads;
  86} MemsetContext;
  87
  88struct MemsetThread {
  89    char *addr;
  90    size_t numpages;
  91    size_t hpagesize;
  92    QemuThread pgthread;
  93    sigjmp_buf env;
  94    MemsetContext *context;
  95};
  96typedef struct MemsetThread MemsetThread;
  97
  98/* used by sigbus_handler() */
  99static MemsetContext *sigbus_memset_context;
 100struct sigaction sigbus_oldact;
 101static QemuMutex sigbus_mutex;
 102
 103static QemuMutex page_mutex;
 104static QemuCond page_cond;
 105
 106int qemu_get_thread_id(void)
 107{
 108#if defined(__linux__)
 109    return syscall(SYS_gettid);
 110#elif defined(__FreeBSD__)
 111    /* thread id is up to INT_MAX */
 112    long tid;
 113    thr_self(&tid);
 114    return (int)tid;
 115#elif defined(__NetBSD__)
 116    return _lwp_self();
 117#elif defined(__OpenBSD__)
 118    return getthrid();
 119#else
 120    return getpid();
 121#endif
 122}
 123
 124int qemu_daemon(int nochdir, int noclose)
 125{
 126    return daemon(nochdir, noclose);
 127}
 128
 129bool qemu_write_pidfile(const char *path, Error **errp)
 130{
 131    int fd;
 132    char pidstr[32];
 133
 134    while (1) {
 135        struct stat a, b;
 136        struct flock lock = {
 137            .l_type = F_WRLCK,
 138            .l_whence = SEEK_SET,
 139            .l_len = 0,
 140        };
 141
 142        fd = qemu_open_old(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
 143        if (fd == -1) {
 144            error_setg_errno(errp, errno, "Cannot open pid file");
 145            return false;
 146        }
 147
 148        if (fstat(fd, &b) < 0) {
 149            error_setg_errno(errp, errno, "Cannot stat file");
 150            goto fail_close;
 151        }
 152
 153        if (fcntl(fd, F_SETLK, &lock)) {
 154            error_setg_errno(errp, errno, "Cannot lock pid file");
 155            goto fail_close;
 156        }
 157
 158        /*
 159         * Now make sure the path we locked is the same one that now
 160         * exists on the filesystem.
 161         */
 162        if (stat(path, &a) < 0) {
 163            /*
 164             * PID file disappeared, someone else must be racing with
 165             * us, so try again.
 166             */
 167            close(fd);
 168            continue;
 169        }
 170
 171        if (a.st_ino == b.st_ino) {
 172            break;
 173        }
 174
 175        /*
 176         * PID file was recreated, someone else must be racing with
 177         * us, so try again.
 178         */
 179        close(fd);
 180    }
 181
 182    if (ftruncate(fd, 0) < 0) {
 183        error_setg_errno(errp, errno, "Failed to truncate pid file");
 184        goto fail_unlink;
 185    }
 186
 187    snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid());
 188    if (write(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) {
 189        error_setg(errp, "Failed to write pid file");
 190        goto fail_unlink;
 191    }
 192
 193    return true;
 194
 195fail_unlink:
 196    unlink(path);
 197fail_close:
 198    close(fd);
 199    return false;
 200}
 201
 202/* alloc shared memory pages */
 203void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared,
 204                          bool noreserve)
 205{
 206    const uint32_t qemu_map_flags = (shared ? QEMU_MAP_SHARED : 0) |
 207                                    (noreserve ? QEMU_MAP_NORESERVE : 0);
 208    size_t align = QEMU_VMALLOC_ALIGN;
 209    void *ptr = qemu_ram_mmap(-1, size, align, qemu_map_flags, 0);
 210
 211    if (ptr == MAP_FAILED) {
 212        return NULL;
 213    }
 214
 215    if (alignment) {
 216        *alignment = align;
 217    }
 218
 219    trace_qemu_anon_ram_alloc(size, ptr);
 220    return ptr;
 221}
 222
 223void qemu_anon_ram_free(void *ptr, size_t size)
 224{
 225    trace_qemu_anon_ram_free(ptr, size);
 226    qemu_ram_munmap(-1, ptr, size);
 227}
 228
 229void qemu_set_block(int fd)
 230{
 231    int f;
 232    f = fcntl(fd, F_GETFL);
 233    assert(f != -1);
 234    f = fcntl(fd, F_SETFL, f & ~O_NONBLOCK);
 235    assert(f != -1);
 236}
 237
 238int qemu_try_set_nonblock(int fd)
 239{
 240    int f;
 241    f = fcntl(fd, F_GETFL);
 242    if (f == -1) {
 243        return -errno;
 244    }
 245    if (fcntl(fd, F_SETFL, f | O_NONBLOCK) == -1) {
 246        return -errno;
 247    }
 248    return 0;
 249}
 250
 251void qemu_set_nonblock(int fd)
 252{
 253    int f;
 254    f = qemu_try_set_nonblock(fd);
 255    assert(f == 0);
 256}
 257
 258int socket_set_fast_reuse(int fd)
 259{
 260    int val = 1, ret;
 261
 262    ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
 263                     (const char *)&val, sizeof(val));
 264
 265    assert(ret == 0);
 266
 267    return ret;
 268}
 269
 270void qemu_set_cloexec(int fd)
 271{
 272    int f;
 273    f = fcntl(fd, F_GETFD);
 274    assert(f != -1);
 275    f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
 276    assert(f != -1);
 277}
 278
 279/*
 280 * Creates a pipe with FD_CLOEXEC set on both file descriptors
 281 */
 282int qemu_pipe(int pipefd[2])
 283{
 284    int ret;
 285
 286#ifdef CONFIG_PIPE2
 287    ret = pipe2(pipefd, O_CLOEXEC);
 288    if (ret != -1 || errno != ENOSYS) {
 289        return ret;
 290    }
 291#endif
 292    ret = pipe(pipefd);
 293    if (ret == 0) {
 294        qemu_set_cloexec(pipefd[0]);
 295        qemu_set_cloexec(pipefd[1]);
 296    }
 297
 298    return ret;
 299}
 300
 301char *
 302qemu_get_local_state_pathname(const char *relative_pathname)
 303{
 304    g_autofree char *dir = g_strdup_printf("%s/%s",
 305                                           CONFIG_QEMU_LOCALSTATEDIR,
 306                                           relative_pathname);
 307    return get_relocated_path(dir);
 308}
 309
 310void qemu_set_tty_echo(int fd, bool echo)
 311{
 312    struct termios tty;
 313
 314    tcgetattr(fd, &tty);
 315
 316    if (echo) {
 317        tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
 318    } else {
 319        tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
 320    }
 321
 322    tcsetattr(fd, TCSANOW, &tty);
 323}
 324
 325static const char *exec_dir;
 326
 327void qemu_init_exec_dir(const char *argv0)
 328{
 329    char *p = NULL;
 330    char buf[PATH_MAX];
 331
 332    if (exec_dir) {
 333        return;
 334    }
 335
 336#if defined(__linux__)
 337    {
 338        int len;
 339        len = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
 340        if (len > 0) {
 341            buf[len] = 0;
 342            p = buf;
 343        }
 344    }
 345#elif defined(__FreeBSD__) \
 346      || (defined(__NetBSD__) && defined(KERN_PROC_PATHNAME))
 347    {
 348#if defined(__FreeBSD__)
 349        static int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
 350#else
 351        static int mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME};
 352#endif
 353        size_t len = sizeof(buf) - 1;
 354
 355        *buf = '\0';
 356        if (!sysctl(mib, ARRAY_SIZE(mib), buf, &len, NULL, 0) &&
 357            *buf) {
 358            buf[sizeof(buf) - 1] = '\0';
 359            p = buf;
 360        }
 361    }
 362#elif defined(__APPLE__)
 363    {
 364        char fpath[PATH_MAX];
 365        uint32_t len = sizeof(fpath);
 366        if (_NSGetExecutablePath(fpath, &len) == 0) {
 367            p = realpath(fpath, buf);
 368            if (!p) {
 369                return;
 370            }
 371        }
 372    }
 373#elif defined(__HAIKU__)
 374    {
 375        image_info ii;
 376        int32_t c = 0;
 377
 378        *buf = '\0';
 379        while (get_next_image_info(0, &c, &ii) == B_OK) {
 380            if (ii.type == B_APP_IMAGE) {
 381                strncpy(buf, ii.name, sizeof(buf));
 382                buf[sizeof(buf) - 1] = 0;
 383                p = buf;
 384                break;
 385            }
 386        }
 387    }
 388#endif
 389    /* If we don't have any way of figuring out the actual executable
 390       location then try argv[0].  */
 391    if (!p && argv0) {
 392        p = realpath(argv0, buf);
 393    }
 394    if (p) {
 395        exec_dir = g_path_get_dirname(p);
 396    } else {
 397        exec_dir = CONFIG_BINDIR;
 398    }
 399}
 400
 401const char *qemu_get_exec_dir(void)
 402{
 403    return exec_dir;
 404}
 405
 406#ifdef CONFIG_LINUX
 407static void sigbus_handler(int signal, siginfo_t *siginfo, void *ctx)
 408#else /* CONFIG_LINUX */
 409static void sigbus_handler(int signal)
 410#endif /* CONFIG_LINUX */
 411{
 412    int i;
 413
 414    if (sigbus_memset_context) {
 415        for (i = 0; i < sigbus_memset_context->num_threads; i++) {
 416            MemsetThread *thread = &sigbus_memset_context->threads[i];
 417
 418            if (qemu_thread_is_self(&thread->pgthread)) {
 419                siglongjmp(thread->env, 1);
 420            }
 421        }
 422    }
 423
 424#ifdef CONFIG_LINUX
 425    /*
 426     * We assume that the MCE SIGBUS handler could have been registered. We
 427     * should never receive BUS_MCEERR_AO on any of our threads, but only on
 428     * the main thread registered for PR_MCE_KILL_EARLY. Further, we should not
 429     * receive BUS_MCEERR_AR triggered by action of other threads on one of
 430     * our threads. So, no need to check for unrelated SIGBUS when seeing one
 431     * for our threads.
 432     *
 433     * We will forward to the MCE handler, which will either handle the SIGBUS
 434     * or reinstall the default SIGBUS handler and reraise the SIGBUS. The
 435     * default SIGBUS handler will crash the process, so we don't care.
 436     */
 437    if (sigbus_oldact.sa_flags & SA_SIGINFO) {
 438        sigbus_oldact.sa_sigaction(signal, siginfo, ctx);
 439        return;
 440    }
 441#endif /* CONFIG_LINUX */
 442    warn_report("os_mem_prealloc: unrelated SIGBUS detected and ignored");
 443}
 444
 445static void *do_touch_pages(void *arg)
 446{
 447    MemsetThread *memset_args = (MemsetThread *)arg;
 448    sigset_t set, oldset;
 449    int ret = 0;
 450
 451    /*
 452     * On Linux, the page faults from the loop below can cause mmap_sem
 453     * contention with allocation of the thread stacks.  Do not start
 454     * clearing until all threads have been created.
 455     */
 456    qemu_mutex_lock(&page_mutex);
 457    while (!memset_args->context->all_threads_created) {
 458        qemu_cond_wait(&page_cond, &page_mutex);
 459    }
 460    qemu_mutex_unlock(&page_mutex);
 461
 462    /* unblock SIGBUS */
 463    sigemptyset(&set);
 464    sigaddset(&set, SIGBUS);
 465    pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
 466
 467    if (sigsetjmp(memset_args->env, 1)) {
 468        ret = -EFAULT;
 469    } else {
 470        char *addr = memset_args->addr;
 471        size_t numpages = memset_args->numpages;
 472        size_t hpagesize = memset_args->hpagesize;
 473        size_t i;
 474        for (i = 0; i < numpages; i++) {
 475            /*
 476             * Read & write back the same value, so we don't
 477             * corrupt existing user/app data that might be
 478             * stored.
 479             *
 480             * 'volatile' to stop compiler optimizing this away
 481             * to a no-op
 482             */
 483            *(volatile char *)addr = *addr;
 484            addr += hpagesize;
 485        }
 486    }
 487    pthread_sigmask(SIG_SETMASK, &oldset, NULL);
 488    return (void *)(uintptr_t)ret;
 489}
 490
 491static void *do_madv_populate_write_pages(void *arg)
 492{
 493    MemsetThread *memset_args = (MemsetThread *)arg;
 494    const size_t size = memset_args->numpages * memset_args->hpagesize;
 495    char * const addr = memset_args->addr;
 496    int ret = 0;
 497
 498    /* See do_touch_pages(). */
 499    qemu_mutex_lock(&page_mutex);
 500    while (!memset_args->context->all_threads_created) {
 501        qemu_cond_wait(&page_cond, &page_mutex);
 502    }
 503    qemu_mutex_unlock(&page_mutex);
 504
 505    if (size && qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE)) {
 506        ret = -errno;
 507    }
 508    return (void *)(uintptr_t)ret;
 509}
 510
 511static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
 512                                         int smp_cpus)
 513{
 514    long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
 515    int ret = 1;
 516
 517    if (host_procs > 0) {
 518        ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus);
 519    }
 520
 521    /* Especially with gigantic pages, don't create more threads than pages. */
 522    ret = MIN(ret, numpages);
 523    /* Don't start threads to prealloc comparatively little memory. */
 524    ret = MIN(ret, MAX(1, hpagesize * numpages / (64 * MiB)));
 525
 526    /* In case sysconf() fails, we fall back to single threaded */
 527    return ret;
 528}
 529
 530static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
 531                           int smp_cpus, bool use_madv_populate_write)
 532{
 533    static gsize initialized = 0;
 534    MemsetContext context = {
 535        .num_threads = get_memset_num_threads(hpagesize, numpages, smp_cpus),
 536    };
 537    size_t numpages_per_thread, leftover;
 538    void *(*touch_fn)(void *);
 539    int ret = 0, i = 0;
 540    char *addr = area;
 541
 542    if (g_once_init_enter(&initialized)) {
 543        qemu_mutex_init(&page_mutex);
 544        qemu_cond_init(&page_cond);
 545        g_once_init_leave(&initialized, 1);
 546    }
 547
 548    if (use_madv_populate_write) {
 549        /* Avoid creating a single thread for MADV_POPULATE_WRITE */
 550        if (context.num_threads == 1) {
 551            if (qemu_madvise(area, hpagesize * numpages,
 552                             QEMU_MADV_POPULATE_WRITE)) {
 553                return -errno;
 554            }
 555            return 0;
 556        }
 557        touch_fn = do_madv_populate_write_pages;
 558    } else {
 559        touch_fn = do_touch_pages;
 560    }
 561
 562    context.threads = g_new0(MemsetThread, context.num_threads);
 563    numpages_per_thread = numpages / context.num_threads;
 564    leftover = numpages % context.num_threads;
 565    for (i = 0; i < context.num_threads; i++) {
 566        context.threads[i].addr = addr;
 567        context.threads[i].numpages = numpages_per_thread + (i < leftover);
 568        context.threads[i].hpagesize = hpagesize;
 569        context.threads[i].context = &context;
 570        qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
 571                           touch_fn, &context.threads[i],
 572                           QEMU_THREAD_JOINABLE);
 573        addr += context.threads[i].numpages * hpagesize;
 574    }
 575
 576    if (!use_madv_populate_write) {
 577        sigbus_memset_context = &context;
 578    }
 579
 580    qemu_mutex_lock(&page_mutex);
 581    context.all_threads_created = true;
 582    qemu_cond_broadcast(&page_cond);
 583    qemu_mutex_unlock(&page_mutex);
 584
 585    for (i = 0; i < context.num_threads; i++) {
 586        int tmp = (uintptr_t)qemu_thread_join(&context.threads[i].pgthread);
 587
 588        if (tmp) {
 589            ret = tmp;
 590        }
 591    }
 592
 593    if (!use_madv_populate_write) {
 594        sigbus_memset_context = NULL;
 595    }
 596    g_free(context.threads);
 597
 598    return ret;
 599}
 600
 601static bool madv_populate_write_possible(char *area, size_t pagesize)
 602{
 603    return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
 604           errno != EINVAL;
 605}
 606
 607void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
 608                     Error **errp)
 609{
 610    static gsize initialized;
 611    int ret;
 612    size_t hpagesize = qemu_fd_getpagesize(fd);
 613    size_t numpages = DIV_ROUND_UP(memory, hpagesize);
 614    bool use_madv_populate_write;
 615    struct sigaction act;
 616
 617    /*
 618     * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
 619     * some special mappings, such as mapping /dev/mem.
 620     */
 621    use_madv_populate_write = madv_populate_write_possible(area, hpagesize);
 622
 623    if (!use_madv_populate_write) {
 624        if (g_once_init_enter(&initialized)) {
 625            qemu_mutex_init(&sigbus_mutex);
 626            g_once_init_leave(&initialized, 1);
 627        }
 628
 629        qemu_mutex_lock(&sigbus_mutex);
 630        memset(&act, 0, sizeof(act));
 631#ifdef CONFIG_LINUX
 632        act.sa_sigaction = &sigbus_handler;
 633        act.sa_flags = SA_SIGINFO;
 634#else /* CONFIG_LINUX */
 635        act.sa_handler = &sigbus_handler;
 636        act.sa_flags = 0;
 637#endif /* CONFIG_LINUX */
 638
 639        ret = sigaction(SIGBUS, &act, &sigbus_oldact);
 640        if (ret) {
 641            qemu_mutex_unlock(&sigbus_mutex);
 642            error_setg_errno(errp, errno,
 643                "os_mem_prealloc: failed to install signal handler");
 644            return;
 645        }
 646    }
 647
 648    /* touch pages simultaneously */
 649    ret = touch_all_pages(area, hpagesize, numpages, smp_cpus,
 650                          use_madv_populate_write);
 651    if (ret) {
 652        error_setg_errno(errp, -ret,
 653                         "os_mem_prealloc: preallocating memory failed");
 654    }
 655
 656    if (!use_madv_populate_write) {
 657        ret = sigaction(SIGBUS, &sigbus_oldact, NULL);
 658        if (ret) {
 659            /* Terminate QEMU since it can't recover from error */
 660            perror("os_mem_prealloc: failed to reinstall signal handler");
 661            exit(1);
 662        }
 663        qemu_mutex_unlock(&sigbus_mutex);
 664    }
 665}
 666
 667char *qemu_get_pid_name(pid_t pid)
 668{
 669    char *name = NULL;
 670
 671#if defined(__FreeBSD__)
 672    /* BSDs don't have /proc, but they provide a nice substitute */
 673    struct kinfo_proc *proc = kinfo_getproc(pid);
 674
 675    if (proc) {
 676        name = g_strdup(proc->ki_comm);
 677        free(proc);
 678    }
 679#else
 680    /* Assume a system with reasonable procfs */
 681    char *pid_path;
 682    size_t len;
 683
 684    pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
 685    g_file_get_contents(pid_path, &name, &len, NULL);
 686    g_free(pid_path);
 687#endif
 688
 689    return name;
 690}
 691
 692
 693pid_t qemu_fork(Error **errp)
 694{
 695    sigset_t oldmask, newmask;
 696    struct sigaction sig_action;
 697    int saved_errno;
 698    pid_t pid;
 699
 700    /*
 701     * Need to block signals now, so that child process can safely
 702     * kill off caller's signal handlers without a race.
 703     */
 704    sigfillset(&newmask);
 705    if (pthread_sigmask(SIG_SETMASK, &newmask, &oldmask) != 0) {
 706        error_setg_errno(errp, errno,
 707                         "cannot block signals");
 708        return -1;
 709    }
 710
 711    pid = fork();
 712    saved_errno = errno;
 713
 714    if (pid < 0) {
 715        /* attempt to restore signal mask, but ignore failure, to
 716         * avoid obscuring the fork failure */
 717        (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
 718        error_setg_errno(errp, saved_errno,
 719                         "cannot fork child process");
 720        errno = saved_errno;
 721        return -1;
 722    } else if (pid) {
 723        /* parent process */
 724
 725        /* Restore our original signal mask now that the child is
 726         * safely running. Only documented failures are EFAULT (not
 727         * possible, since we are using just-grabbed mask) or EINVAL
 728         * (not possible, since we are using correct arguments).  */
 729        (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
 730    } else {
 731        /* child process */
 732        size_t i;
 733
 734        /* Clear out all signal handlers from parent so nothing
 735         * unexpected can happen in our child once we unblock
 736         * signals */
 737        sig_action.sa_handler = SIG_DFL;
 738        sig_action.sa_flags = 0;
 739        sigemptyset(&sig_action.sa_mask);
 740
 741        for (i = 1; i < NSIG; i++) {
 742            /* Only possible errors are EFAULT or EINVAL The former
 743             * won't happen, the latter we expect, so no need to check
 744             * return value */
 745            (void)sigaction(i, &sig_action, NULL);
 746        }
 747
 748        /* Unmask all signals in child, since we've no idea what the
 749         * caller's done with their signal mask and don't want to
 750         * propagate that to children */
 751        sigemptyset(&newmask);
 752        if (pthread_sigmask(SIG_SETMASK, &newmask, NULL) != 0) {
 753            Error *local_err = NULL;
 754            error_setg_errno(&local_err, errno,
 755                             "cannot unblock signals");
 756            error_report_err(local_err);
 757            _exit(1);
 758        }
 759    }
 760    return pid;
 761}
 762
 763void *qemu_alloc_stack(size_t *sz)
 764{
 765    void *ptr, *guardpage;
 766    int flags;
 767#ifdef CONFIG_DEBUG_STACK_USAGE
 768    void *ptr2;
 769#endif
 770    size_t pagesz = qemu_real_host_page_size;
 771#ifdef _SC_THREAD_STACK_MIN
 772    /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
 773    long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
 774    *sz = MAX(MAX(min_stack_sz, 0), *sz);
 775#endif
 776    /* adjust stack size to a multiple of the page size */
 777    *sz = ROUND_UP(*sz, pagesz);
 778    /* allocate one extra page for the guard page */
 779    *sz += pagesz;
 780
 781    flags = MAP_PRIVATE | MAP_ANONYMOUS;
 782#if defined(MAP_STACK) && defined(__OpenBSD__)
 783    /* Only enable MAP_STACK on OpenBSD. Other OS's such as
 784     * Linux/FreeBSD/NetBSD have a flag with the same name
 785     * but have differing functionality. OpenBSD will SEGV
 786     * if it spots execution with a stack pointer pointing
 787     * at memory that was not allocated with MAP_STACK.
 788     */
 789    flags |= MAP_STACK;
 790#endif
 791
 792    ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0);
 793    if (ptr == MAP_FAILED) {
 794        perror("failed to allocate memory for stack");
 795        abort();
 796    }
 797
 798#if defined(HOST_IA64)
 799    /* separate register stack */
 800    guardpage = ptr + (((*sz - pagesz) / 2) & ~pagesz);
 801#elif defined(HOST_HPPA)
 802    /* stack grows up */
 803    guardpage = ptr + *sz - pagesz;
 804#else
 805    /* stack grows down */
 806    guardpage = ptr;
 807#endif
 808    if (mprotect(guardpage, pagesz, PROT_NONE) != 0) {
 809        perror("failed to set up stack guard page");
 810        abort();
 811    }
 812
 813#ifdef CONFIG_DEBUG_STACK_USAGE
 814    for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
 815        *(uint32_t *)ptr2 = 0xdeadbeaf;
 816    }
 817#endif
 818
 819    return ptr;
 820}
 821
 822#ifdef CONFIG_DEBUG_STACK_USAGE
 823static __thread unsigned int max_stack_usage;
 824#endif
 825
 826void qemu_free_stack(void *stack, size_t sz)
 827{
 828#ifdef CONFIG_DEBUG_STACK_USAGE
 829    unsigned int usage;
 830    void *ptr;
 831
 832    for (ptr = stack + qemu_real_host_page_size; ptr < stack + sz;
 833         ptr += sizeof(uint32_t)) {
 834        if (*(uint32_t *)ptr != 0xdeadbeaf) {
 835            break;
 836        }
 837    }
 838    usage = sz - (uintptr_t) (ptr - stack);
 839    if (usage > max_stack_usage) {
 840        error_report("thread %d max stack usage increased from %u to %u",
 841                     qemu_get_thread_id(), max_stack_usage, usage);
 842        max_stack_usage = usage;
 843    }
 844#endif
 845
 846    munmap(stack, sz);
 847}
 848
 849/*
 850 * Disable CFI checks.
 851 * We are going to call a signal hander directly. Such handler may or may not
 852 * have been defined in our binary, so there's no guarantee that the pointer
 853 * used to set the handler is a cfi-valid pointer. Since the handlers are
 854 * stored in kernel memory, changing the handler to an attacker-defined
 855 * function requires being able to call a sigaction() syscall,
 856 * which is not as easy as overwriting a pointer in memory.
 857 */
 858QEMU_DISABLE_CFI
 859void sigaction_invoke(struct sigaction *action,
 860                      struct qemu_signalfd_siginfo *info)
 861{
 862    siginfo_t si = {};
 863    si.si_signo = info->ssi_signo;
 864    si.si_errno = info->ssi_errno;
 865    si.si_code = info->ssi_code;
 866
 867    /* Convert the minimal set of fields defined by POSIX.
 868     * Positive si_code values are reserved for kernel-generated
 869     * signals, where the valid siginfo fields are determined by
 870     * the signal number.  But according to POSIX, it is unspecified
 871     * whether SI_USER and SI_QUEUE have values less than or equal to
 872     * zero.
 873     */
 874    if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
 875        info->ssi_code <= 0) {
 876        /* SIGTERM, etc.  */
 877        si.si_pid = info->ssi_pid;
 878        si.si_uid = info->ssi_uid;
 879    } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
 880               info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
 881        si.si_addr = (void *)(uintptr_t)info->ssi_addr;
 882    } else if (info->ssi_signo == SIGCHLD) {
 883        si.si_pid = info->ssi_pid;
 884        si.si_status = info->ssi_status;
 885        si.si_uid = info->ssi_uid;
 886    }
 887    action->sa_sigaction(info->ssi_signo, &si, NULL);
 888}
 889
 890#ifndef HOST_NAME_MAX
 891# ifdef _POSIX_HOST_NAME_MAX
 892#  define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
 893# else
 894#  define HOST_NAME_MAX 255
 895# endif
 896#endif
 897
 898char *qemu_get_host_name(Error **errp)
 899{
 900    long len = -1;
 901    g_autofree char *hostname = NULL;
 902
 903#ifdef _SC_HOST_NAME_MAX
 904    len = sysconf(_SC_HOST_NAME_MAX);
 905#endif /* _SC_HOST_NAME_MAX */
 906
 907    if (len < 0) {
 908        len = HOST_NAME_MAX;
 909    }
 910
 911    /* Unfortunately, gethostname() below does not guarantee a
 912     * NULL terminated string. Therefore, allocate one byte more
 913     * to be sure. */
 914    hostname = g_new0(char, len + 1);
 915
 916    if (gethostname(hostname, len) < 0) {
 917        error_setg_errno(errp, errno,
 918                         "cannot get hostname");
 919        return NULL;
 920    }
 921
 922    return g_steal_pointer(&hostname);
 923}
 924
 925size_t qemu_get_host_physmem(void)
 926{
 927#ifdef _SC_PHYS_PAGES
 928    long pages = sysconf(_SC_PHYS_PAGES);
 929    if (pages > 0) {
 930        if (pages > SIZE_MAX / qemu_real_host_page_size) {
 931            return SIZE_MAX;
 932        } else {
 933            return pages * qemu_real_host_page_size;
 934        }
 935    }
 936#endif
 937    return 0;
 938}
 939