qemu/util/userfaultfd.c
<<
>>
Prefs
   1/*
   2 * Linux UFFD-WP support
   3 *
   4 * Copyright Virtuozzo GmbH, 2020
   5 *
   6 * Authors:
   7 *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2 or
  10 * later.  See the COPYING file in the top-level directory.
  11 */
  12
  13#include "qemu/osdep.h"
  14#include "qemu/bitops.h"
  15#include "qemu/error-report.h"
  16#include "qemu/userfaultfd.h"
  17#include "trace.h"
  18#include <poll.h>
  19#include <sys/syscall.h>
  20#include <sys/ioctl.h>
  21#include <fcntl.h>
  22
  23typedef enum {
  24    UFFD_UNINITIALIZED = 0,
  25    UFFD_USE_DEV_PATH,
  26    UFFD_USE_SYSCALL,
  27} uffd_open_mode;
  28
  29int uffd_open(int flags)
  30{
  31#if defined(__NR_userfaultfd)
  32    static uffd_open_mode open_mode;
  33    static int uffd_dev;
  34
  35    /* Detect how to generate uffd desc when run the 1st time */
  36    if (open_mode == UFFD_UNINITIALIZED) {
  37        /*
  38         * Make /dev/userfaultfd the default approach because it has better
  39         * permission controls, meanwhile allows kernel faults without any
  40         * privilege requirement (e.g. SYS_CAP_PTRACE).
  41         */
  42        uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
  43        if (uffd_dev >= 0) {
  44            open_mode = UFFD_USE_DEV_PATH;
  45        } else {
  46            /* Fallback to the system call */
  47            open_mode = UFFD_USE_SYSCALL;
  48        }
  49        trace_uffd_detect_open_mode(open_mode);
  50    }
  51
  52    if (open_mode == UFFD_USE_DEV_PATH) {
  53        assert(uffd_dev >= 0);
  54        return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
  55    }
  56
  57    return syscall(__NR_userfaultfd, flags);
  58#else
  59    return -EINVAL;
  60#endif
  61}
  62
  63/**
  64 * uffd_query_features: query UFFD features
  65 *
  66 * Returns: 0 on success, negative value in case of an error
  67 *
  68 * @features: parameter to receive 'uffdio_api.features'
  69 */
  70int uffd_query_features(uint64_t *features)
  71{
  72    int uffd_fd;
  73    struct uffdio_api api_struct = { 0 };
  74    int ret = -1;
  75
  76    uffd_fd = uffd_open(O_CLOEXEC);
  77    if (uffd_fd < 0) {
  78        trace_uffd_query_features_nosys(errno);
  79        return -1;
  80    }
  81
  82    api_struct.api = UFFD_API;
  83    api_struct.features = 0;
  84
  85    if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
  86        trace_uffd_query_features_api_failed(errno);
  87        goto out;
  88    }
  89    *features = api_struct.features;
  90    ret = 0;
  91
  92out:
  93    close(uffd_fd);
  94    return ret;
  95}
  96
  97/**
  98 * uffd_create_fd: create UFFD file descriptor
  99 *
 100 * Returns non-negative file descriptor or negative value in case of an error
 101 *
 102 * @features: UFFD features to request
 103 * @non_blocking: create UFFD file descriptor for non-blocking operation
 104 */
 105int uffd_create_fd(uint64_t features, bool non_blocking)
 106{
 107    int uffd_fd;
 108    int flags;
 109    struct uffdio_api api_struct = { 0 };
 110    uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
 111
 112    flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
 113    uffd_fd = uffd_open(flags);
 114    if (uffd_fd < 0) {
 115        trace_uffd_create_fd_nosys(errno);
 116        return -1;
 117    }
 118
 119    api_struct.api = UFFD_API;
 120    api_struct.features = features;
 121    if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
 122        trace_uffd_create_fd_api_failed(errno);
 123        goto fail;
 124    }
 125    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
 126        trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
 127        goto fail;
 128    }
 129
 130    return uffd_fd;
 131
 132fail:
 133    close(uffd_fd);
 134    return -1;
 135}
 136
 137/**
 138 * uffd_close_fd: close UFFD file descriptor
 139 *
 140 * @uffd_fd: UFFD file descriptor
 141 */
 142void uffd_close_fd(int uffd_fd)
 143{
 144    assert(uffd_fd >= 0);
 145    close(uffd_fd);
 146}
 147
 148/**
 149 * uffd_register_memory: register memory range via UFFD-IO
 150 *
 151 * Returns 0 in case of success, negative value in case of an error
 152 *
 153 * @uffd_fd: UFFD file descriptor
 154 * @addr: base address of memory range
 155 * @length: length of memory range
 156 * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
 157 * @ioctls: optional pointer to receive supported IOCTL mask
 158 */
 159int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
 160        uint64_t mode, uint64_t *ioctls)
 161{
 162    struct uffdio_register uffd_register;
 163
 164    uffd_register.range.start = (uintptr_t) addr;
 165    uffd_register.range.len = length;
 166    uffd_register.mode = mode;
 167
 168    if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
 169        trace_uffd_register_memory_failed(addr, length, mode, errno);
 170        return -1;
 171    }
 172    if (ioctls) {
 173        *ioctls = uffd_register.ioctls;
 174    }
 175
 176    return 0;
 177}
 178
 179/**
 180 * uffd_unregister_memory: un-register memory range with UFFD-IO
 181 *
 182 * Returns 0 in case of success, negative value in case of an error
 183 *
 184 * @uffd_fd: UFFD file descriptor
 185 * @addr: base address of memory range
 186 * @length: length of memory range
 187 */
 188int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
 189{
 190    struct uffdio_range uffd_range;
 191
 192    uffd_range.start = (uintptr_t) addr;
 193    uffd_range.len = length;
 194
 195    if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
 196        trace_uffd_unregister_memory_failed(addr, length, errno);
 197        return -1;
 198    }
 199
 200    return 0;
 201}
 202
 203/**
 204 * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
 205 *
 206 * Returns 0 on success, negative value in case of error
 207 *
 208 * @uffd_fd: UFFD file descriptor
 209 * @addr: base address of memory range
 210 * @length: length of memory range
 211 * @wp: write-protect/unprotect
 212 * @dont_wake: do not wake threads waiting on wr-protected page
 213 */
 214int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
 215        bool wp, bool dont_wake)
 216{
 217    struct uffdio_writeprotect uffd_writeprotect;
 218
 219    uffd_writeprotect.range.start = (uintptr_t) addr;
 220    uffd_writeprotect.range.len = length;
 221    if (!wp && dont_wake) {
 222        /* DONTWAKE is meaningful only on protection release */
 223        uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
 224    } else {
 225        uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
 226    }
 227
 228    if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
 229        error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
 230                " mode=%" PRIx64 " errno=%i", addr, length,
 231                (uint64_t) uffd_writeprotect.mode, errno);
 232        return -1;
 233    }
 234
 235    return 0;
 236}
 237
 238/**
 239 * uffd_copy_page: copy range of pages to destination via UFFD-IO
 240 *
 241 * Copy range of source pages to the destination to resolve
 242 * missing page fault somewhere in the destination range.
 243 *
 244 * Returns 0 on success, negative value in case of an error
 245 *
 246 * @uffd_fd: UFFD file descriptor
 247 * @dst_addr: destination base address
 248 * @src_addr: source base address
 249 * @length: length of the range to copy
 250 * @dont_wake: do not wake threads waiting on missing page
 251 */
 252int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
 253        uint64_t length, bool dont_wake)
 254{
 255    struct uffdio_copy uffd_copy;
 256
 257    uffd_copy.dst = (uintptr_t) dst_addr;
 258    uffd_copy.src = (uintptr_t) src_addr;
 259    uffd_copy.len = length;
 260    uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
 261
 262    if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
 263        error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
 264                " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
 265                length, (uint64_t) uffd_copy.mode, errno);
 266        return -1;
 267    }
 268
 269    return 0;
 270}
 271
 272/**
 273 * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
 274 *
 275 * Fill range pages with zeroes to resolve missing page fault within the range.
 276 *
 277 * Returns 0 on success, negative value in case of an error
 278 *
 279 * @uffd_fd: UFFD file descriptor
 280 * @addr: base address
 281 * @length: length of the range to fill with zeroes
 282 * @dont_wake: do not wake threads waiting on missing page
 283 */
 284int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
 285{
 286    struct uffdio_zeropage uffd_zeropage;
 287
 288    uffd_zeropage.range.start = (uintptr_t) addr;
 289    uffd_zeropage.range.len = length;
 290    uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
 291
 292    if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
 293        error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
 294                " mode=%" PRIx64 " errno=%i", addr, length,
 295                (uint64_t) uffd_zeropage.mode, errno);
 296        return -1;
 297    }
 298
 299    return 0;
 300}
 301
 302/**
 303 * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
 304 *
 305 * Wake up threads waiting on any page/pages from the designated range.
 306 * The main use case is when during some period, page faults are resolved
 307 * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
 308 * for the whole memory range are satisfied in a single call to uffd_wakeup().
 309 *
 310 * Returns 0 on success, negative value in case of an error
 311 *
 312 * @uffd_fd: UFFD file descriptor
 313 * @addr: base address
 314 * @length: length of the range
 315 */
 316int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
 317{
 318    struct uffdio_range uffd_range;
 319
 320    uffd_range.start = (uintptr_t) addr;
 321    uffd_range.len = length;
 322
 323    if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
 324        error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
 325                addr, length, errno);
 326        return -1;
 327    }
 328
 329    return 0;
 330}
 331
 332/**
 333 * uffd_read_events: read pending UFFD events
 334 *
 335 * Returns number of fetched messages, 0 if non is available or
 336 * negative value in case of an error
 337 *
 338 * @uffd_fd: UFFD file descriptor
 339 * @msgs: pointer to message buffer
 340 * @count: number of messages that can fit in the buffer
 341 */
 342int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
 343{
 344    ssize_t res;
 345    do {
 346        res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
 347    } while (res < 0 && errno == EINTR);
 348
 349    if ((res < 0 && errno == EAGAIN)) {
 350        return 0;
 351    }
 352    if (res < 0) {
 353        error_report("uffd_read_events() failed: errno=%i", errno);
 354        return -1;
 355    }
 356
 357    return (int) (res / sizeof(struct uffd_msg));
 358}
 359
 360/**
 361 * uffd_poll_events: poll UFFD file descriptor for read
 362 *
 363 * Returns true if events are available for read, false otherwise
 364 *
 365 * @uffd_fd: UFFD file descriptor
 366 * @tmo: timeout value
 367 */
 368bool uffd_poll_events(int uffd_fd, int tmo)
 369{
 370    int res;
 371    struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
 372
 373    do {
 374        res = poll(&poll_fd, 1, tmo);
 375    } while (res < 0 && errno == EINTR);
 376
 377    if (res == 0) {
 378        return false;
 379    }
 380    if (res < 0) {
 381        error_report("uffd_poll_events() failed: errno=%i", errno);
 382        return false;
 383    }
 384
 385    return (poll_fd.revents & POLLIN) != 0;
 386}
 387