linux/tools/testing/selftests/vm/userfaultfd.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Stress userfaultfd syscall.
   4 *
   5 *  Copyright (C) 2015  Red Hat, Inc.
   6 *
   7 * This test allocates two virtual areas and bounces the physical
   8 * memory across the two virtual areas (from area_src to area_dst)
   9 * using userfaultfd.
  10 *
  11 * There are three threads running per CPU:
  12 *
  13 * 1) one per-CPU thread takes a per-page pthread_mutex in a random
  14 *    page of the area_dst (while the physical page may still be in
  15 *    area_src), and increments a per-page counter in the same page,
  16 *    and checks its value against a verification region.
  17 *
  18 * 2) another per-CPU thread handles the userfaults generated by
  19 *    thread 1 above. userfaultfd blocking reads or poll() modes are
  20 *    exercised interleaved.
  21 *
  22 * 3) one last per-CPU thread transfers the memory in the background
  23 *    at maximum bandwidth (if not already transferred by thread
  24 *    2). Each cpu thread takes cares of transferring a portion of the
  25 *    area.
  26 *
  27 * When all threads of type 3 completed the transfer, one bounce is
  28 * complete. area_src and area_dst are then swapped. All threads are
  29 * respawned and so the bounce is immediately restarted in the
  30 * opposite direction.
  31 *
  32 * per-CPU threads 1 by triggering userfaults inside
  33 * pthread_mutex_lock will also verify the atomicity of the memory
  34 * transfer (UFFDIO_COPY).
  35 */
  36
  37#define _GNU_SOURCE
  38#include <stdio.h>
  39#include <errno.h>
  40#include <unistd.h>
  41#include <stdlib.h>
  42#include <sys/types.h>
  43#include <sys/stat.h>
  44#include <fcntl.h>
  45#include <time.h>
  46#include <signal.h>
  47#include <poll.h>
  48#include <string.h>
  49#include <sys/mman.h>
  50#include <sys/syscall.h>
  51#include <sys/ioctl.h>
  52#include <sys/wait.h>
  53#include <pthread.h>
  54#include <linux/userfaultfd.h>
  55#include <setjmp.h>
  56#include <stdbool.h>
  57#include <assert.h>
  58#include <inttypes.h>
  59#include <stdint.h>
  60
  61#include "../kselftest.h"
  62
  63#ifdef __NR_userfaultfd
  64
  65static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
  66
  67#define BOUNCE_RANDOM           (1<<0)
  68#define BOUNCE_RACINGFAULTS     (1<<1)
  69#define BOUNCE_VERIFY           (1<<2)
  70#define BOUNCE_POLL             (1<<3)
  71static int bounces;
  72
  73#define TEST_ANON       1
  74#define TEST_HUGETLB    2
  75#define TEST_SHMEM      3
  76static int test_type;
  77
  78/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
  79#define ALARM_INTERVAL_SECS 10
  80static volatile bool test_uffdio_copy_eexist = true;
  81static volatile bool test_uffdio_zeropage_eexist = true;
  82/* Whether to test uffd write-protection */
  83static bool test_uffdio_wp = false;
  84/* Whether to test uffd minor faults */
  85static bool test_uffdio_minor = false;
  86
  87static bool map_shared;
  88static int shm_fd;
  89static int huge_fd;
  90static char *huge_fd_off0;
  91static unsigned long long *count_verify;
  92static int uffd = -1;
  93static int uffd_flags, finished, *pipefd;
  94static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
  95static char *zeropage;
  96pthread_attr_t attr;
  97
  98/* Userfaultfd test statistics */
  99struct uffd_stats {
 100        int cpu;
 101        unsigned long missing_faults;
 102        unsigned long wp_faults;
 103        unsigned long minor_faults;
 104};
 105
 106/* pthread_mutex_t starts at page offset 0 */
 107#define area_mutex(___area, ___nr)                                      \
 108        ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
 109/*
 110 * count is placed in the page after pthread_mutex_t naturally aligned
 111 * to avoid non alignment faults on non-x86 archs.
 112 */
 113#define area_count(___area, ___nr)                                      \
 114        ((volatile unsigned long long *) ((unsigned long)               \
 115                                 ((___area) + (___nr)*page_size +       \
 116                                  sizeof(pthread_mutex_t) +             \
 117                                  sizeof(unsigned long long) - 1) &     \
 118                                 ~(unsigned long)(sizeof(unsigned long long) \
 119                                                  -  1)))
 120
 121const char *examples =
 122    "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
 123    "./userfaultfd anon 100 99999\n\n"
 124    "# Run share memory test on 1GiB region with 99 bounces:\n"
 125    "./userfaultfd shmem 1000 99\n\n"
 126    "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
 127    "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
 128    "# Run the same hugetlb test but using shmem:\n"
 129    "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
 130    "# 10MiB-~6GiB 999 bounces anonymous test, "
 131    "continue forever unless an error triggers\n"
 132    "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
 133
 134static void usage(void)
 135{
 136        fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
 137                "[hugetlbfs_file]\n\n");
 138        fprintf(stderr, "Supported <test type>: anon, hugetlb, "
 139                "hugetlb_shared, shmem\n\n");
 140        fprintf(stderr, "Examples:\n\n");
 141        fprintf(stderr, "%s", examples);
 142        exit(1);
 143}
 144
 145#define _err(fmt, ...)                                          \
 146        do {                                                    \
 147                int ret = errno;                                \
 148                fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__);  \
 149                fprintf(stderr, " (errno=%d, line=%d)\n",       \
 150                        ret, __LINE__);                         \
 151        } while (0)
 152
 153#define err(fmt, ...)                           \
 154        do {                                    \
 155                _err(fmt, ##__VA_ARGS__);       \
 156                exit(1);                        \
 157        } while (0)
 158
 159static void uffd_stats_reset(struct uffd_stats *uffd_stats,
 160                             unsigned long n_cpus)
 161{
 162        int i;
 163
 164        for (i = 0; i < n_cpus; i++) {
 165                uffd_stats[i].cpu = i;
 166                uffd_stats[i].missing_faults = 0;
 167                uffd_stats[i].wp_faults = 0;
 168                uffd_stats[i].minor_faults = 0;
 169        }
 170}
 171
 172static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
 173{
 174        int i;
 175        unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
 176
 177        for (i = 0; i < n_cpus; i++) {
 178                miss_total += stats[i].missing_faults;
 179                wp_total += stats[i].wp_faults;
 180                minor_total += stats[i].minor_faults;
 181        }
 182
 183        printf("userfaults: ");
 184        if (miss_total) {
 185                printf("%llu missing (", miss_total);
 186                for (i = 0; i < n_cpus; i++)
 187                        printf("%lu+", stats[i].missing_faults);
 188                printf("\b) ");
 189        }
 190        if (wp_total) {
 191                printf("%llu wp (", wp_total);
 192                for (i = 0; i < n_cpus; i++)
 193                        printf("%lu+", stats[i].wp_faults);
 194                printf("\b) ");
 195        }
 196        if (minor_total) {
 197                printf("%llu minor (", minor_total);
 198                for (i = 0; i < n_cpus; i++)
 199                        printf("%lu+", stats[i].minor_faults);
 200                printf("\b)");
 201        }
 202        printf("\n");
 203}
 204
 205static void anon_release_pages(char *rel_area)
 206{
 207        if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
 208                err("madvise(MADV_DONTNEED) failed");
 209}
 210
 211static void anon_allocate_area(void **alloc_area)
 212{
 213        *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 214                           MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
 215        if (*alloc_area == MAP_FAILED)
 216                err("mmap of anonymous memory failed");
 217}
 218
 219static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 220{
 221}
 222
 223static void hugetlb_release_pages(char *rel_area)
 224{
 225        if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 226                      rel_area == huge_fd_off0 ? 0 : nr_pages * page_size,
 227                      nr_pages * page_size))
 228                err("fallocate() failed");
 229}
 230
 231static void hugetlb_allocate_area(void **alloc_area)
 232{
 233        void *area_alias = NULL;
 234        char **alloc_area_alias;
 235
 236        *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 237                           (map_shared ? MAP_SHARED : MAP_PRIVATE) |
 238                           MAP_HUGETLB,
 239                           huge_fd, *alloc_area == area_src ? 0 :
 240                           nr_pages * page_size);
 241        if (*alloc_area == MAP_FAILED)
 242                err("mmap of hugetlbfs file failed");
 243
 244        if (map_shared) {
 245                area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 246                                  MAP_SHARED | MAP_HUGETLB,
 247                                  huge_fd, *alloc_area == area_src ? 0 :
 248                                  nr_pages * page_size);
 249                if (area_alias == MAP_FAILED)
 250                        err("mmap of hugetlb file alias failed");
 251        }
 252
 253        if (*alloc_area == area_src) {
 254                huge_fd_off0 = *alloc_area;
 255                alloc_area_alias = &area_src_alias;
 256        } else {
 257                alloc_area_alias = &area_dst_alias;
 258        }
 259        if (area_alias)
 260                *alloc_area_alias = area_alias;
 261}
 262
 263static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 264{
 265        if (!map_shared)
 266                return;
 267        /*
 268         * We can't zap just the pagetable with hugetlbfs because
 269         * MADV_DONTEED won't work. So exercise -EEXIST on a alias
 270         * mapping where the pagetables are not established initially,
 271         * this way we'll exercise the -EEXEC at the fs level.
 272         */
 273        *start = (unsigned long) area_dst_alias + offset;
 274}
 275
 276static void shmem_release_pages(char *rel_area)
 277{
 278        if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
 279                err("madvise(MADV_REMOVE) failed");
 280}
 281
 282static void shmem_allocate_area(void **alloc_area)
 283{
 284        void *area_alias = NULL;
 285        bool is_src = alloc_area == (void **)&area_src;
 286        unsigned long offset = is_src ? 0 : nr_pages * page_size;
 287
 288        *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 289                           MAP_SHARED, shm_fd, offset);
 290        if (*alloc_area == MAP_FAILED)
 291                err("mmap of memfd failed");
 292
 293        area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 294                          MAP_SHARED, shm_fd, offset);
 295        if (area_alias == MAP_FAILED)
 296                err("mmap of memfd alias failed");
 297
 298        if (is_src)
 299                area_src_alias = area_alias;
 300        else
 301                area_dst_alias = area_alias;
 302}
 303
 304static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 305{
 306        *start = (unsigned long)area_dst_alias + offset;
 307}
 308
 309struct uffd_test_ops {
 310        unsigned long expected_ioctls;
 311        void (*allocate_area)(void **alloc_area);
 312        void (*release_pages)(char *rel_area);
 313        void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
 314};
 315
 316#define SHMEM_EXPECTED_IOCTLS           ((1 << _UFFDIO_WAKE) | \
 317                                         (1 << _UFFDIO_COPY) | \
 318                                         (1 << _UFFDIO_ZEROPAGE))
 319
 320#define ANON_EXPECTED_IOCTLS            ((1 << _UFFDIO_WAKE) | \
 321                                         (1 << _UFFDIO_COPY) | \
 322                                         (1 << _UFFDIO_ZEROPAGE) | \
 323                                         (1 << _UFFDIO_WRITEPROTECT))
 324
 325static struct uffd_test_ops anon_uffd_test_ops = {
 326        .expected_ioctls = ANON_EXPECTED_IOCTLS,
 327        .allocate_area  = anon_allocate_area,
 328        .release_pages  = anon_release_pages,
 329        .alias_mapping = noop_alias_mapping,
 330};
 331
 332static struct uffd_test_ops shmem_uffd_test_ops = {
 333        .expected_ioctls = SHMEM_EXPECTED_IOCTLS,
 334        .allocate_area  = shmem_allocate_area,
 335        .release_pages  = shmem_release_pages,
 336        .alias_mapping = shmem_alias_mapping,
 337};
 338
 339static struct uffd_test_ops hugetlb_uffd_test_ops = {
 340        .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE),
 341        .allocate_area  = hugetlb_allocate_area,
 342        .release_pages  = hugetlb_release_pages,
 343        .alias_mapping = hugetlb_alias_mapping,
 344};
 345
 346static struct uffd_test_ops *uffd_test_ops;
 347
 348static void userfaultfd_open(uint64_t *features)
 349{
 350        struct uffdio_api uffdio_api;
 351
 352        uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
 353        if (uffd < 0)
 354                err("userfaultfd syscall not available in this kernel");
 355        uffd_flags = fcntl(uffd, F_GETFD, NULL);
 356
 357        uffdio_api.api = UFFD_API;
 358        uffdio_api.features = *features;
 359        if (ioctl(uffd, UFFDIO_API, &uffdio_api))
 360                err("UFFDIO_API failed.\nPlease make sure to "
 361                    "run with either root or ptrace capability.");
 362        if (uffdio_api.api != UFFD_API)
 363                err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
 364
 365        *features = uffdio_api.features;
 366}
 367
 368static inline void munmap_area(void **area)
 369{
 370        if (*area)
 371                if (munmap(*area, nr_pages * page_size))
 372                        err("munmap");
 373
 374        *area = NULL;
 375}
 376
 377static void uffd_test_ctx_clear(void)
 378{
 379        size_t i;
 380
 381        if (pipefd) {
 382                for (i = 0; i < nr_cpus * 2; ++i) {
 383                        if (close(pipefd[i]))
 384                                err("close pipefd");
 385                }
 386                free(pipefd);
 387                pipefd = NULL;
 388        }
 389
 390        if (count_verify) {
 391                free(count_verify);
 392                count_verify = NULL;
 393        }
 394
 395        if (uffd != -1) {
 396                if (close(uffd))
 397                        err("close uffd");
 398                uffd = -1;
 399        }
 400
 401        huge_fd_off0 = NULL;
 402        munmap_area((void **)&area_src);
 403        munmap_area((void **)&area_src_alias);
 404        munmap_area((void **)&area_dst);
 405        munmap_area((void **)&area_dst_alias);
 406}
 407
 408static void uffd_test_ctx_init_ext(uint64_t *features)
 409{
 410        unsigned long nr, cpu;
 411
 412        uffd_test_ctx_clear();
 413
 414        uffd_test_ops->allocate_area((void **)&area_src);
 415        uffd_test_ops->allocate_area((void **)&area_dst);
 416
 417        uffd_test_ops->release_pages(area_src);
 418        uffd_test_ops->release_pages(area_dst);
 419
 420        userfaultfd_open(features);
 421
 422        count_verify = malloc(nr_pages * sizeof(unsigned long long));
 423        if (!count_verify)
 424                err("count_verify");
 425
 426        for (nr = 0; nr < nr_pages; nr++) {
 427                *area_mutex(area_src, nr) =
 428                        (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
 429                count_verify[nr] = *area_count(area_src, nr) = 1;
 430                /*
 431                 * In the transition between 255 to 256, powerpc will
 432                 * read out of order in my_bcmp and see both bytes as
 433                 * zero, so leave a placeholder below always non-zero
 434                 * after the count, to avoid my_bcmp to trigger false
 435                 * positives.
 436                 */
 437                *(area_count(area_src, nr) + 1) = 1;
 438        }
 439
 440        pipefd = malloc(sizeof(int) * nr_cpus * 2);
 441        if (!pipefd)
 442                err("pipefd");
 443        for (cpu = 0; cpu < nr_cpus; cpu++)
 444                if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
 445                        err("pipe");
 446}
 447
 448static inline void uffd_test_ctx_init(uint64_t features)
 449{
 450        uffd_test_ctx_init_ext(&features);
 451}
 452
 453static int my_bcmp(char *str1, char *str2, size_t n)
 454{
 455        unsigned long i;
 456        for (i = 0; i < n; i++)
 457                if (str1[i] != str2[i])
 458                        return 1;
 459        return 0;
 460}
 461
 462static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
 463{
 464        struct uffdio_writeprotect prms;
 465
 466        /* Write protection page faults */
 467        prms.range.start = start;
 468        prms.range.len = len;
 469        /* Undo write-protect, do wakeup after that */
 470        prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
 471
 472        if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
 473                err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
 474}
 475
 476static void continue_range(int ufd, __u64 start, __u64 len)
 477{
 478        struct uffdio_continue req;
 479        int ret;
 480
 481        req.range.start = start;
 482        req.range.len = len;
 483        req.mode = 0;
 484
 485        if (ioctl(ufd, UFFDIO_CONTINUE, &req))
 486                err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
 487                    (uint64_t)start);
 488
 489        /*
 490         * Error handling within the kernel for continue is subtly different
 491         * from copy or zeropage, so it may be a source of bugs. Trigger an
 492         * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
 493         */
 494        req.mapped = 0;
 495        ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
 496        if (ret >= 0 || req.mapped != -EEXIST)
 497                err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
 498                    ret, (int64_t) req.mapped);
 499}
 500
 501static void *locking_thread(void *arg)
 502{
 503        unsigned long cpu = (unsigned long) arg;
 504        struct random_data rand;
 505        unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
 506        int32_t rand_nr;
 507        unsigned long long count;
 508        char randstate[64];
 509        unsigned int seed;
 510
 511        if (bounces & BOUNCE_RANDOM) {
 512                seed = (unsigned int) time(NULL) - bounces;
 513                if (!(bounces & BOUNCE_RACINGFAULTS))
 514                        seed += cpu;
 515                bzero(&rand, sizeof(rand));
 516                bzero(&randstate, sizeof(randstate));
 517                if (initstate_r(seed, randstate, sizeof(randstate), &rand))
 518                        err("initstate_r failed");
 519        } else {
 520                page_nr = -bounces;
 521                if (!(bounces & BOUNCE_RACINGFAULTS))
 522                        page_nr += cpu * nr_pages_per_cpu;
 523        }
 524
 525        while (!finished) {
 526                if (bounces & BOUNCE_RANDOM) {
 527                        if (random_r(&rand, &rand_nr))
 528                                err("random_r failed");
 529                        page_nr = rand_nr;
 530                        if (sizeof(page_nr) > sizeof(rand_nr)) {
 531                                if (random_r(&rand, &rand_nr))
 532                                        err("random_r failed");
 533                                page_nr |= (((unsigned long) rand_nr) << 16) <<
 534                                           16;
 535                        }
 536                } else
 537                        page_nr += 1;
 538                page_nr %= nr_pages;
 539                pthread_mutex_lock(area_mutex(area_dst, page_nr));
 540                count = *area_count(area_dst, page_nr);
 541                if (count != count_verify[page_nr])
 542                        err("page_nr %lu memory corruption %llu %llu",
 543                            page_nr, count, count_verify[page_nr]);
 544                count++;
 545                *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
 546                pthread_mutex_unlock(area_mutex(area_dst, page_nr));
 547        }
 548
 549        return NULL;
 550}
 551
 552static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
 553                            unsigned long offset)
 554{
 555        uffd_test_ops->alias_mapping(&uffdio_copy->dst,
 556                                     uffdio_copy->len,
 557                                     offset);
 558        if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
 559                /* real retval in ufdio_copy.copy */
 560                if (uffdio_copy->copy != -EEXIST)
 561                        err("UFFDIO_COPY retry error: %"PRId64,
 562                            (int64_t)uffdio_copy->copy);
 563        } else {
 564                err("UFFDIO_COPY retry unexpected: %"PRId64,
 565                    (int64_t)uffdio_copy->copy);
 566        }
 567}
 568
 569static int __copy_page(int ufd, unsigned long offset, bool retry)
 570{
 571        struct uffdio_copy uffdio_copy;
 572
 573        if (offset >= nr_pages * page_size)
 574                err("unexpected offset %lu\n", offset);
 575        uffdio_copy.dst = (unsigned long) area_dst + offset;
 576        uffdio_copy.src = (unsigned long) area_src + offset;
 577        uffdio_copy.len = page_size;
 578        if (test_uffdio_wp)
 579                uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
 580        else
 581                uffdio_copy.mode = 0;
 582        uffdio_copy.copy = 0;
 583        if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
 584                /* real retval in ufdio_copy.copy */
 585                if (uffdio_copy.copy != -EEXIST)
 586                        err("UFFDIO_COPY error: %"PRId64,
 587                            (int64_t)uffdio_copy.copy);
 588        } else if (uffdio_copy.copy != page_size) {
 589                err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
 590        } else {
 591                if (test_uffdio_copy_eexist && retry) {
 592                        test_uffdio_copy_eexist = false;
 593                        retry_copy_page(ufd, &uffdio_copy, offset);
 594                }
 595                return 1;
 596        }
 597        return 0;
 598}
 599
 600static int copy_page_retry(int ufd, unsigned long offset)
 601{
 602        return __copy_page(ufd, offset, true);
 603}
 604
 605static int copy_page(int ufd, unsigned long offset)
 606{
 607        return __copy_page(ufd, offset, false);
 608}
 609
 610static int uffd_read_msg(int ufd, struct uffd_msg *msg)
 611{
 612        int ret = read(uffd, msg, sizeof(*msg));
 613
 614        if (ret != sizeof(*msg)) {
 615                if (ret < 0) {
 616                        if (errno == EAGAIN)
 617                                return 1;
 618                        err("blocking read error");
 619                } else {
 620                        err("short read");
 621                }
 622        }
 623
 624        return 0;
 625}
 626
 627static void uffd_handle_page_fault(struct uffd_msg *msg,
 628                                   struct uffd_stats *stats)
 629{
 630        unsigned long offset;
 631
 632        if (msg->event != UFFD_EVENT_PAGEFAULT)
 633                err("unexpected msg event %u", msg->event);
 634
 635        if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
 636                /* Write protect page faults */
 637                wp_range(uffd, msg->arg.pagefault.address, page_size, false);
 638                stats->wp_faults++;
 639        } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
 640                uint8_t *area;
 641                int b;
 642
 643                /*
 644                 * Minor page faults
 645                 *
 646                 * To prove we can modify the original range for testing
 647                 * purposes, we're going to bit flip this range before
 648                 * continuing.
 649                 *
 650                 * Note that this requires all minor page fault tests operate on
 651                 * area_dst (non-UFFD-registered) and area_dst_alias
 652                 * (UFFD-registered).
 653                 */
 654
 655                area = (uint8_t *)(area_dst +
 656                                   ((char *)msg->arg.pagefault.address -
 657                                    area_dst_alias));
 658                for (b = 0; b < page_size; ++b)
 659                        area[b] = ~area[b];
 660                continue_range(uffd, msg->arg.pagefault.address, page_size);
 661                stats->minor_faults++;
 662        } else {
 663                /* Missing page faults */
 664                if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
 665                        err("unexpected write fault");
 666
 667                offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
 668                offset &= ~(page_size-1);
 669
 670                if (copy_page(uffd, offset))
 671                        stats->missing_faults++;
 672        }
 673}
 674
 675static void *uffd_poll_thread(void *arg)
 676{
 677        struct uffd_stats *stats = (struct uffd_stats *)arg;
 678        unsigned long cpu = stats->cpu;
 679        struct pollfd pollfd[2];
 680        struct uffd_msg msg;
 681        struct uffdio_register uffd_reg;
 682        int ret;
 683        char tmp_chr;
 684
 685        pollfd[0].fd = uffd;
 686        pollfd[0].events = POLLIN;
 687        pollfd[1].fd = pipefd[cpu*2];
 688        pollfd[1].events = POLLIN;
 689
 690        for (;;) {
 691                ret = poll(pollfd, 2, -1);
 692                if (ret <= 0)
 693                        err("poll error: %d", ret);
 694                if (pollfd[1].revents & POLLIN) {
 695                        if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
 696                                err("read pipefd error");
 697                        break;
 698                }
 699                if (!(pollfd[0].revents & POLLIN))
 700                        err("pollfd[0].revents %d", pollfd[0].revents);
 701                if (uffd_read_msg(uffd, &msg))
 702                        continue;
 703                switch (msg.event) {
 704                default:
 705                        err("unexpected msg event %u\n", msg.event);
 706                        break;
 707                case UFFD_EVENT_PAGEFAULT:
 708                        uffd_handle_page_fault(&msg, stats);
 709                        break;
 710                case UFFD_EVENT_FORK:
 711                        close(uffd);
 712                        uffd = msg.arg.fork.ufd;
 713                        pollfd[0].fd = uffd;
 714                        break;
 715                case UFFD_EVENT_REMOVE:
 716                        uffd_reg.range.start = msg.arg.remove.start;
 717                        uffd_reg.range.len = msg.arg.remove.end -
 718                                msg.arg.remove.start;
 719                        if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
 720                                err("remove failure");
 721                        break;
 722                case UFFD_EVENT_REMAP:
 723                        area_dst = (char *)(unsigned long)msg.arg.remap.to;
 724                        break;
 725                }
 726        }
 727
 728        return NULL;
 729}
 730
 731pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
 732
 733static void *uffd_read_thread(void *arg)
 734{
 735        struct uffd_stats *stats = (struct uffd_stats *)arg;
 736        struct uffd_msg msg;
 737
 738        pthread_mutex_unlock(&uffd_read_mutex);
 739        /* from here cancellation is ok */
 740
 741        for (;;) {
 742                if (uffd_read_msg(uffd, &msg))
 743                        continue;
 744                uffd_handle_page_fault(&msg, stats);
 745        }
 746
 747        return NULL;
 748}
 749
 750static void *background_thread(void *arg)
 751{
 752        unsigned long cpu = (unsigned long) arg;
 753        unsigned long page_nr, start_nr, mid_nr, end_nr;
 754
 755        start_nr = cpu * nr_pages_per_cpu;
 756        end_nr = (cpu+1) * nr_pages_per_cpu;
 757        mid_nr = (start_nr + end_nr) / 2;
 758
 759        /* Copy the first half of the pages */
 760        for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
 761                copy_page_retry(uffd, page_nr * page_size);
 762
 763        /*
 764         * If we need to test uffd-wp, set it up now.  Then we'll have
 765         * at least the first half of the pages mapped already which
 766         * can be write-protected for testing
 767         */
 768        if (test_uffdio_wp)
 769                wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
 770                        nr_pages_per_cpu * page_size, true);
 771
 772        /*
 773         * Continue the 2nd half of the page copying, handling write
 774         * protection faults if any
 775         */
 776        for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
 777                copy_page_retry(uffd, page_nr * page_size);
 778
 779        return NULL;
 780}
 781
 782static int stress(struct uffd_stats *uffd_stats)
 783{
 784        unsigned long cpu;
 785        pthread_t locking_threads[nr_cpus];
 786        pthread_t uffd_threads[nr_cpus];
 787        pthread_t background_threads[nr_cpus];
 788
 789        finished = 0;
 790        for (cpu = 0; cpu < nr_cpus; cpu++) {
 791                if (pthread_create(&locking_threads[cpu], &attr,
 792                                   locking_thread, (void *)cpu))
 793                        return 1;
 794                if (bounces & BOUNCE_POLL) {
 795                        if (pthread_create(&uffd_threads[cpu], &attr,
 796                                           uffd_poll_thread,
 797                                           (void *)&uffd_stats[cpu]))
 798                                return 1;
 799                } else {
 800                        if (pthread_create(&uffd_threads[cpu], &attr,
 801                                           uffd_read_thread,
 802                                           (void *)&uffd_stats[cpu]))
 803                                return 1;
 804                        pthread_mutex_lock(&uffd_read_mutex);
 805                }
 806                if (pthread_create(&background_threads[cpu], &attr,
 807                                   background_thread, (void *)cpu))
 808                        return 1;
 809        }
 810        for (cpu = 0; cpu < nr_cpus; cpu++)
 811                if (pthread_join(background_threads[cpu], NULL))
 812                        return 1;
 813
 814        /*
 815         * Be strict and immediately zap area_src, the whole area has
 816         * been transferred already by the background treads. The
 817         * area_src could then be faulted in in a racy way by still
 818         * running uffdio_threads reading zeropages after we zapped
 819         * area_src (but they're guaranteed to get -EEXIST from
 820         * UFFDIO_COPY without writing zero pages into area_dst
 821         * because the background threads already completed).
 822         */
 823        uffd_test_ops->release_pages(area_src);
 824
 825        finished = 1;
 826        for (cpu = 0; cpu < nr_cpus; cpu++)
 827                if (pthread_join(locking_threads[cpu], NULL))
 828                        return 1;
 829
 830        for (cpu = 0; cpu < nr_cpus; cpu++) {
 831                char c;
 832                if (bounces & BOUNCE_POLL) {
 833                        if (write(pipefd[cpu*2+1], &c, 1) != 1)
 834                                err("pipefd write error");
 835                        if (pthread_join(uffd_threads[cpu],
 836                                         (void *)&uffd_stats[cpu]))
 837                                return 1;
 838                } else {
 839                        if (pthread_cancel(uffd_threads[cpu]))
 840                                return 1;
 841                        if (pthread_join(uffd_threads[cpu], NULL))
 842                                return 1;
 843                }
 844        }
 845
 846        return 0;
 847}
 848
 849sigjmp_buf jbuf, *sigbuf;
 850
 851static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
 852{
 853        if (sig == SIGBUS) {
 854                if (sigbuf)
 855                        siglongjmp(*sigbuf, 1);
 856                abort();
 857        }
 858}
 859
 860/*
 861 * For non-cooperative userfaultfd test we fork() a process that will
 862 * generate pagefaults, will mremap the area monitored by the
 863 * userfaultfd and at last this process will release the monitored
 864 * area.
 865 * For the anonymous and shared memory the area is divided into two
 866 * parts, the first part is accessed before mremap, and the second
 867 * part is accessed after mremap. Since hugetlbfs does not support
 868 * mremap, the entire monitored area is accessed in a single pass for
 869 * HUGETLB_TEST.
 870 * The release of the pages currently generates event for shmem and
 871 * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
 872 * for hugetlb.
 873 * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
 874 * monitored area, generate pagefaults and test that signal is delivered.
 875 * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
 876 * test robustness use case - we release monitored area, fork a process
 877 * that will generate pagefaults and verify signal is generated.
 878 * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
 879 * feature. Using monitor thread, verify no userfault events are generated.
 880 */
 881static int faulting_process(int signal_test)
 882{
 883        unsigned long nr;
 884        unsigned long long count;
 885        unsigned long split_nr_pages;
 886        unsigned long lastnr;
 887        struct sigaction act;
 888        unsigned long signalled = 0;
 889
 890        if (test_type != TEST_HUGETLB)
 891                split_nr_pages = (nr_pages + 1) / 2;
 892        else
 893                split_nr_pages = nr_pages;
 894
 895        if (signal_test) {
 896                sigbuf = &jbuf;
 897                memset(&act, 0, sizeof(act));
 898                act.sa_sigaction = sighndl;
 899                act.sa_flags = SA_SIGINFO;
 900                if (sigaction(SIGBUS, &act, 0))
 901                        err("sigaction");
 902                lastnr = (unsigned long)-1;
 903        }
 904
 905        for (nr = 0; nr < split_nr_pages; nr++) {
 906                int steps = 1;
 907                unsigned long offset = nr * page_size;
 908
 909                if (signal_test) {
 910                        if (sigsetjmp(*sigbuf, 1) != 0) {
 911                                if (steps == 1 && nr == lastnr)
 912                                        err("Signal repeated");
 913
 914                                lastnr = nr;
 915                                if (signal_test == 1) {
 916                                        if (steps == 1) {
 917                                                /* This is a MISSING request */
 918                                                steps++;
 919                                                if (copy_page(uffd, offset))
 920                                                        signalled++;
 921                                        } else {
 922                                                /* This is a WP request */
 923                                                assert(steps == 2);
 924                                                wp_range(uffd,
 925                                                         (__u64)area_dst +
 926                                                         offset,
 927                                                         page_size, false);
 928                                        }
 929                                } else {
 930                                        signalled++;
 931                                        continue;
 932                                }
 933                        }
 934                }
 935
 936                count = *area_count(area_dst, nr);
 937                if (count != count_verify[nr])
 938                        err("nr %lu memory corruption %llu %llu\n",
 939                            nr, count, count_verify[nr]);
 940                /*
 941                 * Trigger write protection if there is by writing
 942                 * the same value back.
 943                 */
 944                *area_count(area_dst, nr) = count;
 945        }
 946
 947        if (signal_test)
 948                return signalled != split_nr_pages;
 949
 950        if (test_type == TEST_HUGETLB)
 951                return 0;
 952
 953        area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
 954                          MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
 955        if (area_dst == MAP_FAILED)
 956                err("mremap");
 957        /* Reset area_src since we just clobbered it */
 958        area_src = NULL;
 959
 960        for (; nr < nr_pages; nr++) {
 961                count = *area_count(area_dst, nr);
 962                if (count != count_verify[nr]) {
 963                        err("nr %lu memory corruption %llu %llu\n",
 964                            nr, count, count_verify[nr]);
 965                }
 966                /*
 967                 * Trigger write protection if there is by writing
 968                 * the same value back.
 969                 */
 970                *area_count(area_dst, nr) = count;
 971        }
 972
 973        uffd_test_ops->release_pages(area_dst);
 974
 975        for (nr = 0; nr < nr_pages; nr++)
 976                if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
 977                        err("nr %lu is not zero", nr);
 978
 979        return 0;
 980}
 981
 982static void retry_uffdio_zeropage(int ufd,
 983                                  struct uffdio_zeropage *uffdio_zeropage,
 984                                  unsigned long offset)
 985{
 986        uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
 987                                     uffdio_zeropage->range.len,
 988                                     offset);
 989        if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
 990                if (uffdio_zeropage->zeropage != -EEXIST)
 991                        err("UFFDIO_ZEROPAGE error: %"PRId64,
 992                            (int64_t)uffdio_zeropage->zeropage);
 993        } else {
 994                err("UFFDIO_ZEROPAGE error: %"PRId64,
 995                    (int64_t)uffdio_zeropage->zeropage);
 996        }
 997}
 998
 999static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
1000{
1001        struct uffdio_zeropage uffdio_zeropage;
1002        int ret;
1003        unsigned long has_zeropage;
1004        __s64 res;
1005
1006        has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
1007
1008        if (offset >= nr_pages * page_size)
1009                err("unexpected offset %lu", offset);
1010        uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
1011        uffdio_zeropage.range.len = page_size;
1012        uffdio_zeropage.mode = 0;
1013        ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
1014        res = uffdio_zeropage.zeropage;
1015        if (ret) {
1016                /* real retval in ufdio_zeropage.zeropage */
1017                if (has_zeropage)
1018                        err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
1019                else if (res != -EINVAL)
1020                        err("UFFDIO_ZEROPAGE not -EINVAL");
1021        } else if (has_zeropage) {
1022                if (res != page_size) {
1023                        err("UFFDIO_ZEROPAGE unexpected size");
1024                } else {
1025                        if (test_uffdio_zeropage_eexist && retry) {
1026                                test_uffdio_zeropage_eexist = false;
1027                                retry_uffdio_zeropage(ufd, &uffdio_zeropage,
1028                                                      offset);
1029                        }
1030                        return 1;
1031                }
1032        } else
1033                err("UFFDIO_ZEROPAGE succeeded");
1034
1035        return 0;
1036}
1037
1038static int uffdio_zeropage(int ufd, unsigned long offset)
1039{
1040        return __uffdio_zeropage(ufd, offset, false);
1041}
1042
1043/* exercise UFFDIO_ZEROPAGE */
1044static int userfaultfd_zeropage_test(void)
1045{
1046        struct uffdio_register uffdio_register;
1047        unsigned long expected_ioctls;
1048
1049        printf("testing UFFDIO_ZEROPAGE: ");
1050        fflush(stdout);
1051
1052        uffd_test_ctx_init(0);
1053
1054        uffdio_register.range.start = (unsigned long) area_dst;
1055        uffdio_register.range.len = nr_pages * page_size;
1056        uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1057        if (test_uffdio_wp)
1058                uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1059        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1060                err("register failure");
1061
1062        expected_ioctls = uffd_test_ops->expected_ioctls;
1063        if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
1064                err("unexpected missing ioctl for anon memory");
1065
1066        if (uffdio_zeropage(uffd, 0))
1067                if (my_bcmp(area_dst, zeropage, page_size))
1068                        err("zeropage is not zero");
1069
1070        printf("done.\n");
1071        return 0;
1072}
1073
1074static int userfaultfd_events_test(void)
1075{
1076        struct uffdio_register uffdio_register;
1077        unsigned long expected_ioctls;
1078        pthread_t uffd_mon;
1079        int err, features;
1080        pid_t pid;
1081        char c;
1082        struct uffd_stats stats = { 0 };
1083
1084        printf("testing events (fork, remap, remove): ");
1085        fflush(stdout);
1086
1087        features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
1088                UFFD_FEATURE_EVENT_REMOVE;
1089        uffd_test_ctx_init(features);
1090
1091        fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1092
1093        uffdio_register.range.start = (unsigned long) area_dst;
1094        uffdio_register.range.len = nr_pages * page_size;
1095        uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1096        if (test_uffdio_wp)
1097                uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1098        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1099                err("register failure");
1100
1101        expected_ioctls = uffd_test_ops->expected_ioctls;
1102        if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
1103                err("unexpected missing ioctl for anon memory");
1104
1105        if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1106                err("uffd_poll_thread create");
1107
1108        pid = fork();
1109        if (pid < 0)
1110                err("fork");
1111
1112        if (!pid)
1113                exit(faulting_process(0));
1114
1115        waitpid(pid, &err, 0);
1116        if (err)
1117                err("faulting process failed");
1118        if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1119                err("pipe write");
1120        if (pthread_join(uffd_mon, NULL))
1121                return 1;
1122
1123        uffd_stats_report(&stats, 1);
1124
1125        return stats.missing_faults != nr_pages;
1126}
1127
1128static int userfaultfd_sig_test(void)
1129{
1130        struct uffdio_register uffdio_register;
1131        unsigned long expected_ioctls;
1132        unsigned long userfaults;
1133        pthread_t uffd_mon;
1134        int err, features;
1135        pid_t pid;
1136        char c;
1137        struct uffd_stats stats = { 0 };
1138
1139        printf("testing signal delivery: ");
1140        fflush(stdout);
1141
1142        features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
1143        uffd_test_ctx_init(features);
1144
1145        fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1146
1147        uffdio_register.range.start = (unsigned long) area_dst;
1148        uffdio_register.range.len = nr_pages * page_size;
1149        uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1150        if (test_uffdio_wp)
1151                uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1152        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1153                err("register failure");
1154
1155        expected_ioctls = uffd_test_ops->expected_ioctls;
1156        if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
1157                err("unexpected missing ioctl for anon memory");
1158
1159        if (faulting_process(1))
1160                err("faulting process failed");
1161
1162        uffd_test_ops->release_pages(area_dst);
1163
1164        if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1165                err("uffd_poll_thread create");
1166
1167        pid = fork();
1168        if (pid < 0)
1169                err("fork");
1170
1171        if (!pid)
1172                exit(faulting_process(2));
1173
1174        waitpid(pid, &err, 0);
1175        if (err)
1176                err("faulting process failed");
1177        if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1178                err("pipe write");
1179        if (pthread_join(uffd_mon, (void **)&userfaults))
1180                return 1;
1181
1182        printf("done.\n");
1183        if (userfaults)
1184                err("Signal test failed, userfaults: %ld", userfaults);
1185
1186        return userfaults != 0;
1187}
1188
1189static int userfaultfd_minor_test(void)
1190{
1191        struct uffdio_register uffdio_register;
1192        unsigned long expected_ioctls;
1193        unsigned long p;
1194        pthread_t uffd_mon;
1195        uint8_t expected_byte;
1196        void *expected_page;
1197        char c;
1198        struct uffd_stats stats = { 0 };
1199        uint64_t req_features, features_out;
1200
1201        if (!test_uffdio_minor)
1202                return 0;
1203
1204        printf("testing minor faults: ");
1205        fflush(stdout);
1206
1207        if (test_type == TEST_HUGETLB)
1208                req_features = UFFD_FEATURE_MINOR_HUGETLBFS;
1209        else if (test_type == TEST_SHMEM)
1210                req_features = UFFD_FEATURE_MINOR_SHMEM;
1211        else
1212                return 1;
1213
1214        features_out = req_features;
1215        uffd_test_ctx_init_ext(&features_out);
1216        /* If kernel reports required features aren't supported, skip test. */
1217        if ((features_out & req_features) != req_features) {
1218                printf("skipping test due to lack of feature support\n");
1219                fflush(stdout);
1220                return 0;
1221        }
1222
1223        uffdio_register.range.start = (unsigned long)area_dst_alias;
1224        uffdio_register.range.len = nr_pages * page_size;
1225        uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
1226        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1227                err("register failure");
1228
1229        expected_ioctls = uffd_test_ops->expected_ioctls;
1230        expected_ioctls |= 1 << _UFFDIO_CONTINUE;
1231        if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
1232                err("unexpected missing ioctl(s)");
1233
1234        /*
1235         * After registering with UFFD, populate the non-UFFD-registered side of
1236         * the shared mapping. This should *not* trigger any UFFD minor faults.
1237         */
1238        for (p = 0; p < nr_pages; ++p) {
1239                memset(area_dst + (p * page_size), p % ((uint8_t)-1),
1240                       page_size);
1241        }
1242
1243        if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1244                err("uffd_poll_thread create");
1245
1246        /*
1247         * Read each of the pages back using the UFFD-registered mapping. We
1248         * expect that the first time we touch a page, it will result in a minor
1249         * fault. uffd_poll_thread will resolve the fault by bit-flipping the
1250         * page's contents, and then issuing a CONTINUE ioctl.
1251         */
1252
1253        if (posix_memalign(&expected_page, page_size, page_size))
1254                err("out of memory");
1255
1256        for (p = 0; p < nr_pages; ++p) {
1257                expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
1258                memset(expected_page, expected_byte, page_size);
1259                if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
1260                            page_size))
1261                        err("unexpected page contents after minor fault");
1262        }
1263
1264        if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1265                err("pipe write");
1266        if (pthread_join(uffd_mon, NULL))
1267                return 1;
1268
1269        uffd_stats_report(&stats, 1);
1270
1271        return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
1272}
1273
1274#define BIT_ULL(nr)                   (1ULL << (nr))
1275#define PM_SOFT_DIRTY                 BIT_ULL(55)
1276#define PM_MMAP_EXCLUSIVE             BIT_ULL(56)
1277#define PM_UFFD_WP                    BIT_ULL(57)
1278#define PM_FILE                       BIT_ULL(61)
1279#define PM_SWAP                       BIT_ULL(62)
1280#define PM_PRESENT                    BIT_ULL(63)
1281
1282static int pagemap_open(void)
1283{
1284        int fd = open("/proc/self/pagemap", O_RDONLY);
1285
1286        if (fd < 0)
1287                err("open pagemap");
1288
1289        return fd;
1290}
1291
1292static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
1293{
1294        uint64_t value;
1295        int ret;
1296
1297        ret = pread(fd, &value, sizeof(uint64_t),
1298                    ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
1299        if (ret != sizeof(uint64_t))
1300                err("pread() on pagemap failed");
1301
1302        return value;
1303}
1304
1305/* This macro let __LINE__ works in err() */
1306#define  pagemap_check_wp(value, wp) do {                               \
1307                if (!!(value & PM_UFFD_WP) != wp)                       \
1308                        err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
1309        } while (0)
1310
1311static int pagemap_test_fork(bool present)
1312{
1313        pid_t child = fork();
1314        uint64_t value;
1315        int fd, result;
1316
1317        if (!child) {
1318                /* Open the pagemap fd of the child itself */
1319                fd = pagemap_open();
1320                value = pagemap_read_vaddr(fd, area_dst);
1321                /*
1322                 * After fork() uffd-wp bit should be gone as long as we're
1323                 * without UFFD_FEATURE_EVENT_FORK
1324                 */
1325                pagemap_check_wp(value, false);
1326                /* Succeed */
1327                exit(0);
1328        }
1329        waitpid(child, &result, 0);
1330        return result;
1331}
1332
1333static void userfaultfd_pagemap_test(unsigned int test_pgsize)
1334{
1335        struct uffdio_register uffdio_register;
1336        int pagemap_fd;
1337        uint64_t value;
1338
1339        /* Pagemap tests uffd-wp only */
1340        if (!test_uffdio_wp)
1341                return;
1342
1343        /* Not enough memory to test this page size */
1344        if (test_pgsize > nr_pages * page_size)
1345                return;
1346
1347        printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
1348        /* Flush so it doesn't flush twice in parent/child later */
1349        fflush(stdout);
1350
1351        uffd_test_ctx_init(0);
1352
1353        if (test_pgsize > page_size) {
1354                /* This is a thp test */
1355                if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
1356                        err("madvise(MADV_HUGEPAGE) failed");
1357        } else if (test_pgsize == page_size) {
1358                /* This is normal page test; force no thp */
1359                if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
1360                        err("madvise(MADV_NOHUGEPAGE) failed");
1361        }
1362
1363        uffdio_register.range.start = (unsigned long) area_dst;
1364        uffdio_register.range.len = nr_pages * page_size;
1365        uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
1366        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1367                err("register failed");
1368
1369        pagemap_fd = pagemap_open();
1370
1371        /* Touch the page */
1372        *area_dst = 1;
1373        wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
1374        value = pagemap_read_vaddr(pagemap_fd, area_dst);
1375        pagemap_check_wp(value, true);
1376        /* Make sure uffd-wp bit dropped when fork */
1377        if (pagemap_test_fork(true))
1378                err("Detected stall uffd-wp bit in child");
1379
1380        /* Exclusive required or PAGEOUT won't work */
1381        if (!(value & PM_MMAP_EXCLUSIVE))
1382                err("multiple mapping detected: 0x%"PRIx64, value);
1383
1384        if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
1385                err("madvise(MADV_PAGEOUT) failed");
1386
1387        /* Uffd-wp should persist even swapped out */
1388        value = pagemap_read_vaddr(pagemap_fd, area_dst);
1389        pagemap_check_wp(value, true);
1390        /* Make sure uffd-wp bit dropped when fork */
1391        if (pagemap_test_fork(false))
1392                err("Detected stall uffd-wp bit in child");
1393
1394        /* Unprotect; this tests swap pte modifications */
1395        wp_range(uffd, (uint64_t)area_dst, page_size, false);
1396        value = pagemap_read_vaddr(pagemap_fd, area_dst);
1397        pagemap_check_wp(value, false);
1398
1399        /* Fault in the page from disk */
1400        *area_dst = 2;
1401        value = pagemap_read_vaddr(pagemap_fd, area_dst);
1402        pagemap_check_wp(value, false);
1403
1404        close(pagemap_fd);
1405        printf("done\n");
1406}
1407
1408static int userfaultfd_stress(void)
1409{
1410        void *area;
1411        char *tmp_area;
1412        unsigned long nr;
1413        struct uffdio_register uffdio_register;
1414        struct uffd_stats uffd_stats[nr_cpus];
1415
1416        uffd_test_ctx_init(0);
1417
1418        if (posix_memalign(&area, page_size, page_size))
1419                err("out of memory");
1420        zeropage = area;
1421        bzero(zeropage, page_size);
1422
1423        pthread_mutex_lock(&uffd_read_mutex);
1424
1425        pthread_attr_init(&attr);
1426        pthread_attr_setstacksize(&attr, 16*1024*1024);
1427
1428        while (bounces--) {
1429                unsigned long expected_ioctls;
1430
1431                printf("bounces: %d, mode:", bounces);
1432                if (bounces & BOUNCE_RANDOM)
1433                        printf(" rnd");
1434                if (bounces & BOUNCE_RACINGFAULTS)
1435                        printf(" racing");
1436                if (bounces & BOUNCE_VERIFY)
1437                        printf(" ver");
1438                if (bounces & BOUNCE_POLL)
1439                        printf(" poll");
1440                else
1441                        printf(" read");
1442                printf(", ");
1443                fflush(stdout);
1444
1445                if (bounces & BOUNCE_POLL)
1446                        fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1447                else
1448                        fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
1449
1450                /* register */
1451                uffdio_register.range.start = (unsigned long) area_dst;
1452                uffdio_register.range.len = nr_pages * page_size;
1453                uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1454                if (test_uffdio_wp)
1455                        uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1456                if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1457                        err("register failure");
1458                expected_ioctls = uffd_test_ops->expected_ioctls;
1459                if ((uffdio_register.ioctls & expected_ioctls) !=
1460                    expected_ioctls)
1461                        err("unexpected missing ioctl for anon memory");
1462
1463                if (area_dst_alias) {
1464                        uffdio_register.range.start = (unsigned long)
1465                                area_dst_alias;
1466                        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1467                                err("register failure alias");
1468                }
1469
1470                /*
1471                 * The madvise done previously isn't enough: some
1472                 * uffd_thread could have read userfaults (one of
1473                 * those already resolved by the background thread)
1474                 * and it may be in the process of calling
1475                 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
1476                 * area_src and it would map a zero page in it (of
1477                 * course such a UFFDIO_COPY is perfectly safe as it'd
1478                 * return -EEXIST). The problem comes at the next
1479                 * bounce though: that racing UFFDIO_COPY would
1480                 * generate zeropages in the area_src, so invalidating
1481                 * the previous MADV_DONTNEED. Without this additional
1482                 * MADV_DONTNEED those zeropages leftovers in the
1483                 * area_src would lead to -EEXIST failure during the
1484                 * next bounce, effectively leaving a zeropage in the
1485                 * area_dst.
1486                 *
1487                 * Try to comment this out madvise to see the memory
1488                 * corruption being caught pretty quick.
1489                 *
1490                 * khugepaged is also inhibited to collapse THP after
1491                 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
1492                 * required to MADV_DONTNEED here.
1493                 */
1494                uffd_test_ops->release_pages(area_dst);
1495
1496                uffd_stats_reset(uffd_stats, nr_cpus);
1497
1498                /* bounce pass */
1499                if (stress(uffd_stats))
1500                        return 1;
1501
1502                /* Clear all the write protections if there is any */
1503                if (test_uffdio_wp)
1504                        wp_range(uffd, (unsigned long)area_dst,
1505                                 nr_pages * page_size, false);
1506
1507                /* unregister */
1508                if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
1509                        err("unregister failure");
1510                if (area_dst_alias) {
1511                        uffdio_register.range.start = (unsigned long) area_dst;
1512                        if (ioctl(uffd, UFFDIO_UNREGISTER,
1513                                  &uffdio_register.range))
1514                                err("unregister failure alias");
1515                }
1516
1517                /* verification */
1518                if (bounces & BOUNCE_VERIFY)
1519                        for (nr = 0; nr < nr_pages; nr++)
1520                                if (*area_count(area_dst, nr) != count_verify[nr])
1521                                        err("error area_count %llu %llu %lu\n",
1522                                            *area_count(area_src, nr),
1523                                            count_verify[nr], nr);
1524
1525                /* prepare next bounce */
1526                tmp_area = area_src;
1527                area_src = area_dst;
1528                area_dst = tmp_area;
1529
1530                tmp_area = area_src_alias;
1531                area_src_alias = area_dst_alias;
1532                area_dst_alias = tmp_area;
1533
1534                uffd_stats_report(uffd_stats, nr_cpus);
1535        }
1536
1537        if (test_type == TEST_ANON) {
1538                /*
1539                 * shmem/hugetlb won't be able to run since they have different
1540                 * behavior on fork() (file-backed memory normally drops ptes
1541                 * directly when fork), meanwhile the pagemap test will verify
1542                 * pgtable entry of fork()ed child.
1543                 */
1544                userfaultfd_pagemap_test(page_size);
1545                /*
1546                 * Hard-code for x86_64 for now for 2M THP, as x86_64 is
1547                 * currently the only one that supports uffd-wp
1548                 */
1549                userfaultfd_pagemap_test(page_size * 512);
1550        }
1551
1552        return userfaultfd_zeropage_test() || userfaultfd_sig_test()
1553                || userfaultfd_events_test() || userfaultfd_minor_test();
1554}
1555
1556/*
1557 * Copied from mlock2-tests.c
1558 */
1559unsigned long default_huge_page_size(void)
1560{
1561        unsigned long hps = 0;
1562        char *line = NULL;
1563        size_t linelen = 0;
1564        FILE *f = fopen("/proc/meminfo", "r");
1565
1566        if (!f)
1567                return 0;
1568        while (getline(&line, &linelen, f) > 0) {
1569                if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
1570                        hps <<= 10;
1571                        break;
1572                }
1573        }
1574
1575        free(line);
1576        fclose(f);
1577        return hps;
1578}
1579
1580static void set_test_type(const char *type)
1581{
1582        if (!strcmp(type, "anon")) {
1583                test_type = TEST_ANON;
1584                uffd_test_ops = &anon_uffd_test_ops;
1585                /* Only enable write-protect test for anonymous test */
1586                test_uffdio_wp = true;
1587        } else if (!strcmp(type, "hugetlb")) {
1588                test_type = TEST_HUGETLB;
1589                uffd_test_ops = &hugetlb_uffd_test_ops;
1590        } else if (!strcmp(type, "hugetlb_shared")) {
1591                map_shared = true;
1592                test_type = TEST_HUGETLB;
1593                uffd_test_ops = &hugetlb_uffd_test_ops;
1594                /* Minor faults require shared hugetlb; only enable here. */
1595                test_uffdio_minor = true;
1596        } else if (!strcmp(type, "shmem")) {
1597                map_shared = true;
1598                test_type = TEST_SHMEM;
1599                uffd_test_ops = &shmem_uffd_test_ops;
1600                test_uffdio_minor = true;
1601        } else {
1602                err("Unknown test type: %s", type);
1603        }
1604
1605        if (test_type == TEST_HUGETLB)
1606                page_size = default_huge_page_size();
1607        else
1608                page_size = sysconf(_SC_PAGE_SIZE);
1609
1610        if (!page_size)
1611                err("Unable to determine page size");
1612        if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1613            > page_size)
1614                err("Impossible to run this test");
1615}
1616
1617static void sigalrm(int sig)
1618{
1619        if (sig != SIGALRM)
1620                abort();
1621        test_uffdio_copy_eexist = true;
1622        test_uffdio_zeropage_eexist = true;
1623        alarm(ALARM_INTERVAL_SECS);
1624}
1625
1626int main(int argc, char **argv)
1627{
1628        if (argc < 4)
1629                usage();
1630
1631        if (signal(SIGALRM, sigalrm) == SIG_ERR)
1632                err("failed to arm SIGALRM");
1633        alarm(ALARM_INTERVAL_SECS);
1634
1635        set_test_type(argv[1]);
1636
1637        nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1638        nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
1639                nr_cpus;
1640        if (!nr_pages_per_cpu) {
1641                _err("invalid MiB");
1642                usage();
1643        }
1644
1645        bounces = atoi(argv[3]);
1646        if (bounces <= 0) {
1647                _err("invalid bounces");
1648                usage();
1649        }
1650        nr_pages = nr_pages_per_cpu * nr_cpus;
1651
1652        if (test_type == TEST_HUGETLB) {
1653                if (argc < 5)
1654                        usage();
1655                huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
1656                if (huge_fd < 0)
1657                        err("Open of %s failed", argv[4]);
1658                if (ftruncate(huge_fd, 0))
1659                        err("ftruncate %s to size 0 failed", argv[4]);
1660        } else if (test_type == TEST_SHMEM) {
1661                shm_fd = memfd_create(argv[0], 0);
1662                if (shm_fd < 0)
1663                        err("memfd_create");
1664                if (ftruncate(shm_fd, nr_pages * page_size * 2))
1665                        err("ftruncate");
1666                if (fallocate(shm_fd,
1667                              FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
1668                              nr_pages * page_size * 2))
1669                        err("fallocate");
1670        }
1671        printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1672               nr_pages, nr_pages_per_cpu);
1673        return userfaultfd_stress();
1674}
1675
1676#else /* __NR_userfaultfd */
1677
1678#warning "missing __NR_userfaultfd definition"
1679
1680int main(void)
1681{
1682        printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1683        return KSFT_SKIP;
1684}
1685
1686#endif /* __NR_userfaultfd */
1687