linux/tools/testing/selftests/vm/userfaultfd.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Stress userfaultfd syscall.
   4 *
   5 *  Copyright (C) 2015  Red Hat, Inc.
   6 *
   7 * This test allocates two virtual areas and bounces the physical
   8 * memory across the two virtual areas (from area_src to area_dst)
   9 * using userfaultfd.
  10 *
  11 * There are three threads running per CPU:
  12 *
  13 * 1) one per-CPU thread takes a per-page pthread_mutex in a random
  14 *    page of the area_dst (while the physical page may still be in
  15 *    area_src), and increments a per-page counter in the same page,
  16 *    and checks its value against a verification region.
  17 *
  18 * 2) another per-CPU thread handles the userfaults generated by
  19 *    thread 1 above. userfaultfd blocking reads or poll() modes are
  20 *    exercised interleaved.
  21 *
  22 * 3) one last per-CPU thread transfers the memory in the background
  23 *    at maximum bandwidth (if not already transferred by thread
  24 *    2). Each cpu thread takes cares of transferring a portion of the
  25 *    area.
  26 *
  27 * When all threads of type 3 completed the transfer, one bounce is
  28 * complete. area_src and area_dst are then swapped. All threads are
  29 * respawned and so the bounce is immediately restarted in the
  30 * opposite direction.
  31 *
  32 * per-CPU threads 1 by triggering userfaults inside
  33 * pthread_mutex_lock will also verify the atomicity of the memory
  34 * transfer (UFFDIO_COPY).
  35 */
  36
  37#define _GNU_SOURCE
  38#include <stdio.h>
  39#include <errno.h>
  40#include <unistd.h>
  41#include <stdlib.h>
  42#include <sys/types.h>
  43#include <sys/stat.h>
  44#include <fcntl.h>
  45#include <time.h>
  46#include <signal.h>
  47#include <poll.h>
  48#include <string.h>
  49#include <sys/mman.h>
  50#include <sys/syscall.h>
  51#include <sys/ioctl.h>
  52#include <sys/wait.h>
  53#include <pthread.h>
  54#include <linux/userfaultfd.h>
  55#include <setjmp.h>
  56#include <stdbool.h>
  57#include <assert.h>
  58#include <inttypes.h>
  59#include <stdint.h>
  60
  61#include "../kselftest.h"
  62
  63#ifdef __NR_userfaultfd
  64
  65static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
  66
  67#define BOUNCE_RANDOM           (1<<0)
  68#define BOUNCE_RACINGFAULTS     (1<<1)
  69#define BOUNCE_VERIFY           (1<<2)
  70#define BOUNCE_POLL             (1<<3)
  71static int bounces;
  72
  73#define TEST_ANON       1
  74#define TEST_HUGETLB    2
  75#define TEST_SHMEM      3
  76static int test_type;
  77
  78/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
  79#define ALARM_INTERVAL_SECS 10
  80static volatile bool test_uffdio_copy_eexist = true;
  81static volatile bool test_uffdio_zeropage_eexist = true;
  82/* Whether to test uffd write-protection */
  83static bool test_uffdio_wp = false;
  84/* Whether to test uffd minor faults */
  85static bool test_uffdio_minor = false;
  86
  87static bool map_shared;
  88static int shm_fd;
  89static int huge_fd;
  90static char *huge_fd_off0;
  91static unsigned long long *count_verify;
  92static int uffd = -1;
  93static int uffd_flags, finished, *pipefd;
  94static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
  95static char *zeropage;
  96pthread_attr_t attr;
  97
  98/* Userfaultfd test statistics */
  99struct uffd_stats {
 100        int cpu;
 101        unsigned long missing_faults;
 102        unsigned long wp_faults;
 103        unsigned long minor_faults;
 104};
 105
 106/* pthread_mutex_t starts at page offset 0 */
 107#define area_mutex(___area, ___nr)                                      \
 108        ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
 109/*
 110 * count is placed in the page after pthread_mutex_t naturally aligned
 111 * to avoid non alignment faults on non-x86 archs.
 112 */
 113#define area_count(___area, ___nr)                                      \
 114        ((volatile unsigned long long *) ((unsigned long)               \
 115                                 ((___area) + (___nr)*page_size +       \
 116                                  sizeof(pthread_mutex_t) +             \
 117                                  sizeof(unsigned long long) - 1) &     \
 118                                 ~(unsigned long)(sizeof(unsigned long long) \
 119                                                  -  1)))
 120
 121const char *examples =
 122    "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
 123    "./userfaultfd anon 100 99999\n\n"
 124    "# Run share memory test on 1GiB region with 99 bounces:\n"
 125    "./userfaultfd shmem 1000 99\n\n"
 126    "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
 127    "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
 128    "# Run the same hugetlb test but using shmem:\n"
 129    "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
 130    "# 10MiB-~6GiB 999 bounces anonymous test, "
 131    "continue forever unless an error triggers\n"
 132    "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
 133
 134static void usage(void)
 135{
 136        fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
 137                "[hugetlbfs_file]\n\n");
 138        fprintf(stderr, "Supported <test type>: anon, hugetlb, "
 139                "hugetlb_shared, shmem\n\n");
 140        fprintf(stderr, "Examples:\n\n");
 141        fprintf(stderr, "%s", examples);
 142        exit(1);
 143}
 144
 145#define _err(fmt, ...)                                          \
 146        do {                                                    \
 147                int ret = errno;                                \
 148                fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__);  \
 149                fprintf(stderr, " (errno=%d, line=%d)\n",       \
 150                        ret, __LINE__);                         \
 151        } while (0)
 152
 153#define err(fmt, ...)                           \
 154        do {                                    \
 155                _err(fmt, ##__VA_ARGS__);       \
 156                exit(1);                        \
 157        } while (0)
 158
 159static void uffd_stats_reset(struct uffd_stats *uffd_stats,
 160                             unsigned long n_cpus)
 161{
 162        int i;
 163
 164        for (i = 0; i < n_cpus; i++) {
 165                uffd_stats[i].cpu = i;
 166                uffd_stats[i].missing_faults = 0;
 167                uffd_stats[i].wp_faults = 0;
 168                uffd_stats[i].minor_faults = 0;
 169        }
 170}
 171
 172static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
 173{
 174        int i;
 175        unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
 176
 177        for (i = 0; i < n_cpus; i++) {
 178                miss_total += stats[i].missing_faults;
 179                wp_total += stats[i].wp_faults;
 180                minor_total += stats[i].minor_faults;
 181        }
 182
 183        printf("userfaults: ");
 184        if (miss_total) {
 185                printf("%llu missing (", miss_total);
 186                for (i = 0; i < n_cpus; i++)
 187                        printf("%lu+", stats[i].missing_faults);
 188                printf("\b) ");
 189        }
 190        if (wp_total) {
 191                printf("%llu wp (", wp_total);
 192                for (i = 0; i < n_cpus; i++)
 193                        printf("%lu+", stats[i].wp_faults);
 194                printf("\b) ");
 195        }
 196        if (minor_total) {
 197                printf("%llu minor (", minor_total);
 198                for (i = 0; i < n_cpus; i++)
 199                        printf("%lu+", stats[i].minor_faults);
 200                printf("\b)");
 201        }
 202        printf("\n");
 203}
 204
 205static void anon_release_pages(char *rel_area)
 206{
 207        if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
 208                err("madvise(MADV_DONTNEED) failed");
 209}
 210
 211static void anon_allocate_area(void **alloc_area)
 212{
 213        *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 214                           MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
 215        if (*alloc_area == MAP_FAILED)
 216                err("mmap of anonymous memory failed");
 217}
 218
 219static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 220{
 221}
 222
 223static void hugetlb_release_pages(char *rel_area)
 224{
 225        if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 226                      rel_area == huge_fd_off0 ? 0 : nr_pages * page_size,
 227                      nr_pages * page_size))
 228                err("fallocate() failed");
 229}
 230
 231static void hugetlb_allocate_area(void **alloc_area)
 232{
 233        void *area_alias = NULL;
 234        char **alloc_area_alias;
 235
 236        *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 237                           (map_shared ? MAP_SHARED : MAP_PRIVATE) |
 238                           MAP_HUGETLB,
 239                           huge_fd, *alloc_area == area_src ? 0 :
 240                           nr_pages * page_size);
 241        if (*alloc_area == MAP_FAILED)
 242                err("mmap of hugetlbfs file failed");
 243
 244        if (map_shared) {
 245                area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 246                                  MAP_SHARED | MAP_HUGETLB,
 247                                  huge_fd, *alloc_area == area_src ? 0 :
 248                                  nr_pages * page_size);
 249                if (area_alias == MAP_FAILED)
 250                        err("mmap of hugetlb file alias failed");
 251        }
 252
 253        if (*alloc_area == area_src) {
 254                huge_fd_off0 = *alloc_area;
 255                alloc_area_alias = &area_src_alias;
 256        } else {
 257                alloc_area_alias = &area_dst_alias;
 258        }
 259        if (area_alias)
 260                *alloc_area_alias = area_alias;
 261}
 262
 263static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 264{
 265        if (!map_shared)
 266                return;
 267        /*
 268         * We can't zap just the pagetable with hugetlbfs because
 269         * MADV_DONTEED won't work. So exercise -EEXIST on a alias
 270         * mapping where the pagetables are not established initially,
 271         * this way we'll exercise the -EEXEC at the fs level.
 272         */
 273        *start = (unsigned long) area_dst_alias + offset;
 274}
 275
 276static void shmem_release_pages(char *rel_area)
 277{
 278        if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
 279                err("madvise(MADV_REMOVE) failed");
 280}
 281
 282static void shmem_allocate_area(void **alloc_area)
 283{
 284        void *area_alias = NULL;
 285        bool is_src = alloc_area == (void **)&area_src;
 286        unsigned long offset = is_src ? 0 : nr_pages * page_size;
 287
 288        *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 289                           MAP_SHARED, shm_fd, offset);
 290        if (*alloc_area == MAP_FAILED)
 291                err("mmap of memfd failed");
 292
 293        area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 294                          MAP_SHARED, shm_fd, offset);
 295        if (area_alias == MAP_FAILED)
 296                err("mmap of memfd alias failed");
 297
 298        if (is_src)
 299                area_src_alias = area_alias;
 300        else
 301                area_dst_alias = area_alias;
 302}
 303
 304static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 305{
 306        *start = (unsigned long)area_dst_alias + offset;
 307}
 308
 309struct uffd_test_ops {
 310        unsigned long expected_ioctls;
 311        void (*allocate_area)(void **alloc_area);
 312        void (*release_pages)(char *rel_area);
 313        void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
 314};
 315
 316#define SHMEM_EXPECTED_IOCTLS           ((1 << _UFFDIO_WAKE) | \
 317                                         (1 << _UFFDIO_COPY) | \
 318                                         (1 << _UFFDIO_ZEROPAGE))
 319
 320#define ANON_EXPECTED_IOCTLS            ((1 << _UFFDIO_WAKE) | \
 321                                         (1 << _UFFDIO_COPY) | \
 322                                         (1 << _UFFDIO_ZEROPAGE) | \
 323                                         (1 << _UFFDIO_WRITEPROTECT))
 324
 325static struct uffd_test_ops anon_uffd_test_ops = {
 326        .expected_ioctls = ANON_EXPECTED_IOCTLS,
 327        .allocate_area  = anon_allocate_area,
 328        .release_pages  = anon_release_pages,
 329        .alias_mapping = noop_alias_mapping,
 330};
 331
 332static struct uffd_test_ops shmem_uffd_test_ops = {
 333        .expected_ioctls = SHMEM_EXPECTED_IOCTLS,
 334        .allocate_area  = shmem_allocate_area,
 335        .release_pages  = shmem_release_pages,
 336        .alias_mapping = shmem_alias_mapping,
 337};
 338
 339static struct uffd_test_ops hugetlb_uffd_test_ops = {
 340        .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE),
 341        .allocate_area  = hugetlb_allocate_area,
 342        .release_pages  = hugetlb_release_pages,
 343        .alias_mapping = hugetlb_alias_mapping,
 344};
 345
 346static struct uffd_test_ops *uffd_test_ops;
 347
 348static void userfaultfd_open(uint64_t *features)
 349{
 350        struct uffdio_api uffdio_api;
 351
 352        uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
 353        if (uffd < 0)
 354                err("userfaultfd syscall not available in this kernel");
 355        uffd_flags = fcntl(uffd, F_GETFD, NULL);
 356
 357        uffdio_api.api = UFFD_API;
 358        uffdio_api.features = *features;
 359        if (ioctl(uffd, UFFDIO_API, &uffdio_api))
 360                err("UFFDIO_API failed.\nPlease make sure to "
 361                    "run with either root or ptrace capability.");
 362        if (uffdio_api.api != UFFD_API)
 363                err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
 364
 365        *features = uffdio_api.features;
 366}
 367
 368static inline void munmap_area(void **area)
 369{
 370        if (*area)
 371                if (munmap(*area, nr_pages * page_size))
 372                        err("munmap");
 373
 374        *area = NULL;
 375}
 376
 377static void uffd_test_ctx_clear(void)
 378{
 379        size_t i;
 380
 381        if (pipefd) {
 382                for (i = 0; i < nr_cpus * 2; ++i) {
 383                        if (close(pipefd[i]))
 384                                err("close pipefd");
 385                }
 386                free(pipefd);
 387                pipefd = NULL;
 388        }
 389
 390        if (count_verify) {
 391                free(count_verify);
 392                count_verify = NULL;
 393        }
 394
 395        if (uffd != -1) {
 396                if (close(uffd))
 397                        err("close uffd");
 398                uffd = -1;
 399        }
 400
 401        huge_fd_off0 = NULL;
 402        munmap_area((void **)&area_src);
 403        munmap_area((void **)&area_src_alias);
 404        munmap_area((void **)&area_dst);
 405        munmap_area((void **)&area_dst_alias);
 406}
 407
 408static void uffd_test_ctx_init_ext(uint64_t *features)
 409{
 410        unsigned long nr, cpu;
 411
 412        uffd_test_ctx_clear();
 413
 414        uffd_test_ops->allocate_area((void **)&area_src);
 415        uffd_test_ops->allocate_area((void **)&area_dst);
 416
 417        userfaultfd_open(features);
 418
 419        count_verify = malloc(nr_pages * sizeof(unsigned long long));
 420        if (!count_verify)
 421                err("count_verify");
 422
 423        for (nr = 0; nr < nr_pages; nr++) {
 424                *area_mutex(area_src, nr) =
 425                        (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
 426                count_verify[nr] = *area_count(area_src, nr) = 1;
 427                /*
 428                 * In the transition between 255 to 256, powerpc will
 429                 * read out of order in my_bcmp and see both bytes as
 430                 * zero, so leave a placeholder below always non-zero
 431                 * after the count, to avoid my_bcmp to trigger false
 432                 * positives.
 433                 */
 434                *(area_count(area_src, nr) + 1) = 1;
 435        }
 436
 437        /*
 438         * After initialization of area_src, we must explicitly release pages
 439         * for area_dst to make sure it's fully empty.  Otherwise we could have
 440         * some area_dst pages be errornously initialized with zero pages,
 441         * hence we could hit memory corruption later in the test.
 442         *
 443         * One example is when THP is globally enabled, above allocate_area()
 444         * calls could have the two areas merged into a single VMA (as they
 445         * will have the same VMA flags so they're mergeable).  When we
 446         * initialize the area_src above, it's possible that some part of
 447         * area_dst could have been faulted in via one huge THP that will be
 448         * shared between area_src and area_dst.  It could cause some of the
 449         * area_dst won't be trapped by missing userfaults.
 450         *
 451         * This release_pages() will guarantee even if that happened, we'll
 452         * proactively split the thp and drop any accidentally initialized
 453         * pages within area_dst.
 454         */
 455        uffd_test_ops->release_pages(area_dst);
 456
 457        pipefd = malloc(sizeof(int) * nr_cpus * 2);
 458        if (!pipefd)
 459                err("pipefd");
 460        for (cpu = 0; cpu < nr_cpus; cpu++)
 461                if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
 462                        err("pipe");
 463}
 464
 465static inline void uffd_test_ctx_init(uint64_t features)
 466{
 467        uffd_test_ctx_init_ext(&features);
 468}
 469
 470static int my_bcmp(char *str1, char *str2, size_t n)
 471{
 472        unsigned long i;
 473        for (i = 0; i < n; i++)
 474                if (str1[i] != str2[i])
 475                        return 1;
 476        return 0;
 477}
 478
 479static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
 480{
 481        struct uffdio_writeprotect prms;
 482
 483        /* Write protection page faults */
 484        prms.range.start = start;
 485        prms.range.len = len;
 486        /* Undo write-protect, do wakeup after that */
 487        prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
 488
 489        if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
 490                err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
 491}
 492
 493static void continue_range(int ufd, __u64 start, __u64 len)
 494{
 495        struct uffdio_continue req;
 496        int ret;
 497
 498        req.range.start = start;
 499        req.range.len = len;
 500        req.mode = 0;
 501
 502        if (ioctl(ufd, UFFDIO_CONTINUE, &req))
 503                err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
 504                    (uint64_t)start);
 505
 506        /*
 507         * Error handling within the kernel for continue is subtly different
 508         * from copy or zeropage, so it may be a source of bugs. Trigger an
 509         * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
 510         */
 511        req.mapped = 0;
 512        ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
 513        if (ret >= 0 || req.mapped != -EEXIST)
 514                err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
 515                    ret, (int64_t) req.mapped);
 516}
 517
 518static void *locking_thread(void *arg)
 519{
 520        unsigned long cpu = (unsigned long) arg;
 521        struct random_data rand;
 522        unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
 523        int32_t rand_nr;
 524        unsigned long long count;
 525        char randstate[64];
 526        unsigned int seed;
 527
 528        if (bounces & BOUNCE_RANDOM) {
 529                seed = (unsigned int) time(NULL) - bounces;
 530                if (!(bounces & BOUNCE_RACINGFAULTS))
 531                        seed += cpu;
 532                bzero(&rand, sizeof(rand));
 533                bzero(&randstate, sizeof(randstate));
 534                if (initstate_r(seed, randstate, sizeof(randstate), &rand))
 535                        err("initstate_r failed");
 536        } else {
 537                page_nr = -bounces;
 538                if (!(bounces & BOUNCE_RACINGFAULTS))
 539                        page_nr += cpu * nr_pages_per_cpu;
 540        }
 541
 542        while (!finished) {
 543                if (bounces & BOUNCE_RANDOM) {
 544                        if (random_r(&rand, &rand_nr))
 545                                err("random_r failed");
 546                        page_nr = rand_nr;
 547                        if (sizeof(page_nr) > sizeof(rand_nr)) {
 548                                if (random_r(&rand, &rand_nr))
 549                                        err("random_r failed");
 550                                page_nr |= (((unsigned long) rand_nr) << 16) <<
 551                                           16;
 552                        }
 553                } else
 554                        page_nr += 1;
 555                page_nr %= nr_pages;
 556                pthread_mutex_lock(area_mutex(area_dst, page_nr));
 557                count = *area_count(area_dst, page_nr);
 558                if (count != count_verify[page_nr])
 559                        err("page_nr %lu memory corruption %llu %llu",
 560                            page_nr, count, count_verify[page_nr]);
 561                count++;
 562                *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
 563                pthread_mutex_unlock(area_mutex(area_dst, page_nr));
 564        }
 565
 566        return NULL;
 567}
 568
 569static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
 570                            unsigned long offset)
 571{
 572        uffd_test_ops->alias_mapping(&uffdio_copy->dst,
 573                                     uffdio_copy->len,
 574                                     offset);
 575        if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
 576                /* real retval in ufdio_copy.copy */
 577                if (uffdio_copy->copy != -EEXIST)
 578                        err("UFFDIO_COPY retry error: %"PRId64,
 579                            (int64_t)uffdio_copy->copy);
 580        } else {
 581                err("UFFDIO_COPY retry unexpected: %"PRId64,
 582                    (int64_t)uffdio_copy->copy);
 583        }
 584}
 585
 586static void wake_range(int ufd, unsigned long addr, unsigned long len)
 587{
 588        struct uffdio_range uffdio_wake;
 589
 590        uffdio_wake.start = addr;
 591        uffdio_wake.len = len;
 592
 593        if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
 594                fprintf(stderr, "error waking %lu\n",
 595                        addr), exit(1);
 596}
 597
 598static int __copy_page(int ufd, unsigned long offset, bool retry)
 599{
 600        struct uffdio_copy uffdio_copy;
 601
 602        if (offset >= nr_pages * page_size)
 603                err("unexpected offset %lu\n", offset);
 604        uffdio_copy.dst = (unsigned long) area_dst + offset;
 605        uffdio_copy.src = (unsigned long) area_src + offset;
 606        uffdio_copy.len = page_size;
 607        if (test_uffdio_wp)
 608                uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
 609        else
 610                uffdio_copy.mode = 0;
 611        uffdio_copy.copy = 0;
 612        if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
 613                /* real retval in ufdio_copy.copy */
 614                if (uffdio_copy.copy != -EEXIST)
 615                        err("UFFDIO_COPY error: %"PRId64,
 616                            (int64_t)uffdio_copy.copy);
 617                wake_range(ufd, uffdio_copy.dst, page_size);
 618        } else if (uffdio_copy.copy != page_size) {
 619                err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
 620        } else {
 621                if (test_uffdio_copy_eexist && retry) {
 622                        test_uffdio_copy_eexist = false;
 623                        retry_copy_page(ufd, &uffdio_copy, offset);
 624                }
 625                return 1;
 626        }
 627        return 0;
 628}
 629
 630static int copy_page_retry(int ufd, unsigned long offset)
 631{
 632        return __copy_page(ufd, offset, true);
 633}
 634
 635static int copy_page(int ufd, unsigned long offset)
 636{
 637        return __copy_page(ufd, offset, false);
 638}
 639
 640static int uffd_read_msg(int ufd, struct uffd_msg *msg)
 641{
 642        int ret = read(uffd, msg, sizeof(*msg));
 643
 644        if (ret != sizeof(*msg)) {
 645                if (ret < 0) {
 646                        if (errno == EAGAIN)
 647                                return 1;
 648                        err("blocking read error");
 649                } else {
 650                        err("short read");
 651                }
 652        }
 653
 654        return 0;
 655}
 656
 657static void uffd_handle_page_fault(struct uffd_msg *msg,
 658                                   struct uffd_stats *stats)
 659{
 660        unsigned long offset;
 661
 662        if (msg->event != UFFD_EVENT_PAGEFAULT)
 663                err("unexpected msg event %u", msg->event);
 664
 665        if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
 666                /* Write protect page faults */
 667                wp_range(uffd, msg->arg.pagefault.address, page_size, false);
 668                stats->wp_faults++;
 669        } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
 670                uint8_t *area;
 671                int b;
 672
 673                /*
 674                 * Minor page faults
 675                 *
 676                 * To prove we can modify the original range for testing
 677                 * purposes, we're going to bit flip this range before
 678                 * continuing.
 679                 *
 680                 * Note that this requires all minor page fault tests operate on
 681                 * area_dst (non-UFFD-registered) and area_dst_alias
 682                 * (UFFD-registered).
 683                 */
 684
 685                area = (uint8_t *)(area_dst +
 686                                   ((char *)msg->arg.pagefault.address -
 687                                    area_dst_alias));
 688                for (b = 0; b < page_size; ++b)
 689                        area[b] = ~area[b];
 690                continue_range(uffd, msg->arg.pagefault.address, page_size);
 691                stats->minor_faults++;
 692        } else {
 693                /* Missing page faults */
 694                if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
 695                        err("unexpected write fault");
 696
 697                offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
 698                offset &= ~(page_size-1);
 699
 700                if (copy_page(uffd, offset))
 701                        stats->missing_faults++;
 702        }
 703}
 704
 705static void *uffd_poll_thread(void *arg)
 706{
 707        struct uffd_stats *stats = (struct uffd_stats *)arg;
 708        unsigned long cpu = stats->cpu;
 709        struct pollfd pollfd[2];
 710        struct uffd_msg msg;
 711        struct uffdio_register uffd_reg;
 712        int ret;
 713        char tmp_chr;
 714
 715        pollfd[0].fd = uffd;
 716        pollfd[0].events = POLLIN;
 717        pollfd[1].fd = pipefd[cpu*2];
 718        pollfd[1].events = POLLIN;
 719
 720        for (;;) {
 721                ret = poll(pollfd, 2, -1);
 722                if (ret <= 0)
 723                        err("poll error: %d", ret);
 724                if (pollfd[1].revents & POLLIN) {
 725                        if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
 726                                err("read pipefd error");
 727                        break;
 728                }
 729                if (!(pollfd[0].revents & POLLIN))
 730                        err("pollfd[0].revents %d", pollfd[0].revents);
 731                if (uffd_read_msg(uffd, &msg))
 732                        continue;
 733                switch (msg.event) {
 734                default:
 735                        err("unexpected msg event %u\n", msg.event);
 736                        break;
 737                case UFFD_EVENT_PAGEFAULT:
 738                        uffd_handle_page_fault(&msg, stats);
 739                        break;
 740                case UFFD_EVENT_FORK:
 741                        close(uffd);
 742                        uffd = msg.arg.fork.ufd;
 743                        pollfd[0].fd = uffd;
 744                        break;
 745                case UFFD_EVENT_REMOVE:
 746                        uffd_reg.range.start = msg.arg.remove.start;
 747                        uffd_reg.range.len = msg.arg.remove.end -
 748                                msg.arg.remove.start;
 749                        if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
 750                                err("remove failure");
 751                        break;
 752                case UFFD_EVENT_REMAP:
 753                        area_dst = (char *)(unsigned long)msg.arg.remap.to;
 754                        break;
 755                }
 756        }
 757
 758        return NULL;
 759}
 760
 761pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
 762
 763static void *uffd_read_thread(void *arg)
 764{
 765        struct uffd_stats *stats = (struct uffd_stats *)arg;
 766        struct uffd_msg msg;
 767
 768        pthread_mutex_unlock(&uffd_read_mutex);
 769        /* from here cancellation is ok */
 770
 771        for (;;) {
 772                if (uffd_read_msg(uffd, &msg))
 773                        continue;
 774                uffd_handle_page_fault(&msg, stats);
 775        }
 776
 777        return NULL;
 778}
 779
 780static void *background_thread(void *arg)
 781{
 782        unsigned long cpu = (unsigned long) arg;
 783        unsigned long page_nr, start_nr, mid_nr, end_nr;
 784
 785        start_nr = cpu * nr_pages_per_cpu;
 786        end_nr = (cpu+1) * nr_pages_per_cpu;
 787        mid_nr = (start_nr + end_nr) / 2;
 788
 789        /* Copy the first half of the pages */
 790        for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
 791                copy_page_retry(uffd, page_nr * page_size);
 792
 793        /*
 794         * If we need to test uffd-wp, set it up now.  Then we'll have
 795         * at least the first half of the pages mapped already which
 796         * can be write-protected for testing
 797         */
 798        if (test_uffdio_wp)
 799                wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
 800                        nr_pages_per_cpu * page_size, true);
 801
 802        /*
 803         * Continue the 2nd half of the page copying, handling write
 804         * protection faults if any
 805         */
 806        for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
 807                copy_page_retry(uffd, page_nr * page_size);
 808
 809        return NULL;
 810}
 811
 812static int stress(struct uffd_stats *uffd_stats)
 813{
 814        unsigned long cpu;
 815        pthread_t locking_threads[nr_cpus];
 816        pthread_t uffd_threads[nr_cpus];
 817        pthread_t background_threads[nr_cpus];
 818
 819        finished = 0;
 820        for (cpu = 0; cpu < nr_cpus; cpu++) {
 821                if (pthread_create(&locking_threads[cpu], &attr,
 822                                   locking_thread, (void *)cpu))
 823                        return 1;
 824                if (bounces & BOUNCE_POLL) {
 825                        if (pthread_create(&uffd_threads[cpu], &attr,
 826                                           uffd_poll_thread,
 827                                           (void *)&uffd_stats[cpu]))
 828                                return 1;
 829                } else {
 830                        if (pthread_create(&uffd_threads[cpu], &attr,
 831                                           uffd_read_thread,
 832                                           (void *)&uffd_stats[cpu]))
 833                                return 1;
 834                        pthread_mutex_lock(&uffd_read_mutex);
 835                }
 836                if (pthread_create(&background_threads[cpu], &attr,
 837                                   background_thread, (void *)cpu))
 838                        return 1;
 839        }
 840        for (cpu = 0; cpu < nr_cpus; cpu++)
 841                if (pthread_join(background_threads[cpu], NULL))
 842                        return 1;
 843
 844        /*
 845         * Be strict and immediately zap area_src, the whole area has
 846         * been transferred already by the background treads. The
 847         * area_src could then be faulted in in a racy way by still
 848         * running uffdio_threads reading zeropages after we zapped
 849         * area_src (but they're guaranteed to get -EEXIST from
 850         * UFFDIO_COPY without writing zero pages into area_dst
 851         * because the background threads already completed).
 852         */
 853        uffd_test_ops->release_pages(area_src);
 854
 855        finished = 1;
 856        for (cpu = 0; cpu < nr_cpus; cpu++)
 857                if (pthread_join(locking_threads[cpu], NULL))
 858                        return 1;
 859
 860        for (cpu = 0; cpu < nr_cpus; cpu++) {
 861                char c;
 862                if (bounces & BOUNCE_POLL) {
 863                        if (write(pipefd[cpu*2+1], &c, 1) != 1)
 864                                err("pipefd write error");
 865                        if (pthread_join(uffd_threads[cpu],
 866                                         (void *)&uffd_stats[cpu]))
 867                                return 1;
 868                } else {
 869                        if (pthread_cancel(uffd_threads[cpu]))
 870                                return 1;
 871                        if (pthread_join(uffd_threads[cpu], NULL))
 872                                return 1;
 873                }
 874        }
 875
 876        return 0;
 877}
 878
 879sigjmp_buf jbuf, *sigbuf;
 880
 881static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
 882{
 883        if (sig == SIGBUS) {
 884                if (sigbuf)
 885                        siglongjmp(*sigbuf, 1);
 886                abort();
 887        }
 888}
 889
 890/*
 891 * For non-cooperative userfaultfd test we fork() a process that will
 892 * generate pagefaults, will mremap the area monitored by the
 893 * userfaultfd and at last this process will release the monitored
 894 * area.
 895 * For the anonymous and shared memory the area is divided into two
 896 * parts, the first part is accessed before mremap, and the second
 897 * part is accessed after mremap. Since hugetlbfs does not support
 898 * mremap, the entire monitored area is accessed in a single pass for
 899 * HUGETLB_TEST.
 900 * The release of the pages currently generates event for shmem and
 901 * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
 902 * for hugetlb.
 903 * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
 904 * monitored area, generate pagefaults and test that signal is delivered.
 905 * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
 906 * test robustness use case - we release monitored area, fork a process
 907 * that will generate pagefaults and verify signal is generated.
 908 * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
 909 * feature. Using monitor thread, verify no userfault events are generated.
 910 */
 911static int faulting_process(int signal_test)
 912{
 913        unsigned long nr;
 914        unsigned long long count;
 915        unsigned long split_nr_pages;
 916        unsigned long lastnr;
 917        struct sigaction act;
 918        unsigned long signalled = 0;
 919
 920        if (test_type != TEST_HUGETLB)
 921                split_nr_pages = (nr_pages + 1) / 2;
 922        else
 923                split_nr_pages = nr_pages;
 924
 925        if (signal_test) {
 926                sigbuf = &jbuf;
 927                memset(&act, 0, sizeof(act));
 928                act.sa_sigaction = sighndl;
 929                act.sa_flags = SA_SIGINFO;
 930                if (sigaction(SIGBUS, &act, 0))
 931                        err("sigaction");
 932                lastnr = (unsigned long)-1;
 933        }
 934
 935        for (nr = 0; nr < split_nr_pages; nr++) {
 936                int steps = 1;
 937                unsigned long offset = nr * page_size;
 938
 939                if (signal_test) {
 940                        if (sigsetjmp(*sigbuf, 1) != 0) {
 941                                if (steps == 1 && nr == lastnr)
 942                                        err("Signal repeated");
 943
 944                                lastnr = nr;
 945                                if (signal_test == 1) {
 946                                        if (steps == 1) {
 947                                                /* This is a MISSING request */
 948                                                steps++;
 949                                                if (copy_page(uffd, offset))
 950                                                        signalled++;
 951                                        } else {
 952                                                /* This is a WP request */
 953                                                assert(steps == 2);
 954                                                wp_range(uffd,
 955                                                         (__u64)area_dst +
 956                                                         offset,
 957                                                         page_size, false);
 958                                        }
 959                                } else {
 960                                        signalled++;
 961                                        continue;
 962                                }
 963                        }
 964                }
 965
 966                count = *area_count(area_dst, nr);
 967                if (count != count_verify[nr])
 968                        err("nr %lu memory corruption %llu %llu\n",
 969                            nr, count, count_verify[nr]);
 970                /*
 971                 * Trigger write protection if there is by writing
 972                 * the same value back.
 973                 */
 974                *area_count(area_dst, nr) = count;
 975        }
 976
 977        if (signal_test)
 978                return signalled != split_nr_pages;
 979
 980        if (test_type == TEST_HUGETLB)
 981                return 0;
 982
 983        area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
 984                          MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
 985        if (area_dst == MAP_FAILED)
 986                err("mremap");
 987        /* Reset area_src since we just clobbered it */
 988        area_src = NULL;
 989
 990        for (; nr < nr_pages; nr++) {
 991                count = *area_count(area_dst, nr);
 992                if (count != count_verify[nr]) {
 993                        err("nr %lu memory corruption %llu %llu\n",
 994                            nr, count, count_verify[nr]);
 995                }
 996                /*
 997                 * Trigger write protection if there is by writing
 998                 * the same value back.
 999                 */
1000                *area_count(area_dst, nr) = count;
1001        }
1002
1003        uffd_test_ops->release_pages(area_dst);
1004
1005        for (nr = 0; nr < nr_pages; nr++)
1006                if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
1007                        err("nr %lu is not zero", nr);
1008
1009        return 0;
1010}
1011
1012static void retry_uffdio_zeropage(int ufd,
1013                                  struct uffdio_zeropage *uffdio_zeropage,
1014                                  unsigned long offset)
1015{
1016        uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
1017                                     uffdio_zeropage->range.len,
1018                                     offset);
1019        if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
1020                if (uffdio_zeropage->zeropage != -EEXIST)
1021                        err("UFFDIO_ZEROPAGE error: %"PRId64,
1022                            (int64_t)uffdio_zeropage->zeropage);
1023        } else {
1024                err("UFFDIO_ZEROPAGE error: %"PRId64,
1025                    (int64_t)uffdio_zeropage->zeropage);
1026        }
1027}
1028
1029static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
1030{
1031        struct uffdio_zeropage uffdio_zeropage;
1032        int ret;
1033        unsigned long has_zeropage;
1034        __s64 res;
1035
1036        has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
1037
1038        if (offset >= nr_pages * page_size)
1039                err("unexpected offset %lu", offset);
1040        uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
1041        uffdio_zeropage.range.len = page_size;
1042        uffdio_zeropage.mode = 0;
1043        ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
1044        res = uffdio_zeropage.zeropage;
1045        if (ret) {
1046                /* real retval in ufdio_zeropage.zeropage */
1047                if (has_zeropage)
1048                        err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
1049                else if (res != -EINVAL)
1050                        err("UFFDIO_ZEROPAGE not -EINVAL");
1051        } else if (has_zeropage) {
1052                if (res != page_size) {
1053                        err("UFFDIO_ZEROPAGE unexpected size");
1054                } else {
1055                        if (test_uffdio_zeropage_eexist && retry) {
1056                                test_uffdio_zeropage_eexist = false;
1057                                retry_uffdio_zeropage(ufd, &uffdio_zeropage,
1058                                                      offset);
1059                        }
1060                        return 1;
1061                }
1062        } else
1063                err("UFFDIO_ZEROPAGE succeeded");
1064
1065        return 0;
1066}
1067
1068static int uffdio_zeropage(int ufd, unsigned long offset)
1069{
1070        return __uffdio_zeropage(ufd, offset, false);
1071}
1072
1073/* exercise UFFDIO_ZEROPAGE */
1074static int userfaultfd_zeropage_test(void)
1075{
1076        struct uffdio_register uffdio_register;
1077        unsigned long expected_ioctls;
1078
1079        printf("testing UFFDIO_ZEROPAGE: ");
1080        fflush(stdout);
1081
1082        uffd_test_ctx_init(0);
1083
1084        uffdio_register.range.start = (unsigned long) area_dst;
1085        uffdio_register.range.len = nr_pages * page_size;
1086        uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1087        if (test_uffdio_wp)
1088                uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1089        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1090                err("register failure");
1091
1092        expected_ioctls = uffd_test_ops->expected_ioctls;
1093        if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
1094                err("unexpected missing ioctl for anon memory");
1095
1096        if (uffdio_zeropage(uffd, 0))
1097                if (my_bcmp(area_dst, zeropage, page_size))
1098                        err("zeropage is not zero");
1099
1100        printf("done.\n");
1101        return 0;
1102}
1103
1104static int userfaultfd_events_test(void)
1105{
1106        struct uffdio_register uffdio_register;
1107        unsigned long expected_ioctls;
1108        pthread_t uffd_mon;
1109        int err, features;
1110        pid_t pid;
1111        char c;
1112        struct uffd_stats stats = { 0 };
1113
1114        printf("testing events (fork, remap, remove): ");
1115        fflush(stdout);
1116
1117        features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
1118                UFFD_FEATURE_EVENT_REMOVE;
1119        uffd_test_ctx_init(features);
1120
1121        fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1122
1123        uffdio_register.range.start = (unsigned long) area_dst;
1124        uffdio_register.range.len = nr_pages * page_size;
1125        uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1126        if (test_uffdio_wp)
1127                uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1128        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1129                err("register failure");
1130
1131        expected_ioctls = uffd_test_ops->expected_ioctls;
1132        if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
1133                err("unexpected missing ioctl for anon memory");
1134
1135        if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1136                err("uffd_poll_thread create");
1137
1138        pid = fork();
1139        if (pid < 0)
1140                err("fork");
1141
1142        if (!pid)
1143                exit(faulting_process(0));
1144
1145        waitpid(pid, &err, 0);
1146        if (err)
1147                err("faulting process failed");
1148        if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1149                err("pipe write");
1150        if (pthread_join(uffd_mon, NULL))
1151                return 1;
1152
1153        uffd_stats_report(&stats, 1);
1154
1155        return stats.missing_faults != nr_pages;
1156}
1157
1158static int userfaultfd_sig_test(void)
1159{
1160        struct uffdio_register uffdio_register;
1161        unsigned long expected_ioctls;
1162        unsigned long userfaults;
1163        pthread_t uffd_mon;
1164        int err, features;
1165        pid_t pid;
1166        char c;
1167        struct uffd_stats stats = { 0 };
1168
1169        printf("testing signal delivery: ");
1170        fflush(stdout);
1171
1172        features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
1173        uffd_test_ctx_init(features);
1174
1175        fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1176
1177        uffdio_register.range.start = (unsigned long) area_dst;
1178        uffdio_register.range.len = nr_pages * page_size;
1179        uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1180        if (test_uffdio_wp)
1181                uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1182        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1183                err("register failure");
1184
1185        expected_ioctls = uffd_test_ops->expected_ioctls;
1186        if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
1187                err("unexpected missing ioctl for anon memory");
1188
1189        if (faulting_process(1))
1190                err("faulting process failed");
1191
1192        uffd_test_ops->release_pages(area_dst);
1193
1194        if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1195                err("uffd_poll_thread create");
1196
1197        pid = fork();
1198        if (pid < 0)
1199                err("fork");
1200
1201        if (!pid)
1202                exit(faulting_process(2));
1203
1204        waitpid(pid, &err, 0);
1205        if (err)
1206                err("faulting process failed");
1207        if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1208                err("pipe write");
1209        if (pthread_join(uffd_mon, (void **)&userfaults))
1210                return 1;
1211
1212        printf("done.\n");
1213        if (userfaults)
1214                err("Signal test failed, userfaults: %ld", userfaults);
1215
1216        return userfaults != 0;
1217}
1218
1219static int userfaultfd_minor_test(void)
1220{
1221        struct uffdio_register uffdio_register;
1222        unsigned long expected_ioctls;
1223        unsigned long p;
1224        pthread_t uffd_mon;
1225        uint8_t expected_byte;
1226        void *expected_page;
1227        char c;
1228        struct uffd_stats stats = { 0 };
1229        uint64_t req_features, features_out;
1230
1231        if (!test_uffdio_minor)
1232                return 0;
1233
1234        printf("testing minor faults: ");
1235        fflush(stdout);
1236
1237        if (test_type == TEST_HUGETLB)
1238                req_features = UFFD_FEATURE_MINOR_HUGETLBFS;
1239        else if (test_type == TEST_SHMEM)
1240                req_features = UFFD_FEATURE_MINOR_SHMEM;
1241        else
1242                return 1;
1243
1244        features_out = req_features;
1245        uffd_test_ctx_init_ext(&features_out);
1246        /* If kernel reports required features aren't supported, skip test. */
1247        if ((features_out & req_features) != req_features) {
1248                printf("skipping test due to lack of feature support\n");
1249                fflush(stdout);
1250                return 0;
1251        }
1252
1253        uffdio_register.range.start = (unsigned long)area_dst_alias;
1254        uffdio_register.range.len = nr_pages * page_size;
1255        uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
1256        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1257                err("register failure");
1258
1259        expected_ioctls = uffd_test_ops->expected_ioctls;
1260        expected_ioctls |= 1 << _UFFDIO_CONTINUE;
1261        if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
1262                err("unexpected missing ioctl(s)");
1263
1264        /*
1265         * After registering with UFFD, populate the non-UFFD-registered side of
1266         * the shared mapping. This should *not* trigger any UFFD minor faults.
1267         */
1268        for (p = 0; p < nr_pages; ++p) {
1269                memset(area_dst + (p * page_size), p % ((uint8_t)-1),
1270                       page_size);
1271        }
1272
1273        if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1274                err("uffd_poll_thread create");
1275
1276        /*
1277         * Read each of the pages back using the UFFD-registered mapping. We
1278         * expect that the first time we touch a page, it will result in a minor
1279         * fault. uffd_poll_thread will resolve the fault by bit-flipping the
1280         * page's contents, and then issuing a CONTINUE ioctl.
1281         */
1282
1283        if (posix_memalign(&expected_page, page_size, page_size))
1284                err("out of memory");
1285
1286        for (p = 0; p < nr_pages; ++p) {
1287                expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
1288                memset(expected_page, expected_byte, page_size);
1289                if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
1290                            page_size))
1291                        err("unexpected page contents after minor fault");
1292        }
1293
1294        if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1295                err("pipe write");
1296        if (pthread_join(uffd_mon, NULL))
1297                return 1;
1298
1299        uffd_stats_report(&stats, 1);
1300
1301        return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
1302}
1303
1304#define BIT_ULL(nr)                   (1ULL << (nr))
1305#define PM_SOFT_DIRTY                 BIT_ULL(55)
1306#define PM_MMAP_EXCLUSIVE             BIT_ULL(56)
1307#define PM_UFFD_WP                    BIT_ULL(57)
1308#define PM_FILE                       BIT_ULL(61)
1309#define PM_SWAP                       BIT_ULL(62)
1310#define PM_PRESENT                    BIT_ULL(63)
1311
1312static int pagemap_open(void)
1313{
1314        int fd = open("/proc/self/pagemap", O_RDONLY);
1315
1316        if (fd < 0)
1317                err("open pagemap");
1318
1319        return fd;
1320}
1321
1322static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
1323{
1324        uint64_t value;
1325        int ret;
1326
1327        ret = pread(fd, &value, sizeof(uint64_t),
1328                    ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
1329        if (ret != sizeof(uint64_t))
1330                err("pread() on pagemap failed");
1331
1332        return value;
1333}
1334
1335/* This macro let __LINE__ works in err() */
1336#define  pagemap_check_wp(value, wp) do {                               \
1337                if (!!(value & PM_UFFD_WP) != wp)                       \
1338                        err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
1339        } while (0)
1340
1341static int pagemap_test_fork(bool present)
1342{
1343        pid_t child = fork();
1344        uint64_t value;
1345        int fd, result;
1346
1347        if (!child) {
1348                /* Open the pagemap fd of the child itself */
1349                fd = pagemap_open();
1350                value = pagemap_read_vaddr(fd, area_dst);
1351                /*
1352                 * After fork() uffd-wp bit should be gone as long as we're
1353                 * without UFFD_FEATURE_EVENT_FORK
1354                 */
1355                pagemap_check_wp(value, false);
1356                /* Succeed */
1357                exit(0);
1358        }
1359        waitpid(child, &result, 0);
1360        return result;
1361}
1362
1363static void userfaultfd_pagemap_test(unsigned int test_pgsize)
1364{
1365        struct uffdio_register uffdio_register;
1366        int pagemap_fd;
1367        uint64_t value;
1368
1369        /* Pagemap tests uffd-wp only */
1370        if (!test_uffdio_wp)
1371                return;
1372
1373        /* Not enough memory to test this page size */
1374        if (test_pgsize > nr_pages * page_size)
1375                return;
1376
1377        printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
1378        /* Flush so it doesn't flush twice in parent/child later */
1379        fflush(stdout);
1380
1381        uffd_test_ctx_init(0);
1382
1383        if (test_pgsize > page_size) {
1384                /* This is a thp test */
1385                if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
1386                        err("madvise(MADV_HUGEPAGE) failed");
1387        } else if (test_pgsize == page_size) {
1388                /* This is normal page test; force no thp */
1389                if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
1390                        err("madvise(MADV_NOHUGEPAGE) failed");
1391        }
1392
1393        uffdio_register.range.start = (unsigned long) area_dst;
1394        uffdio_register.range.len = nr_pages * page_size;
1395        uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
1396        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1397                err("register failed");
1398
1399        pagemap_fd = pagemap_open();
1400
1401        /* Touch the page */
1402        *area_dst = 1;
1403        wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
1404        value = pagemap_read_vaddr(pagemap_fd, area_dst);
1405        pagemap_check_wp(value, true);
1406        /* Make sure uffd-wp bit dropped when fork */
1407        if (pagemap_test_fork(true))
1408                err("Detected stall uffd-wp bit in child");
1409
1410        /* Exclusive required or PAGEOUT won't work */
1411        if (!(value & PM_MMAP_EXCLUSIVE))
1412                err("multiple mapping detected: 0x%"PRIx64, value);
1413
1414        if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
1415                err("madvise(MADV_PAGEOUT) failed");
1416
1417        /* Uffd-wp should persist even swapped out */
1418        value = pagemap_read_vaddr(pagemap_fd, area_dst);
1419        pagemap_check_wp(value, true);
1420        /* Make sure uffd-wp bit dropped when fork */
1421        if (pagemap_test_fork(false))
1422                err("Detected stall uffd-wp bit in child");
1423
1424        /* Unprotect; this tests swap pte modifications */
1425        wp_range(uffd, (uint64_t)area_dst, page_size, false);
1426        value = pagemap_read_vaddr(pagemap_fd, area_dst);
1427        pagemap_check_wp(value, false);
1428
1429        /* Fault in the page from disk */
1430        *area_dst = 2;
1431        value = pagemap_read_vaddr(pagemap_fd, area_dst);
1432        pagemap_check_wp(value, false);
1433
1434        close(pagemap_fd);
1435        printf("done\n");
1436}
1437
1438static int userfaultfd_stress(void)
1439{
1440        void *area;
1441        char *tmp_area;
1442        unsigned long nr;
1443        struct uffdio_register uffdio_register;
1444        struct uffd_stats uffd_stats[nr_cpus];
1445
1446        uffd_test_ctx_init(0);
1447
1448        if (posix_memalign(&area, page_size, page_size))
1449                err("out of memory");
1450        zeropage = area;
1451        bzero(zeropage, page_size);
1452
1453        pthread_mutex_lock(&uffd_read_mutex);
1454
1455        pthread_attr_init(&attr);
1456        pthread_attr_setstacksize(&attr, 16*1024*1024);
1457
1458        while (bounces--) {
1459                unsigned long expected_ioctls;
1460
1461                printf("bounces: %d, mode:", bounces);
1462                if (bounces & BOUNCE_RANDOM)
1463                        printf(" rnd");
1464                if (bounces & BOUNCE_RACINGFAULTS)
1465                        printf(" racing");
1466                if (bounces & BOUNCE_VERIFY)
1467                        printf(" ver");
1468                if (bounces & BOUNCE_POLL)
1469                        printf(" poll");
1470                else
1471                        printf(" read");
1472                printf(", ");
1473                fflush(stdout);
1474
1475                if (bounces & BOUNCE_POLL)
1476                        fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1477                else
1478                        fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
1479
1480                /* register */
1481                uffdio_register.range.start = (unsigned long) area_dst;
1482                uffdio_register.range.len = nr_pages * page_size;
1483                uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1484                if (test_uffdio_wp)
1485                        uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1486                if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1487                        err("register failure");
1488                expected_ioctls = uffd_test_ops->expected_ioctls;
1489                if ((uffdio_register.ioctls & expected_ioctls) !=
1490                    expected_ioctls)
1491                        err("unexpected missing ioctl for anon memory");
1492
1493                if (area_dst_alias) {
1494                        uffdio_register.range.start = (unsigned long)
1495                                area_dst_alias;
1496                        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1497                                err("register failure alias");
1498                }
1499
1500                /*
1501                 * The madvise done previously isn't enough: some
1502                 * uffd_thread could have read userfaults (one of
1503                 * those already resolved by the background thread)
1504                 * and it may be in the process of calling
1505                 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
1506                 * area_src and it would map a zero page in it (of
1507                 * course such a UFFDIO_COPY is perfectly safe as it'd
1508                 * return -EEXIST). The problem comes at the next
1509                 * bounce though: that racing UFFDIO_COPY would
1510                 * generate zeropages in the area_src, so invalidating
1511                 * the previous MADV_DONTNEED. Without this additional
1512                 * MADV_DONTNEED those zeropages leftovers in the
1513                 * area_src would lead to -EEXIST failure during the
1514                 * next bounce, effectively leaving a zeropage in the
1515                 * area_dst.
1516                 *
1517                 * Try to comment this out madvise to see the memory
1518                 * corruption being caught pretty quick.
1519                 *
1520                 * khugepaged is also inhibited to collapse THP after
1521                 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
1522                 * required to MADV_DONTNEED here.
1523                 */
1524                uffd_test_ops->release_pages(area_dst);
1525
1526                uffd_stats_reset(uffd_stats, nr_cpus);
1527
1528                /* bounce pass */
1529                if (stress(uffd_stats))
1530                        return 1;
1531
1532                /* Clear all the write protections if there is any */
1533                if (test_uffdio_wp)
1534                        wp_range(uffd, (unsigned long)area_dst,
1535                                 nr_pages * page_size, false);
1536
1537                /* unregister */
1538                if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
1539                        err("unregister failure");
1540                if (area_dst_alias) {
1541                        uffdio_register.range.start = (unsigned long) area_dst;
1542                        if (ioctl(uffd, UFFDIO_UNREGISTER,
1543                                  &uffdio_register.range))
1544                                err("unregister failure alias");
1545                }
1546
1547                /* verification */
1548                if (bounces & BOUNCE_VERIFY)
1549                        for (nr = 0; nr < nr_pages; nr++)
1550                                if (*area_count(area_dst, nr) != count_verify[nr])
1551                                        err("error area_count %llu %llu %lu\n",
1552                                            *area_count(area_src, nr),
1553                                            count_verify[nr], nr);
1554
1555                /* prepare next bounce */
1556                tmp_area = area_src;
1557                area_src = area_dst;
1558                area_dst = tmp_area;
1559
1560                tmp_area = area_src_alias;
1561                area_src_alias = area_dst_alias;
1562                area_dst_alias = tmp_area;
1563
1564                uffd_stats_report(uffd_stats, nr_cpus);
1565        }
1566
1567        if (test_type == TEST_ANON) {
1568                /*
1569                 * shmem/hugetlb won't be able to run since they have different
1570                 * behavior on fork() (file-backed memory normally drops ptes
1571                 * directly when fork), meanwhile the pagemap test will verify
1572                 * pgtable entry of fork()ed child.
1573                 */
1574                userfaultfd_pagemap_test(page_size);
1575                /*
1576                 * Hard-code for x86_64 for now for 2M THP, as x86_64 is
1577                 * currently the only one that supports uffd-wp
1578                 */
1579                userfaultfd_pagemap_test(page_size * 512);
1580        }
1581
1582        return userfaultfd_zeropage_test() || userfaultfd_sig_test()
1583                || userfaultfd_events_test() || userfaultfd_minor_test();
1584}
1585
1586/*
1587 * Copied from mlock2-tests.c
1588 */
1589unsigned long default_huge_page_size(void)
1590{
1591        unsigned long hps = 0;
1592        char *line = NULL;
1593        size_t linelen = 0;
1594        FILE *f = fopen("/proc/meminfo", "r");
1595
1596        if (!f)
1597                return 0;
1598        while (getline(&line, &linelen, f) > 0) {
1599                if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
1600                        hps <<= 10;
1601                        break;
1602                }
1603        }
1604
1605        free(line);
1606        fclose(f);
1607        return hps;
1608}
1609
1610static void set_test_type(const char *type)
1611{
1612        if (!strcmp(type, "anon")) {
1613                test_type = TEST_ANON;
1614                uffd_test_ops = &anon_uffd_test_ops;
1615                /* Only enable write-protect test for anonymous test */
1616                test_uffdio_wp = true;
1617        } else if (!strcmp(type, "hugetlb")) {
1618                test_type = TEST_HUGETLB;
1619                uffd_test_ops = &hugetlb_uffd_test_ops;
1620        } else if (!strcmp(type, "hugetlb_shared")) {
1621                map_shared = true;
1622                test_type = TEST_HUGETLB;
1623                uffd_test_ops = &hugetlb_uffd_test_ops;
1624                /* Minor faults require shared hugetlb; only enable here. */
1625                test_uffdio_minor = true;
1626        } else if (!strcmp(type, "shmem")) {
1627                map_shared = true;
1628                test_type = TEST_SHMEM;
1629                uffd_test_ops = &shmem_uffd_test_ops;
1630                test_uffdio_minor = true;
1631        } else {
1632                err("Unknown test type: %s", type);
1633        }
1634
1635        if (test_type == TEST_HUGETLB)
1636                page_size = default_huge_page_size();
1637        else
1638                page_size = sysconf(_SC_PAGE_SIZE);
1639
1640        if (!page_size)
1641                err("Unable to determine page size");
1642        if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1643            > page_size)
1644                err("Impossible to run this test");
1645}
1646
1647static void sigalrm(int sig)
1648{
1649        if (sig != SIGALRM)
1650                abort();
1651        test_uffdio_copy_eexist = true;
1652        test_uffdio_zeropage_eexist = true;
1653        alarm(ALARM_INTERVAL_SECS);
1654}
1655
1656int main(int argc, char **argv)
1657{
1658        if (argc < 4)
1659                usage();
1660
1661        if (signal(SIGALRM, sigalrm) == SIG_ERR)
1662                err("failed to arm SIGALRM");
1663        alarm(ALARM_INTERVAL_SECS);
1664
1665        set_test_type(argv[1]);
1666
1667        nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1668        nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
1669                nr_cpus;
1670        if (!nr_pages_per_cpu) {
1671                _err("invalid MiB");
1672                usage();
1673        }
1674
1675        bounces = atoi(argv[3]);
1676        if (bounces <= 0) {
1677                _err("invalid bounces");
1678                usage();
1679        }
1680        nr_pages = nr_pages_per_cpu * nr_cpus;
1681
1682        if (test_type == TEST_HUGETLB) {
1683                if (argc < 5)
1684                        usage();
1685                huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
1686                if (huge_fd < 0)
1687                        err("Open of %s failed", argv[4]);
1688                if (ftruncate(huge_fd, 0))
1689                        err("ftruncate %s to size 0 failed", argv[4]);
1690        } else if (test_type == TEST_SHMEM) {
1691                shm_fd = memfd_create(argv[0], 0);
1692                if (shm_fd < 0)
1693                        err("memfd_create");
1694                if (ftruncate(shm_fd, nr_pages * page_size * 2))
1695                        err("ftruncate");
1696                if (fallocate(shm_fd,
1697                              FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
1698                              nr_pages * page_size * 2))
1699                        err("fallocate");
1700        }
1701        printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1702               nr_pages, nr_pages_per_cpu);
1703        return userfaultfd_stress();
1704}
1705
1706#else /* __NR_userfaultfd */
1707
1708#warning "missing __NR_userfaultfd definition"
1709
1710int main(void)
1711{
1712        printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1713        return KSFT_SKIP;
1714}
1715
1716#endif /* __NR_userfaultfd */
1717