linux/tools/testing/selftests/x86/protection_keys.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Tests x86 Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
   4 *
   5 * There are examples in here of:
   6 *  * how to set protection keys on memory
   7 *  * how to set/clear bits in PKRU (the rights register)
   8 *  * how to handle SEGV_PKRU signals and extract pkey-relevant
   9 *    information from the siginfo
  10 *
  11 * Things to add:
  12 *      make sure KSM and KSM COW breaking works
  13 *      prefault pages in at malloc, or not
  14 *      protect MPX bounds tables with protection keys?
  15 *      make sure VMA splitting/merging is working correctly
  16 *      OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
  17 *      look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
  18 *      do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
  19 *
  20 * Compile like this:
  21 *      gcc      -o protection_keys    -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
  22 *      gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
  23 */
  24#define _GNU_SOURCE
  25#include <errno.h>
  26#include <linux/futex.h>
  27#include <sys/time.h>
  28#include <sys/syscall.h>
  29#include <string.h>
  30#include <stdio.h>
  31#include <stdint.h>
  32#include <stdbool.h>
  33#include <signal.h>
  34#include <assert.h>
  35#include <stdlib.h>
  36#include <ucontext.h>
  37#include <sys/mman.h>
  38#include <sys/types.h>
  39#include <sys/wait.h>
  40#include <sys/stat.h>
  41#include <fcntl.h>
  42#include <unistd.h>
  43#include <sys/ptrace.h>
  44#include <setjmp.h>
  45
  46#include "pkey-helpers.h"
  47
  48int iteration_nr = 1;
  49int test_nr;
  50
  51unsigned int shadow_pkru;
  52
  53#define HPAGE_SIZE      (1UL<<21)
  54#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
  55#define ALIGN_UP(x, align_to)   (((x) + ((align_to)-1)) & ~((align_to)-1))
  56#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
  57#define ALIGN_PTR_UP(p, ptr_align_to)   ((typeof(p))ALIGN_UP((unsigned long)(p),        ptr_align_to))
  58#define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p),      ptr_align_to))
  59#define __stringify_1(x...)     #x
  60#define __stringify(x...)       __stringify_1(x)
  61
  62#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
  63
  64int dprint_in_signal;
  65char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
  66
  67extern void abort_hooks(void);
  68#define pkey_assert(condition) do {             \
  69        if (!(condition)) {                     \
  70                dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
  71                                __FILE__, __LINE__,     \
  72                                test_nr, iteration_nr); \
  73                dprintf0("errno at assert: %d", errno); \
  74                abort_hooks();                  \
  75                exit(__LINE__);                 \
  76        }                                       \
  77} while (0)
  78
  79void cat_into_file(char *str, char *file)
  80{
  81        int fd = open(file, O_RDWR);
  82        int ret;
  83
  84        dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
  85        /*
  86         * these need to be raw because they are called under
  87         * pkey_assert()
  88         */
  89        if (fd < 0) {
  90                fprintf(stderr, "error opening '%s'\n", str);
  91                perror("error: ");
  92                exit(__LINE__);
  93        }
  94
  95        ret = write(fd, str, strlen(str));
  96        if (ret != strlen(str)) {
  97                perror("write to file failed");
  98                fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
  99                exit(__LINE__);
 100        }
 101        close(fd);
 102}
 103
 104#if CONTROL_TRACING > 0
 105static int warned_tracing;
 106int tracing_root_ok(void)
 107{
 108        if (geteuid() != 0) {
 109                if (!warned_tracing)
 110                        fprintf(stderr, "WARNING: not run as root, "
 111                                        "can not do tracing control\n");
 112                warned_tracing = 1;
 113                return 0;
 114        }
 115        return 1;
 116}
 117#endif
 118
 119void tracing_on(void)
 120{
 121#if CONTROL_TRACING > 0
 122#define TRACEDIR "/sys/kernel/debug/tracing"
 123        char pidstr[32];
 124
 125        if (!tracing_root_ok())
 126                return;
 127
 128        sprintf(pidstr, "%d", getpid());
 129        cat_into_file("0", TRACEDIR "/tracing_on");
 130        cat_into_file("\n", TRACEDIR "/trace");
 131        if (1) {
 132                cat_into_file("function_graph", TRACEDIR "/current_tracer");
 133                cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
 134        } else {
 135                cat_into_file("nop", TRACEDIR "/current_tracer");
 136        }
 137        cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
 138        cat_into_file("1", TRACEDIR "/tracing_on");
 139        dprintf1("enabled tracing\n");
 140#endif
 141}
 142
 143void tracing_off(void)
 144{
 145#if CONTROL_TRACING > 0
 146        if (!tracing_root_ok())
 147                return;
 148        cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
 149#endif
 150}
 151
 152void abort_hooks(void)
 153{
 154        fprintf(stderr, "running %s()...\n", __func__);
 155        tracing_off();
 156#ifdef SLEEP_ON_ABORT
 157        sleep(SLEEP_ON_ABORT);
 158#endif
 159}
 160
 161static inline void __page_o_noops(void)
 162{
 163        /* 8-bytes of instruction * 512 bytes = 1 page */
 164        asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
 165}
 166
 167/*
 168 * This attempts to have roughly a page of instructions followed by a few
 169 * instructions that do a write, and another page of instructions.  That
 170 * way, we are pretty sure that the write is in the second page of
 171 * instructions and has at least a page of padding behind it.
 172 *
 173 * *That* lets us be sure to madvise() away the write instruction, which
 174 * will then fault, which makes sure that the fault code handles
 175 * execute-only memory properly.
 176 */
 177__attribute__((__aligned__(PAGE_SIZE)))
 178void lots_o_noops_around_write(int *write_to_me)
 179{
 180        dprintf3("running %s()\n", __func__);
 181        __page_o_noops();
 182        /* Assume this happens in the second page of instructions: */
 183        *write_to_me = __LINE__;
 184        /* pad out by another page: */
 185        __page_o_noops();
 186        dprintf3("%s() done\n", __func__);
 187}
 188
 189/* Define some kernel-like types */
 190#define  u8 uint8_t
 191#define u16 uint16_t
 192#define u32 uint32_t
 193#define u64 uint64_t
 194
 195#ifdef __i386__
 196
 197#ifndef SYS_mprotect_key
 198# define SYS_mprotect_key       380
 199#endif
 200
 201#ifndef SYS_pkey_alloc
 202# define SYS_pkey_alloc         381
 203# define SYS_pkey_free          382
 204#endif
 205
 206#define REG_IP_IDX              REG_EIP
 207#define si_pkey_offset          0x14
 208
 209#else
 210
 211#ifndef SYS_mprotect_key
 212# define SYS_mprotect_key       329
 213#endif
 214
 215#ifndef SYS_pkey_alloc
 216# define SYS_pkey_alloc         330
 217# define SYS_pkey_free          331
 218#endif
 219
 220#define REG_IP_IDX              REG_RIP
 221#define si_pkey_offset          0x20
 222
 223#endif
 224
 225void dump_mem(void *dumpme, int len_bytes)
 226{
 227        char *c = (void *)dumpme;
 228        int i;
 229
 230        for (i = 0; i < len_bytes; i += sizeof(u64)) {
 231                u64 *ptr = (u64 *)(c + i);
 232                dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr);
 233        }
 234}
 235
 236/* Failed address bound checks: */
 237#ifndef SEGV_BNDERR
 238# define SEGV_BNDERR            3
 239#endif
 240
 241#ifndef SEGV_PKUERR
 242# define SEGV_PKUERR            4
 243#endif
 244
 245static char *si_code_str(int si_code)
 246{
 247        if (si_code == SEGV_MAPERR)
 248                return "SEGV_MAPERR";
 249        if (si_code == SEGV_ACCERR)
 250                return "SEGV_ACCERR";
 251        if (si_code == SEGV_BNDERR)
 252                return "SEGV_BNDERR";
 253        if (si_code == SEGV_PKUERR)
 254                return "SEGV_PKUERR";
 255        return "UNKNOWN";
 256}
 257
 258int pkru_faults;
 259int last_si_pkey = -1;
 260void signal_handler(int signum, siginfo_t *si, void *vucontext)
 261{
 262        ucontext_t *uctxt = vucontext;
 263        int trapno;
 264        unsigned long ip;
 265        char *fpregs;
 266        u32 *pkru_ptr;
 267        u64 siginfo_pkey;
 268        u32 *si_pkey_ptr;
 269        int pkru_offset;
 270        fpregset_t fpregset;
 271
 272        dprint_in_signal = 1;
 273        dprintf1(">>>>===============SIGSEGV============================\n");
 274        dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__,
 275                        __rdpkru(), shadow_pkru);
 276
 277        trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
 278        ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
 279        fpregset = uctxt->uc_mcontext.fpregs;
 280        fpregs = (void *)fpregset;
 281
 282        dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__,
 283                        trapno, ip, si_code_str(si->si_code), si->si_code);
 284#ifdef __i386__
 285        /*
 286         * 32-bit has some extra padding so that userspace can tell whether
 287         * the XSTATE header is present in addition to the "legacy" FPU
 288         * state.  We just assume that it is here.
 289         */
 290        fpregs += 0x70;
 291#endif
 292        pkru_offset = pkru_xstate_offset();
 293        pkru_ptr = (void *)(&fpregs[pkru_offset]);
 294
 295        dprintf1("siginfo: %p\n", si);
 296        dprintf1(" fpregs: %p\n", fpregs);
 297        /*
 298         * If we got a PKRU fault, we *HAVE* to have at least one bit set in
 299         * here.
 300         */
 301        dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset());
 302        if (DEBUG_LEVEL > 4)
 303                dump_mem(pkru_ptr - 128, 256);
 304        pkey_assert(*pkru_ptr);
 305
 306        if ((si->si_code == SEGV_MAPERR) ||
 307            (si->si_code == SEGV_ACCERR) ||
 308            (si->si_code == SEGV_BNDERR)) {
 309                printf("non-PK si_code, exiting...\n");
 310                exit(4);
 311        }
 312
 313        si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
 314        dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
 315        dump_mem((u8 *)si_pkey_ptr - 8, 24);
 316        siginfo_pkey = *si_pkey_ptr;
 317        pkey_assert(siginfo_pkey < NR_PKEYS);
 318        last_si_pkey = siginfo_pkey;
 319
 320        dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr);
 321        /* need __rdpkru() version so we do not do shadow_pkru checking */
 322        dprintf1("signal pkru from  pkru: %08x\n", __rdpkru());
 323        dprintf1("pkey from siginfo: %jx\n", siginfo_pkey);
 324        *(u64 *)pkru_ptr = 0x00000000;
 325        dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n");
 326        pkru_faults++;
 327        dprintf1("<<<<==================================================\n");
 328        dprint_in_signal = 0;
 329}
 330
 331int wait_all_children(void)
 332{
 333        int status;
 334        return waitpid(-1, &status, 0);
 335}
 336
 337void sig_chld(int x)
 338{
 339        dprint_in_signal = 1;
 340        dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
 341        dprint_in_signal = 0;
 342}
 343
 344void setup_sigsegv_handler(void)
 345{
 346        int r, rs;
 347        struct sigaction newact;
 348        struct sigaction oldact;
 349
 350        /* #PF is mapped to sigsegv */
 351        int signum  = SIGSEGV;
 352
 353        newact.sa_handler = 0;
 354        newact.sa_sigaction = signal_handler;
 355
 356        /*sigset_t - signals to block while in the handler */
 357        /* get the old signal mask. */
 358        rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
 359        pkey_assert(rs == 0);
 360
 361        /* call sa_sigaction, not sa_handler*/
 362        newact.sa_flags = SA_SIGINFO;
 363
 364        newact.sa_restorer = 0;  /* void(*)(), obsolete */
 365        r = sigaction(signum, &newact, &oldact);
 366        r = sigaction(SIGALRM, &newact, &oldact);
 367        pkey_assert(r == 0);
 368}
 369
 370void setup_handlers(void)
 371{
 372        signal(SIGCHLD, &sig_chld);
 373        setup_sigsegv_handler();
 374}
 375
 376pid_t fork_lazy_child(void)
 377{
 378        pid_t forkret;
 379
 380        forkret = fork();
 381        pkey_assert(forkret >= 0);
 382        dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
 383
 384        if (!forkret) {
 385                /* in the child */
 386                while (1) {
 387                        dprintf1("child sleeping...\n");
 388                        sleep(30);
 389                }
 390        }
 391        return forkret;
 392}
 393
 394#ifndef PKEY_DISABLE_ACCESS
 395# define PKEY_DISABLE_ACCESS    0x1
 396#endif
 397
 398#ifndef PKEY_DISABLE_WRITE
 399# define PKEY_DISABLE_WRITE     0x2
 400#endif
 401
 402static u32 hw_pkey_get(int pkey, unsigned long flags)
 403{
 404        u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
 405        u32 pkru = __rdpkru();
 406        u32 shifted_pkru;
 407        u32 masked_pkru;
 408
 409        dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
 410                        __func__, pkey, flags, 0, 0);
 411        dprintf2("%s() raw pkru: %x\n", __func__, pkru);
 412
 413        shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY));
 414        dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru);
 415        masked_pkru = shifted_pkru & mask;
 416        dprintf2("%s() masked  pkru: %x\n", __func__, masked_pkru);
 417        /*
 418         * shift down the relevant bits to the lowest two, then
 419         * mask off all the other high bits.
 420         */
 421        return masked_pkru;
 422}
 423
 424static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
 425{
 426        u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
 427        u32 old_pkru = __rdpkru();
 428        u32 new_pkru;
 429
 430        /* make sure that 'rights' only contains the bits we expect: */
 431        assert(!(rights & ~mask));
 432
 433        /* copy old pkru */
 434        new_pkru = old_pkru;
 435        /* mask out bits from pkey in old value: */
 436        new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY));
 437        /* OR in new bits for pkey: */
 438        new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY));
 439
 440        __wrpkru(new_pkru);
 441
 442        dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n",
 443                        __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru);
 444        return 0;
 445}
 446
 447void pkey_disable_set(int pkey, int flags)
 448{
 449        unsigned long syscall_flags = 0;
 450        int ret;
 451        int pkey_rights;
 452        u32 orig_pkru = rdpkru();
 453
 454        dprintf1("START->%s(%d, 0x%x)\n", __func__,
 455                pkey, flags);
 456        pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
 457
 458        pkey_rights = hw_pkey_get(pkey, syscall_flags);
 459
 460        dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
 461                        pkey, pkey, pkey_rights);
 462        pkey_assert(pkey_rights >= 0);
 463
 464        pkey_rights |= flags;
 465
 466        ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
 467        assert(!ret);
 468        /*pkru and flags have the same format */
 469        shadow_pkru |= flags << (pkey * 2);
 470        dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru);
 471
 472        pkey_assert(ret >= 0);
 473
 474        pkey_rights = hw_pkey_get(pkey, syscall_flags);
 475        dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
 476                        pkey, pkey, pkey_rights);
 477
 478        dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
 479        if (flags)
 480                pkey_assert(rdpkru() > orig_pkru);
 481        dprintf1("END<---%s(%d, 0x%x)\n", __func__,
 482                pkey, flags);
 483}
 484
 485void pkey_disable_clear(int pkey, int flags)
 486{
 487        unsigned long syscall_flags = 0;
 488        int ret;
 489        int pkey_rights = hw_pkey_get(pkey, syscall_flags);
 490        u32 orig_pkru = rdpkru();
 491
 492        pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
 493
 494        dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
 495                        pkey, pkey, pkey_rights);
 496        pkey_assert(pkey_rights >= 0);
 497
 498        pkey_rights |= flags;
 499
 500        ret = hw_pkey_set(pkey, pkey_rights, 0);
 501        /* pkru and flags have the same format */
 502        shadow_pkru &= ~(flags << (pkey * 2));
 503        pkey_assert(ret >= 0);
 504
 505        pkey_rights = hw_pkey_get(pkey, syscall_flags);
 506        dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
 507                        pkey, pkey, pkey_rights);
 508
 509        dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
 510        if (flags)
 511                assert(rdpkru() > orig_pkru);
 512}
 513
 514void pkey_write_allow(int pkey)
 515{
 516        pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
 517}
 518void pkey_write_deny(int pkey)
 519{
 520        pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
 521}
 522void pkey_access_allow(int pkey)
 523{
 524        pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
 525}
 526void pkey_access_deny(int pkey)
 527{
 528        pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
 529}
 530
 531int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
 532                unsigned long pkey)
 533{
 534        int sret;
 535
 536        dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
 537                        ptr, size, orig_prot, pkey);
 538
 539        errno = 0;
 540        sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
 541        if (errno) {
 542                dprintf2("SYS_mprotect_key sret: %d\n", sret);
 543                dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
 544                dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
 545                if (DEBUG_LEVEL >= 2)
 546                        perror("SYS_mprotect_pkey");
 547        }
 548        return sret;
 549}
 550
 551int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
 552{
 553        int ret = syscall(SYS_pkey_alloc, flags, init_val);
 554        dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
 555                        __func__, flags, init_val, ret, errno);
 556        return ret;
 557}
 558
 559int alloc_pkey(void)
 560{
 561        int ret;
 562        unsigned long init_val = 0x0;
 563
 564        dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n",
 565                        __LINE__, __rdpkru(), shadow_pkru);
 566        ret = sys_pkey_alloc(0, init_val);
 567        /*
 568         * pkey_alloc() sets PKRU, so we need to reflect it in
 569         * shadow_pkru:
 570         */
 571        dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
 572                        __LINE__, ret, __rdpkru(), shadow_pkru);
 573        if (ret) {
 574                /* clear both the bits: */
 575                shadow_pkru &= ~(0x3      << (ret * 2));
 576                dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
 577                                __LINE__, ret, __rdpkru(), shadow_pkru);
 578                /*
 579                 * move the new state in from init_val
 580                 * (remember, we cheated and init_val == pkru format)
 581                 */
 582                shadow_pkru |=  (init_val << (ret * 2));
 583        }
 584        dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
 585                        __LINE__, ret, __rdpkru(), shadow_pkru);
 586        dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno);
 587        /* for shadow checking: */
 588        rdpkru();
 589        dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
 590                        __LINE__, ret, __rdpkru(), shadow_pkru);
 591        return ret;
 592}
 593
 594int sys_pkey_free(unsigned long pkey)
 595{
 596        int ret = syscall(SYS_pkey_free, pkey);
 597        dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
 598        return ret;
 599}
 600
 601/*
 602 * I had a bug where pkey bits could be set by mprotect() but
 603 * not cleared.  This ensures we get lots of random bit sets
 604 * and clears on the vma and pte pkey bits.
 605 */
 606int alloc_random_pkey(void)
 607{
 608        int max_nr_pkey_allocs;
 609        int ret;
 610        int i;
 611        int alloced_pkeys[NR_PKEYS];
 612        int nr_alloced = 0;
 613        int random_index;
 614        memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
 615
 616        /* allocate every possible key and make a note of which ones we got */
 617        max_nr_pkey_allocs = NR_PKEYS;
 618        max_nr_pkey_allocs = 1;
 619        for (i = 0; i < max_nr_pkey_allocs; i++) {
 620                int new_pkey = alloc_pkey();
 621                if (new_pkey < 0)
 622                        break;
 623                alloced_pkeys[nr_alloced++] = new_pkey;
 624        }
 625
 626        pkey_assert(nr_alloced > 0);
 627        /* select a random one out of the allocated ones */
 628        random_index = rand() % nr_alloced;
 629        ret = alloced_pkeys[random_index];
 630        /* now zero it out so we don't free it next */
 631        alloced_pkeys[random_index] = 0;
 632
 633        /* go through the allocated ones that we did not want and free them */
 634        for (i = 0; i < nr_alloced; i++) {
 635                int free_ret;
 636                if (!alloced_pkeys[i])
 637                        continue;
 638                free_ret = sys_pkey_free(alloced_pkeys[i]);
 639                pkey_assert(!free_ret);
 640        }
 641        dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
 642                        __LINE__, ret, __rdpkru(), shadow_pkru);
 643        return ret;
 644}
 645
 646int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
 647                unsigned long pkey)
 648{
 649        int nr_iterations = random() % 100;
 650        int ret;
 651
 652        while (0) {
 653                int rpkey = alloc_random_pkey();
 654                ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
 655                dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
 656                                ptr, size, orig_prot, pkey, ret);
 657                if (nr_iterations-- < 0)
 658                        break;
 659
 660                dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
 661                        __LINE__, ret, __rdpkru(), shadow_pkru);
 662                sys_pkey_free(rpkey);
 663                dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
 664                        __LINE__, ret, __rdpkru(), shadow_pkru);
 665        }
 666        pkey_assert(pkey < NR_PKEYS);
 667
 668        ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
 669        dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
 670                        ptr, size, orig_prot, pkey, ret);
 671        pkey_assert(!ret);
 672        dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
 673                        __LINE__, ret, __rdpkru(), shadow_pkru);
 674        return ret;
 675}
 676
 677struct pkey_malloc_record {
 678        void *ptr;
 679        long size;
 680        int prot;
 681};
 682struct pkey_malloc_record *pkey_malloc_records;
 683struct pkey_malloc_record *pkey_last_malloc_record;
 684long nr_pkey_malloc_records;
 685void record_pkey_malloc(void *ptr, long size, int prot)
 686{
 687        long i;
 688        struct pkey_malloc_record *rec = NULL;
 689
 690        for (i = 0; i < nr_pkey_malloc_records; i++) {
 691                rec = &pkey_malloc_records[i];
 692                /* find a free record */
 693                if (rec)
 694                        break;
 695        }
 696        if (!rec) {
 697                /* every record is full */
 698                size_t old_nr_records = nr_pkey_malloc_records;
 699                size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
 700                size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
 701                dprintf2("new_nr_records: %zd\n", new_nr_records);
 702                dprintf2("new_size: %zd\n", new_size);
 703                pkey_malloc_records = realloc(pkey_malloc_records, new_size);
 704                pkey_assert(pkey_malloc_records != NULL);
 705                rec = &pkey_malloc_records[nr_pkey_malloc_records];
 706                /*
 707                 * realloc() does not initialize memory, so zero it from
 708                 * the first new record all the way to the end.
 709                 */
 710                for (i = 0; i < new_nr_records - old_nr_records; i++)
 711                        memset(rec + i, 0, sizeof(*rec));
 712        }
 713        dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
 714                (int)(rec - pkey_malloc_records), rec, ptr, size);
 715        rec->ptr = ptr;
 716        rec->size = size;
 717        rec->prot = prot;
 718        pkey_last_malloc_record = rec;
 719        nr_pkey_malloc_records++;
 720}
 721
 722void free_pkey_malloc(void *ptr)
 723{
 724        long i;
 725        int ret;
 726        dprintf3("%s(%p)\n", __func__, ptr);
 727        for (i = 0; i < nr_pkey_malloc_records; i++) {
 728                struct pkey_malloc_record *rec = &pkey_malloc_records[i];
 729                dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
 730                                ptr, i, rec, rec->ptr, rec->size);
 731                if ((ptr <  rec->ptr) ||
 732                    (ptr >= rec->ptr + rec->size))
 733                        continue;
 734
 735                dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
 736                                ptr, i, rec, rec->ptr, rec->size);
 737                nr_pkey_malloc_records--;
 738                ret = munmap(rec->ptr, rec->size);
 739                dprintf3("munmap ret: %d\n", ret);
 740                pkey_assert(!ret);
 741                dprintf3("clearing rec->ptr, rec: %p\n", rec);
 742                rec->ptr = NULL;
 743                dprintf3("done clearing rec->ptr, rec: %p\n", rec);
 744                return;
 745        }
 746        pkey_assert(false);
 747}
 748
 749
 750void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
 751{
 752        void *ptr;
 753        int ret;
 754
 755        rdpkru();
 756        dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
 757                        size, prot, pkey);
 758        pkey_assert(pkey < NR_PKEYS);
 759        ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
 760        pkey_assert(ptr != (void *)-1);
 761        ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
 762        pkey_assert(!ret);
 763        record_pkey_malloc(ptr, size, prot);
 764        rdpkru();
 765
 766        dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
 767        return ptr;
 768}
 769
 770void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
 771{
 772        int ret;
 773        void *ptr;
 774
 775        dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
 776                        size, prot, pkey);
 777        /*
 778         * Guarantee we can fit at least one huge page in the resulting
 779         * allocation by allocating space for 2:
 780         */
 781        size = ALIGN_UP(size, HPAGE_SIZE * 2);
 782        ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
 783        pkey_assert(ptr != (void *)-1);
 784        record_pkey_malloc(ptr, size, prot);
 785        mprotect_pkey(ptr, size, prot, pkey);
 786
 787        dprintf1("unaligned ptr: %p\n", ptr);
 788        ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
 789        dprintf1("  aligned ptr: %p\n", ptr);
 790        ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
 791        dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
 792        ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
 793        dprintf1("MADV_WILLNEED ret: %d\n", ret);
 794        memset(ptr, 0, HPAGE_SIZE);
 795
 796        dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
 797        return ptr;
 798}
 799
 800int hugetlb_setup_ok;
 801#define GET_NR_HUGE_PAGES 10
 802void setup_hugetlbfs(void)
 803{
 804        int err;
 805        int fd;
 806        char buf[] = "123";
 807
 808        if (geteuid() != 0) {
 809                fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
 810                return;
 811        }
 812
 813        cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
 814
 815        /*
 816         * Now go make sure that we got the pages and that they
 817         * are 2M pages.  Someone might have made 1G the default.
 818         */
 819        fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY);
 820        if (fd < 0) {
 821                perror("opening sysfs 2M hugetlb config");
 822                return;
 823        }
 824
 825        /* -1 to guarantee leaving the trailing \0 */
 826        err = read(fd, buf, sizeof(buf)-1);
 827        close(fd);
 828        if (err <= 0) {
 829                perror("reading sysfs 2M hugetlb config");
 830                return;
 831        }
 832
 833        if (atoi(buf) != GET_NR_HUGE_PAGES) {
 834                fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n",
 835                        buf, GET_NR_HUGE_PAGES);
 836                return;
 837        }
 838
 839        hugetlb_setup_ok = 1;
 840}
 841
 842void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
 843{
 844        void *ptr;
 845        int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
 846
 847        if (!hugetlb_setup_ok)
 848                return PTR_ERR_ENOTSUP;
 849
 850        dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
 851        size = ALIGN_UP(size, HPAGE_SIZE * 2);
 852        pkey_assert(pkey < NR_PKEYS);
 853        ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
 854        pkey_assert(ptr != (void *)-1);
 855        mprotect_pkey(ptr, size, prot, pkey);
 856
 857        record_pkey_malloc(ptr, size, prot);
 858
 859        dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
 860        return ptr;
 861}
 862
 863void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
 864{
 865        void *ptr;
 866        int fd;
 867
 868        dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
 869                        size, prot, pkey);
 870        pkey_assert(pkey < NR_PKEYS);
 871        fd = open("/dax/foo", O_RDWR);
 872        pkey_assert(fd >= 0);
 873
 874        ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
 875        pkey_assert(ptr != (void *)-1);
 876
 877        mprotect_pkey(ptr, size, prot, pkey);
 878
 879        record_pkey_malloc(ptr, size, prot);
 880
 881        dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
 882        close(fd);
 883        return ptr;
 884}
 885
 886void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
 887
 888        malloc_pkey_with_mprotect,
 889        malloc_pkey_anon_huge,
 890        malloc_pkey_hugetlb
 891/* can not do direct with the pkey_mprotect() API:
 892        malloc_pkey_mmap_direct,
 893        malloc_pkey_mmap_dax,
 894*/
 895};
 896
 897void *malloc_pkey(long size, int prot, u16 pkey)
 898{
 899        void *ret;
 900        static int malloc_type;
 901        int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
 902
 903        pkey_assert(pkey < NR_PKEYS);
 904
 905        while (1) {
 906                pkey_assert(malloc_type < nr_malloc_types);
 907
 908                ret = pkey_malloc[malloc_type](size, prot, pkey);
 909                pkey_assert(ret != (void *)-1);
 910
 911                malloc_type++;
 912                if (malloc_type >= nr_malloc_types)
 913                        malloc_type = (random()%nr_malloc_types);
 914
 915                /* try again if the malloc_type we tried is unsupported */
 916                if (ret == PTR_ERR_ENOTSUP)
 917                        continue;
 918
 919                break;
 920        }
 921
 922        dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
 923                        size, prot, pkey, ret);
 924        return ret;
 925}
 926
 927int last_pkru_faults;
 928#define UNKNOWN_PKEY -2
 929void expected_pk_fault(int pkey)
 930{
 931        dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n",
 932                        __func__, last_pkru_faults, pkru_faults);
 933        dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
 934        pkey_assert(last_pkru_faults + 1 == pkru_faults);
 935
 936       /*
 937        * For exec-only memory, we do not know the pkey in
 938        * advance, so skip this check.
 939        */
 940        if (pkey != UNKNOWN_PKEY)
 941                pkey_assert(last_si_pkey == pkey);
 942
 943        /*
 944         * The signal handler shold have cleared out PKRU to let the
 945         * test program continue.  We now have to restore it.
 946         */
 947        if (__rdpkru() != 0)
 948                pkey_assert(0);
 949
 950        __wrpkru(shadow_pkru);
 951        dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n",
 952                        __func__, shadow_pkru);
 953        last_pkru_faults = pkru_faults;
 954        last_si_pkey = -1;
 955}
 956
 957#define do_not_expect_pk_fault(msg)     do {                    \
 958        if (last_pkru_faults != pkru_faults)                    \
 959                dprintf0("unexpected PK fault: %s\n", msg);     \
 960        pkey_assert(last_pkru_faults == pkru_faults);           \
 961} while (0)
 962
 963int test_fds[10] = { -1 };
 964int nr_test_fds;
 965void __save_test_fd(int fd)
 966{
 967        pkey_assert(fd >= 0);
 968        pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
 969        test_fds[nr_test_fds] = fd;
 970        nr_test_fds++;
 971}
 972
 973int get_test_read_fd(void)
 974{
 975        int test_fd = open("/etc/passwd", O_RDONLY);
 976        __save_test_fd(test_fd);
 977        return test_fd;
 978}
 979
 980void close_test_fds(void)
 981{
 982        int i;
 983
 984        for (i = 0; i < nr_test_fds; i++) {
 985                if (test_fds[i] < 0)
 986                        continue;
 987                close(test_fds[i]);
 988                test_fds[i] = -1;
 989        }
 990        nr_test_fds = 0;
 991}
 992
 993#define barrier() __asm__ __volatile__("": : :"memory")
 994__attribute__((noinline)) int read_ptr(int *ptr)
 995{
 996        /*
 997         * Keep GCC from optimizing this away somehow
 998         */
 999        barrier();
1000        return *ptr;
1001}
1002
1003void test_read_of_write_disabled_region(int *ptr, u16 pkey)
1004{
1005        int ptr_contents;
1006
1007        dprintf1("disabling write access to PKEY[1], doing read\n");
1008        pkey_write_deny(pkey);
1009        ptr_contents = read_ptr(ptr);
1010        dprintf1("*ptr: %d\n", ptr_contents);
1011        dprintf1("\n");
1012}
1013void test_read_of_access_disabled_region(int *ptr, u16 pkey)
1014{
1015        int ptr_contents;
1016
1017        dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
1018        rdpkru();
1019        pkey_access_deny(pkey);
1020        ptr_contents = read_ptr(ptr);
1021        dprintf1("*ptr: %d\n", ptr_contents);
1022        expected_pk_fault(pkey);
1023}
1024void test_write_of_write_disabled_region(int *ptr, u16 pkey)
1025{
1026        dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
1027        pkey_write_deny(pkey);
1028        *ptr = __LINE__;
1029        expected_pk_fault(pkey);
1030}
1031void test_write_of_access_disabled_region(int *ptr, u16 pkey)
1032{
1033        dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
1034        pkey_access_deny(pkey);
1035        *ptr = __LINE__;
1036        expected_pk_fault(pkey);
1037}
1038void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
1039{
1040        int ret;
1041        int test_fd = get_test_read_fd();
1042
1043        dprintf1("disabling access to PKEY[%02d], "
1044                 "having kernel read() to buffer\n", pkey);
1045        pkey_access_deny(pkey);
1046        ret = read(test_fd, ptr, 1);
1047        dprintf1("read ret: %d\n", ret);
1048        pkey_assert(ret);
1049}
1050void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
1051{
1052        int ret;
1053        int test_fd = get_test_read_fd();
1054
1055        pkey_write_deny(pkey);
1056        ret = read(test_fd, ptr, 100);
1057        dprintf1("read ret: %d\n", ret);
1058        if (ret < 0 && (DEBUG_LEVEL > 0))
1059                perror("verbose read result (OK for this to be bad)");
1060        pkey_assert(ret);
1061}
1062
1063void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
1064{
1065        int pipe_ret, vmsplice_ret;
1066        struct iovec iov;
1067        int pipe_fds[2];
1068
1069        pipe_ret = pipe(pipe_fds);
1070
1071        pkey_assert(pipe_ret == 0);
1072        dprintf1("disabling access to PKEY[%02d], "
1073                 "having kernel vmsplice from buffer\n", pkey);
1074        pkey_access_deny(pkey);
1075        iov.iov_base = ptr;
1076        iov.iov_len = PAGE_SIZE;
1077        vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
1078        dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
1079        pkey_assert(vmsplice_ret == -1);
1080
1081        close(pipe_fds[0]);
1082        close(pipe_fds[1]);
1083}
1084
1085void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
1086{
1087        int ignored = 0xdada;
1088        int futex_ret;
1089        int some_int = __LINE__;
1090
1091        dprintf1("disabling write to PKEY[%02d], "
1092                 "doing futex gunk in buffer\n", pkey);
1093        *ptr = some_int;
1094        pkey_write_deny(pkey);
1095        futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
1096                        &ignored, ignored);
1097        if (DEBUG_LEVEL > 0)
1098                perror("futex");
1099        dprintf1("futex() ret: %d\n", futex_ret);
1100}
1101
1102/* Assumes that all pkeys other than 'pkey' are unallocated */
1103void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
1104{
1105        int err;
1106        int i;
1107
1108        /* Note: 0 is the default pkey, so don't mess with it */
1109        for (i = 1; i < NR_PKEYS; i++) {
1110                if (pkey == i)
1111                        continue;
1112
1113                dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
1114                err = sys_pkey_free(i);
1115                pkey_assert(err);
1116
1117                err = sys_pkey_free(i);
1118                pkey_assert(err);
1119
1120                err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
1121                pkey_assert(err);
1122        }
1123}
1124
1125/* Assumes that all pkeys other than 'pkey' are unallocated */
1126void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
1127{
1128        int err;
1129        int bad_pkey = NR_PKEYS+99;
1130
1131        /* pass a known-invalid pkey in: */
1132        err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
1133        pkey_assert(err);
1134}
1135
1136void become_child(void)
1137{
1138        pid_t forkret;
1139
1140        forkret = fork();
1141        pkey_assert(forkret >= 0);
1142        dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
1143
1144        if (!forkret) {
1145                /* in the child */
1146                return;
1147        }
1148        exit(0);
1149}
1150
1151/* Assumes that all pkeys other than 'pkey' are unallocated */
1152void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
1153{
1154        int err;
1155        int allocated_pkeys[NR_PKEYS] = {0};
1156        int nr_allocated_pkeys = 0;
1157        int i;
1158
1159        for (i = 0; i < NR_PKEYS*3; i++) {
1160                int new_pkey;
1161                dprintf1("%s() alloc loop: %d\n", __func__, i);
1162                new_pkey = alloc_pkey();
1163                dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__,
1164                                __LINE__, err, __rdpkru(), shadow_pkru);
1165                rdpkru(); /* for shadow checking */
1166                dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
1167                if ((new_pkey == -1) && (errno == ENOSPC)) {
1168                        dprintf2("%s() failed to allocate pkey after %d tries\n",
1169                                __func__, nr_allocated_pkeys);
1170                } else {
1171                        /*
1172                         * Ensure the number of successes never
1173                         * exceeds the number of keys supported
1174                         * in the hardware.
1175                         */
1176                        pkey_assert(nr_allocated_pkeys < NR_PKEYS);
1177                        allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
1178                }
1179
1180                /*
1181                 * Make sure that allocation state is properly
1182                 * preserved across fork().
1183                 */
1184                if (i == NR_PKEYS*2)
1185                        become_child();
1186        }
1187
1188        dprintf3("%s()::%d\n", __func__, __LINE__);
1189
1190        /*
1191         * There are 16 pkeys supported in hardware.  Three are
1192         * allocated by the time we get here:
1193         *   1. The default key (0)
1194         *   2. One possibly consumed by an execute-only mapping.
1195         *   3. One allocated by the test code and passed in via
1196         *      'pkey' to this function.
1197         * Ensure that we can allocate at least another 13 (16-3).
1198         */
1199        pkey_assert(i >= NR_PKEYS-3);
1200
1201        for (i = 0; i < nr_allocated_pkeys; i++) {
1202                err = sys_pkey_free(allocated_pkeys[i]);
1203                pkey_assert(!err);
1204                rdpkru(); /* for shadow checking */
1205        }
1206}
1207
1208/*
1209 * pkey 0 is special.  It is allocated by default, so you do not
1210 * have to call pkey_alloc() to use it first.  Make sure that it
1211 * is usable.
1212 */
1213void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
1214{
1215        long size;
1216        int prot;
1217
1218        assert(pkey_last_malloc_record);
1219        size = pkey_last_malloc_record->size;
1220        /*
1221         * This is a bit of a hack.  But mprotect() requires
1222         * huge-page-aligned sizes when operating on hugetlbfs.
1223         * So, make sure that we use something that's a multiple
1224         * of a huge page when we can.
1225         */
1226        if (size >= HPAGE_SIZE)
1227                size = HPAGE_SIZE;
1228        prot = pkey_last_malloc_record->prot;
1229
1230        /* Use pkey 0 */
1231        mprotect_pkey(ptr, size, prot, 0);
1232
1233        /* Make sure that we can set it back to the original pkey. */
1234        mprotect_pkey(ptr, size, prot, pkey);
1235}
1236
1237void test_ptrace_of_child(int *ptr, u16 pkey)
1238{
1239        __attribute__((__unused__)) int peek_result;
1240        pid_t child_pid;
1241        void *ignored = 0;
1242        long ret;
1243        int status;
1244        /*
1245         * This is the "control" for our little expermient.  Make sure
1246         * we can always access it when ptracing.
1247         */
1248        int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
1249        int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
1250
1251        /*
1252         * Fork a child which is an exact copy of this process, of course.
1253         * That means we can do all of our tests via ptrace() and then plain
1254         * memory access and ensure they work differently.
1255         */
1256        child_pid = fork_lazy_child();
1257        dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
1258
1259        ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
1260        if (ret)
1261                perror("attach");
1262        dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
1263        pkey_assert(ret != -1);
1264        ret = waitpid(child_pid, &status, WUNTRACED);
1265        if ((ret != child_pid) || !(WIFSTOPPED(status))) {
1266                fprintf(stderr, "weird waitpid result %ld stat %x\n",
1267                                ret, status);
1268                pkey_assert(0);
1269        }
1270        dprintf2("waitpid ret: %ld\n", ret);
1271        dprintf2("waitpid status: %d\n", status);
1272
1273        pkey_access_deny(pkey);
1274        pkey_write_deny(pkey);
1275
1276        /* Write access, untested for now:
1277        ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
1278        pkey_assert(ret != -1);
1279        dprintf1("poke at %p: %ld\n", peek_at, ret);
1280        */
1281
1282        /*
1283         * Try to access the pkey-protected "ptr" via ptrace:
1284         */
1285        ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
1286        /* expect it to work, without an error: */
1287        pkey_assert(ret != -1);
1288        /* Now access from the current task, and expect an exception: */
1289        peek_result = read_ptr(ptr);
1290        expected_pk_fault(pkey);
1291
1292        /*
1293         * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
1294         */
1295        ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
1296        /* expect it to work, without an error: */
1297        pkey_assert(ret != -1);
1298        /* Now access from the current task, and expect NO exception: */
1299        peek_result = read_ptr(plain_ptr);
1300        do_not_expect_pk_fault("read plain pointer after ptrace");
1301
1302        ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
1303        pkey_assert(ret != -1);
1304
1305        ret = kill(child_pid, SIGKILL);
1306        pkey_assert(ret != -1);
1307
1308        wait(&status);
1309
1310        free(plain_ptr_unaligned);
1311}
1312
1313void *get_pointer_to_instructions(void)
1314{
1315        void *p1;
1316
1317        p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
1318        dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
1319        /* lots_o_noops_around_write should be page-aligned already */
1320        assert(p1 == &lots_o_noops_around_write);
1321
1322        /* Point 'p1' at the *second* page of the function: */
1323        p1 += PAGE_SIZE;
1324
1325        /*
1326         * Try to ensure we fault this in on next touch to ensure
1327         * we get an instruction fault as opposed to a data one
1328         */
1329        madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1330
1331        return p1;
1332}
1333
1334void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
1335{
1336        void *p1;
1337        int scratch;
1338        int ptr_contents;
1339        int ret;
1340
1341        p1 = get_pointer_to_instructions();
1342        lots_o_noops_around_write(&scratch);
1343        ptr_contents = read_ptr(p1);
1344        dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1345
1346        ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
1347        pkey_assert(!ret);
1348        pkey_access_deny(pkey);
1349
1350        dprintf2("pkru: %x\n", rdpkru());
1351
1352        /*
1353         * Make sure this is an *instruction* fault
1354         */
1355        madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1356        lots_o_noops_around_write(&scratch);
1357        do_not_expect_pk_fault("executing on PROT_EXEC memory");
1358        ptr_contents = read_ptr(p1);
1359        dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1360        expected_pk_fault(pkey);
1361}
1362
1363void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
1364{
1365        void *p1;
1366        int scratch;
1367        int ptr_contents;
1368        int ret;
1369
1370        dprintf1("%s() start\n", __func__);
1371
1372        p1 = get_pointer_to_instructions();
1373        lots_o_noops_around_write(&scratch);
1374        ptr_contents = read_ptr(p1);
1375        dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1376
1377        /* Use a *normal* mprotect(), not mprotect_pkey(): */
1378        ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
1379        pkey_assert(!ret);
1380
1381        dprintf2("pkru: %x\n", rdpkru());
1382
1383        /* Make sure this is an *instruction* fault */
1384        madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1385        lots_o_noops_around_write(&scratch);
1386        do_not_expect_pk_fault("executing on PROT_EXEC memory");
1387        ptr_contents = read_ptr(p1);
1388        dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1389        expected_pk_fault(UNKNOWN_PKEY);
1390
1391        /*
1392         * Put the memory back to non-PROT_EXEC.  Should clear the
1393         * exec-only pkey off the VMA and allow it to be readable
1394         * again.  Go to PROT_NONE first to check for a kernel bug
1395         * that did not clear the pkey when doing PROT_NONE.
1396         */
1397        ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
1398        pkey_assert(!ret);
1399
1400        ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
1401        pkey_assert(!ret);
1402        ptr_contents = read_ptr(p1);
1403        do_not_expect_pk_fault("plain read on recently PROT_EXEC area");
1404}
1405
1406void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
1407{
1408        int size = PAGE_SIZE;
1409        int sret;
1410
1411        if (cpu_has_pku()) {
1412                dprintf1("SKIP: %s: no CPU support\n", __func__);
1413                return;
1414        }
1415
1416        sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
1417        pkey_assert(sret < 0);
1418}
1419
1420void (*pkey_tests[])(int *ptr, u16 pkey) = {
1421        test_read_of_write_disabled_region,
1422        test_read_of_access_disabled_region,
1423        test_write_of_write_disabled_region,
1424        test_write_of_access_disabled_region,
1425        test_kernel_write_of_access_disabled_region,
1426        test_kernel_write_of_write_disabled_region,
1427        test_kernel_gup_of_access_disabled_region,
1428        test_kernel_gup_write_to_write_disabled_region,
1429        test_executing_on_unreadable_memory,
1430        test_implicit_mprotect_exec_only_memory,
1431        test_mprotect_with_pkey_0,
1432        test_ptrace_of_child,
1433        test_pkey_syscalls_on_non_allocated_pkey,
1434        test_pkey_syscalls_bad_args,
1435        test_pkey_alloc_exhaust,
1436};
1437
1438void run_tests_once(void)
1439{
1440        int *ptr;
1441        int prot = PROT_READ|PROT_WRITE;
1442
1443        for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
1444                int pkey;
1445                int orig_pkru_faults = pkru_faults;
1446
1447                dprintf1("======================\n");
1448                dprintf1("test %d preparing...\n", test_nr);
1449
1450                tracing_on();
1451                pkey = alloc_random_pkey();
1452                dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
1453                ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
1454                dprintf1("test %d starting...\n", test_nr);
1455                pkey_tests[test_nr](ptr, pkey);
1456                dprintf1("freeing test memory: %p\n", ptr);
1457                free_pkey_malloc(ptr);
1458                sys_pkey_free(pkey);
1459
1460                dprintf1("pkru_faults: %d\n", pkru_faults);
1461                dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults);
1462
1463                tracing_off();
1464                close_test_fds();
1465
1466                printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr);
1467                dprintf1("======================\n\n");
1468        }
1469        iteration_nr++;
1470}
1471
1472void pkey_setup_shadow(void)
1473{
1474        shadow_pkru = __rdpkru();
1475}
1476
1477int main(void)
1478{
1479        int nr_iterations = 22;
1480
1481        setup_handlers();
1482
1483        printf("has pku: %d\n", cpu_has_pku());
1484
1485        if (!cpu_has_pku()) {
1486                int size = PAGE_SIZE;
1487                int *ptr;
1488
1489                printf("running PKEY tests for unsupported CPU/OS\n");
1490
1491                ptr  = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
1492                assert(ptr != (void *)-1);
1493                test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
1494                exit(0);
1495        }
1496
1497        pkey_setup_shadow();
1498        printf("startup pkru: %x\n", rdpkru());
1499        setup_hugetlbfs();
1500
1501        while (nr_iterations-- > 0)
1502                run_tests_once();
1503
1504        printf("done (all tests OK)\n");
1505        return 0;
1506}
1507