linux/arch/x86/kernel/fpu/xstate.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * xsave/xrstor support.
   4 *
   5 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
   6 */
   7#include <linux/bitops.h>
   8#include <linux/compat.h>
   9#include <linux/cpu.h>
  10#include <linux/mman.h>
  11#include <linux/nospec.h>
  12#include <linux/pkeys.h>
  13#include <linux/seq_file.h>
  14#include <linux/proc_fs.h>
  15#include <linux/vmalloc.h>
  16
  17#include <asm/fpu/api.h>
  18#include <asm/fpu/regset.h>
  19#include <asm/fpu/signal.h>
  20#include <asm/fpu/xcr.h>
  21
  22#include <asm/tlbflush.h>
  23#include <asm/prctl.h>
  24#include <asm/elf.h>
  25
  26#include "context.h"
  27#include "internal.h"
  28#include "legacy.h"
  29#include "xstate.h"
  30
  31#define for_each_extended_xfeature(bit, mask)                           \
  32        (bit) = FIRST_EXTENDED_XFEATURE;                                \
  33        for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
  34
  35/*
  36 * Although we spell it out in here, the Processor Trace
  37 * xfeature is completely unused.  We use other mechanisms
  38 * to save/restore PT state in Linux.
  39 */
  40static const char *xfeature_names[] =
  41{
  42        "x87 floating point registers"  ,
  43        "SSE registers"                 ,
  44        "AVX registers"                 ,
  45        "MPX bounds registers"          ,
  46        "MPX CSR"                       ,
  47        "AVX-512 opmask"                ,
  48        "AVX-512 Hi256"                 ,
  49        "AVX-512 ZMM_Hi256"             ,
  50        "Processor Trace (unused)"      ,
  51        "Protection Keys User registers",
  52        "PASID state",
  53        "unknown xstate feature"        ,
  54        "unknown xstate feature"        ,
  55        "unknown xstate feature"        ,
  56        "unknown xstate feature"        ,
  57        "unknown xstate feature"        ,
  58        "unknown xstate feature"        ,
  59        "AMX Tile config"               ,
  60        "AMX Tile data"                 ,
  61        "unknown xstate feature"        ,
  62};
  63
  64static unsigned short xsave_cpuid_features[] __initdata = {
  65        [XFEATURE_FP]                           = X86_FEATURE_FPU,
  66        [XFEATURE_SSE]                          = X86_FEATURE_XMM,
  67        [XFEATURE_YMM]                          = X86_FEATURE_AVX,
  68        [XFEATURE_BNDREGS]                      = X86_FEATURE_MPX,
  69        [XFEATURE_BNDCSR]                       = X86_FEATURE_MPX,
  70        [XFEATURE_OPMASK]                       = X86_FEATURE_AVX512F,
  71        [XFEATURE_ZMM_Hi256]                    = X86_FEATURE_AVX512F,
  72        [XFEATURE_Hi16_ZMM]                     = X86_FEATURE_AVX512F,
  73        [XFEATURE_PT_UNIMPLEMENTED_SO_FAR]      = X86_FEATURE_INTEL_PT,
  74        [XFEATURE_PKRU]                         = X86_FEATURE_PKU,
  75        [XFEATURE_PASID]                        = X86_FEATURE_ENQCMD,
  76        [XFEATURE_XTILE_CFG]                    = X86_FEATURE_AMX_TILE,
  77        [XFEATURE_XTILE_DATA]                   = X86_FEATURE_AMX_TILE,
  78};
  79
  80static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
  81        { [ 0 ... XFEATURE_MAX - 1] = -1};
  82static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
  83        { [ 0 ... XFEATURE_MAX - 1] = -1};
  84static unsigned int xstate_comp_offsets[XFEATURE_MAX] __ro_after_init =
  85        { [ 0 ... XFEATURE_MAX - 1] = -1};
  86static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init =
  87        { [ 0 ... XFEATURE_MAX - 1] = -1};
  88
  89/*
  90 * Return whether the system supports a given xfeature.
  91 *
  92 * Also return the name of the (most advanced) feature that the caller requested:
  93 */
  94int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
  95{
  96        u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
  97
  98        if (unlikely(feature_name)) {
  99                long xfeature_idx, max_idx;
 100                u64 xfeatures_print;
 101                /*
 102                 * So we use FLS here to be able to print the most advanced
 103                 * feature that was requested but is missing. So if a driver
 104                 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
 105                 * missing AVX feature - this is the most informative message
 106                 * to users:
 107                 */
 108                if (xfeatures_missing)
 109                        xfeatures_print = xfeatures_missing;
 110                else
 111                        xfeatures_print = xfeatures_needed;
 112
 113                xfeature_idx = fls64(xfeatures_print)-1;
 114                max_idx = ARRAY_SIZE(xfeature_names)-1;
 115                xfeature_idx = min(xfeature_idx, max_idx);
 116
 117                *feature_name = xfeature_names[xfeature_idx];
 118        }
 119
 120        if (xfeatures_missing)
 121                return 0;
 122
 123        return 1;
 124}
 125EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
 126
 127static bool xfeature_is_supervisor(int xfeature_nr)
 128{
 129        /*
 130         * Extended State Enumeration Sub-leaves (EAX = 0DH, ECX = n, n > 1)
 131         * returns ECX[0] set to (1) for a supervisor state, and cleared (0)
 132         * for a user state.
 133         */
 134        u32 eax, ebx, ecx, edx;
 135
 136        cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
 137        return ecx & 1;
 138}
 139
 140/*
 141 * Enable the extended processor state save/restore feature.
 142 * Called once per CPU onlining.
 143 */
 144void fpu__init_cpu_xstate(void)
 145{
 146        if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
 147                return;
 148
 149        cr4_set_bits(X86_CR4_OSXSAVE);
 150
 151        /*
 152         * Must happen after CR4 setup and before xsetbv() to allow KVM
 153         * lazy passthrough.  Write independent of the dynamic state static
 154         * key as that does not work on the boot CPU. This also ensures
 155         * that any stale state is wiped out from XFD.
 156         */
 157        if (cpu_feature_enabled(X86_FEATURE_XFD))
 158                wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
 159
 160        /*
 161         * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
 162         * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
 163         * states can be set here.
 164         */
 165        xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
 166
 167        /*
 168         * MSR_IA32_XSS sets supervisor states managed by XSAVES.
 169         */
 170        if (boot_cpu_has(X86_FEATURE_XSAVES)) {
 171                wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
 172                                     xfeatures_mask_independent());
 173        }
 174}
 175
 176static bool xfeature_enabled(enum xfeature xfeature)
 177{
 178        return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
 179}
 180
 181/*
 182 * Record the offsets and sizes of various xstates contained
 183 * in the XSAVE state memory layout.
 184 */
 185static void __init setup_xstate_features(void)
 186{
 187        u32 eax, ebx, ecx, edx, i;
 188        /* start at the beginning of the "extended state" */
 189        unsigned int last_good_offset = offsetof(struct xregs_state,
 190                                                 extended_state_area);
 191        /*
 192         * The FP xstates and SSE xstates are legacy states. They are always
 193         * in the fixed offsets in the xsave area in either compacted form
 194         * or standard form.
 195         */
 196        xstate_offsets[XFEATURE_FP]     = 0;
 197        xstate_sizes[XFEATURE_FP]       = offsetof(struct fxregs_state,
 198                                                   xmm_space);
 199
 200        xstate_offsets[XFEATURE_SSE]    = xstate_sizes[XFEATURE_FP];
 201        xstate_sizes[XFEATURE_SSE]      = sizeof_field(struct fxregs_state,
 202                                                       xmm_space);
 203
 204        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
 205                cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
 206
 207                xstate_sizes[i] = eax;
 208
 209                /*
 210                 * If an xfeature is supervisor state, the offset in EBX is
 211                 * invalid, leave it to -1.
 212                 */
 213                if (xfeature_is_supervisor(i))
 214                        continue;
 215
 216                xstate_offsets[i] = ebx;
 217
 218                /*
 219                 * In our xstate size checks, we assume that the highest-numbered
 220                 * xstate feature has the highest offset in the buffer.  Ensure
 221                 * it does.
 222                 */
 223                WARN_ONCE(last_good_offset > xstate_offsets[i],
 224                          "x86/fpu: misordered xstate at %d\n", last_good_offset);
 225
 226                last_good_offset = xstate_offsets[i];
 227        }
 228}
 229
 230static void __init print_xstate_feature(u64 xstate_mask)
 231{
 232        const char *feature_name;
 233
 234        if (cpu_has_xfeatures(xstate_mask, &feature_name))
 235                pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
 236}
 237
 238/*
 239 * Print out all the supported xstate features:
 240 */
 241static void __init print_xstate_features(void)
 242{
 243        print_xstate_feature(XFEATURE_MASK_FP);
 244        print_xstate_feature(XFEATURE_MASK_SSE);
 245        print_xstate_feature(XFEATURE_MASK_YMM);
 246        print_xstate_feature(XFEATURE_MASK_BNDREGS);
 247        print_xstate_feature(XFEATURE_MASK_BNDCSR);
 248        print_xstate_feature(XFEATURE_MASK_OPMASK);
 249        print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
 250        print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
 251        print_xstate_feature(XFEATURE_MASK_PKRU);
 252        print_xstate_feature(XFEATURE_MASK_PASID);
 253        print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
 254        print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
 255}
 256
 257/*
 258 * This check is important because it is easy to get XSTATE_*
 259 * confused with XSTATE_BIT_*.
 260 */
 261#define CHECK_XFEATURE(nr) do {         \
 262        WARN_ON(nr < FIRST_EXTENDED_XFEATURE);  \
 263        WARN_ON(nr >= XFEATURE_MAX);    \
 264} while (0)
 265
 266/*
 267 * We could cache this like xstate_size[], but we only use
 268 * it here, so it would be a waste of space.
 269 */
 270static int xfeature_is_aligned(int xfeature_nr)
 271{
 272        u32 eax, ebx, ecx, edx;
 273
 274        CHECK_XFEATURE(xfeature_nr);
 275
 276        if (!xfeature_enabled(xfeature_nr)) {
 277                WARN_ONCE(1, "Checking alignment of disabled xfeature %d\n",
 278                          xfeature_nr);
 279                return 0;
 280        }
 281
 282        cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
 283        /*
 284         * The value returned by ECX[1] indicates the alignment
 285         * of state component 'i' when the compacted format
 286         * of the extended region of an XSAVE area is used:
 287         */
 288        return !!(ecx & 2);
 289}
 290
 291/*
 292 * This function sets up offsets and sizes of all extended states in
 293 * xsave area. This supports both standard format and compacted format
 294 * of the xsave area.
 295 */
 296static void __init setup_xstate_comp_offsets(void)
 297{
 298        unsigned int next_offset;
 299        int i;
 300
 301        /*
 302         * The FP xstates and SSE xstates are legacy states. They are always
 303         * in the fixed offsets in the xsave area in either compacted form
 304         * or standard form.
 305         */
 306        xstate_comp_offsets[XFEATURE_FP] = 0;
 307        xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state,
 308                                                     xmm_space);
 309
 310        if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) {
 311                for_each_extended_xfeature(i, fpu_kernel_cfg.max_features)
 312                        xstate_comp_offsets[i] = xstate_offsets[i];
 313                return;
 314        }
 315
 316        next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 317
 318        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
 319                if (xfeature_is_aligned(i))
 320                        next_offset = ALIGN(next_offset, 64);
 321
 322                xstate_comp_offsets[i] = next_offset;
 323                next_offset += xstate_sizes[i];
 324        }
 325}
 326
 327/*
 328 * Setup offsets of a supervisor-state-only XSAVES buffer:
 329 *
 330 * The offsets stored in xstate_comp_offsets[] only work for one specific
 331 * value of the Requested Feature BitMap (RFBM).  In cases where a different
 332 * RFBM value is used, a different set of offsets is required.  This set of
 333 * offsets is for when RFBM=xfeatures_mask_supervisor().
 334 */
 335static void __init setup_supervisor_only_offsets(void)
 336{
 337        unsigned int next_offset;
 338        int i;
 339
 340        next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 341
 342        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
 343                if (!xfeature_is_supervisor(i))
 344                        continue;
 345
 346                if (xfeature_is_aligned(i))
 347                        next_offset = ALIGN(next_offset, 64);
 348
 349                xstate_supervisor_only_offsets[i] = next_offset;
 350                next_offset += xstate_sizes[i];
 351        }
 352}
 353
 354/*
 355 * Print out xstate component offsets and sizes
 356 */
 357static void __init print_xstate_offset_size(void)
 358{
 359        int i;
 360
 361        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
 362                pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
 363                         i, xstate_comp_offsets[i], i, xstate_sizes[i]);
 364        }
 365}
 366
 367/*
 368 * This function is called only during boot time when x86 caps are not set
 369 * up and alternative can not be used yet.
 370 */
 371static __init void os_xrstor_booting(struct xregs_state *xstate)
 372{
 373        u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
 374        u32 lmask = mask;
 375        u32 hmask = mask >> 32;
 376        int err;
 377
 378        if (cpu_feature_enabled(X86_FEATURE_XSAVES))
 379                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
 380        else
 381                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
 382
 383        /*
 384         * We should never fault when copying from a kernel buffer, and the FPU
 385         * state we set at boot time should be valid.
 386         */
 387        WARN_ON_FPU(err);
 388}
 389
 390/*
 391 * All supported features have either init state all zeros or are
 392 * handled in setup_init_fpu() individually. This is an explicit
 393 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
 394 * newly added supported features at build time and make people
 395 * actually look at the init state for the new feature.
 396 */
 397#define XFEATURES_INIT_FPSTATE_HANDLED          \
 398        (XFEATURE_MASK_FP |                     \
 399         XFEATURE_MASK_SSE |                    \
 400         XFEATURE_MASK_YMM |                    \
 401         XFEATURE_MASK_OPMASK |                 \
 402         XFEATURE_MASK_ZMM_Hi256 |              \
 403         XFEATURE_MASK_Hi16_ZMM  |              \
 404         XFEATURE_MASK_PKRU |                   \
 405         XFEATURE_MASK_BNDREGS |                \
 406         XFEATURE_MASK_BNDCSR |                 \
 407         XFEATURE_MASK_PASID |                  \
 408         XFEATURE_MASK_XTILE)
 409
 410/*
 411 * setup the xstate image representing the init state
 412 */
 413static void __init setup_init_fpu_buf(void)
 414{
 415        BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
 416                      XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
 417                     XFEATURES_INIT_FPSTATE_HANDLED);
 418
 419        if (!boot_cpu_has(X86_FEATURE_XSAVE))
 420                return;
 421
 422        setup_xstate_features();
 423        print_xstate_features();
 424
 425        xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features);
 426
 427        /*
 428         * Init all the features state with header.xfeatures being 0x0
 429         */
 430        os_xrstor_booting(&init_fpstate.regs.xsave);
 431
 432        /*
 433         * All components are now in init state. Read the state back so
 434         * that init_fpstate contains all non-zero init state. This only
 435         * works with XSAVE, but not with XSAVEOPT and XSAVES because
 436         * those use the init optimization which skips writing data for
 437         * components in init state.
 438         *
 439         * XSAVE could be used, but that would require to reshuffle the
 440         * data when XSAVES is available because XSAVES uses xstate
 441         * compaction. But doing so is a pointless exercise because most
 442         * components have an all zeros init state except for the legacy
 443         * ones (FP and SSE). Those can be saved with FXSAVE into the
 444         * legacy area. Adding new features requires to ensure that init
 445         * state is all zeroes or if not to add the necessary handling
 446         * here.
 447         */
 448        fxsave(&init_fpstate.regs.fxsave);
 449}
 450
 451static int xfeature_uncompacted_offset(int xfeature_nr)
 452{
 453        u32 eax, ebx, ecx, edx;
 454
 455        /*
 456         * Only XSAVES supports supervisor states and it uses compacted
 457         * format. Checking a supervisor state's uncompacted offset is
 458         * an error.
 459         */
 460        if (XFEATURE_MASK_SUPERVISOR_ALL & BIT_ULL(xfeature_nr)) {
 461                WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
 462                return -1;
 463        }
 464
 465        CHECK_XFEATURE(xfeature_nr);
 466        cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
 467        return ebx;
 468}
 469
 470int xfeature_size(int xfeature_nr)
 471{
 472        u32 eax, ebx, ecx, edx;
 473
 474        CHECK_XFEATURE(xfeature_nr);
 475        cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
 476        return eax;
 477}
 478
 479/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
 480static int validate_user_xstate_header(const struct xstate_header *hdr,
 481                                       struct fpstate *fpstate)
 482{
 483        /* No unknown or supervisor features may be set */
 484        if (hdr->xfeatures & ~fpstate->user_xfeatures)
 485                return -EINVAL;
 486
 487        /* Userspace must use the uncompacted format */
 488        if (hdr->xcomp_bv)
 489                return -EINVAL;
 490
 491        /*
 492         * If 'reserved' is shrunken to add a new field, make sure to validate
 493         * that new field here!
 494         */
 495        BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
 496
 497        /* No reserved bits may be set */
 498        if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
 499                return -EINVAL;
 500
 501        return 0;
 502}
 503
 504static void __init __xstate_dump_leaves(void)
 505{
 506        int i;
 507        u32 eax, ebx, ecx, edx;
 508        static int should_dump = 1;
 509
 510        if (!should_dump)
 511                return;
 512        should_dump = 0;
 513        /*
 514         * Dump out a few leaves past the ones that we support
 515         * just in case there are some goodies up there
 516         */
 517        for (i = 0; i < XFEATURE_MAX + 10; i++) {
 518                cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
 519                pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
 520                        XSTATE_CPUID, i, eax, ebx, ecx, edx);
 521        }
 522}
 523
 524#define XSTATE_WARN_ON(x) do {                                                  \
 525        if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) {        \
 526                __xstate_dump_leaves();                                         \
 527        }                                                                       \
 528} while (0)
 529
 530#define XCHECK_SZ(sz, nr, nr_macro, __struct) do {                      \
 531        if ((nr == nr_macro) &&                                         \
 532            WARN_ONCE(sz != sizeof(__struct),                           \
 533                "%s: struct is %zu bytes, cpu state %d bytes\n",        \
 534                __stringify(nr_macro), sizeof(__struct), sz)) {         \
 535                __xstate_dump_leaves();                                 \
 536        }                                                               \
 537} while (0)
 538
 539/**
 540 * check_xtile_data_against_struct - Check tile data state size.
 541 *
 542 * Calculate the state size by multiplying the single tile size which is
 543 * recorded in a C struct, and the number of tiles that the CPU informs.
 544 * Compare the provided size with the calculation.
 545 *
 546 * @size:       The tile data state size
 547 *
 548 * Returns:     0 on success, -EINVAL on mismatch.
 549 */
 550static int __init check_xtile_data_against_struct(int size)
 551{
 552        u32 max_palid, palid, state_size;
 553        u32 eax, ebx, ecx, edx;
 554        u16 max_tile;
 555
 556        /*
 557         * Check the maximum palette id:
 558         *   eax: the highest numbered palette subleaf.
 559         */
 560        cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
 561
 562        /*
 563         * Cross-check each tile size and find the maximum number of
 564         * supported tiles.
 565         */
 566        for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
 567                u16 tile_size, max;
 568
 569                /*
 570                 * Check the tile size info:
 571                 *   eax[31:16]:  bytes per title
 572                 *   ebx[31:16]:  the max names (or max number of tiles)
 573                 */
 574                cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
 575                tile_size = eax >> 16;
 576                max = ebx >> 16;
 577
 578                if (tile_size != sizeof(struct xtile_data)) {
 579                        pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
 580                               __stringify(XFEATURE_XTILE_DATA),
 581                               sizeof(struct xtile_data), tile_size);
 582                        __xstate_dump_leaves();
 583                        return -EINVAL;
 584                }
 585
 586                if (max > max_tile)
 587                        max_tile = max;
 588        }
 589
 590        state_size = sizeof(struct xtile_data) * max_tile;
 591        if (size != state_size) {
 592                pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
 593                       __stringify(XFEATURE_XTILE_DATA), state_size, size);
 594                __xstate_dump_leaves();
 595                return -EINVAL;
 596        }
 597        return 0;
 598}
 599
 600/*
 601 * We have a C struct for each 'xstate'.  We need to ensure
 602 * that our software representation matches what the CPU
 603 * tells us about the state's size.
 604 */
 605static bool __init check_xstate_against_struct(int nr)
 606{
 607        /*
 608         * Ask the CPU for the size of the state.
 609         */
 610        int sz = xfeature_size(nr);
 611        /*
 612         * Match each CPU state with the corresponding software
 613         * structure.
 614         */
 615        XCHECK_SZ(sz, nr, XFEATURE_YMM,       struct ymmh_struct);
 616        XCHECK_SZ(sz, nr, XFEATURE_BNDREGS,   struct mpx_bndreg_state);
 617        XCHECK_SZ(sz, nr, XFEATURE_BNDCSR,    struct mpx_bndcsr_state);
 618        XCHECK_SZ(sz, nr, XFEATURE_OPMASK,    struct avx_512_opmask_state);
 619        XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
 620        XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM,  struct avx_512_hi16_state);
 621        XCHECK_SZ(sz, nr, XFEATURE_PKRU,      struct pkru_state);
 622        XCHECK_SZ(sz, nr, XFEATURE_PASID,     struct ia32_pasid_state);
 623        XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg);
 624
 625        /* The tile data size varies between implementations. */
 626        if (nr == XFEATURE_XTILE_DATA)
 627                check_xtile_data_against_struct(sz);
 628
 629        /*
 630         * Make *SURE* to add any feature numbers in below if
 631         * there are "holes" in the xsave state component
 632         * numbers.
 633         */
 634        if ((nr < XFEATURE_YMM) ||
 635            (nr >= XFEATURE_MAX) ||
 636            (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
 637            ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
 638                WARN_ONCE(1, "no structure for xstate: %d\n", nr);
 639                XSTATE_WARN_ON(1);
 640                return false;
 641        }
 642        return true;
 643}
 644
 645static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
 646{
 647        unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 648        int i;
 649
 650        for_each_extended_xfeature(i, xfeatures) {
 651                /* Align from the end of the previous feature */
 652                if (xfeature_is_aligned(i))
 653                        size = ALIGN(size, 64);
 654                /*
 655                 * In compacted format the enabled features are packed,
 656                 * i.e. disabled features do not occupy space.
 657                 *
 658                 * In non-compacted format the offsets are fixed and
 659                 * disabled states still occupy space in the memory buffer.
 660                 */
 661                if (!compacted)
 662                        size = xfeature_uncompacted_offset(i);
 663                /*
 664                 * Add the feature size even for non-compacted format
 665                 * to make the end result correct
 666                 */
 667                size += xfeature_size(i);
 668        }
 669        return size;
 670}
 671
 672/*
 673 * This essentially double-checks what the cpu told us about
 674 * how large the XSAVE buffer needs to be.  We are recalculating
 675 * it to be safe.
 676 *
 677 * Independent XSAVE features allocate their own buffers and are not
 678 * covered by these checks. Only the size of the buffer for task->fpu
 679 * is checked here.
 680 */
 681static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
 682{
 683        bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
 684        unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
 685        int i;
 686
 687        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
 688                if (!check_xstate_against_struct(i))
 689                        return false;
 690                /*
 691                 * Supervisor state components can be managed only by
 692                 * XSAVES.
 693                 */
 694                if (!compacted && xfeature_is_supervisor(i)) {
 695                        XSTATE_WARN_ON(1);
 696                        return false;
 697                }
 698        }
 699        size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
 700        XSTATE_WARN_ON(size != kernel_size);
 701        return size == kernel_size;
 702}
 703
 704/*
 705 * Get total size of enabled xstates in XCR0 | IA32_XSS.
 706 *
 707 * Note the SDM's wording here.  "sub-function 0" only enumerates
 708 * the size of the *user* states.  If we use it to size a buffer
 709 * that we use 'XSAVES' on, we could potentially overflow the
 710 * buffer because 'XSAVES' saves system states too.
 711 */
 712static unsigned int __init get_xsaves_size(void)
 713{
 714        unsigned int eax, ebx, ecx, edx;
 715        /*
 716         * - CPUID function 0DH, sub-function 1:
 717         *    EBX enumerates the size (in bytes) required by
 718         *    the XSAVES instruction for an XSAVE area
 719         *    containing all the state components
 720         *    corresponding to bits currently set in
 721         *    XCR0 | IA32_XSS.
 722         */
 723        cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
 724        return ebx;
 725}
 726
 727/*
 728 * Get the total size of the enabled xstates without the independent supervisor
 729 * features.
 730 */
 731static unsigned int __init get_xsaves_size_no_independent(void)
 732{
 733        u64 mask = xfeatures_mask_independent();
 734        unsigned int size;
 735
 736        if (!mask)
 737                return get_xsaves_size();
 738
 739        /* Disable independent features. */
 740        wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
 741
 742        /*
 743         * Ask the hardware what size is required of the buffer.
 744         * This is the size required for the task->fpu buffer.
 745         */
 746        size = get_xsaves_size();
 747
 748        /* Re-enable independent features so XSAVES will work on them again. */
 749        wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
 750
 751        return size;
 752}
 753
 754static unsigned int __init get_xsave_size_user(void)
 755{
 756        unsigned int eax, ebx, ecx, edx;
 757        /*
 758         * - CPUID function 0DH, sub-function 0:
 759         *    EBX enumerates the size (in bytes) required by
 760         *    the XSAVE instruction for an XSAVE area
 761         *    containing all the *user* state components
 762         *    corresponding to bits currently set in XCR0.
 763         */
 764        cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
 765        return ebx;
 766}
 767
 768/*
 769 * Will the runtime-enumerated 'xstate_size' fit in the init
 770 * task's statically-allocated buffer?
 771 */
 772static bool __init is_supported_xstate_size(unsigned int test_xstate_size)
 773{
 774        if (test_xstate_size <= sizeof(init_fpstate.regs))
 775                return true;
 776
 777        pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
 778                        sizeof(init_fpstate.regs), test_xstate_size);
 779        return false;
 780}
 781
 782static int __init init_xstate_size(void)
 783{
 784        /* Recompute the context size for enabled features: */
 785        unsigned int user_size, kernel_size, kernel_default_size;
 786        bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
 787
 788        /* Uncompacted user space size */
 789        user_size = get_xsave_size_user();
 790
 791        /*
 792         * XSAVES kernel size includes supervisor states and
 793         * uses compacted format when available.
 794         *
 795         * XSAVE does not support supervisor states so
 796         * kernel and user size is identical.
 797         */
 798        if (compacted)
 799                kernel_size = get_xsaves_size_no_independent();
 800        else
 801                kernel_size = user_size;
 802
 803        kernel_default_size =
 804                xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
 805
 806        /* Ensure we have the space to store all default enabled features. */
 807        if (!is_supported_xstate_size(kernel_default_size))
 808                return -EINVAL;
 809
 810        if (!paranoid_xstate_size_valid(kernel_size))
 811                return -EINVAL;
 812
 813        fpu_kernel_cfg.max_size = kernel_size;
 814        fpu_user_cfg.max_size = user_size;
 815
 816        fpu_kernel_cfg.default_size = kernel_default_size;
 817        fpu_user_cfg.default_size =
 818                xstate_calculate_size(fpu_user_cfg.default_features, false);
 819
 820        return 0;
 821}
 822
 823/*
 824 * We enabled the XSAVE hardware, but something went wrong and
 825 * we can not use it.  Disable it.
 826 */
 827static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
 828{
 829        fpu_kernel_cfg.max_features = 0;
 830        cr4_clear_bits(X86_CR4_OSXSAVE);
 831        setup_clear_cpu_cap(X86_FEATURE_XSAVE);
 832
 833        /* Restore the legacy size.*/
 834        fpu_kernel_cfg.max_size = legacy_size;
 835        fpu_kernel_cfg.default_size = legacy_size;
 836        fpu_user_cfg.max_size = legacy_size;
 837        fpu_user_cfg.default_size = legacy_size;
 838
 839        /*
 840         * Prevent enabling the static branch which enables writes to the
 841         * XFD MSR.
 842         */
 843        init_fpstate.xfd = 0;
 844
 845        fpstate_reset(&current->thread.fpu);
 846}
 847
 848/*
 849 * Enable and initialize the xsave feature.
 850 * Called once per system bootup.
 851 */
 852void __init fpu__init_system_xstate(unsigned int legacy_size)
 853{
 854        unsigned int eax, ebx, ecx, edx;
 855        u64 xfeatures;
 856        int err;
 857        int i;
 858
 859        if (!boot_cpu_has(X86_FEATURE_FPU)) {
 860                pr_info("x86/fpu: No FPU detected\n");
 861                return;
 862        }
 863
 864        if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
 865                pr_info("x86/fpu: x87 FPU will use %s\n",
 866                        boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
 867                return;
 868        }
 869
 870        if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
 871                WARN_ON_FPU(1);
 872                return;
 873        }
 874
 875        /*
 876         * Find user xstates supported by the processor.
 877         */
 878        cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
 879        fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
 880
 881        /*
 882         * Find supervisor xstates supported by the processor.
 883         */
 884        cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
 885        fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
 886
 887        if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
 888                /*
 889                 * This indicates that something really unexpected happened
 890                 * with the enumeration.  Disable XSAVE and try to continue
 891                 * booting without it.  This is too early to BUG().
 892                 */
 893                pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
 894                       fpu_kernel_cfg.max_features);
 895                goto out_disable;
 896        }
 897
 898        /*
 899         * Clear XSAVE features that are disabled in the normal CPUID.
 900         */
 901        for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
 902                unsigned short cid = xsave_cpuid_features[i];
 903
 904                /* Careful: X86_FEATURE_FPU is 0! */
 905                if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
 906                        fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
 907        }
 908
 909        if (!cpu_feature_enabled(X86_FEATURE_XFD))
 910                fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
 911
 912        fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
 913                              XFEATURE_MASK_SUPERVISOR_SUPPORTED;
 914
 915        fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
 916        fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
 917
 918        /* Clean out dynamic features from default */
 919        fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
 920        fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
 921
 922        fpu_user_cfg.default_features = fpu_user_cfg.max_features;
 923        fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
 924
 925        /* Store it for paranoia check at the end */
 926        xfeatures = fpu_kernel_cfg.max_features;
 927
 928        /*
 929         * Initialize the default XFD state in initfp_state and enable the
 930         * dynamic sizing mechanism if dynamic states are available.  The
 931         * static key cannot be enabled here because this runs before
 932         * jump_label_init(). This is delayed to an initcall.
 933         */
 934        init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
 935
 936        /* Enable xstate instructions to be able to continue with initialization: */
 937        fpu__init_cpu_xstate();
 938        err = init_xstate_size();
 939        if (err)
 940                goto out_disable;
 941
 942        /* Reset the state for the current task */
 943        fpstate_reset(&current->thread.fpu);
 944
 945        /*
 946         * Update info used for ptrace frames; use standard-format size and no
 947         * supervisor xstates:
 948         */
 949        update_regset_xstate_info(fpu_user_cfg.max_size,
 950                                  fpu_user_cfg.max_features);
 951
 952        setup_init_fpu_buf();
 953        setup_xstate_comp_offsets();
 954        setup_supervisor_only_offsets();
 955
 956        /*
 957         * Paranoia check whether something in the setup modified the
 958         * xfeatures mask.
 959         */
 960        if (xfeatures != fpu_kernel_cfg.max_features) {
 961                pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
 962                       xfeatures, fpu_kernel_cfg.max_features);
 963                goto out_disable;
 964        }
 965
 966        print_xstate_offset_size();
 967        pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
 968                fpu_kernel_cfg.max_features,
 969                fpu_kernel_cfg.max_size,
 970                boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
 971        return;
 972
 973out_disable:
 974        /* something went wrong, try to boot without any XSAVE support */
 975        fpu__init_disable_system_xstate(legacy_size);
 976}
 977
 978/*
 979 * Restore minimal FPU state after suspend:
 980 */
 981void fpu__resume_cpu(void)
 982{
 983        /*
 984         * Restore XCR0 on xsave capable CPUs:
 985         */
 986        if (cpu_feature_enabled(X86_FEATURE_XSAVE))
 987                xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
 988
 989        /*
 990         * Restore IA32_XSS. The same CPUID bit enumerates support
 991         * of XSAVES and MSR_IA32_XSS.
 992         */
 993        if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
 994                wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
 995                                     xfeatures_mask_independent());
 996        }
 997
 998        if (fpu_state_size_dynamic())
 999                wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
1000}
1001
1002/*
1003 * Given an xstate feature nr, calculate where in the xsave
1004 * buffer the state is.  Callers should ensure that the buffer
1005 * is valid.
1006 */
1007static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
1008{
1009        if (!xfeature_enabled(xfeature_nr)) {
1010                WARN_ON_FPU(1);
1011                return NULL;
1012        }
1013
1014        return (void *)xsave + xstate_comp_offsets[xfeature_nr];
1015}
1016/*
1017 * Given the xsave area and a state inside, this function returns the
1018 * address of the state.
1019 *
1020 * This is the API that is called to get xstate address in either
1021 * standard format or compacted format of xsave area.
1022 *
1023 * Note that if there is no data for the field in the xsave buffer
1024 * this will return NULL.
1025 *
1026 * Inputs:
1027 *      xstate: the thread's storage area for all FPU data
1028 *      xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
1029 *      XFEATURE_SSE, etc...)
1030 * Output:
1031 *      address of the state in the xsave area, or NULL if the
1032 *      field is not present in the xsave buffer.
1033 */
1034void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
1035{
1036        /*
1037         * Do we even *have* xsave state?
1038         */
1039        if (!boot_cpu_has(X86_FEATURE_XSAVE))
1040                return NULL;
1041
1042        /*
1043         * We should not ever be requesting features that we
1044         * have not enabled.
1045         */
1046        WARN_ONCE(!(fpu_kernel_cfg.max_features & BIT_ULL(xfeature_nr)),
1047                  "get of unsupported state");
1048        /*
1049         * This assumes the last 'xsave*' instruction to
1050         * have requested that 'xfeature_nr' be saved.
1051         * If it did not, we might be seeing and old value
1052         * of the field in the buffer.
1053         *
1054         * This can happen because the last 'xsave' did not
1055         * request that this feature be saved (unlikely)
1056         * or because the "init optimization" caused it
1057         * to not be saved.
1058         */
1059        if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
1060                return NULL;
1061
1062        return __raw_xsave_addr(xsave, xfeature_nr);
1063}
1064
1065#ifdef CONFIG_ARCH_HAS_PKEYS
1066
1067/*
1068 * This will go out and modify PKRU register to set the access
1069 * rights for @pkey to @init_val.
1070 */
1071int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
1072                              unsigned long init_val)
1073{
1074        u32 old_pkru, new_pkru_bits = 0;
1075        int pkey_shift;
1076
1077        /*
1078         * This check implies XSAVE support.  OSPKE only gets
1079         * set if we enable XSAVE and we enable PKU in XCR0.
1080         */
1081        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1082                return -EINVAL;
1083
1084        /*
1085         * This code should only be called with valid 'pkey'
1086         * values originating from in-kernel users.  Complain
1087         * if a bad value is observed.
1088         */
1089        if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1090                return -EINVAL;
1091
1092        /* Set the bits we need in PKRU:  */
1093        if (init_val & PKEY_DISABLE_ACCESS)
1094                new_pkru_bits |= PKRU_AD_BIT;
1095        if (init_val & PKEY_DISABLE_WRITE)
1096                new_pkru_bits |= PKRU_WD_BIT;
1097
1098        /* Shift the bits in to the correct place in PKRU for pkey: */
1099        pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1100        new_pkru_bits <<= pkey_shift;
1101
1102        /* Get old PKRU and mask off any old bits in place: */
1103        old_pkru = read_pkru();
1104        old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1105
1106        /* Write old part along with new part: */
1107        write_pkru(old_pkru | new_pkru_bits);
1108
1109        return 0;
1110}
1111#endif /* ! CONFIG_ARCH_HAS_PKEYS */
1112
1113static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1114                         void *init_xstate, unsigned int size)
1115{
1116        membuf_write(to, from_xstate ? xstate : init_xstate, size);
1117}
1118
1119/**
1120 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1121 * @to:         membuf descriptor
1122 * @fpstate:    The fpstate buffer from which to copy
1123 * @pkru_val:   The PKRU value to store in the PKRU component
1124 * @copy_mode:  The requested copy mode
1125 *
1126 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1127 * format, i.e. from the kernel internal hardware dependent storage format
1128 * to the requested @mode. UABI XSTATE is always uncompacted!
1129 *
1130 * It supports partial copy but @to.pos always starts from zero.
1131 */
1132void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1133                               u32 pkru_val, enum xstate_copy_mode copy_mode)
1134{
1135        const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1136        struct xregs_state *xinit = &init_fpstate.regs.xsave;
1137        struct xregs_state *xsave = &fpstate->regs.xsave;
1138        struct xstate_header header;
1139        unsigned int zerofrom;
1140        u64 mask;
1141        int i;
1142
1143        memset(&header, 0, sizeof(header));
1144        header.xfeatures = xsave->header.xfeatures;
1145
1146        /* Mask out the feature bits depending on copy mode */
1147        switch (copy_mode) {
1148        case XSTATE_COPY_FP:
1149                header.xfeatures &= XFEATURE_MASK_FP;
1150                break;
1151
1152        case XSTATE_COPY_FX:
1153                header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1154                break;
1155
1156        case XSTATE_COPY_XSAVE:
1157                header.xfeatures &= fpstate->user_xfeatures;
1158                break;
1159        }
1160
1161        /* Copy FP state up to MXCSR */
1162        copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1163                     &xinit->i387, off_mxcsr);
1164
1165        /* Copy MXCSR when SSE or YMM are set in the feature mask */
1166        copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1167                     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1168                     MXCSR_AND_FLAGS_SIZE);
1169
1170        /* Copy the remaining FP state */
1171        copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1172                     &to, &xsave->i387.st_space, &xinit->i387.st_space,
1173                     sizeof(xsave->i387.st_space));
1174
1175        /* Copy the SSE state - shared with YMM, but independently managed */
1176        copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1177                     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1178                     sizeof(xsave->i387.xmm_space));
1179
1180        if (copy_mode != XSTATE_COPY_XSAVE)
1181                goto out;
1182
1183        /* Zero the padding area */
1184        membuf_zero(&to, sizeof(xsave->i387.padding));
1185
1186        /* Copy xsave->i387.sw_reserved */
1187        membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1188
1189        /* Copy the user space relevant state of @xsave->header */
1190        membuf_write(&to, &header, sizeof(header));
1191
1192        zerofrom = offsetof(struct xregs_state, extended_state_area);
1193
1194        /*
1195         * The ptrace buffer is in non-compacted XSAVE format.  In
1196         * non-compacted format disabled features still occupy state space,
1197         * but there is no state to copy from in the compacted
1198         * init_fpstate. The gap tracking will zero these states.
1199         */
1200        mask = fpstate->user_xfeatures;
1201
1202        for_each_extended_xfeature(i, mask) {
1203                /*
1204                 * If there was a feature or alignment gap, zero the space
1205                 * in the destination buffer.
1206                 */
1207                if (zerofrom < xstate_offsets[i])
1208                        membuf_zero(&to, xstate_offsets[i] - zerofrom);
1209
1210                if (i == XFEATURE_PKRU) {
1211                        struct pkru_state pkru = {0};
1212                        /*
1213                         * PKRU is not necessarily up to date in the
1214                         * XSAVE buffer. Use the provided value.
1215                         */
1216                        pkru.pkru = pkru_val;
1217                        membuf_write(&to, &pkru, sizeof(pkru));
1218                } else {
1219                        copy_feature(header.xfeatures & BIT_ULL(i), &to,
1220                                     __raw_xsave_addr(xsave, i),
1221                                     __raw_xsave_addr(xinit, i),
1222                                     xstate_sizes[i]);
1223                }
1224                /*
1225                 * Keep track of the last copied state in the non-compacted
1226                 * target buffer for gap zeroing.
1227                 */
1228                zerofrom = xstate_offsets[i] + xstate_sizes[i];
1229        }
1230
1231out:
1232        if (to.left)
1233                membuf_zero(&to, to.left);
1234}
1235
1236/**
1237 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1238 * @to:         membuf descriptor
1239 * @tsk:        The task from which to copy the saved xstate
1240 * @copy_mode:  The requested copy mode
1241 *
1242 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1243 * format, i.e. from the kernel internal hardware dependent storage format
1244 * to the requested @mode. UABI XSTATE is always uncompacted!
1245 *
1246 * It supports partial copy but @to.pos always starts from zero.
1247 */
1248void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1249                             enum xstate_copy_mode copy_mode)
1250{
1251        __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
1252                                  tsk->thread.pkru, copy_mode);
1253}
1254
1255static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1256                            const void *kbuf, const void __user *ubuf)
1257{
1258        if (kbuf) {
1259                memcpy(dst, kbuf + offset, size);
1260        } else {
1261                if (copy_from_user(dst, ubuf + offset, size))
1262                        return -EFAULT;
1263        }
1264        return 0;
1265}
1266
1267
1268static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1269                               const void __user *ubuf)
1270{
1271        struct xregs_state *xsave = &fpstate->regs.xsave;
1272        unsigned int offset, size;
1273        struct xstate_header hdr;
1274        u64 mask;
1275        int i;
1276
1277        offset = offsetof(struct xregs_state, header);
1278        if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1279                return -EFAULT;
1280
1281        if (validate_user_xstate_header(&hdr, fpstate))
1282                return -EINVAL;
1283
1284        /* Validate MXCSR when any of the related features is in use */
1285        mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1286        if (hdr.xfeatures & mask) {
1287                u32 mxcsr[2];
1288
1289                offset = offsetof(struct fxregs_state, mxcsr);
1290                if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1291                        return -EFAULT;
1292
1293                /* Reserved bits in MXCSR must be zero. */
1294                if (mxcsr[0] & ~mxcsr_feature_mask)
1295                        return -EINVAL;
1296
1297                /* SSE and YMM require MXCSR even when FP is not in use. */
1298                if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1299                        xsave->i387.mxcsr = mxcsr[0];
1300                        xsave->i387.mxcsr_mask = mxcsr[1];
1301                }
1302        }
1303
1304        for (i = 0; i < XFEATURE_MAX; i++) {
1305                u64 mask = ((u64)1 << i);
1306
1307                if (hdr.xfeatures & mask) {
1308                        void *dst = __raw_xsave_addr(xsave, i);
1309
1310                        offset = xstate_offsets[i];
1311                        size = xstate_sizes[i];
1312
1313                        if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1314                                return -EFAULT;
1315                }
1316        }
1317
1318        /*
1319         * The state that came in from userspace was user-state only.
1320         * Mask all the user states out of 'xfeatures':
1321         */
1322        xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1323
1324        /*
1325         * Add back in the features that came in from userspace:
1326         */
1327        xsave->header.xfeatures |= hdr.xfeatures;
1328
1329        return 0;
1330}
1331
1332/*
1333 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1334 * format and copy to the target thread. Used by ptrace and KVM.
1335 */
1336int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
1337{
1338        return copy_uabi_to_xstate(fpstate, kbuf, NULL);
1339}
1340
1341/*
1342 * Convert from a sigreturn standard-format user-space buffer to kernel
1343 * XSAVE[S] format and copy to the target thread. This is called from the
1344 * sigreturn() and rt_sigreturn() system calls.
1345 */
1346int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate,
1347                                      const void __user *ubuf)
1348{
1349        return copy_uabi_to_xstate(fpstate, NULL, ubuf);
1350}
1351
1352static bool validate_independent_components(u64 mask)
1353{
1354        u64 xchk;
1355
1356        if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1357                return false;
1358
1359        xchk = ~xfeatures_mask_independent();
1360
1361        if (WARN_ON_ONCE(!mask || mask & xchk))
1362                return false;
1363
1364        return true;
1365}
1366
1367/**
1368 * xsaves - Save selected components to a kernel xstate buffer
1369 * @xstate:     Pointer to the buffer
1370 * @mask:       Feature mask to select the components to save
1371 *
1372 * The @xstate buffer must be 64 byte aligned and correctly initialized as
1373 * XSAVES does not write the full xstate header. Before first use the
1374 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1375 * can #GP.
1376 *
1377 * The feature mask must be a subset of the independent features.
1378 */
1379void xsaves(struct xregs_state *xstate, u64 mask)
1380{
1381        int err;
1382
1383        if (!validate_independent_components(mask))
1384                return;
1385
1386        XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1387        WARN_ON_ONCE(err);
1388}
1389
1390/**
1391 * xrstors - Restore selected components from a kernel xstate buffer
1392 * @xstate:     Pointer to the buffer
1393 * @mask:       Feature mask to select the components to restore
1394 *
1395 * The @xstate buffer must be 64 byte aligned and correctly initialized
1396 * otherwise XRSTORS from that buffer can #GP.
1397 *
1398 * Proper usage is to restore the state which was saved with
1399 * xsaves() into @xstate.
1400 *
1401 * The feature mask must be a subset of the independent features.
1402 */
1403void xrstors(struct xregs_state *xstate, u64 mask)
1404{
1405        int err;
1406
1407        if (!validate_independent_components(mask))
1408                return;
1409
1410        XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1411        WARN_ON_ONCE(err);
1412}
1413
1414#if IS_ENABLED(CONFIG_KVM)
1415void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
1416{
1417        void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
1418
1419        if (addr)
1420                memset(addr, 0, xstate_sizes[xfeature]);
1421}
1422EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
1423#endif
1424
1425#ifdef CONFIG_X86_64
1426
1427#ifdef CONFIG_X86_DEBUG_FPU
1428/*
1429 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1430 * can safely operate on the @fpstate buffer.
1431 */
1432static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1433{
1434        u64 xfd = __this_cpu_read(xfd_state);
1435
1436        if (fpstate->xfd == xfd)
1437                return true;
1438
1439         /*
1440          * The XFD MSR does not match fpstate->xfd. That's invalid when
1441          * the passed in fpstate is current's fpstate.
1442          */
1443        if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
1444                return false;
1445
1446        /*
1447         * XRSTOR(S) from init_fpstate are always correct as it will just
1448         * bring all components into init state and not read from the
1449         * buffer. XSAVE(S) raises #PF after init.
1450         */
1451        if (fpstate == &init_fpstate)
1452                return rstor;
1453
1454        /*
1455         * XSAVE(S): clone(), fpu_swap_kvm_fpu()
1456         * XRSTORS(S): fpu_swap_kvm_fpu()
1457         */
1458
1459        /*
1460         * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1461         * the buffer area for XFD-disabled state components.
1462         */
1463        mask &= ~xfd;
1464
1465        /*
1466         * Remove features which are valid in fpstate. They
1467         * have space allocated in fpstate.
1468         */
1469        mask &= ~fpstate->xfeatures;
1470
1471        /*
1472         * Any remaining state components in 'mask' might be written
1473         * by XSAVE/XRSTOR. Fail validation it found.
1474         */
1475        return !mask;
1476}
1477
1478void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1479{
1480        WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1481}
1482#endif /* CONFIG_X86_DEBUG_FPU */
1483
1484static int __init xfd_update_static_branch(void)
1485{
1486        /*
1487         * If init_fpstate.xfd has bits set then dynamic features are
1488         * available and the dynamic sizing must be enabled.
1489         */
1490        if (init_fpstate.xfd)
1491                static_branch_enable(&__fpu_state_size_dynamic);
1492        return 0;
1493}
1494arch_initcall(xfd_update_static_branch)
1495
1496void fpstate_free(struct fpu *fpu)
1497{
1498        if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1499                vfree(fpu->fpstate);
1500}
1501
1502/**
1503 * fpu_install_fpstate - Update the active fpstate in the FPU
1504 *
1505 * @fpu:        A struct fpu * pointer
1506 * @newfps:     A struct fpstate * pointer
1507 *
1508 * Returns:     A null pointer if the last active fpstate is the embedded
1509 *              one or the new fpstate is already installed;
1510 *              otherwise, a pointer to the old fpstate which has to
1511 *              be freed by the caller.
1512 */
1513static struct fpstate *fpu_install_fpstate(struct fpu *fpu,
1514                                           struct fpstate *newfps)
1515{
1516        struct fpstate *oldfps = fpu->fpstate;
1517
1518        if (fpu->fpstate == newfps)
1519                return NULL;
1520
1521        fpu->fpstate = newfps;
1522        return oldfps != &fpu->__fpstate ? oldfps : NULL;
1523}
1524
1525/**
1526 * fpstate_realloc - Reallocate struct fpstate for the requested new features
1527 *
1528 * @xfeatures:  A bitmap of xstate features which extend the enabled features
1529 *              of that task
1530 * @ksize:      The required size for the kernel buffer
1531 * @usize:      The required size for user space buffers
1532 *
1533 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1534 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1535 * with large states are likely to live longer.
1536 *
1537 * Returns: 0 on success, -ENOMEM on allocation error.
1538 */
1539static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1540                           unsigned int usize)
1541{
1542        struct fpu *fpu = &current->thread.fpu;
1543        struct fpstate *curfps, *newfps = NULL;
1544        unsigned int fpsize;
1545
1546        curfps = fpu->fpstate;
1547        fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1548
1549        newfps = vzalloc(fpsize);
1550        if (!newfps)
1551                return -ENOMEM;
1552        newfps->size = ksize;
1553        newfps->user_size = usize;
1554        newfps->is_valloc = true;
1555
1556        fpregs_lock();
1557        /*
1558         * Ensure that the current state is in the registers before
1559         * swapping fpstate as that might invalidate it due to layout
1560         * changes.
1561         */
1562        if (test_thread_flag(TIF_NEED_FPU_LOAD))
1563                fpregs_restore_userregs();
1564
1565        newfps->xfeatures = curfps->xfeatures | xfeatures;
1566        newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1567        newfps->xfd = curfps->xfd & ~xfeatures;
1568
1569        curfps = fpu_install_fpstate(fpu, newfps);
1570
1571        /* Do the final updates within the locked region */
1572        xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1573        xfd_update_state(newfps);
1574
1575        fpregs_unlock();
1576
1577        vfree(curfps);
1578        return 0;
1579}
1580
1581static int validate_sigaltstack(unsigned int usize)
1582{
1583        struct task_struct *thread, *leader = current->group_leader;
1584        unsigned long framesize = get_sigframe_size();
1585
1586        lockdep_assert_held(&current->sighand->siglock);
1587
1588        /* get_sigframe_size() is based on fpu_user_cfg.max_size */
1589        framesize -= fpu_user_cfg.max_size;
1590        framesize += usize;
1591        for_each_thread(leader, thread) {
1592                if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1593                        return -ENOSPC;
1594        }
1595        return 0;
1596}
1597
1598static int __xstate_request_perm(u64 permitted, u64 requested)
1599{
1600        /*
1601         * This deliberately does not exclude !XSAVES as we still might
1602         * decide to optionally context switch XCR0 or talk the silicon
1603         * vendors into extending XFD for the pre AMX states, especially
1604         * AVX512.
1605         */
1606        bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
1607        struct fpu *fpu = &current->group_leader->thread.fpu;
1608        unsigned int ksize, usize;
1609        u64 mask;
1610        int ret;
1611
1612        /* Check whether fully enabled */
1613        if ((permitted & requested) == requested)
1614                return 0;
1615
1616        /* Calculate the resulting kernel state size */
1617        mask = permitted | requested;
1618        ksize = xstate_calculate_size(mask, compacted);
1619
1620        /* Calculate the resulting user state size */
1621        mask &= XFEATURE_MASK_USER_SUPPORTED;
1622        usize = xstate_calculate_size(mask, false);
1623
1624        ret = validate_sigaltstack(usize);
1625        if (ret)
1626                return ret;
1627
1628        /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1629        WRITE_ONCE(fpu->perm.__state_perm, requested);
1630        /* Protected by sighand lock */
1631        fpu->perm.__state_size = ksize;
1632        fpu->perm.__user_state_size = usize;
1633        return ret;
1634}
1635
1636/*
1637 * Permissions array to map facilities with more than one component
1638 */
1639static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1640        [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1641};
1642
1643static int xstate_request_perm(unsigned long idx)
1644{
1645        u64 permitted, requested;
1646        int ret;
1647
1648        if (idx >= XFEATURE_MAX)
1649                return -EINVAL;
1650
1651        /*
1652         * Look up the facility mask which can require more than
1653         * one xstate component.
1654         */
1655        idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1656        requested = xstate_prctl_req[idx];
1657        if (!requested)
1658                return -EOPNOTSUPP;
1659
1660        if ((fpu_user_cfg.max_features & requested) != requested)
1661                return -EOPNOTSUPP;
1662
1663        /* Lockless quick check */
1664        permitted = xstate_get_host_group_perm();
1665        if ((permitted & requested) == requested)
1666                return 0;
1667
1668        /* Protect against concurrent modifications */
1669        spin_lock_irq(&current->sighand->siglock);
1670        permitted = xstate_get_host_group_perm();
1671        ret = __xstate_request_perm(permitted, requested);
1672        spin_unlock_irq(&current->sighand->siglock);
1673        return ret;
1674}
1675
1676int xfd_enable_feature(u64 xfd_err)
1677{
1678        u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1679        unsigned int ksize, usize;
1680        struct fpu *fpu;
1681
1682        if (!xfd_event) {
1683                pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1684                return 0;
1685        }
1686
1687        /* Protect against concurrent modifications */
1688        spin_lock_irq(&current->sighand->siglock);
1689
1690        /* If not permitted let it die */
1691        if ((xstate_get_host_group_perm() & xfd_event) != xfd_event) {
1692                spin_unlock_irq(&current->sighand->siglock);
1693                return -EPERM;
1694        }
1695
1696        fpu = &current->group_leader->thread.fpu;
1697        ksize = fpu->perm.__state_size;
1698        usize = fpu->perm.__user_state_size;
1699        /*
1700         * The feature is permitted. State size is sufficient.  Dropping
1701         * the lock is safe here even if more features are added from
1702         * another task, the retrieved buffer sizes are valid for the
1703         * currently requested feature(s).
1704         */
1705        spin_unlock_irq(&current->sighand->siglock);
1706
1707        /*
1708         * Try to allocate a new fpstate. If that fails there is no way
1709         * out.
1710         */
1711        if (fpstate_realloc(xfd_event, ksize, usize))
1712                return -EFAULT;
1713        return 0;
1714}
1715#else /* CONFIG_X86_64 */
1716static inline int xstate_request_perm(unsigned long idx)
1717{
1718        return -EPERM;
1719}
1720#endif  /* !CONFIG_X86_64 */
1721
1722/**
1723 * fpu_xstate_prctl - xstate permission operations
1724 * @tsk:        Redundant pointer to current
1725 * @option:     A subfunction of arch_prctl()
1726 * @arg2:       option argument
1727 * Return:      0 if successful; otherwise, an error code
1728 *
1729 * Option arguments:
1730 *
1731 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1732 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1733 * ARCH_REQ_XCOMP_PERM: Facility number requested
1734 *
1735 * For facilities which require more than one XSTATE component, the request
1736 * must be the highest state component number related to that facility,
1737 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1738 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1739 */
1740long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2)
1741{
1742        u64 __user *uptr = (u64 __user *)arg2;
1743        u64 permitted, supported;
1744        unsigned long idx = arg2;
1745
1746        if (tsk != current)
1747                return -EPERM;
1748
1749        switch (option) {
1750        case ARCH_GET_XCOMP_SUPP:
1751                supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
1752                return put_user(supported, uptr);
1753
1754        case ARCH_GET_XCOMP_PERM:
1755                /*
1756                 * Lockless snapshot as it can also change right after the
1757                 * dropping the lock.
1758                 */
1759                permitted = xstate_get_host_group_perm();
1760                permitted &= XFEATURE_MASK_USER_SUPPORTED;
1761                return put_user(permitted, uptr);
1762
1763        case ARCH_REQ_XCOMP_PERM:
1764                if (!IS_ENABLED(CONFIG_X86_64))
1765                        return -EOPNOTSUPP;
1766
1767                return xstate_request_perm(idx);
1768
1769        default:
1770                return -EINVAL;
1771        }
1772}
1773
1774#ifdef CONFIG_PROC_PID_ARCH_STATUS
1775/*
1776 * Report the amount of time elapsed in millisecond since last AVX512
1777 * use in the task.
1778 */
1779static void avx512_status(struct seq_file *m, struct task_struct *task)
1780{
1781        unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
1782        long delta;
1783
1784        if (!timestamp) {
1785                /*
1786                 * Report -1 if no AVX512 usage
1787                 */
1788                delta = -1;
1789        } else {
1790                delta = (long)(jiffies - timestamp);
1791                /*
1792                 * Cap to LONG_MAX if time difference > LONG_MAX
1793                 */
1794                if (delta < 0)
1795                        delta = LONG_MAX;
1796                delta = jiffies_to_msecs(delta);
1797        }
1798
1799        seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1800        seq_putc(m, '\n');
1801}
1802
1803/*
1804 * Report architecture specific information
1805 */
1806int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1807                        struct pid *pid, struct task_struct *task)
1808{
1809        /*
1810         * Report AVX512 state if the processor and build option supported.
1811         */
1812        if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1813                avx512_status(m, task);
1814
1815        return 0;
1816}
1817#endif /* CONFIG_PROC_PID_ARCH_STATUS */
1818