qemu/target/arm/helper-a64.c
<<
>>
Prefs
   1/*
   2 *  AArch64 specific helpers
   3 *
   4 *  Copyright (c) 2013 Alexander Graf <agraf@suse.de>
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "qemu/units.h"
  22#include "cpu.h"
  23#include "exec/gdbstub.h"
  24#include "exec/helper-proto.h"
  25#include "qemu/host-utils.h"
  26#include "qemu/log.h"
  27#include "qemu/main-loop.h"
  28#include "qemu/bitops.h"
  29#include "internals.h"
  30#include "qemu/crc32c.h"
  31#include "exec/exec-all.h"
  32#include "exec/cpu_ldst.h"
  33#include "qemu/int128.h"
  34#include "qemu/atomic128.h"
  35#include "tcg/tcg.h"
  36#include "fpu/softfloat.h"
  37#include <zlib.h> /* For crc32 */
  38
  39/* C2.4.7 Multiply and divide */
  40/* special cases for 0 and LLONG_MIN are mandated by the standard */
  41uint64_t HELPER(udiv64)(uint64_t num, uint64_t den)
  42{
  43    if (den == 0) {
  44        return 0;
  45    }
  46    return num / den;
  47}
  48
  49int64_t HELPER(sdiv64)(int64_t num, int64_t den)
  50{
  51    if (den == 0) {
  52        return 0;
  53    }
  54    if (num == LLONG_MIN && den == -1) {
  55        return LLONG_MIN;
  56    }
  57    return num / den;
  58}
  59
  60uint64_t HELPER(rbit64)(uint64_t x)
  61{
  62    return revbit64(x);
  63}
  64
  65void HELPER(msr_i_spsel)(CPUARMState *env, uint32_t imm)
  66{
  67    update_spsel(env, imm);
  68}
  69
  70static void daif_check(CPUARMState *env, uint32_t op,
  71                       uint32_t imm, uintptr_t ra)
  72{
  73    /* DAIF update to PSTATE. This is OK from EL0 only if UMA is set.  */
  74    if (arm_current_el(env) == 0 && !(arm_sctlr(env, 0) & SCTLR_UMA)) {
  75        raise_exception_ra(env, EXCP_UDEF,
  76                           syn_aa64_sysregtrap(0, extract32(op, 0, 3),
  77                                               extract32(op, 3, 3), 4,
  78                                               imm, 0x1f, 0),
  79                           exception_target_el(env), ra);
  80    }
  81}
  82
  83void HELPER(msr_i_daifset)(CPUARMState *env, uint32_t imm)
  84{
  85    daif_check(env, 0x1e, imm, GETPC());
  86    env->daif |= (imm << 6) & PSTATE_DAIF;
  87}
  88
  89void HELPER(msr_i_daifclear)(CPUARMState *env, uint32_t imm)
  90{
  91    daif_check(env, 0x1f, imm, GETPC());
  92    env->daif &= ~((imm << 6) & PSTATE_DAIF);
  93}
  94
  95/* Convert a softfloat float_relation_ (as returned by
  96 * the float*_compare functions) to the correct ARM
  97 * NZCV flag state.
  98 */
  99static inline uint32_t float_rel_to_flags(int res)
 100{
 101    uint64_t flags;
 102    switch (res) {
 103    case float_relation_equal:
 104        flags = PSTATE_Z | PSTATE_C;
 105        break;
 106    case float_relation_less:
 107        flags = PSTATE_N;
 108        break;
 109    case float_relation_greater:
 110        flags = PSTATE_C;
 111        break;
 112    case float_relation_unordered:
 113    default:
 114        flags = PSTATE_C | PSTATE_V;
 115        break;
 116    }
 117    return flags;
 118}
 119
 120uint64_t HELPER(vfp_cmph_a64)(uint32_t x, uint32_t y, void *fp_status)
 121{
 122    return float_rel_to_flags(float16_compare_quiet(x, y, fp_status));
 123}
 124
 125uint64_t HELPER(vfp_cmpeh_a64)(uint32_t x, uint32_t y, void *fp_status)
 126{
 127    return float_rel_to_flags(float16_compare(x, y, fp_status));
 128}
 129
 130uint64_t HELPER(vfp_cmps_a64)(float32 x, float32 y, void *fp_status)
 131{
 132    return float_rel_to_flags(float32_compare_quiet(x, y, fp_status));
 133}
 134
 135uint64_t HELPER(vfp_cmpes_a64)(float32 x, float32 y, void *fp_status)
 136{
 137    return float_rel_to_flags(float32_compare(x, y, fp_status));
 138}
 139
 140uint64_t HELPER(vfp_cmpd_a64)(float64 x, float64 y, void *fp_status)
 141{
 142    return float_rel_to_flags(float64_compare_quiet(x, y, fp_status));
 143}
 144
 145uint64_t HELPER(vfp_cmped_a64)(float64 x, float64 y, void *fp_status)
 146{
 147    return float_rel_to_flags(float64_compare(x, y, fp_status));
 148}
 149
 150float32 HELPER(vfp_mulxs)(float32 a, float32 b, void *fpstp)
 151{
 152    float_status *fpst = fpstp;
 153
 154    a = float32_squash_input_denormal(a, fpst);
 155    b = float32_squash_input_denormal(b, fpst);
 156
 157    if ((float32_is_zero(a) && float32_is_infinity(b)) ||
 158        (float32_is_infinity(a) && float32_is_zero(b))) {
 159        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
 160        return make_float32((1U << 30) |
 161                            ((float32_val(a) ^ float32_val(b)) & (1U << 31)));
 162    }
 163    return float32_mul(a, b, fpst);
 164}
 165
 166float64 HELPER(vfp_mulxd)(float64 a, float64 b, void *fpstp)
 167{
 168    float_status *fpst = fpstp;
 169
 170    a = float64_squash_input_denormal(a, fpst);
 171    b = float64_squash_input_denormal(b, fpst);
 172
 173    if ((float64_is_zero(a) && float64_is_infinity(b)) ||
 174        (float64_is_infinity(a) && float64_is_zero(b))) {
 175        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
 176        return make_float64((1ULL << 62) |
 177                            ((float64_val(a) ^ float64_val(b)) & (1ULL << 63)));
 178    }
 179    return float64_mul(a, b, fpst);
 180}
 181
 182/* 64bit/double versions of the neon float compare functions */
 183uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp)
 184{
 185    float_status *fpst = fpstp;
 186    return -float64_eq_quiet(a, b, fpst);
 187}
 188
 189uint64_t HELPER(neon_cge_f64)(float64 a, float64 b, void *fpstp)
 190{
 191    float_status *fpst = fpstp;
 192    return -float64_le(b, a, fpst);
 193}
 194
 195uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp)
 196{
 197    float_status *fpst = fpstp;
 198    return -float64_lt(b, a, fpst);
 199}
 200
 201/* Reciprocal step and sqrt step. Note that unlike the A32/T32
 202 * versions, these do a fully fused multiply-add or
 203 * multiply-add-and-halve.
 204 */
 205
 206uint32_t HELPER(recpsf_f16)(uint32_t a, uint32_t b, void *fpstp)
 207{
 208    float_status *fpst = fpstp;
 209
 210    a = float16_squash_input_denormal(a, fpst);
 211    b = float16_squash_input_denormal(b, fpst);
 212
 213    a = float16_chs(a);
 214    if ((float16_is_infinity(a) && float16_is_zero(b)) ||
 215        (float16_is_infinity(b) && float16_is_zero(a))) {
 216        return float16_two;
 217    }
 218    return float16_muladd(a, b, float16_two, 0, fpst);
 219}
 220
 221float32 HELPER(recpsf_f32)(float32 a, float32 b, void *fpstp)
 222{
 223    float_status *fpst = fpstp;
 224
 225    a = float32_squash_input_denormal(a, fpst);
 226    b = float32_squash_input_denormal(b, fpst);
 227
 228    a = float32_chs(a);
 229    if ((float32_is_infinity(a) && float32_is_zero(b)) ||
 230        (float32_is_infinity(b) && float32_is_zero(a))) {
 231        return float32_two;
 232    }
 233    return float32_muladd(a, b, float32_two, 0, fpst);
 234}
 235
 236float64 HELPER(recpsf_f64)(float64 a, float64 b, void *fpstp)
 237{
 238    float_status *fpst = fpstp;
 239
 240    a = float64_squash_input_denormal(a, fpst);
 241    b = float64_squash_input_denormal(b, fpst);
 242
 243    a = float64_chs(a);
 244    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
 245        (float64_is_infinity(b) && float64_is_zero(a))) {
 246        return float64_two;
 247    }
 248    return float64_muladd(a, b, float64_two, 0, fpst);
 249}
 250
 251uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, void *fpstp)
 252{
 253    float_status *fpst = fpstp;
 254
 255    a = float16_squash_input_denormal(a, fpst);
 256    b = float16_squash_input_denormal(b, fpst);
 257
 258    a = float16_chs(a);
 259    if ((float16_is_infinity(a) && float16_is_zero(b)) ||
 260        (float16_is_infinity(b) && float16_is_zero(a))) {
 261        return float16_one_point_five;
 262    }
 263    return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst);
 264}
 265
 266float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, void *fpstp)
 267{
 268    float_status *fpst = fpstp;
 269
 270    a = float32_squash_input_denormal(a, fpst);
 271    b = float32_squash_input_denormal(b, fpst);
 272
 273    a = float32_chs(a);
 274    if ((float32_is_infinity(a) && float32_is_zero(b)) ||
 275        (float32_is_infinity(b) && float32_is_zero(a))) {
 276        return float32_one_point_five;
 277    }
 278    return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst);
 279}
 280
 281float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, void *fpstp)
 282{
 283    float_status *fpst = fpstp;
 284
 285    a = float64_squash_input_denormal(a, fpst);
 286    b = float64_squash_input_denormal(b, fpst);
 287
 288    a = float64_chs(a);
 289    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
 290        (float64_is_infinity(b) && float64_is_zero(a))) {
 291        return float64_one_point_five;
 292    }
 293    return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
 294}
 295
 296/* Pairwise long add: add pairs of adjacent elements into
 297 * double-width elements in the result (eg _s8 is an 8x8->16 op)
 298 */
 299uint64_t HELPER(neon_addlp_s8)(uint64_t a)
 300{
 301    uint64_t nsignmask = 0x0080008000800080ULL;
 302    uint64_t wsignmask = 0x8000800080008000ULL;
 303    uint64_t elementmask = 0x00ff00ff00ff00ffULL;
 304    uint64_t tmp1, tmp2;
 305    uint64_t res, signres;
 306
 307    /* Extract odd elements, sign extend each to a 16 bit field */
 308    tmp1 = a & elementmask;
 309    tmp1 ^= nsignmask;
 310    tmp1 |= wsignmask;
 311    tmp1 = (tmp1 - nsignmask) ^ wsignmask;
 312    /* Ditto for the even elements */
 313    tmp2 = (a >> 8) & elementmask;
 314    tmp2 ^= nsignmask;
 315    tmp2 |= wsignmask;
 316    tmp2 = (tmp2 - nsignmask) ^ wsignmask;
 317
 318    /* calculate the result by summing bits 0..14, 16..22, etc,
 319     * and then adjusting the sign bits 15, 23, etc manually.
 320     * This ensures the addition can't overflow the 16 bit field.
 321     */
 322    signres = (tmp1 ^ tmp2) & wsignmask;
 323    res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask);
 324    res ^= signres;
 325
 326    return res;
 327}
 328
 329uint64_t HELPER(neon_addlp_u8)(uint64_t a)
 330{
 331    uint64_t tmp;
 332
 333    tmp = a & 0x00ff00ff00ff00ffULL;
 334    tmp += (a >> 8) & 0x00ff00ff00ff00ffULL;
 335    return tmp;
 336}
 337
 338uint64_t HELPER(neon_addlp_s16)(uint64_t a)
 339{
 340    int32_t reslo, reshi;
 341
 342    reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16);
 343    reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48);
 344
 345    return (uint32_t)reslo | (((uint64_t)reshi) << 32);
 346}
 347
 348uint64_t HELPER(neon_addlp_u16)(uint64_t a)
 349{
 350    uint64_t tmp;
 351
 352    tmp = a & 0x0000ffff0000ffffULL;
 353    tmp += (a >> 16) & 0x0000ffff0000ffffULL;
 354    return tmp;
 355}
 356
 357/* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
 358uint32_t HELPER(frecpx_f16)(uint32_t a, void *fpstp)
 359{
 360    float_status *fpst = fpstp;
 361    uint16_t val16, sbit;
 362    int16_t exp;
 363
 364    if (float16_is_any_nan(a)) {
 365        float16 nan = a;
 366        if (float16_is_signaling_nan(a, fpst)) {
 367            float_raise(float_flag_invalid, fpst);
 368            nan = float16_silence_nan(a, fpst);
 369        }
 370        if (fpst->default_nan_mode) {
 371            nan = float16_default_nan(fpst);
 372        }
 373        return nan;
 374    }
 375
 376    a = float16_squash_input_denormal(a, fpst);
 377
 378    val16 = float16_val(a);
 379    sbit = 0x8000 & val16;
 380    exp = extract32(val16, 10, 5);
 381
 382    if (exp == 0) {
 383        return make_float16(deposit32(sbit, 10, 5, 0x1e));
 384    } else {
 385        return make_float16(deposit32(sbit, 10, 5, ~exp));
 386    }
 387}
 388
 389float32 HELPER(frecpx_f32)(float32 a, void *fpstp)
 390{
 391    float_status *fpst = fpstp;
 392    uint32_t val32, sbit;
 393    int32_t exp;
 394
 395    if (float32_is_any_nan(a)) {
 396        float32 nan = a;
 397        if (float32_is_signaling_nan(a, fpst)) {
 398            float_raise(float_flag_invalid, fpst);
 399            nan = float32_silence_nan(a, fpst);
 400        }
 401        if (fpst->default_nan_mode) {
 402            nan = float32_default_nan(fpst);
 403        }
 404        return nan;
 405    }
 406
 407    a = float32_squash_input_denormal(a, fpst);
 408
 409    val32 = float32_val(a);
 410    sbit = 0x80000000ULL & val32;
 411    exp = extract32(val32, 23, 8);
 412
 413    if (exp == 0) {
 414        return make_float32(sbit | (0xfe << 23));
 415    } else {
 416        return make_float32(sbit | (~exp & 0xff) << 23);
 417    }
 418}
 419
 420float64 HELPER(frecpx_f64)(float64 a, void *fpstp)
 421{
 422    float_status *fpst = fpstp;
 423    uint64_t val64, sbit;
 424    int64_t exp;
 425
 426    if (float64_is_any_nan(a)) {
 427        float64 nan = a;
 428        if (float64_is_signaling_nan(a, fpst)) {
 429            float_raise(float_flag_invalid, fpst);
 430            nan = float64_silence_nan(a, fpst);
 431        }
 432        if (fpst->default_nan_mode) {
 433            nan = float64_default_nan(fpst);
 434        }
 435        return nan;
 436    }
 437
 438    a = float64_squash_input_denormal(a, fpst);
 439
 440    val64 = float64_val(a);
 441    sbit = 0x8000000000000000ULL & val64;
 442    exp = extract64(float64_val(a), 52, 11);
 443
 444    if (exp == 0) {
 445        return make_float64(sbit | (0x7feULL << 52));
 446    } else {
 447        return make_float64(sbit | (~exp & 0x7ffULL) << 52);
 448    }
 449}
 450
 451float32 HELPER(fcvtx_f64_to_f32)(float64 a, CPUARMState *env)
 452{
 453    /* Von Neumann rounding is implemented by using round-to-zero
 454     * and then setting the LSB of the result if Inexact was raised.
 455     */
 456    float32 r;
 457    float_status *fpst = &env->vfp.fp_status;
 458    float_status tstat = *fpst;
 459    int exflags;
 460
 461    set_float_rounding_mode(float_round_to_zero, &tstat);
 462    set_float_exception_flags(0, &tstat);
 463    r = float64_to_float32(a, &tstat);
 464    exflags = get_float_exception_flags(&tstat);
 465    if (exflags & float_flag_inexact) {
 466        r = make_float32(float32_val(r) | 1);
 467    }
 468    exflags |= get_float_exception_flags(fpst);
 469    set_float_exception_flags(exflags, fpst);
 470    return r;
 471}
 472
 473/* 64-bit versions of the CRC helpers. Note that although the operation
 474 * (and the prototypes of crc32c() and crc32() mean that only the bottom
 475 * 32 bits of the accumulator and result are used, we pass and return
 476 * uint64_t for convenience of the generated code. Unlike the 32-bit
 477 * instruction set versions, val may genuinely have 64 bits of data in it.
 478 * The upper bytes of val (above the number specified by 'bytes') must have
 479 * been zeroed out by the caller.
 480 */
 481uint64_t HELPER(crc32_64)(uint64_t acc, uint64_t val, uint32_t bytes)
 482{
 483    uint8_t buf[8];
 484
 485    stq_le_p(buf, val);
 486
 487    /* zlib crc32 converts the accumulator and output to one's complement.  */
 488    return crc32(acc ^ 0xffffffff, buf, bytes) ^ 0xffffffff;
 489}
 490
 491uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
 492{
 493    uint8_t buf[8];
 494
 495    stq_le_p(buf, val);
 496
 497    /* Linux crc32c converts the output to one's complement.  */
 498    return crc32c(acc, buf, bytes) ^ 0xffffffff;
 499}
 500
 501uint64_t HELPER(paired_cmpxchg64_le)(CPUARMState *env, uint64_t addr,
 502                                     uint64_t new_lo, uint64_t new_hi)
 503{
 504    Int128 cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
 505    Int128 newv = int128_make128(new_lo, new_hi);
 506    Int128 oldv;
 507    uintptr_t ra = GETPC();
 508    uint64_t o0, o1;
 509    bool success;
 510
 511#ifdef CONFIG_USER_ONLY
 512    /* ??? Enforce alignment.  */
 513    uint64_t *haddr = g2h(env_cpu(env), addr);
 514
 515    set_helper_retaddr(ra);
 516    o0 = ldq_le_p(haddr + 0);
 517    o1 = ldq_le_p(haddr + 1);
 518    oldv = int128_make128(o0, o1);
 519
 520    success = int128_eq(oldv, cmpv);
 521    if (success) {
 522        stq_le_p(haddr + 0, int128_getlo(newv));
 523        stq_le_p(haddr + 1, int128_gethi(newv));
 524    }
 525    clear_helper_retaddr();
 526#else
 527    int mem_idx = cpu_mmu_index(env, false);
 528    TCGMemOpIdx oi0 = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
 529    TCGMemOpIdx oi1 = make_memop_idx(MO_LEQ, mem_idx);
 530
 531    o0 = helper_le_ldq_mmu(env, addr + 0, oi0, ra);
 532    o1 = helper_le_ldq_mmu(env, addr + 8, oi1, ra);
 533    oldv = int128_make128(o0, o1);
 534
 535    success = int128_eq(oldv, cmpv);
 536    if (success) {
 537        helper_le_stq_mmu(env, addr + 0, int128_getlo(newv), oi1, ra);
 538        helper_le_stq_mmu(env, addr + 8, int128_gethi(newv), oi1, ra);
 539    }
 540#endif
 541
 542    return !success;
 543}
 544
 545uint64_t HELPER(paired_cmpxchg64_le_parallel)(CPUARMState *env, uint64_t addr,
 546                                              uint64_t new_lo, uint64_t new_hi)
 547{
 548    Int128 oldv, cmpv, newv;
 549    uintptr_t ra = GETPC();
 550    bool success;
 551    int mem_idx;
 552    TCGMemOpIdx oi;
 553
 554    assert(HAVE_CMPXCHG128);
 555
 556    mem_idx = cpu_mmu_index(env, false);
 557    oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
 558
 559    cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
 560    newv = int128_make128(new_lo, new_hi);
 561    oldv = helper_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
 562
 563    success = int128_eq(oldv, cmpv);
 564    return !success;
 565}
 566
 567uint64_t HELPER(paired_cmpxchg64_be)(CPUARMState *env, uint64_t addr,
 568                                     uint64_t new_lo, uint64_t new_hi)
 569{
 570    /*
 571     * High and low need to be switched here because this is not actually a
 572     * 128bit store but two doublewords stored consecutively
 573     */
 574    Int128 cmpv = int128_make128(env->exclusive_high, env->exclusive_val);
 575    Int128 newv = int128_make128(new_hi, new_lo);
 576    Int128 oldv;
 577    uintptr_t ra = GETPC();
 578    uint64_t o0, o1;
 579    bool success;
 580
 581#ifdef CONFIG_USER_ONLY
 582    /* ??? Enforce alignment.  */
 583    uint64_t *haddr = g2h(env_cpu(env), addr);
 584
 585    set_helper_retaddr(ra);
 586    o1 = ldq_be_p(haddr + 0);
 587    o0 = ldq_be_p(haddr + 1);
 588    oldv = int128_make128(o0, o1);
 589
 590    success = int128_eq(oldv, cmpv);
 591    if (success) {
 592        stq_be_p(haddr + 0, int128_gethi(newv));
 593        stq_be_p(haddr + 1, int128_getlo(newv));
 594    }
 595    clear_helper_retaddr();
 596#else
 597    int mem_idx = cpu_mmu_index(env, false);
 598    TCGMemOpIdx oi0 = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
 599    TCGMemOpIdx oi1 = make_memop_idx(MO_BEQ, mem_idx);
 600
 601    o1 = helper_be_ldq_mmu(env, addr + 0, oi0, ra);
 602    o0 = helper_be_ldq_mmu(env, addr + 8, oi1, ra);
 603    oldv = int128_make128(o0, o1);
 604
 605    success = int128_eq(oldv, cmpv);
 606    if (success) {
 607        helper_be_stq_mmu(env, addr + 0, int128_gethi(newv), oi1, ra);
 608        helper_be_stq_mmu(env, addr + 8, int128_getlo(newv), oi1, ra);
 609    }
 610#endif
 611
 612    return !success;
 613}
 614
 615uint64_t HELPER(paired_cmpxchg64_be_parallel)(CPUARMState *env, uint64_t addr,
 616                                              uint64_t new_lo, uint64_t new_hi)
 617{
 618    Int128 oldv, cmpv, newv;
 619    uintptr_t ra = GETPC();
 620    bool success;
 621    int mem_idx;
 622    TCGMemOpIdx oi;
 623
 624    assert(HAVE_CMPXCHG128);
 625
 626    mem_idx = cpu_mmu_index(env, false);
 627    oi = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
 628
 629    /*
 630     * High and low need to be switched here because this is not actually a
 631     * 128bit store but two doublewords stored consecutively
 632     */
 633    cmpv = int128_make128(env->exclusive_high, env->exclusive_val);
 634    newv = int128_make128(new_hi, new_lo);
 635    oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
 636
 637    success = int128_eq(oldv, cmpv);
 638    return !success;
 639}
 640
 641/* Writes back the old data into Rs.  */
 642void HELPER(casp_le_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
 643                              uint64_t new_lo, uint64_t new_hi)
 644{
 645    Int128 oldv, cmpv, newv;
 646    uintptr_t ra = GETPC();
 647    int mem_idx;
 648    TCGMemOpIdx oi;
 649
 650    assert(HAVE_CMPXCHG128);
 651
 652    mem_idx = cpu_mmu_index(env, false);
 653    oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
 654
 655    cmpv = int128_make128(env->xregs[rs], env->xregs[rs + 1]);
 656    newv = int128_make128(new_lo, new_hi);
 657    oldv = helper_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
 658
 659    env->xregs[rs] = int128_getlo(oldv);
 660    env->xregs[rs + 1] = int128_gethi(oldv);
 661}
 662
 663void HELPER(casp_be_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
 664                              uint64_t new_hi, uint64_t new_lo)
 665{
 666    Int128 oldv, cmpv, newv;
 667    uintptr_t ra = GETPC();
 668    int mem_idx;
 669    TCGMemOpIdx oi;
 670
 671    assert(HAVE_CMPXCHG128);
 672
 673    mem_idx = cpu_mmu_index(env, false);
 674    oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
 675
 676    cmpv = int128_make128(env->xregs[rs + 1], env->xregs[rs]);
 677    newv = int128_make128(new_lo, new_hi);
 678    oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
 679
 680    env->xregs[rs + 1] = int128_getlo(oldv);
 681    env->xregs[rs] = int128_gethi(oldv);
 682}
 683
 684/*
 685 * AdvSIMD half-precision
 686 */
 687
 688#define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix))
 689
 690#define ADVSIMD_HALFOP(name) \
 691uint32_t ADVSIMD_HELPER(name, h)(uint32_t a, uint32_t b, void *fpstp) \
 692{ \
 693    float_status *fpst = fpstp; \
 694    return float16_ ## name(a, b, fpst);    \
 695}
 696
 697ADVSIMD_HALFOP(add)
 698ADVSIMD_HALFOP(sub)
 699ADVSIMD_HALFOP(mul)
 700ADVSIMD_HALFOP(div)
 701ADVSIMD_HALFOP(min)
 702ADVSIMD_HALFOP(max)
 703ADVSIMD_HALFOP(minnum)
 704ADVSIMD_HALFOP(maxnum)
 705
 706#define ADVSIMD_TWOHALFOP(name)                                         \
 707uint32_t ADVSIMD_HELPER(name, 2h)(uint32_t two_a, uint32_t two_b, void *fpstp) \
 708{ \
 709    float16  a1, a2, b1, b2;                        \
 710    uint32_t r1, r2;                                \
 711    float_status *fpst = fpstp;                     \
 712    a1 = extract32(two_a, 0, 16);                   \
 713    a2 = extract32(two_a, 16, 16);                  \
 714    b1 = extract32(two_b, 0, 16);                   \
 715    b2 = extract32(two_b, 16, 16);                  \
 716    r1 = float16_ ## name(a1, b1, fpst);            \
 717    r2 = float16_ ## name(a2, b2, fpst);            \
 718    return deposit32(r1, 16, 16, r2);               \
 719}
 720
 721ADVSIMD_TWOHALFOP(add)
 722ADVSIMD_TWOHALFOP(sub)
 723ADVSIMD_TWOHALFOP(mul)
 724ADVSIMD_TWOHALFOP(div)
 725ADVSIMD_TWOHALFOP(min)
 726ADVSIMD_TWOHALFOP(max)
 727ADVSIMD_TWOHALFOP(minnum)
 728ADVSIMD_TWOHALFOP(maxnum)
 729
 730/* Data processing - scalar floating-point and advanced SIMD */
 731static float16 float16_mulx(float16 a, float16 b, void *fpstp)
 732{
 733    float_status *fpst = fpstp;
 734
 735    a = float16_squash_input_denormal(a, fpst);
 736    b = float16_squash_input_denormal(b, fpst);
 737
 738    if ((float16_is_zero(a) && float16_is_infinity(b)) ||
 739        (float16_is_infinity(a) && float16_is_zero(b))) {
 740        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
 741        return make_float16((1U << 14) |
 742                            ((float16_val(a) ^ float16_val(b)) & (1U << 15)));
 743    }
 744    return float16_mul(a, b, fpst);
 745}
 746
 747ADVSIMD_HALFOP(mulx)
 748ADVSIMD_TWOHALFOP(mulx)
 749
 750/* fused multiply-accumulate */
 751uint32_t HELPER(advsimd_muladdh)(uint32_t a, uint32_t b, uint32_t c,
 752                                 void *fpstp)
 753{
 754    float_status *fpst = fpstp;
 755    return float16_muladd(a, b, c, 0, fpst);
 756}
 757
 758uint32_t HELPER(advsimd_muladd2h)(uint32_t two_a, uint32_t two_b,
 759                                  uint32_t two_c, void *fpstp)
 760{
 761    float_status *fpst = fpstp;
 762    float16  a1, a2, b1, b2, c1, c2;
 763    uint32_t r1, r2;
 764    a1 = extract32(two_a, 0, 16);
 765    a2 = extract32(two_a, 16, 16);
 766    b1 = extract32(two_b, 0, 16);
 767    b2 = extract32(two_b, 16, 16);
 768    c1 = extract32(two_c, 0, 16);
 769    c2 = extract32(two_c, 16, 16);
 770    r1 = float16_muladd(a1, b1, c1, 0, fpst);
 771    r2 = float16_muladd(a2, b2, c2, 0, fpst);
 772    return deposit32(r1, 16, 16, r2);
 773}
 774
 775/*
 776 * Floating point comparisons produce an integer result. Softfloat
 777 * routines return float_relation types which we convert to the 0/-1
 778 * Neon requires.
 779 */
 780
 781#define ADVSIMD_CMPRES(test) (test) ? 0xffff : 0
 782
 783uint32_t HELPER(advsimd_ceq_f16)(uint32_t a, uint32_t b, void *fpstp)
 784{
 785    float_status *fpst = fpstp;
 786    int compare = float16_compare_quiet(a, b, fpst);
 787    return ADVSIMD_CMPRES(compare == float_relation_equal);
 788}
 789
 790uint32_t HELPER(advsimd_cge_f16)(uint32_t a, uint32_t b, void *fpstp)
 791{
 792    float_status *fpst = fpstp;
 793    int compare = float16_compare(a, b, fpst);
 794    return ADVSIMD_CMPRES(compare == float_relation_greater ||
 795                          compare == float_relation_equal);
 796}
 797
 798uint32_t HELPER(advsimd_cgt_f16)(uint32_t a, uint32_t b, void *fpstp)
 799{
 800    float_status *fpst = fpstp;
 801    int compare = float16_compare(a, b, fpst);
 802    return ADVSIMD_CMPRES(compare == float_relation_greater);
 803}
 804
 805uint32_t HELPER(advsimd_acge_f16)(uint32_t a, uint32_t b, void *fpstp)
 806{
 807    float_status *fpst = fpstp;
 808    float16 f0 = float16_abs(a);
 809    float16 f1 = float16_abs(b);
 810    int compare = float16_compare(f0, f1, fpst);
 811    return ADVSIMD_CMPRES(compare == float_relation_greater ||
 812                          compare == float_relation_equal);
 813}
 814
 815uint32_t HELPER(advsimd_acgt_f16)(uint32_t a, uint32_t b, void *fpstp)
 816{
 817    float_status *fpst = fpstp;
 818    float16 f0 = float16_abs(a);
 819    float16 f1 = float16_abs(b);
 820    int compare = float16_compare(f0, f1, fpst);
 821    return ADVSIMD_CMPRES(compare == float_relation_greater);
 822}
 823
 824/* round to integral */
 825uint32_t HELPER(advsimd_rinth_exact)(uint32_t x, void *fp_status)
 826{
 827    return float16_round_to_int(x, fp_status);
 828}
 829
 830uint32_t HELPER(advsimd_rinth)(uint32_t x, void *fp_status)
 831{
 832    int old_flags = get_float_exception_flags(fp_status), new_flags;
 833    float16 ret;
 834
 835    ret = float16_round_to_int(x, fp_status);
 836
 837    /* Suppress any inexact exceptions the conversion produced */
 838    if (!(old_flags & float_flag_inexact)) {
 839        new_flags = get_float_exception_flags(fp_status);
 840        set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
 841    }
 842
 843    return ret;
 844}
 845
 846/*
 847 * Half-precision floating point conversion functions
 848 *
 849 * There are a multitude of conversion functions with various
 850 * different rounding modes. This is dealt with by the calling code
 851 * setting the mode appropriately before calling the helper.
 852 */
 853
 854uint32_t HELPER(advsimd_f16tosinth)(uint32_t a, void *fpstp)
 855{
 856    float_status *fpst = fpstp;
 857
 858    /* Invalid if we are passed a NaN */
 859    if (float16_is_any_nan(a)) {
 860        float_raise(float_flag_invalid, fpst);
 861        return 0;
 862    }
 863    return float16_to_int16(a, fpst);
 864}
 865
 866uint32_t HELPER(advsimd_f16touinth)(uint32_t a, void *fpstp)
 867{
 868    float_status *fpst = fpstp;
 869
 870    /* Invalid if we are passed a NaN */
 871    if (float16_is_any_nan(a)) {
 872        float_raise(float_flag_invalid, fpst);
 873        return 0;
 874    }
 875    return float16_to_uint16(a, fpst);
 876}
 877
 878static int el_from_spsr(uint32_t spsr)
 879{
 880    /* Return the exception level that this SPSR is requesting a return to,
 881     * or -1 if it is invalid (an illegal return)
 882     */
 883    if (spsr & PSTATE_nRW) {
 884        switch (spsr & CPSR_M) {
 885        case ARM_CPU_MODE_USR:
 886            return 0;
 887        case ARM_CPU_MODE_HYP:
 888            return 2;
 889        case ARM_CPU_MODE_FIQ:
 890        case ARM_CPU_MODE_IRQ:
 891        case ARM_CPU_MODE_SVC:
 892        case ARM_CPU_MODE_ABT:
 893        case ARM_CPU_MODE_UND:
 894        case ARM_CPU_MODE_SYS:
 895            return 1;
 896        case ARM_CPU_MODE_MON:
 897            /* Returning to Mon from AArch64 is never possible,
 898             * so this is an illegal return.
 899             */
 900        default:
 901            return -1;
 902        }
 903    } else {
 904        if (extract32(spsr, 1, 1)) {
 905            /* Return with reserved M[1] bit set */
 906            return -1;
 907        }
 908        if (extract32(spsr, 0, 4) == 1) {
 909            /* return to EL0 with M[0] bit set */
 910            return -1;
 911        }
 912        return extract32(spsr, 2, 2);
 913    }
 914}
 915
 916static void cpsr_write_from_spsr_elx(CPUARMState *env,
 917                                     uint32_t val)
 918{
 919    uint32_t mask;
 920
 921    /* Save SPSR_ELx.SS into PSTATE. */
 922    env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS);
 923    val &= ~PSTATE_SS;
 924
 925    /* Move DIT to the correct location for CPSR */
 926    if (val & PSTATE_DIT) {
 927        val &= ~PSTATE_DIT;
 928        val |= CPSR_DIT;
 929    }
 930
 931    mask = aarch32_cpsr_valid_mask(env->features, \
 932        &env_archcpu(env)->isar);
 933    cpsr_write(env, val, mask, CPSRWriteRaw);
 934}
 935
 936void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
 937{
 938    int cur_el = arm_current_el(env);
 939    unsigned int spsr_idx = aarch64_banked_spsr_index(cur_el);
 940    uint32_t spsr = env->banked_spsr[spsr_idx];
 941    int new_el;
 942    bool return_to_aa64 = (spsr & PSTATE_nRW) == 0;
 943
 944    aarch64_save_sp(env, cur_el);
 945
 946    arm_clear_exclusive(env);
 947
 948    /* We must squash the PSTATE.SS bit to zero unless both of the
 949     * following hold:
 950     *  1. debug exceptions are currently disabled
 951     *  2. singlestep will be active in the EL we return to
 952     * We check 1 here and 2 after we've done the pstate/cpsr write() to
 953     * transition to the EL we're going to.
 954     */
 955    if (arm_generate_debug_exceptions(env)) {
 956        spsr &= ~PSTATE_SS;
 957    }
 958
 959    new_el = el_from_spsr(spsr);
 960    if (new_el == -1) {
 961        goto illegal_return;
 962    }
 963    if (new_el > cur_el || (new_el == 2 && !arm_is_el2_enabled(env))) {
 964        /* Disallow return to an EL which is unimplemented or higher
 965         * than the current one.
 966         */
 967        goto illegal_return;
 968    }
 969
 970    if (new_el != 0 && arm_el_is_aa64(env, new_el) != return_to_aa64) {
 971        /* Return to an EL which is configured for a different register width */
 972        goto illegal_return;
 973    }
 974
 975    if (new_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
 976        goto illegal_return;
 977    }
 978
 979    qemu_mutex_lock_iothread();
 980    arm_call_pre_el_change_hook(env_archcpu(env));
 981    qemu_mutex_unlock_iothread();
 982
 983    if (!return_to_aa64) {
 984        env->aarch64 = 0;
 985        /* We do a raw CPSR write because aarch64_sync_64_to_32()
 986         * will sort the register banks out for us, and we've already
 987         * caught all the bad-mode cases in el_from_spsr().
 988         */
 989        cpsr_write_from_spsr_elx(env, spsr);
 990        if (!arm_singlestep_active(env)) {
 991            env->pstate &= ~PSTATE_SS;
 992        }
 993        aarch64_sync_64_to_32(env);
 994
 995        if (spsr & CPSR_T) {
 996            env->regs[15] = new_pc & ~0x1;
 997        } else {
 998            env->regs[15] = new_pc & ~0x3;
 999        }
1000        helper_rebuild_hflags_a32(env, new_el);
1001        qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
1002                      "AArch32 EL%d PC 0x%" PRIx32 "\n",
1003                      cur_el, new_el, env->regs[15]);
1004    } else {
1005        int tbii;
1006
1007        env->aarch64 = 1;
1008        spsr &= aarch64_pstate_valid_mask(&env_archcpu(env)->isar);
1009        pstate_write(env, spsr);
1010        if (!arm_singlestep_active(env)) {
1011            env->pstate &= ~PSTATE_SS;
1012        }
1013        aarch64_restore_sp(env, new_el);
1014        helper_rebuild_hflags_a64(env, new_el);
1015
1016        /*
1017         * Apply TBI to the exception return address.  We had to delay this
1018         * until after we selected the new EL, so that we could select the
1019         * correct TBI+TBID bits.  This is made easier by waiting until after
1020         * the hflags rebuild, since we can pull the composite TBII field
1021         * from there.
1022         */
1023        tbii = FIELD_EX32(env->hflags, TBFLAG_A64, TBII);
1024        if ((tbii >> extract64(new_pc, 55, 1)) & 1) {
1025            /* TBI is enabled. */
1026            int core_mmu_idx = cpu_mmu_index(env, false);
1027            if (regime_has_2_ranges(core_to_aa64_mmu_idx(core_mmu_idx))) {
1028                new_pc = sextract64(new_pc, 0, 56);
1029            } else {
1030                new_pc = extract64(new_pc, 0, 56);
1031            }
1032        }
1033        env->pc = new_pc;
1034
1035        qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
1036                      "AArch64 EL%d PC 0x%" PRIx64 "\n",
1037                      cur_el, new_el, env->pc);
1038    }
1039
1040    /*
1041     * Note that cur_el can never be 0.  If new_el is 0, then
1042     * el0_a64 is return_to_aa64, else el0_a64 is ignored.
1043     */
1044    aarch64_sve_change_el(env, cur_el, new_el, return_to_aa64);
1045
1046    qemu_mutex_lock_iothread();
1047    arm_call_el_change_hook(env_archcpu(env));
1048    qemu_mutex_unlock_iothread();
1049
1050    return;
1051
1052illegal_return:
1053    /* Illegal return events of various kinds have architecturally
1054     * mandated behaviour:
1055     * restore NZCV and DAIF from SPSR_ELx
1056     * set PSTATE.IL
1057     * restore PC from ELR_ELx
1058     * no change to exception level, execution state or stack pointer
1059     */
1060    env->pstate |= PSTATE_IL;
1061    env->pc = new_pc;
1062    spsr &= PSTATE_NZCV | PSTATE_DAIF;
1063    spsr |= pstate_read(env) & ~(PSTATE_NZCV | PSTATE_DAIF);
1064    pstate_write(env, spsr);
1065    if (!arm_singlestep_active(env)) {
1066        env->pstate &= ~PSTATE_SS;
1067    }
1068    qemu_log_mask(LOG_GUEST_ERROR, "Illegal exception return at EL%d: "
1069                  "resuming execution at 0x%" PRIx64 "\n", cur_el, env->pc);
1070}
1071
1072/*
1073 * Square Root and Reciprocal square root
1074 */
1075
1076uint32_t HELPER(sqrt_f16)(uint32_t a, void *fpstp)
1077{
1078    float_status *s = fpstp;
1079
1080    return float16_sqrt(a, s);
1081}
1082
1083void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in)
1084{
1085    /*
1086     * Implement DC ZVA, which zeroes a fixed-length block of memory.
1087     * Note that we do not implement the (architecturally mandated)
1088     * alignment fault for attempts to use this on Device memory
1089     * (which matches the usual QEMU behaviour of not implementing either
1090     * alignment faults or any memory attribute handling).
1091     */
1092    int blocklen = 4 << env_archcpu(env)->dcz_blocksize;
1093    uint64_t vaddr = vaddr_in & ~(blocklen - 1);
1094    int mmu_idx = cpu_mmu_index(env, false);
1095    void *mem;
1096
1097    /*
1098     * Trapless lookup.  In addition to actual invalid page, may
1099     * return NULL for I/O, watchpoints, clean pages, etc.
1100     */
1101    mem = tlb_vaddr_to_host(env, vaddr, MMU_DATA_STORE, mmu_idx);
1102
1103#ifndef CONFIG_USER_ONLY
1104    if (unlikely(!mem)) {
1105        uintptr_t ra = GETPC();
1106
1107        /*
1108         * Trap if accessing an invalid page.  DC_ZVA requires that we supply
1109         * the original pointer for an invalid page.  But watchpoints require
1110         * that we probe the actual space.  So do both.
1111         */
1112        (void) probe_write(env, vaddr_in, 1, mmu_idx, ra);
1113        mem = probe_write(env, vaddr, blocklen, mmu_idx, ra);
1114
1115        if (unlikely(!mem)) {
1116            /*
1117             * The only remaining reason for mem == NULL is I/O.
1118             * Just do a series of byte writes as the architecture demands.
1119             */
1120            for (int i = 0; i < blocklen; i++) {
1121                cpu_stb_mmuidx_ra(env, vaddr + i, 0, mmu_idx, ra);
1122            }
1123            return;
1124        }
1125    }
1126#endif
1127
1128    memset(mem, 0, blocklen);
1129}
1130