LXR qemu/target/arm/helper-a64.c

   1/*
   2 *  AArch64 specific helpers
   3 *
   4 *  Copyright (c) 2013 Alexander Graf <agraf@suse.de>
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "qemu/units.h"
  22#include "cpu.h"
  23#include "exec/gdbstub.h"
  24#include "exec/helper-proto.h"
  25#include "qemu/host-utils.h"
  26#include "qemu/log.h"
  27#include "qemu/main-loop.h"
  28#include "qemu/bitops.h"
  29#include "internals.h"
  30#include "qemu/crc32c.h"
  31#include "exec/exec-all.h"
  32#include "exec/cpu_ldst.h"
  33#include "qemu/int128.h"
  34#include "qemu/atomic128.h"
  35#include "fpu/softfloat.h"
  36#include <zlib.h> /* For crc32 */
  37
  38/* C2.4.7 Multiply and divide */
  39/* special cases for 0 and LLONG_MIN are mandated by the standard */
  40uint64_t HELPER(udiv64)(uint64_t num, uint64_t den)
  41{
  42    if (den == 0) {
  43        return 0;
  44    }
  45    return num / den;
  46}
  47
  48int64_t HELPER(sdiv64)(int64_t num, int64_t den)
  49{
  50    if (den == 0) {
  51        return 0;
  52    }
  53    if (num == LLONG_MIN && den == -1) {
  54        return LLONG_MIN;
  55    }
  56    return num / den;
  57}
  58
  59uint64_t HELPER(rbit64)(uint64_t x)
  60{
  61    return revbit64(x);
  62}
  63
  64void HELPER(msr_i_spsel)(CPUARMState *env, uint32_t imm)
  65{
  66    update_spsel(env, imm);
  67}
  68
  69static void daif_check(CPUARMState *env, uint32_t op,
  70                       uint32_t imm, uintptr_t ra)
  71{
  72    /* DAIF update to PSTATE. This is OK from EL0 only if UMA is set.  */
  73    if (arm_current_el(env) == 0 && !(arm_sctlr(env, 0) & SCTLR_UMA)) {
  74        raise_exception_ra(env, EXCP_UDEF,
  75                           syn_aa64_sysregtrap(0, extract32(op, 0, 3),
  76                                               extract32(op, 3, 3), 4,
  77                                               imm, 0x1f, 0),
  78                           exception_target_el(env), ra);
  79    }
  80}
  81
  82void HELPER(msr_i_daifset)(CPUARMState *env, uint32_t imm)
  83{
  84    daif_check(env, 0x1e, imm, GETPC());
  85    env->daif |= (imm << 6) & PSTATE_DAIF;
  86    arm_rebuild_hflags(env);
  87}
  88
  89void HELPER(msr_i_daifclear)(CPUARMState *env, uint32_t imm)
  90{
  91    daif_check(env, 0x1f, imm, GETPC());
  92    env->daif &= ~((imm << 6) & PSTATE_DAIF);
  93    arm_rebuild_hflags(env);
  94}
  95
  96/* Convert a softfloat float_relation_ (as returned by
  97 * the float*_compare functions) to the correct ARM
  98 * NZCV flag state.
  99 */
 100static inline uint32_t float_rel_to_flags(int res)
 101{
 102    uint64_t flags;
 103    switch (res) {
 104    case float_relation_equal:
 105        flags = PSTATE_Z | PSTATE_C;
 106        break;
 107    case float_relation_less:
 108        flags = PSTATE_N;
 109        break;
 110    case float_relation_greater:
 111        flags = PSTATE_C;
 112        break;
 113    case float_relation_unordered:
 114    default:
 115        flags = PSTATE_C | PSTATE_V;
 116        break;
 117    }
 118    return flags;
 119}
 120
 121uint64_t HELPER(vfp_cmph_a64)(uint32_t x, uint32_t y, void *fp_status)
 122{
 123    return float_rel_to_flags(float16_compare_quiet(x, y, fp_status));
 124}
 125
 126uint64_t HELPER(vfp_cmpeh_a64)(uint32_t x, uint32_t y, void *fp_status)
 127{
 128    return float_rel_to_flags(float16_compare(x, y, fp_status));
 129}
 130
 131uint64_t HELPER(vfp_cmps_a64)(float32 x, float32 y, void *fp_status)
 132{
 133    return float_rel_to_flags(float32_compare_quiet(x, y, fp_status));
 134}
 135
 136uint64_t HELPER(vfp_cmpes_a64)(float32 x, float32 y, void *fp_status)
 137{
 138    return float_rel_to_flags(float32_compare(x, y, fp_status));
 139}
 140
 141uint64_t HELPER(vfp_cmpd_a64)(float64 x, float64 y, void *fp_status)
 142{
 143    return float_rel_to_flags(float64_compare_quiet(x, y, fp_status));
 144}
 145
 146uint64_t HELPER(vfp_cmped_a64)(float64 x, float64 y, void *fp_status)
 147{
 148    return float_rel_to_flags(float64_compare(x, y, fp_status));
 149}
 150
 151float32 HELPER(vfp_mulxs)(float32 a, float32 b, void *fpstp)
 152{
 153    float_status *fpst = fpstp;
 154
 155    a = float32_squash_input_denormal(a, fpst);
 156    b = float32_squash_input_denormal(b, fpst);
 157
 158    if ((float32_is_zero(a) && float32_is_infinity(b)) ||
 159        (float32_is_infinity(a) && float32_is_zero(b))) {
 160        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
 161        return make_float32((1U << 30) |
 162                            ((float32_val(a) ^ float32_val(b)) & (1U << 31)));
 163    }
 164    return float32_mul(a, b, fpst);
 165}
 166
 167float64 HELPER(vfp_mulxd)(float64 a, float64 b, void *fpstp)
 168{
 169    float_status *fpst = fpstp;
 170
 171    a = float64_squash_input_denormal(a, fpst);
 172    b = float64_squash_input_denormal(b, fpst);
 173
 174    if ((float64_is_zero(a) && float64_is_infinity(b)) ||
 175        (float64_is_infinity(a) && float64_is_zero(b))) {
 176        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
 177        return make_float64((1ULL << 62) |
 178                            ((float64_val(a) ^ float64_val(b)) & (1ULL << 63)));
 179    }
 180    return float64_mul(a, b, fpst);
 181}
 182
 183/* 64bit/double versions of the neon float compare functions */
 184uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp)
 185{
 186    float_status *fpst = fpstp;
 187    return -float64_eq_quiet(a, b, fpst);
 188}
 189
 190uint64_t HELPER(neon_cge_f64)(float64 a, float64 b, void *fpstp)
 191{
 192    float_status *fpst = fpstp;
 193    return -float64_le(b, a, fpst);
 194}
 195
 196uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp)
 197{
 198    float_status *fpst = fpstp;
 199    return -float64_lt(b, a, fpst);
 200}
 201
 202/* Reciprocal step and sqrt step. Note that unlike the A32/T32
 203 * versions, these do a fully fused multiply-add or
 204 * multiply-add-and-halve.
 205 */
 206
 207uint32_t HELPER(recpsf_f16)(uint32_t a, uint32_t b, void *fpstp)
 208{
 209    float_status *fpst = fpstp;
 210
 211    a = float16_squash_input_denormal(a, fpst);
 212    b = float16_squash_input_denormal(b, fpst);
 213
 214    a = float16_chs(a);
 215    if ((float16_is_infinity(a) && float16_is_zero(b)) ||
 216        (float16_is_infinity(b) && float16_is_zero(a))) {
 217        return float16_two;
 218    }
 219    return float16_muladd(a, b, float16_two, 0, fpst);
 220}
 221
 222float32 HELPER(recpsf_f32)(float32 a, float32 b, void *fpstp)
 223{
 224    float_status *fpst = fpstp;
 225
 226    a = float32_squash_input_denormal(a, fpst);
 227    b = float32_squash_input_denormal(b, fpst);
 228
 229    a = float32_chs(a);
 230    if ((float32_is_infinity(a) && float32_is_zero(b)) ||
 231        (float32_is_infinity(b) && float32_is_zero(a))) {
 232        return float32_two;
 233    }
 234    return float32_muladd(a, b, float32_two, 0, fpst);
 235}
 236
 237float64 HELPER(recpsf_f64)(float64 a, float64 b, void *fpstp)
 238{
 239    float_status *fpst = fpstp;
 240
 241    a = float64_squash_input_denormal(a, fpst);
 242    b = float64_squash_input_denormal(b, fpst);
 243
 244    a = float64_chs(a);
 245    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
 246        (float64_is_infinity(b) && float64_is_zero(a))) {
 247        return float64_two;
 248    }
 249    return float64_muladd(a, b, float64_two, 0, fpst);
 250}
 251
 252uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, void *fpstp)
 253{
 254    float_status *fpst = fpstp;
 255
 256    a = float16_squash_input_denormal(a, fpst);
 257    b = float16_squash_input_denormal(b, fpst);
 258
 259    a = float16_chs(a);
 260    if ((float16_is_infinity(a) && float16_is_zero(b)) ||
 261        (float16_is_infinity(b) && float16_is_zero(a))) {
 262        return float16_one_point_five;
 263    }
 264    return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst);
 265}
 266
 267float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, void *fpstp)
 268{
 269    float_status *fpst = fpstp;
 270
 271    a = float32_squash_input_denormal(a, fpst);
 272    b = float32_squash_input_denormal(b, fpst);
 273
 274    a = float32_chs(a);
 275    if ((float32_is_infinity(a) && float32_is_zero(b)) ||
 276        (float32_is_infinity(b) && float32_is_zero(a))) {
 277        return float32_one_point_five;
 278    }
 279    return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst);
 280}
 281
 282float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, void *fpstp)
 283{
 284    float_status *fpst = fpstp;
 285
 286    a = float64_squash_input_denormal(a, fpst);
 287    b = float64_squash_input_denormal(b, fpst);
 288
 289    a = float64_chs(a);
 290    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
 291        (float64_is_infinity(b) && float64_is_zero(a))) {
 292        return float64_one_point_five;
 293    }
 294    return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
 295}
 296
 297/* Pairwise long add: add pairs of adjacent elements into
 298 * double-width elements in the result (eg _s8 is an 8x8->16 op)
 299 */
 300uint64_t HELPER(neon_addlp_s8)(uint64_t a)
 301{
 302    uint64_t nsignmask = 0x0080008000800080ULL;
 303    uint64_t wsignmask = 0x8000800080008000ULL;
 304    uint64_t elementmask = 0x00ff00ff00ff00ffULL;
 305    uint64_t tmp1, tmp2;
 306    uint64_t res, signres;
 307
 308    /* Extract odd elements, sign extend each to a 16 bit field */
 309    tmp1 = a & elementmask;
 310    tmp1 ^= nsignmask;
 311    tmp1 |= wsignmask;
 312    tmp1 = (tmp1 - nsignmask) ^ wsignmask;
 313    /* Ditto for the even elements */
 314    tmp2 = (a >> 8) & elementmask;
 315    tmp2 ^= nsignmask;
 316    tmp2 |= wsignmask;
 317    tmp2 = (tmp2 - nsignmask) ^ wsignmask;
 318
 319    /* calculate the result by summing bits 0..14, 16..22, etc,
 320     * and then adjusting the sign bits 15, 23, etc manually.
 321     * This ensures the addition can't overflow the 16 bit field.
 322     */
 323    signres = (tmp1 ^ tmp2) & wsignmask;
 324    res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask);
 325    res ^= signres;
 326
 327    return res;
 328}
 329
 330uint64_t HELPER(neon_addlp_u8)(uint64_t a)
 331{
 332    uint64_t tmp;
 333
 334    tmp = a & 0x00ff00ff00ff00ffULL;
 335    tmp += (a >> 8) & 0x00ff00ff00ff00ffULL;
 336    return tmp;
 337}
 338
 339uint64_t HELPER(neon_addlp_s16)(uint64_t a)
 340{
 341    int32_t reslo, reshi;
 342
 343    reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16);
 344    reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48);
 345
 346    return (uint32_t)reslo | (((uint64_t)reshi) << 32);
 347}
 348
 349uint64_t HELPER(neon_addlp_u16)(uint64_t a)
 350{
 351    uint64_t tmp;
 352
 353    tmp = a & 0x0000ffff0000ffffULL;
 354    tmp += (a >> 16) & 0x0000ffff0000ffffULL;
 355    return tmp;
 356}
 357
 358/* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
 359uint32_t HELPER(frecpx_f16)(uint32_t a, void *fpstp)
 360{
 361    float_status *fpst = fpstp;
 362    uint16_t val16, sbit;
 363    int16_t exp;
 364
 365    if (float16_is_any_nan(a)) {
 366        float16 nan = a;
 367        if (float16_is_signaling_nan(a, fpst)) {
 368            float_raise(float_flag_invalid, fpst);
 369            if (!fpst->default_nan_mode) {
 370                nan = float16_silence_nan(a, fpst);
 371            }
 372        }
 373        if (fpst->default_nan_mode) {
 374            nan = float16_default_nan(fpst);
 375        }
 376        return nan;
 377    }
 378
 379    a = float16_squash_input_denormal(a, fpst);
 380
 381    val16 = float16_val(a);
 382    sbit = 0x8000 & val16;
 383    exp = extract32(val16, 10, 5);
 384
 385    if (exp == 0) {
 386        return make_float16(deposit32(sbit, 10, 5, 0x1e));
 387    } else {
 388        return make_float16(deposit32(sbit, 10, 5, ~exp));
 389    }
 390}
 391
 392float32 HELPER(frecpx_f32)(float32 a, void *fpstp)
 393{
 394    float_status *fpst = fpstp;
 395    uint32_t val32, sbit;
 396    int32_t exp;
 397
 398    if (float32_is_any_nan(a)) {
 399        float32 nan = a;
 400        if (float32_is_signaling_nan(a, fpst)) {
 401            float_raise(float_flag_invalid, fpst);
 402            if (!fpst->default_nan_mode) {
 403                nan = float32_silence_nan(a, fpst);
 404            }
 405        }
 406        if (fpst->default_nan_mode) {
 407            nan = float32_default_nan(fpst);
 408        }
 409        return nan;
 410    }
 411
 412    a = float32_squash_input_denormal(a, fpst);
 413
 414    val32 = float32_val(a);
 415    sbit = 0x80000000ULL & val32;
 416    exp = extract32(val32, 23, 8);
 417
 418    if (exp == 0) {
 419        return make_float32(sbit | (0xfe << 23));
 420    } else {
 421        return make_float32(sbit | (~exp & 0xff) << 23);
 422    }
 423}
 424
 425float64 HELPER(frecpx_f64)(float64 a, void *fpstp)
 426{
 427    float_status *fpst = fpstp;
 428    uint64_t val64, sbit;
 429    int64_t exp;
 430
 431    if (float64_is_any_nan(a)) {
 432        float64 nan = a;
 433        if (float64_is_signaling_nan(a, fpst)) {
 434            float_raise(float_flag_invalid, fpst);
 435            if (!fpst->default_nan_mode) {
 436                nan = float64_silence_nan(a, fpst);
 437            }
 438        }
 439        if (fpst->default_nan_mode) {
 440            nan = float64_default_nan(fpst);
 441        }
 442        return nan;
 443    }
 444
 445    a = float64_squash_input_denormal(a, fpst);
 446
 447    val64 = float64_val(a);
 448    sbit = 0x8000000000000000ULL & val64;
 449    exp = extract64(float64_val(a), 52, 11);
 450
 451    if (exp == 0) {
 452        return make_float64(sbit | (0x7feULL << 52));
 453    } else {
 454        return make_float64(sbit | (~exp & 0x7ffULL) << 52);
 455    }
 456}
 457
 458float32 HELPER(fcvtx_f64_to_f32)(float64 a, CPUARMState *env)
 459{
 460    /* Von Neumann rounding is implemented by using round-to-zero
 461     * and then setting the LSB of the result if Inexact was raised.
 462     */
 463    float32 r;
 464    float_status *fpst = &env->vfp.fp_status;
 465    float_status tstat = *fpst;
 466    int exflags;
 467
 468    set_float_rounding_mode(float_round_to_zero, &tstat);
 469    set_float_exception_flags(0, &tstat);
 470    r = float64_to_float32(a, &tstat);
 471    exflags = get_float_exception_flags(&tstat);
 472    if (exflags & float_flag_inexact) {
 473        r = make_float32(float32_val(r) | 1);
 474    }
 475    exflags |= get_float_exception_flags(fpst);
 476    set_float_exception_flags(exflags, fpst);
 477    return r;
 478}
 479
 480/* 64-bit versions of the CRC helpers. Note that although the operation
 481 * (and the prototypes of crc32c() and crc32() mean that only the bottom
 482 * 32 bits of the accumulator and result are used, we pass and return
 483 * uint64_t for convenience of the generated code. Unlike the 32-bit
 484 * instruction set versions, val may genuinely have 64 bits of data in it.
 485 * The upper bytes of val (above the number specified by 'bytes') must have
 486 * been zeroed out by the caller.
 487 */
 488uint64_t HELPER(crc32_64)(uint64_t acc, uint64_t val, uint32_t bytes)
 489{
 490    uint8_t buf[8];
 491
 492    stq_le_p(buf, val);
 493
 494    /* zlib crc32 converts the accumulator and output to one's complement.  */
 495    return crc32(acc ^ 0xffffffff, buf, bytes) ^ 0xffffffff;
 496}
 497
 498uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
 499{
 500    uint8_t buf[8];
 501
 502    stq_le_p(buf, val);
 503
 504    /* Linux crc32c converts the output to one's complement.  */
 505    return crc32c(acc, buf, bytes) ^ 0xffffffff;
 506}
 507
 508uint64_t HELPER(paired_cmpxchg64_le)(CPUARMState *env, uint64_t addr,
 509                                     uint64_t new_lo, uint64_t new_hi)
 510{
 511    Int128 cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
 512    Int128 newv = int128_make128(new_lo, new_hi);
 513    Int128 oldv;
 514    uintptr_t ra = GETPC();
 515    uint64_t o0, o1;
 516    bool success;
 517    int mem_idx = cpu_mmu_index(env, false);
 518    MemOpIdx oi0 = make_memop_idx(MO_LEUQ | MO_ALIGN_16, mem_idx);
 519    MemOpIdx oi1 = make_memop_idx(MO_LEUQ, mem_idx);
 520
 521    o0 = cpu_ldq_le_mmu(env, addr + 0, oi0, ra);
 522    o1 = cpu_ldq_le_mmu(env, addr + 8, oi1, ra);
 523    oldv = int128_make128(o0, o1);
 524
 525    success = int128_eq(oldv, cmpv);
 526    if (success) {
 527        cpu_stq_le_mmu(env, addr + 0, int128_getlo(newv), oi1, ra);
 528        cpu_stq_le_mmu(env, addr + 8, int128_gethi(newv), oi1, ra);
 529    }
 530
 531    return !success;
 532}
 533
 534uint64_t HELPER(paired_cmpxchg64_le_parallel)(CPUARMState *env, uint64_t addr,
 535                                              uint64_t new_lo, uint64_t new_hi)
 536{
 537    Int128 oldv, cmpv, newv;
 538    uintptr_t ra = GETPC();
 539    bool success;
 540    int mem_idx;
 541    MemOpIdx oi;
 542
 543    assert(HAVE_CMPXCHG128);
 544
 545    mem_idx = cpu_mmu_index(env, false);
 546    oi = make_memop_idx(MO_LE | MO_128 | MO_ALIGN, mem_idx);
 547
 548    cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
 549    newv = int128_make128(new_lo, new_hi);
 550    oldv = cpu_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
 551
 552    success = int128_eq(oldv, cmpv);
 553    return !success;
 554}
 555
 556uint64_t HELPER(paired_cmpxchg64_be)(CPUARMState *env, uint64_t addr,
 557                                     uint64_t new_lo, uint64_t new_hi)
 558{
 559    /*
 560     * High and low need to be switched here because this is not actually a
 561     * 128bit store but two doublewords stored consecutively
 562     */
 563    Int128 cmpv = int128_make128(env->exclusive_high, env->exclusive_val);
 564    Int128 newv = int128_make128(new_hi, new_lo);
 565    Int128 oldv;
 566    uintptr_t ra = GETPC();
 567    uint64_t o0, o1;
 568    bool success;
 569    int mem_idx = cpu_mmu_index(env, false);
 570    MemOpIdx oi0 = make_memop_idx(MO_BEUQ | MO_ALIGN_16, mem_idx);
 571    MemOpIdx oi1 = make_memop_idx(MO_BEUQ, mem_idx);
 572
 573    o1 = cpu_ldq_be_mmu(env, addr + 0, oi0, ra);
 574    o0 = cpu_ldq_be_mmu(env, addr + 8, oi1, ra);
 575    oldv = int128_make128(o0, o1);
 576
 577    success = int128_eq(oldv, cmpv);
 578    if (success) {
 579        cpu_stq_be_mmu(env, addr + 0, int128_gethi(newv), oi1, ra);
 580        cpu_stq_be_mmu(env, addr + 8, int128_getlo(newv), oi1, ra);
 581    }
 582
 583    return !success;
 584}
 585
 586uint64_t HELPER(paired_cmpxchg64_be_parallel)(CPUARMState *env, uint64_t addr,
 587                                              uint64_t new_lo, uint64_t new_hi)
 588{
 589    Int128 oldv, cmpv, newv;
 590    uintptr_t ra = GETPC();
 591    bool success;
 592    int mem_idx;
 593    MemOpIdx oi;
 594
 595    assert(HAVE_CMPXCHG128);
 596
 597    mem_idx = cpu_mmu_index(env, false);
 598    oi = make_memop_idx(MO_BE | MO_128 | MO_ALIGN, mem_idx);
 599
 600    /*
 601     * High and low need to be switched here because this is not actually a
 602     * 128bit store but two doublewords stored consecutively
 603     */
 604    cmpv = int128_make128(env->exclusive_high, env->exclusive_val);
 605    newv = int128_make128(new_hi, new_lo);
 606    oldv = cpu_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
 607
 608    success = int128_eq(oldv, cmpv);
 609    return !success;
 610}
 611
 612/* Writes back the old data into Rs.  */
 613void HELPER(casp_le_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
 614                              uint64_t new_lo, uint64_t new_hi)
 615{
 616    Int128 oldv, cmpv, newv;
 617    uintptr_t ra = GETPC();
 618    int mem_idx;
 619    MemOpIdx oi;
 620
 621    assert(HAVE_CMPXCHG128);
 622
 623    mem_idx = cpu_mmu_index(env, false);
 624    oi = make_memop_idx(MO_LE | MO_128 | MO_ALIGN, mem_idx);
 625
 626    cmpv = int128_make128(env->xregs[rs], env->xregs[rs + 1]);
 627    newv = int128_make128(new_lo, new_hi);
 628    oldv = cpu_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
 629
 630    env->xregs[rs] = int128_getlo(oldv);
 631    env->xregs[rs + 1] = int128_gethi(oldv);
 632}
 633
 634void HELPER(casp_be_parallel)(CPUARMState *env, uint32_t rs, uint64_t addr,
 635                              uint64_t new_hi, uint64_t new_lo)
 636{
 637    Int128 oldv, cmpv, newv;
 638    uintptr_t ra = GETPC();
 639    int mem_idx;
 640    MemOpIdx oi;
 641
 642    assert(HAVE_CMPXCHG128);
 643
 644    mem_idx = cpu_mmu_index(env, false);
 645    oi = make_memop_idx(MO_LE | MO_128 | MO_ALIGN, mem_idx);
 646
 647    cmpv = int128_make128(env->xregs[rs + 1], env->xregs[rs]);
 648    newv = int128_make128(new_lo, new_hi);
 649    oldv = cpu_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
 650
 651    env->xregs[rs + 1] = int128_getlo(oldv);
 652    env->xregs[rs] = int128_gethi(oldv);
 653}
 654
 655/*
 656 * AdvSIMD half-precision
 657 */
 658
 659#define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix))
 660
 661#define ADVSIMD_HALFOP(name) \
 662uint32_t ADVSIMD_HELPER(name, h)(uint32_t a, uint32_t b, void *fpstp) \
 663{ \
 664    float_status *fpst = fpstp; \
 665    return float16_ ## name(a, b, fpst);    \
 666}
 667
 668ADVSIMD_HALFOP(add)
 669ADVSIMD_HALFOP(sub)
 670ADVSIMD_HALFOP(mul)
 671ADVSIMD_HALFOP(div)
 672ADVSIMD_HALFOP(min)
 673ADVSIMD_HALFOP(max)
 674ADVSIMD_HALFOP(minnum)
 675ADVSIMD_HALFOP(maxnum)
 676
 677#define ADVSIMD_TWOHALFOP(name)                                         \
 678uint32_t ADVSIMD_HELPER(name, 2h)(uint32_t two_a, uint32_t two_b, void *fpstp) \
 679{ \
 680    float16  a1, a2, b1, b2;                        \
 681    uint32_t r1, r2;                                \
 682    float_status *fpst = fpstp;                     \
 683    a1 = extract32(two_a, 0, 16);                   \
 684    a2 = extract32(two_a, 16, 16);                  \
 685    b1 = extract32(two_b, 0, 16);                   \
 686    b2 = extract32(two_b, 16, 16);                  \
 687    r1 = float16_ ## name(a1, b1, fpst);            \
 688    r2 = float16_ ## name(a2, b2, fpst);            \
 689    return deposit32(r1, 16, 16, r2);               \
 690}
 691
 692ADVSIMD_TWOHALFOP(add)
 693ADVSIMD_TWOHALFOP(sub)
 694ADVSIMD_TWOHALFOP(mul)
 695ADVSIMD_TWOHALFOP(div)
 696ADVSIMD_TWOHALFOP(min)
 697ADVSIMD_TWOHALFOP(max)
 698ADVSIMD_TWOHALFOP(minnum)
 699ADVSIMD_TWOHALFOP(maxnum)
 700
 701/* Data processing - scalar floating-point and advanced SIMD */
 702static float16 float16_mulx(float16 a, float16 b, void *fpstp)
 703{
 704    float_status *fpst = fpstp;
 705
 706    a = float16_squash_input_denormal(a, fpst);
 707    b = float16_squash_input_denormal(b, fpst);
 708
 709    if ((float16_is_zero(a) && float16_is_infinity(b)) ||
 710        (float16_is_infinity(a) && float16_is_zero(b))) {
 711        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
 712        return make_float16((1U << 14) |
 713                            ((float16_val(a) ^ float16_val(b)) & (1U << 15)));
 714    }
 715    return float16_mul(a, b, fpst);
 716}
 717
 718ADVSIMD_HALFOP(mulx)
 719ADVSIMD_TWOHALFOP(mulx)
 720
 721/* fused multiply-accumulate */
 722uint32_t HELPER(advsimd_muladdh)(uint32_t a, uint32_t b, uint32_t c,
 723                                 void *fpstp)
 724{
 725    float_status *fpst = fpstp;
 726    return float16_muladd(a, b, c, 0, fpst);
 727}
 728
 729uint32_t HELPER(advsimd_muladd2h)(uint32_t two_a, uint32_t two_b,
 730                                  uint32_t two_c, void *fpstp)
 731{
 732    float_status *fpst = fpstp;
 733    float16  a1, a2, b1, b2, c1, c2;
 734    uint32_t r1, r2;
 735    a1 = extract32(two_a, 0, 16);
 736    a2 = extract32(two_a, 16, 16);
 737    b1 = extract32(two_b, 0, 16);
 738    b2 = extract32(two_b, 16, 16);
 739    c1 = extract32(two_c, 0, 16);
 740    c2 = extract32(two_c, 16, 16);
 741    r1 = float16_muladd(a1, b1, c1, 0, fpst);
 742    r2 = float16_muladd(a2, b2, c2, 0, fpst);
 743    return deposit32(r1, 16, 16, r2);
 744}
 745
 746/*
 747 * Floating point comparisons produce an integer result. Softfloat
 748 * routines return float_relation types which we convert to the 0/-1
 749 * Neon requires.
 750 */
 751
 752#define ADVSIMD_CMPRES(test) (test) ? 0xffff : 0
 753
 754uint32_t HELPER(advsimd_ceq_f16)(uint32_t a, uint32_t b, void *fpstp)
 755{
 756    float_status *fpst = fpstp;
 757    int compare = float16_compare_quiet(a, b, fpst);
 758    return ADVSIMD_CMPRES(compare == float_relation_equal);
 759}
 760
 761uint32_t HELPER(advsimd_cge_f16)(uint32_t a, uint32_t b, void *fpstp)
 762{
 763    float_status *fpst = fpstp;
 764    int compare = float16_compare(a, b, fpst);
 765    return ADVSIMD_CMPRES(compare == float_relation_greater ||
 766                          compare == float_relation_equal);
 767}
 768
 769uint32_t HELPER(advsimd_cgt_f16)(uint32_t a, uint32_t b, void *fpstp)
 770{
 771    float_status *fpst = fpstp;
 772    int compare = float16_compare(a, b, fpst);
 773    return ADVSIMD_CMPRES(compare == float_relation_greater);
 774}
 775
 776uint32_t HELPER(advsimd_acge_f16)(uint32_t a, uint32_t b, void *fpstp)
 777{
 778    float_status *fpst = fpstp;
 779    float16 f0 = float16_abs(a);
 780    float16 f1 = float16_abs(b);
 781    int compare = float16_compare(f0, f1, fpst);
 782    return ADVSIMD_CMPRES(compare == float_relation_greater ||
 783                          compare == float_relation_equal);
 784}
 785
 786uint32_t HELPER(advsimd_acgt_f16)(uint32_t a, uint32_t b, void *fpstp)
 787{
 788    float_status *fpst = fpstp;
 789    float16 f0 = float16_abs(a);
 790    float16 f1 = float16_abs(b);
 791    int compare = float16_compare(f0, f1, fpst);
 792    return ADVSIMD_CMPRES(compare == float_relation_greater);
 793}
 794
 795/* round to integral */
 796uint32_t HELPER(advsimd_rinth_exact)(uint32_t x, void *fp_status)
 797{
 798    return float16_round_to_int(x, fp_status);
 799}
 800
 801uint32_t HELPER(advsimd_rinth)(uint32_t x, void *fp_status)
 802{
 803    int old_flags = get_float_exception_flags(fp_status), new_flags;
 804    float16 ret;
 805
 806    ret = float16_round_to_int(x, fp_status);
 807
 808    /* Suppress any inexact exceptions the conversion produced */
 809    if (!(old_flags & float_flag_inexact)) {
 810        new_flags = get_float_exception_flags(fp_status);
 811        set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
 812    }
 813
 814    return ret;
 815}
 816
 817/*
 818 * Half-precision floating point conversion functions
 819 *
 820 * There are a multitude of conversion functions with various
 821 * different rounding modes. This is dealt with by the calling code
 822 * setting the mode appropriately before calling the helper.
 823 */
 824
 825uint32_t HELPER(advsimd_f16tosinth)(uint32_t a, void *fpstp)
 826{
 827    float_status *fpst = fpstp;
 828
 829    /* Invalid if we are passed a NaN */
 830    if (float16_is_any_nan(a)) {
 831        float_raise(float_flag_invalid, fpst);
 832        return 0;
 833    }
 834    return float16_to_int16(a, fpst);
 835}
 836
 837uint32_t HELPER(advsimd_f16touinth)(uint32_t a, void *fpstp)
 838{
 839    float_status *fpst = fpstp;
 840
 841    /* Invalid if we are passed a NaN */
 842    if (float16_is_any_nan(a)) {
 843        float_raise(float_flag_invalid, fpst);
 844        return 0;
 845    }
 846    return float16_to_uint16(a, fpst);
 847}
 848
 849static int el_from_spsr(uint32_t spsr)
 850{
 851    /* Return the exception level that this SPSR is requesting a return to,
 852     * or -1 if it is invalid (an illegal return)
 853     */
 854    if (spsr & PSTATE_nRW) {
 855        switch (spsr & CPSR_M) {
 856        case ARM_CPU_MODE_USR:
 857            return 0;
 858        case ARM_CPU_MODE_HYP:
 859            return 2;
 860        case ARM_CPU_MODE_FIQ:
 861        case ARM_CPU_MODE_IRQ:
 862        case ARM_CPU_MODE_SVC:
 863        case ARM_CPU_MODE_ABT:
 864        case ARM_CPU_MODE_UND:
 865        case ARM_CPU_MODE_SYS:
 866            return 1;
 867        case ARM_CPU_MODE_MON:
 868            /* Returning to Mon from AArch64 is never possible,
 869             * so this is an illegal return.
 870             */
 871        default:
 872            return -1;
 873        }
 874    } else {
 875        if (extract32(spsr, 1, 1)) {
 876            /* Return with reserved M[1] bit set */
 877            return -1;
 878        }
 879        if (extract32(spsr, 0, 4) == 1) {
 880            /* return to EL0 with M[0] bit set */
 881            return -1;
 882        }
 883        return extract32(spsr, 2, 2);
 884    }
 885}
 886
 887static void cpsr_write_from_spsr_elx(CPUARMState *env,
 888                                     uint32_t val)
 889{
 890    uint32_t mask;
 891
 892    /* Save SPSR_ELx.SS into PSTATE. */
 893    env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS);
 894    val &= ~PSTATE_SS;
 895
 896    /* Move DIT to the correct location for CPSR */
 897    if (val & PSTATE_DIT) {
 898        val &= ~PSTATE_DIT;
 899        val |= CPSR_DIT;
 900    }
 901
 902    mask = aarch32_cpsr_valid_mask(env->features, \
 903        &env_archcpu(env)->isar);
 904    cpsr_write(env, val, mask, CPSRWriteRaw);
 905}
 906
 907void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
 908{
 909    int cur_el = arm_current_el(env);
 910    unsigned int spsr_idx = aarch64_banked_spsr_index(cur_el);
 911    uint32_t spsr = env->banked_spsr[spsr_idx];
 912    int new_el;
 913    bool return_to_aa64 = (spsr & PSTATE_nRW) == 0;
 914
 915    aarch64_save_sp(env, cur_el);
 916
 917    arm_clear_exclusive(env);
 918
 919    /* We must squash the PSTATE.SS bit to zero unless both of the
 920     * following hold:
 921     *  1. debug exceptions are currently disabled
 922     *  2. singlestep will be active in the EL we return to
 923     * We check 1 here and 2 after we've done the pstate/cpsr write() to
 924     * transition to the EL we're going to.
 925     */
 926    if (arm_generate_debug_exceptions(env)) {
 927        spsr &= ~PSTATE_SS;
 928    }
 929
 930    new_el = el_from_spsr(spsr);
 931    if (new_el == -1) {
 932        goto illegal_return;
 933    }
 934    if (new_el > cur_el || (new_el == 2 && !arm_is_el2_enabled(env))) {
 935        /* Disallow return to an EL which is unimplemented or higher
 936         * than the current one.
 937         */
 938        goto illegal_return;
 939    }
 940
 941    if (new_el != 0 && arm_el_is_aa64(env, new_el) != return_to_aa64) {
 942        /* Return to an EL which is configured for a different register width */
 943        goto illegal_return;
 944    }
 945
 946    if (new_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
 947        goto illegal_return;
 948    }
 949
 950    qemu_mutex_lock_iothread();
 951    arm_call_pre_el_change_hook(env_archcpu(env));
 952    qemu_mutex_unlock_iothread();
 953
 954    if (!return_to_aa64) {
 955        env->aarch64 = false;
 956        /* We do a raw CPSR write because aarch64_sync_64_to_32()
 957         * will sort the register banks out for us, and we've already
 958         * caught all the bad-mode cases in el_from_spsr().
 959         */
 960        cpsr_write_from_spsr_elx(env, spsr);
 961        if (!arm_singlestep_active(env)) {
 962            env->pstate &= ~PSTATE_SS;
 963        }
 964        aarch64_sync_64_to_32(env);
 965
 966        if (spsr & CPSR_T) {
 967            env->regs[15] = new_pc & ~0x1;
 968        } else {
 969            env->regs[15] = new_pc & ~0x3;
 970        }
 971        helper_rebuild_hflags_a32(env, new_el);
 972        qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
 973                      "AArch32 EL%d PC 0x%" PRIx32 "\n",
 974                      cur_el, new_el, env->regs[15]);
 975    } else {
 976        int tbii;
 977
 978        env->aarch64 = true;
 979        spsr &= aarch64_pstate_valid_mask(&env_archcpu(env)->isar);
 980        pstate_write(env, spsr);
 981        if (!arm_singlestep_active(env)) {
 982            env->pstate &= ~PSTATE_SS;
 983        }
 984        aarch64_restore_sp(env, new_el);
 985        helper_rebuild_hflags_a64(env, new_el);
 986
 987        /*
 988         * Apply TBI to the exception return address.  We had to delay this
 989         * until after we selected the new EL, so that we could select the
 990         * correct TBI+TBID bits.  This is made easier by waiting until after
 991         * the hflags rebuild, since we can pull the composite TBII field
 992         * from there.
 993         */
 994        tbii = EX_TBFLAG_A64(env->hflags, TBII);
 995        if ((tbii >> extract64(new_pc, 55, 1)) & 1) {
 996            /* TBI is enabled. */
 997            int core_mmu_idx = cpu_mmu_index(env, false);
 998            if (regime_has_2_ranges(core_to_aa64_mmu_idx(core_mmu_idx))) {
 999                new_pc = sextract64(new_pc, 0, 56);
1000            } else {

1001                new_pc = extract64(new_pc, 0, 56);
1002            }
1003        }
1004        env->pc = new_pc;
1005
1006        qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
1007                      "AArch64 EL%d PC 0x%" PRIx64 "\n",
1008                      cur_el, new_el, env->pc);
1009    }
1010
1011    /*
1012     * Note that cur_el can never be 0.  If new_el is 0, then
1013     * el0_a64 is return_to_aa64, else el0_a64 is ignored.
1014     */
1015    aarch64_sve_change_el(env, cur_el, new_el, return_to_aa64);
1016
1017    qemu_mutex_lock_iothread();
1018    arm_call_el_change_hook(env_archcpu(env));
1019    qemu_mutex_unlock_iothread();
1020
1021    return;
1022
1023illegal_return:
1024    /* Illegal return events of various kinds have architecturally
1025     * mandated behaviour:
1026     * restore NZCV and DAIF from SPSR_ELx
1027     * set PSTATE.IL
1028     * restore PC from ELR_ELx
1029     * no change to exception level, execution state or stack pointer
1030     */
1031    env->pstate |= PSTATE_IL;
1032    env->pc = new_pc;
1033    spsr &= PSTATE_NZCV | PSTATE_DAIF;
1034    spsr |= pstate_read(env) & ~(PSTATE_NZCV | PSTATE_DAIF);
1035    pstate_write(env, spsr);
1036    if (!arm_singlestep_active(env)) {
1037        env->pstate &= ~PSTATE_SS;
1038    }
1039    helper_rebuild_hflags_a64(env, cur_el);
1040    qemu_log_mask(LOG_GUEST_ERROR, "Illegal exception return at EL%d: "
1041                  "resuming execution at 0x%" PRIx64 "\n", cur_el, env->pc);
1042}
1043
1044/*
1045 * Square Root and Reciprocal square root
1046 */
1047
1048uint32_t HELPER(sqrt_f16)(uint32_t a, void *fpstp)
1049{
1050    float_status *s = fpstp;
1051
1052    return float16_sqrt(a, s);
1053}
1054
1055void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in)
1056{
1057    /*
1058     * Implement DC ZVA, which zeroes a fixed-length block of memory.
1059     * Note that we do not implement the (architecturally mandated)
1060     * alignment fault for attempts to use this on Device memory
1061     * (which matches the usual QEMU behaviour of not implementing either
1062     * alignment faults or any memory attribute handling).
1063     */
1064    int blocklen = 4 << env_archcpu(env)->dcz_blocksize;
1065    uint64_t vaddr = vaddr_in & ~(blocklen - 1);
1066    int mmu_idx = cpu_mmu_index(env, false);
1067    void *mem;
1068
1069    /*
1070     * Trapless lookup.  In addition to actual invalid page, may
1071     * return NULL for I/O, watchpoints, clean pages, etc.
1072     */
1073    mem = tlb_vaddr_to_host(env, vaddr, MMU_DATA_STORE, mmu_idx);
1074
1075#ifndef CONFIG_USER_ONLY
1076    if (unlikely(!mem)) {
1077        uintptr_t ra = GETPC();
1078
1079        /*
1080         * Trap if accessing an invalid page.  DC_ZVA requires that we supply
1081         * the original pointer for an invalid page.  But watchpoints require
1082         * that we probe the actual space.  So do both.
1083         */
1084        (void) probe_write(env, vaddr_in, 1, mmu_idx, ra);
1085        mem = probe_write(env, vaddr, blocklen, mmu_idx, ra);
1086
1087        if (unlikely(!mem)) {
1088            /*
1089             * The only remaining reason for mem == NULL is I/O.
1090             * Just do a series of byte writes as the architecture demands.
1091             */
1092            for (int i = 0; i < blocklen; i++) {
1093                cpu_stb_mmuidx_ra(env, vaddr + i, 0, mmu_idx, ra);
1094            }
1095            return;
1096        }
1097    }
1098#endif
1099
1100    memset(mem, 0, blocklen);
1101}
1102