sim/aarch64/simulator.c

   1 /* simulator.c -- Interface for the AArch64 simulator.
   2
   3    Copyright (C) 2015-2017 Free Software Foundation, Inc.
   4
   5    Contributed by Red Hat.
   6
   7    This file is part of GDB.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include <stdlib.h>
  24 #include <stdio.h>
  25 #include <string.h>
  26 #include <sys/types.h>
  27 #include <math.h>
  28 #include <time.h>
  29 #include <limits.h>
  30
  31 #include "simulator.h"
  32 #include "cpustate.h"
  33 #include "memory.h"
  34
  35 #define NO_SP 0
  36 #define SP_OK 1
  37
  38 #define TST(_flag)   (aarch64_test_CPSR_bit (cpu, _flag))
  39 #define IS_SET(_X)   (TST (( _X )) ? 1 : 0)
  40 #define IS_CLEAR(_X) (TST (( _X )) ? 0 : 1)
  41
  42 /* Space saver macro.  */
  43 #define INSTR(HIGH, LOW) uimm (aarch64_get_instr (cpu), (HIGH), (LOW))
  44
  45 #define HALT_UNALLOC                                                    \
  46   do                                                                    \
  47     {                                                                   \
  48       TRACE_DISASM (cpu, aarch64_get_PC (cpu));                         \
  49       TRACE_INSN (cpu,                                                  \
  50                   "Unallocated instruction detected at sim line %d,"    \
  51                   " exe addr %" PRIx64,                                 \
  52                   __LINE__, aarch64_get_PC (cpu));                      \
  53       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
  54                        sim_stopped, SIM_SIGILL);                        \
  55     }                                                                   \
  56   while (0)
  57
  58 #define HALT_NYI                                                        \
  59   do                                                                    \
  60     {                                                                   \
  61       TRACE_DISASM (cpu, aarch64_get_PC (cpu));                         \
  62       TRACE_INSN (cpu,                                                  \
  63                   "Unimplemented instruction detected at sim line %d,"  \
  64                   " exe addr %" PRIx64,                                 \
  65                   __LINE__, aarch64_get_PC (cpu));                      \
  66       if (! TRACE_ANY_P (cpu))                                          \
  67         sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: %#08x\n", \
  68                         aarch64_get_instr (cpu));                       \
  69       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
  70                        sim_stopped, SIM_SIGABRT);                       \
  71     }                                                                   \
  72   while (0)
  73
  74 #define NYI_assert(HI, LO, EXPECTED)                                    \
  75   do                                                                    \
  76     {                                                                   \
  77       if (INSTR ((HI), (LO)) != (EXPECTED))                             \
  78         HALT_NYI;                                                       \
  79     }                                                                   \
  80   while (0)
  81
  82 /* Helper functions used by expandLogicalImmediate.  */
  83
  84 /* for i = 1, ... N result<i-1> = 1 other bits are zero  */
  85 static inline uint64_t
  86 ones (int N)
  87 {
  88   return (N == 64 ? (uint64_t)-1UL : ((1UL << N) - 1));
  89 }
  90
  91 /* result<0> to val<N>  */
  92 static inline uint64_t
  93 pickbit (uint64_t val, int N)
  94 {
  95   return pickbits64 (val, N, N);
  96 }
  97
  98 static uint64_t
  99 expand_logical_immediate (uint32_t S, uint32_t R, uint32_t N)
 100 {
 101   uint64_t mask;
 102   uint64_t imm;
 103   unsigned simd_size;
 104
 105   /* The immediate value is S+1 bits to 1, left rotated by SIMDsize - R
 106      (in other words, right rotated by R), then replicated. */
 107   if (N != 0)
 108     {
 109       simd_size = 64;
 110       mask = 0xffffffffffffffffull;
 111     }
 112   else
 113     {
 114       switch (S)
 115         {
 116         case 0x00 ... 0x1f: /* 0xxxxx */ simd_size = 32;           break;
 117         case 0x20 ... 0x2f: /* 10xxxx */ simd_size = 16; S &= 0xf; break;
 118         case 0x30 ... 0x37: /* 110xxx */ simd_size =  8; S &= 0x7; break;
 119         case 0x38 ... 0x3b: /* 1110xx */ simd_size =  4; S &= 0x3; break;
 120         case 0x3c ... 0x3d: /* 11110x */ simd_size =  2; S &= 0x1; break;
 121         default: return 0;
 122         }
 123       mask = (1ull << simd_size) - 1;
 124       /* Top bits are IGNORED.  */
 125       R &= simd_size - 1;
 126     }
 127
 128   /* NOTE: if S = simd_size - 1 we get 0xf..f which is rejected.  */
 129   if (S == simd_size - 1)
 130     return 0;
 131
 132   /* S+1 consecutive bits to 1.  */
 133   /* NOTE: S can't be 63 due to detection above.  */
 134   imm = (1ull << (S + 1)) - 1;
 135
 136   /* Rotate to the left by simd_size - R.  */
 137   if (R != 0)
 138     imm = ((imm << (simd_size - R)) & mask) | (imm >> R);
 139
 140   /* Replicate the value according to SIMD size.  */
 141   switch (simd_size)
 142     {
 143     case  2: imm = (imm <<  2) | imm;
 144     case  4: imm = (imm <<  4) | imm;
 145     case  8: imm = (imm <<  8) | imm;
 146     case 16: imm = (imm << 16) | imm;
 147     case 32: imm = (imm << 32) | imm;
 148     case 64: break;
 149     default: return 0;
 150     }
 151
 152   return imm;
 153 }
 154
 155 /* Instr[22,10] encodes N immr and imms. we want a lookup table
 156    for each possible combination i.e. 13 bits worth of int entries.  */
 157 #define  LI_TABLE_SIZE  (1 << 13)
 158 static uint64_t LITable[LI_TABLE_SIZE];
 159
 160 void
 161 aarch64_init_LIT_table (void)
 162 {
 163   unsigned index;
 164
 165   for (index = 0; index < LI_TABLE_SIZE; index++)
 166     {
 167       uint32_t N    = uimm (index, 12, 12);
 168       uint32_t immr = uimm (index, 11, 6);
 169       uint32_t imms = uimm (index, 5, 0);
 170
 171       LITable [index] = expand_logical_immediate (imms, immr, N);
 172     }
 173 }
 174
 175 static void
 176 dexNotify (sim_cpu *cpu)
 177 {
 178   /* instr[14,0] == type : 0 ==> method entry, 1 ==> method reentry
 179                            2 ==> exit Java, 3 ==> start next bytecode.  */
 180   uint32_t type = INSTR (14, 0);
 181
 182   TRACE_EVENTS (cpu, "Notify Insn encountered, type = 0x%x", type);
 183
 184   switch (type)
 185     {
 186     case 0:
 187       /* aarch64_notifyMethodEntry (aarch64_get_reg_u64 (cpu, R23, 0),
 188          aarch64_get_reg_u64 (cpu, R22, 0));  */
 189       break;
 190     case 1:
 191       /* aarch64_notifyMethodReentry (aarch64_get_reg_u64 (cpu, R23, 0),
 192          aarch64_get_reg_u64 (cpu, R22, 0));  */
 193       break;
 194     case 2:
 195       /* aarch64_notifyMethodExit ();  */
 196       break;
 197     case 3:
 198       /* aarch64_notifyBCStart (aarch64_get_reg_u64 (cpu, R23, 0),
 199          aarch64_get_reg_u64 (cpu, R22, 0));  */
 200       break;
 201     }
 202 }
 203
 204 /* secondary decode within top level groups  */
 205
 206 static void
 207 dexPseudo (sim_cpu *cpu)
 208 {
 209   /* assert instr[28,27] = 00
 210
 211      We provide 2 pseudo instructions:
 212
 213      HALT stops execution of the simulator causing an immediate
 214      return to the x86 code which entered it.
 215
 216      CALLOUT initiates recursive entry into x86 code.  A register
 217      argument holds the address of the x86 routine.  Immediate
 218      values in the instruction identify the number of general
 219      purpose and floating point register arguments to be passed
 220      and the type of any value to be returned.  */
 221
 222   uint32_t PSEUDO_HALT      =  0xE0000000U;
 223   uint32_t PSEUDO_CALLOUT   =  0x00018000U;
 224   uint32_t PSEUDO_CALLOUTR  =  0x00018001U;
 225   uint32_t PSEUDO_NOTIFY    =  0x00014000U;
 226   uint32_t dispatch;
 227
 228   if (aarch64_get_instr (cpu) == PSEUDO_HALT)
 229     {
 230       TRACE_EVENTS (cpu, " Pseudo Halt Instruction");
 231       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
 232                        sim_stopped, SIM_SIGTRAP);
 233     }
 234
 235   dispatch = INSTR (31, 15);
 236
 237   /* We do not handle callouts at the moment.  */
 238   if (dispatch == PSEUDO_CALLOUT || dispatch == PSEUDO_CALLOUTR)
 239     {
 240       TRACE_EVENTS (cpu, " Callout");
 241       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
 242                        sim_stopped, SIM_SIGABRT);
 243     }
 244
 245   else if (dispatch == PSEUDO_NOTIFY)
 246     dexNotify (cpu);
 247
 248   else
 249     HALT_UNALLOC;
 250 }
 251
 252 /* Load-store single register (unscaled offset)
 253    These instructions employ a base register plus an unscaled signed
 254    9 bit offset.
 255
 256    N.B. the base register (source) can be Xn or SP. all other
 257    registers may not be SP.  */
 258
 259 /* 32 bit load 32 bit unscaled signed 9 bit.  */
 260 static void
 261 ldur32 (sim_cpu *cpu, int32_t offset)
 262 {
 263   unsigned rn = INSTR (9, 5);
 264   unsigned rt = INSTR (4, 0);
 265
 266   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 267   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
 268                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 269                         + offset));
 270 }
 271
 272 /* 64 bit load 64 bit unscaled signed 9 bit.  */
 273 static void
 274 ldur64 (sim_cpu *cpu, int32_t offset)
 275 {
 276   unsigned rn = INSTR (9, 5);
 277   unsigned rt = INSTR (4, 0);
 278
 279   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 280   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
 281                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 282                         + offset));
 283 }
 284
 285 /* 32 bit load zero-extended byte unscaled signed 9 bit.  */
 286 static void
 287 ldurb32 (sim_cpu *cpu, int32_t offset)
 288 {
 289   unsigned rn = INSTR (9, 5);
 290   unsigned rt = INSTR (4, 0);
 291
 292   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 293   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8
 294                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 295                         + offset));
 296 }
 297
 298 /* 32 bit load sign-extended byte unscaled signed 9 bit.  */
 299 static void
 300 ldursb32 (sim_cpu *cpu, int32_t offset)
 301 {
 302   unsigned rn = INSTR (9, 5);
 303   unsigned rt = INSTR (4, 0);
 304
 305   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 306   aarch64_set_reg_u64 (cpu, rt, NO_SP, (uint32_t) aarch64_get_mem_s8
 307                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 308                         + offset));
 309 }
 310
 311 /* 64 bit load sign-extended byte unscaled signed 9 bit.  */
 312 static void
 313 ldursb64 (sim_cpu *cpu, int32_t offset)
 314 {
 315   unsigned rn = INSTR (9, 5);
 316   unsigned rt = INSTR (4, 0);
 317
 318   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 319   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s8
 320                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 321                         + offset));
 322 }
 323
 324 /* 32 bit load zero-extended short unscaled signed 9 bit  */
 325 static void
 326 ldurh32 (sim_cpu *cpu, int32_t offset)
 327 {
 328   unsigned rn = INSTR (9, 5);
 329   unsigned rd = INSTR (4, 0);
 330
 331   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 332   aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_mem_u16
 333                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 334                         + offset));
 335 }
 336
 337 /* 32 bit load sign-extended short unscaled signed 9 bit  */
 338 static void
 339 ldursh32 (sim_cpu *cpu, int32_t offset)
 340 {
 341   unsigned rn = INSTR (9, 5);
 342   unsigned rd = INSTR (4, 0);
 343
 344   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 345   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s16
 346                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 347                         + offset));
 348 }
 349
 350 /* 64 bit load sign-extended short unscaled signed 9 bit  */
 351 static void
 352 ldursh64 (sim_cpu *cpu, int32_t offset)
 353 {
 354   unsigned rn = INSTR (9, 5);
 355   unsigned rt = INSTR (4, 0);
 356
 357   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 358   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s16
 359                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 360                         + offset));
 361 }
 362
 363 /* 64 bit load sign-extended word unscaled signed 9 bit  */
 364 static void
 365 ldursw (sim_cpu *cpu, int32_t offset)
 366 {
 367   unsigned rn = INSTR (9, 5);
 368   unsigned rd = INSTR (4, 0);
 369
 370   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 371   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s32
 372                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 373                         + offset));
 374 }
 375
 376 /* N.B. with stores the value in source is written to the address
 377    identified by source2 modified by offset.  */
 378
 379 /* 32 bit store 32 bit unscaled signed 9 bit.  */
 380 static void
 381 stur32 (sim_cpu *cpu, int32_t offset)
 382 {
 383   unsigned rn = INSTR (9, 5);
 384   unsigned rd = INSTR (4, 0);
 385
 386   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 387   aarch64_set_mem_u32 (cpu,
 388                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 389                        aarch64_get_reg_u32 (cpu, rd, NO_SP));
 390 }
 391
 392 /* 64 bit store 64 bit unscaled signed 9 bit  */
 393 static void
 394 stur64 (sim_cpu *cpu, int32_t offset)
 395 {
 396   unsigned rn = INSTR (9, 5);
 397   unsigned rd = INSTR (4, 0);
 398
 399   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 400   aarch64_set_mem_u64 (cpu,
 401                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 402                        aarch64_get_reg_u64 (cpu, rd, NO_SP));
 403 }
 404
 405 /* 32 bit store byte unscaled signed 9 bit  */
 406 static void
 407 sturb (sim_cpu *cpu, int32_t offset)
 408 {
 409   unsigned rn = INSTR (9, 5);
 410   unsigned rd = INSTR (4, 0);
 411
 412   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 413   aarch64_set_mem_u8 (cpu,
 414                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 415                       aarch64_get_reg_u8 (cpu, rd, NO_SP));
 416 }
 417
 418 /* 32 bit store short unscaled signed 9 bit  */
 419 static void
 420 sturh (sim_cpu *cpu, int32_t offset)
 421 {
 422   unsigned rn = INSTR (9, 5);
 423   unsigned rd = INSTR (4, 0);
 424
 425   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 426   aarch64_set_mem_u16 (cpu,
 427                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 428                        aarch64_get_reg_u16 (cpu, rd, NO_SP));
 429 }
 430
 431 /* Load single register pc-relative label
 432    Offset is a signed 19 bit immediate count in words
 433    rt may not be SP.  */
 434
 435 /* 32 bit pc-relative load  */
 436 static void
 437 ldr32_pcrel (sim_cpu *cpu, int32_t offset)
 438 {
 439   unsigned rd = INSTR (4, 0);
 440
 441   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 442   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 443                        aarch64_get_mem_u32
 444                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 445 }
 446
 447 /* 64 bit pc-relative load  */
 448 static void
 449 ldr_pcrel (sim_cpu *cpu, int32_t offset)
 450 {
 451   unsigned rd = INSTR (4, 0);
 452
 453   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 454   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 455                        aarch64_get_mem_u64
 456                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 457 }
 458
 459 /* sign extended 32 bit pc-relative load  */
 460 static void
 461 ldrsw_pcrel (sim_cpu *cpu, int32_t offset)
 462 {
 463   unsigned rd = INSTR (4, 0);
 464
 465   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 466   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 467                        aarch64_get_mem_s32
 468                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 469 }
 470
 471 /* float pc-relative load  */
 472 static void
 473 fldrs_pcrel (sim_cpu *cpu, int32_t offset)
 474 {
 475   unsigned int rd = INSTR (4, 0);
 476
 477   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 478   aarch64_set_vec_u32 (cpu, rd, 0,
 479                        aarch64_get_mem_u32
 480                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 481 }
 482
 483 /* double pc-relative load  */
 484 static void
 485 fldrd_pcrel (sim_cpu *cpu, int32_t offset)
 486 {
 487   unsigned int st = INSTR (4, 0);
 488
 489   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 490   aarch64_set_vec_u64 (cpu, st, 0,
 491                        aarch64_get_mem_u64
 492                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 493 }
 494
 495 /* long double pc-relative load.  */
 496 static void
 497 fldrq_pcrel (sim_cpu *cpu, int32_t offset)
 498 {
 499   unsigned int st = INSTR (4, 0);
 500   uint64_t addr = aarch64_get_PC (cpu) + offset * 4;
 501   FRegister a;
 502
 503   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 504   aarch64_get_mem_long_double (cpu, addr, & a);
 505   aarch64_set_FP_long_double (cpu, st, a);
 506 }
 507
 508 /* This can be used to scale an offset by applying
 509    the requisite shift. the second argument is either
 510    16, 32 or 64.  */
 511
 512 #define SCALE(_offset, _elementSize) \
 513     ((_offset) << ScaleShift ## _elementSize)
 514
 515 /* This can be used to optionally scale a register derived offset
 516    by applying the requisite shift as indicated by the Scaling
 517    argument.  The second argument is either Byte, Short, Word
 518    or Long. The third argument is either Scaled or Unscaled.
 519    N.B. when _Scaling is Scaled the shift gets ANDed with
 520    all 1s while when it is Unscaled it gets ANDed with 0.  */
 521
 522 #define OPT_SCALE(_offset, _elementType, _Scaling) \
 523   ((_offset) << (_Scaling ? ScaleShift ## _elementType : 0))
 524
 525 /* This can be used to zero or sign extend a 32 bit register derived
 526    value to a 64 bit value.  the first argument must be the value as
 527    a uint32_t and the second must be either UXTW or SXTW. The result
 528    is returned as an int64_t.  */
 529
 530 static inline int64_t
 531 extend (uint32_t value, Extension extension)
 532 {
 533   union
 534   {
 535     uint32_t u;
 536     int32_t   n;
 537   } x;
 538
 539   /* A branchless variant of this ought to be possible.  */
 540   if (extension == UXTW || extension == NoExtension)
 541     return value;
 542
 543   x.u = value;
 544   return x.n;
 545 }
 546
 547 /* Scalar Floating Point
 548
 549    FP load/store single register (4 addressing modes)
 550
 551    N.B. the base register (source) can be the stack pointer.
 552    The secondary source register (source2) can only be an Xn register.  */
 553
 554 /* Load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
 555 static void
 556 fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 557 {
 558   unsigned rn = INSTR (9, 5);
 559   unsigned st = INSTR (4, 0);
 560   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 561
 562   if (wb != Post)
 563     address += offset;
 564
 565   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 566   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32 (cpu, address));
 567   if (wb == Post)
 568     address += offset;
 569
 570   if (wb != NoWriteBack)
 571     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 572 }
 573
 574 /* Load 8 bit with unsigned 12 bit offset.  */
 575 static void
 576 fldrb_abs (sim_cpu *cpu, uint32_t offset)
 577 {
 578   unsigned rd = INSTR (4, 0);
 579   unsigned rn = INSTR (9, 5);
 580   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
 581
 582   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 583   aarch64_set_vec_u8 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
 584 }
 585
 586 /* Load 16 bit scaled unsigned 12 bit.  */
 587 static void
 588 fldrh_abs (sim_cpu *cpu, uint32_t offset)
 589 {
 590   unsigned rd = INSTR (4, 0);
 591   unsigned rn = INSTR (9, 5);
 592   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16);
 593
 594   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 595   aarch64_set_vec_u16 (cpu, rd, 0, aarch64_get_mem_u16 (cpu, addr));
 596 }
 597
 598 /* Load 32 bit scaled unsigned 12 bit.  */
 599 static void
 600 fldrs_abs (sim_cpu *cpu, uint32_t offset)
 601 {
 602   unsigned rd = INSTR (4, 0);
 603   unsigned rn = INSTR (9, 5);
 604   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32);
 605
 606   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 607   aarch64_set_vec_u32 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
 608 }
 609
 610 /* Load 64 bit scaled unsigned 12 bit.  */
 611 static void
 612 fldrd_abs (sim_cpu *cpu, uint32_t offset)
 613 {
 614   unsigned rd = INSTR (4, 0);
 615   unsigned rn = INSTR (9, 5);
 616   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);
 617
 618   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 619   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
 620 }
 621
 622 /* Load 128 bit scaled unsigned 12 bit.  */
 623 static void
 624 fldrq_abs (sim_cpu *cpu, uint32_t offset)
 625 {
 626   unsigned rd = INSTR (4, 0);
 627   unsigned rn = INSTR (9, 5);
 628   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
 629
 630   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 631   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
 632   aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_mem_u64 (cpu, addr + 8));
 633 }
 634
 635 /* Load 32 bit scaled or unscaled zero- or sign-extended
 636    32-bit register offset.  */
 637 static void
 638 fldrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 639 {
 640   unsigned rm = INSTR (20, 16);
 641   unsigned rn = INSTR (9, 5);
 642   unsigned st = INSTR (4, 0);
 643   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 644   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 645   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
 646
 647   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 648   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
 649                        (cpu, address + displacement));
 650 }
 651
 652 /* Load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
 653 static void
 654 fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 655 {
 656   unsigned rn = INSTR (9, 5);
 657   unsigned st = INSTR (4, 0);
 658   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 659
 660   if (wb != Post)
 661     address += offset;
 662
 663   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 664   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));
 665
 666   if (wb == Post)
 667     address += offset;
 668
 669   if (wb != NoWriteBack)
 670     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 671 }
 672
 673 /* Load 64 bit scaled or unscaled zero- or sign-extended 32-bit register offset.  */
 674 static void
 675 fldrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 676 {
 677   unsigned rm = INSTR (20, 16);
 678   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 679   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
 680
 681   fldrd_wb (cpu, displacement, NoWriteBack);
 682 }
 683
 684 /* Load 128 bit unscaled signed 9 bit with pre- or post-writeback.  */
 685 static void
 686 fldrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 687 {
 688   FRegister a;
 689   unsigned rn = INSTR (9, 5);
 690   unsigned st = INSTR (4, 0);
 691   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 692
 693   if (wb != Post)
 694     address += offset;
 695
 696   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 697   aarch64_get_mem_long_double (cpu, address, & a);
 698   aarch64_set_FP_long_double (cpu, st, a);
 699
 700   if (wb == Post)
 701     address += offset;
 702
 703   if (wb != NoWriteBack)
 704     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 705 }
 706
 707 /* Load 128 bit scaled or unscaled zero- or sign-extended 32-bit register offset  */
 708 static void
 709 fldrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 710 {
 711   unsigned rm = INSTR (20, 16);
 712   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 713   uint64_t displacement = OPT_SCALE (extended, 128, scaling);
 714
 715   fldrq_wb (cpu, displacement, NoWriteBack);
 716 }
 717
 718 /* Memory Access
 719
 720    load-store single register
 721    There are four addressing modes available here which all employ a
 722    64 bit source (base) register.
 723
 724    N.B. the base register (source) can be the stack pointer.
 725    The secondary source register (source2)can only be an Xn register.
 726
 727    Scaled, 12-bit, unsigned immediate offset, without pre- and
 728    post-index options.
 729    Unscaled, 9-bit, signed immediate offset with pre- or post-index
 730    writeback.
 731    scaled or unscaled 64-bit register offset.
 732    scaled or unscaled 32-bit extended register offset.
 733
 734    All offsets are assumed to be raw from the decode i.e. the
 735    simulator is expected to adjust scaled offsets based on the
 736    accessed data size with register or extended register offset
 737    versions the same applies except that in the latter case the
 738    operation may also require a sign extend.
 739
 740    A separate method is provided for each possible addressing mode.  */
 741
 742 /* 32 bit load 32 bit scaled unsigned 12 bit  */
 743 static void
 744 ldr32_abs (sim_cpu *cpu, uint32_t offset)
 745 {
 746   unsigned rn = INSTR (9, 5);
 747   unsigned rt = INSTR (4, 0);
 748
 749   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 750   /* The target register may not be SP but the source may be.  */
 751   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
 752                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 753                         + SCALE (offset, 32)));
 754 }
 755
 756 /* 32 bit load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
 757 static void
 758 ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 759 {
 760   unsigned rn = INSTR (9, 5);
 761   unsigned rt = INSTR (4, 0);
 762   uint64_t address;
 763
 764   if (rn == rt && wb != NoWriteBack)
 765     HALT_UNALLOC;
 766
 767   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 768
 769   if (wb != Post)
 770     address += offset;
 771
 772   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 773   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
 774
 775   if (wb == Post)
 776     address += offset;
 777
 778   if (wb != NoWriteBack)
 779     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 780 }
 781
 782 /* 32 bit load 32 bit scaled or unscaled
 783    zero- or sign-extended 32-bit register offset  */
 784 static void
 785 ldr32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 786 {
 787   unsigned rm = INSTR (20, 16);
 788   unsigned rn = INSTR (9, 5);
 789   unsigned rt = INSTR (4, 0);
 790   /* rn may reference SP, rm and rt must reference ZR  */
 791
 792   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 793   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 794   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
 795
 796   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 797   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 798                        aarch64_get_mem_u32 (cpu, address + displacement));
 799 }
 800
 801 /* 64 bit load 64 bit scaled unsigned 12 bit  */
 802 static void
 803 ldr_abs (sim_cpu *cpu, uint32_t offset)
 804 {
 805   unsigned rn = INSTR (9, 5);
 806   unsigned rt = INSTR (4, 0);
 807
 808   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 809   /* The target register may not be SP but the source may be.  */
 810   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
 811                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 812                         + SCALE (offset, 64)));
 813 }
 814
 815 /* 64 bit load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
 816 static void
 817 ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 818 {
 819   unsigned rn = INSTR (9, 5);
 820   unsigned rt = INSTR (4, 0);
 821   uint64_t address;
 822
 823   if (rn == rt && wb != NoWriteBack)
 824     HALT_UNALLOC;
 825
 826   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 827
 828   if (wb != Post)
 829     address += offset;
 830
 831   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 832   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
 833
 834   if (wb == Post)
 835     address += offset;
 836
 837   if (wb != NoWriteBack)
 838     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 839 }
 840
 841 /* 64 bit load 64 bit scaled or unscaled zero-
 842    or sign-extended 32-bit register offset.  */
 843 static void
 844 ldr_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 845 {
 846   unsigned rm = INSTR (20, 16);
 847   unsigned rn = INSTR (9, 5);
 848   unsigned rt = INSTR (4, 0);
 849   /* rn may reference SP, rm and rt must reference ZR  */
 850
 851   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 852   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 853   uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
 854
 855   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 856   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 857                        aarch64_get_mem_u64 (cpu, address + displacement));
 858 }
 859
 860 /* 32 bit load zero-extended byte scaled unsigned 12 bit.  */
 861 static void
 862 ldrb32_abs (sim_cpu *cpu, uint32_t offset)
 863 {
 864   unsigned rn = INSTR (9, 5);
 865   unsigned rt = INSTR (4, 0);
 866
 867   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 868   /* The target register may not be SP but the source may be
 869      there is no scaling required for a byte load.  */
 870   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 871                        aarch64_get_mem_u8
 872                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
 873 }
 874
 875 /* 32 bit load zero-extended byte unscaled signed 9 bit with pre- or post-writeback.  */
 876 static void
 877 ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 878 {
 879   unsigned rn = INSTR (9, 5);
 880   unsigned rt = INSTR (4, 0);
 881   uint64_t address;
 882
 883   if (rn == rt && wb != NoWriteBack)
 884     HALT_UNALLOC;
 885
 886   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 887
 888   if (wb != Post)
 889     address += offset;
 890
 891   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 892   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
 893
 894   if (wb == Post)
 895     address += offset;
 896
 897   if (wb != NoWriteBack)
 898     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 899 }
 900
 901 /* 32 bit load zero-extended byte scaled or unscaled zero-
 902    or sign-extended 32-bit register offset.  */
 903 static void
 904 ldrb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 905 {
 906   unsigned rm = INSTR (20, 16);
 907   unsigned rn = INSTR (9, 5);
 908   unsigned rt = INSTR (4, 0);
 909   /* rn may reference SP, rm and rt must reference ZR  */
 910
 911   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 912   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 913                                  extension);
 914
 915   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 916   /* There is no scaling required for a byte load.  */
 917   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 918                        aarch64_get_mem_u8 (cpu, address + displacement));
 919 }
 920
 921 /* 64 bit load sign-extended byte unscaled signed 9 bit
 922    with pre- or post-writeback.  */
 923 static void
 924 ldrsb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 925 {
 926   unsigned rn = INSTR (9, 5);
 927   unsigned rt = INSTR (4, 0);
 928   uint64_t address;
 929   int64_t val;
 930
 931   if (rn == rt && wb != NoWriteBack)
 932     HALT_UNALLOC;
 933
 934   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 935
 936   if (wb != Post)
 937     address += offset;
 938
 939   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 940   val = aarch64_get_mem_s8 (cpu, address);
 941   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
 942
 943   if (wb == Post)
 944     address += offset;
 945
 946   if (wb != NoWriteBack)
 947     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 948 }
 949
 950 /* 64 bit load sign-extended byte scaled unsigned 12 bit.  */
 951 static void
 952 ldrsb_abs (sim_cpu *cpu, uint32_t offset)
 953 {
 954   ldrsb_wb (cpu, offset, NoWriteBack);
 955 }
 956
 957 /* 64 bit load sign-extended byte scaled or unscaled zero-
 958    or sign-extended 32-bit register offset.  */
 959 static void
 960 ldrsb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 961 {
 962   unsigned rm = INSTR (20, 16);
 963   unsigned rn = INSTR (9, 5);
 964   unsigned rt = INSTR (4, 0);
 965   /* rn may reference SP, rm and rt must reference ZR  */
 966
 967   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 968   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 969                                  extension);
 970   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 971   /* There is no scaling required for a byte load.  */
 972   aarch64_set_reg_s64 (cpu, rt, NO_SP,
 973                        aarch64_get_mem_s8 (cpu, address + displacement));
 974 }
 975
 976 /* 32 bit load zero-extended short scaled unsigned 12 bit.  */
 977 static void
 978 ldrh32_abs (sim_cpu *cpu, uint32_t offset)
 979 {
 980   unsigned rn = INSTR (9, 5);
 981   unsigned rt = INSTR (4, 0);
 982   uint32_t val;
 983
 984   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 985   /* The target register may not be SP but the source may be.  */
 986   val = aarch64_get_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 987                              + SCALE (offset, 16));
 988   aarch64_set_reg_u32 (cpu, rt, NO_SP, val);
 989 }
 990
 991 /* 32 bit load zero-extended short unscaled signed 9 bit
 992    with pre- or post-writeback.  */
 993 static void
 994 ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 995 {
 996   unsigned rn = INSTR (9, 5);
 997   unsigned rt = INSTR (4, 0);
 998   uint64_t address;
 999
1000   if (rn == rt && wb != NoWriteBack)
1001     HALT_UNALLOC;
1002
1003   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1004
1005   if (wb != Post)
1006     address += offset;
1007
1008   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1009   aarch64_set_reg_u32 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1010
1011   if (wb == Post)
1012     address += offset;
1013
1014   if (wb != NoWriteBack)
1015     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1016 }
1017
1018 /* 32 bit load zero-extended short scaled or unscaled zero-
1019    or sign-extended 32-bit register offset.  */
1020 static void
1021 ldrh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1022 {
1023   unsigned rm = INSTR (20, 16);
1024   unsigned rn = INSTR (9, 5);
1025   unsigned rt = INSTR (4, 0);
1026   /* rn may reference SP, rm and rt must reference ZR  */
1027
1028   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1029   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1030   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1031
1032   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1033   aarch64_set_reg_u32 (cpu, rt, NO_SP,
1034                        aarch64_get_mem_u16 (cpu, address + displacement));
1035 }
1036
1037 /* 32 bit load sign-extended short scaled unsigned 12 bit.  */
1038 static void
1039 ldrsh32_abs (sim_cpu *cpu, uint32_t offset)
1040 {
1041   unsigned rn = INSTR (9, 5);
1042   unsigned rt = INSTR (4, 0);
1043   int32_t val;
1044
1045   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1046   /* The target register may not be SP but the source may be.  */
1047   val = aarch64_get_mem_s16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1048                              + SCALE (offset, 16));
1049   aarch64_set_reg_s32 (cpu, rt, NO_SP, val);
1050 }
1051
1052 /* 32 bit load sign-extended short unscaled signed 9 bit
1053    with pre- or post-writeback.  */
1054 static void
1055 ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1056 {
1057   unsigned rn = INSTR (9, 5);
1058   unsigned rt = INSTR (4, 0);
1059   uint64_t address;
1060
1061   if (rn == rt && wb != NoWriteBack)
1062     HALT_UNALLOC;
1063
1064   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1065
1066   if (wb != Post)
1067     address += offset;
1068
1069   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1070   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1071                        (int32_t) aarch64_get_mem_s16 (cpu, address));
1072
1073   if (wb == Post)
1074     address += offset;
1075
1076   if (wb != NoWriteBack)
1077     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1078 }
1079
1080 /* 32 bit load sign-extended short scaled or unscaled zero-
1081    or sign-extended 32-bit register offset.  */
1082 static void
1083 ldrsh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1084 {
1085   unsigned rm = INSTR (20, 16);
1086   unsigned rn = INSTR (9, 5);
1087   unsigned rt = INSTR (4, 0);
1088   /* rn may reference SP, rm and rt must reference ZR  */
1089
1090   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1091   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1092   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1093
1094   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1095   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1096                        (int32_t) aarch64_get_mem_s16
1097                        (cpu, address + displacement));
1098 }
1099
1100 /* 64 bit load sign-extended short scaled unsigned 12 bit.  */
1101 static void
1102 ldrsh_abs (sim_cpu *cpu, uint32_t offset)
1103 {
1104   unsigned rn = INSTR (9, 5);
1105   unsigned rt = INSTR (4, 0);
1106   int64_t val;
1107
1108   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1109   /* The target register may not be SP but the source may be.  */
1110   val = aarch64_get_mem_s16  (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1111                               + SCALE (offset, 16));
1112   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1113 }
1114
1115 /* 64 bit load sign-extended short unscaled signed 9 bit
1116    with pre- or post-writeback.  */
1117 static void
1118 ldrsh64_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1119 {
1120   unsigned rn = INSTR (9, 5);
1121   unsigned rt = INSTR (4, 0);
1122   uint64_t address;
1123   int64_t val;
1124
1125   if (rn == rt && wb != NoWriteBack)
1126     HALT_UNALLOC;
1127
1128   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1129   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1130
1131   if (wb != Post)
1132     address += offset;
1133
1134   val = aarch64_get_mem_s16 (cpu, address);
1135   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1136
1137   if (wb == Post)
1138     address += offset;
1139
1140   if (wb != NoWriteBack)
1141     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1142 }
1143
1144 /* 64 bit load sign-extended short scaled or unscaled zero-
1145    or sign-extended 32-bit register offset.  */
1146 static void
1147 ldrsh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1148 {
1149   unsigned rm = INSTR (20, 16);
1150   unsigned rn = INSTR (9, 5);
1151   unsigned rt = INSTR (4, 0);
1152
1153   /* rn may reference SP, rm and rt must reference ZR  */
1154
1155   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1156   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1157   uint64_t displacement = OPT_SCALE (extended, 16, scaling);
1158   int64_t val;
1159
1160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1161   val = aarch64_get_mem_s16 (cpu, address + displacement);
1162   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1163 }
1164
1165 /* 64 bit load sign-extended 32 bit scaled unsigned 12 bit.  */
1166 static void
1167 ldrsw_abs (sim_cpu *cpu, uint32_t offset)
1168 {
1169   unsigned rn = INSTR (9, 5);
1170   unsigned rt = INSTR (4, 0);
1171   int64_t val;
1172
1173   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1174   val = aarch64_get_mem_s32 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1175                              + SCALE (offset, 32));
1176   /* The target register may not be SP but the source may be.  */
1177   return aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1178 }
1179
1180 /* 64 bit load sign-extended 32 bit unscaled signed 9 bit
1181    with pre- or post-writeback.  */
1182 static void
1183 ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1184 {
1185   unsigned rn = INSTR (9, 5);
1186   unsigned rt = INSTR (4, 0);
1187   uint64_t address;
1188
1189   if (rn == rt && wb != NoWriteBack)
1190     HALT_UNALLOC;
1191
1192   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1193
1194   if (wb != Post)
1195     address += offset;
1196
1197   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1198   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s32 (cpu, address));
1199
1200   if (wb == Post)
1201     address += offset;
1202
1203   if (wb != NoWriteBack)
1204     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1205 }
1206
1207 /* 64 bit load sign-extended 32 bit scaled or unscaled zero-
1208    or sign-extended 32-bit register offset.  */
1209 static void
1210 ldrsw_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1211 {
1212   unsigned rm = INSTR (20, 16);
1213   unsigned rn = INSTR (9, 5);
1214   unsigned rt = INSTR (4, 0);
1215   /* rn may reference SP, rm and rt must reference ZR  */
1216
1217   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1218   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1219   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
1220
1221   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1222   aarch64_set_reg_s64 (cpu, rt, NO_SP,
1223                        aarch64_get_mem_s32 (cpu, address + displacement));
1224 }
1225
1226 /* N.B. with stores the value in source is written to the
1227    address identified by source2 modified by source3/offset.  */
1228
1229 /* 32 bit store scaled unsigned 12 bit.  */
1230 static void
1231 str32_abs (sim_cpu *cpu, uint32_t offset)
1232 {
1233   unsigned rn = INSTR (9, 5);
1234   unsigned rt = INSTR (4, 0);
1235
1236   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1237   /* The target register may not be SP but the source may be.  */
1238   aarch64_set_mem_u32 (cpu, (aarch64_get_reg_u64 (cpu, rn, SP_OK)
1239                              + SCALE (offset, 32)),
1240                        aarch64_get_reg_u32 (cpu, rt, NO_SP));
1241 }
1242
1243 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1244 static void
1245 str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1246 {
1247   unsigned rn = INSTR (9, 5);
1248   unsigned rt = INSTR (4, 0);
1249   uint64_t address;
1250
1251   if (rn == rt && wb != NoWriteBack)
1252     HALT_UNALLOC;
1253
1254   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1255   if (wb != Post)
1256     address += offset;
1257
1258   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1259   aarch64_set_mem_u32 (cpu, address, aarch64_get_reg_u32 (cpu, rt, NO_SP));
1260
1261   if (wb == Post)
1262     address += offset;
1263
1264   if (wb != NoWriteBack)
1265     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1266 }
1267
1268 /* 32 bit store scaled or unscaled zero- or
1269    sign-extended 32-bit register offset.  */
1270 static void
1271 str32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1272 {
1273   unsigned rm = INSTR (20, 16);
1274   unsigned rn = INSTR (9, 5);
1275   unsigned rt = INSTR (4, 0);
1276
1277   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1278   int64_t  extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1279   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
1280
1281   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1282   aarch64_set_mem_u32 (cpu, address + displacement,
1283                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1284 }
1285
1286 /* 64 bit store scaled unsigned 12 bit.  */
1287 static void
1288 str_abs (sim_cpu *cpu, uint32_t offset)
1289 {
1290   unsigned rn = INSTR (9, 5);
1291   unsigned rt = INSTR (4, 0);
1292
1293   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1294   aarch64_set_mem_u64 (cpu,
1295                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
1296                        + SCALE (offset, 64),
1297                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1298 }
1299
1300 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1301 static void
1302 str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1303 {
1304   unsigned rn = INSTR (9, 5);
1305   unsigned rt = INSTR (4, 0);
1306   uint64_t address;
1307
1308   if (rn == rt && wb != NoWriteBack)
1309     HALT_UNALLOC;
1310
1311   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1312
1313   if (wb != Post)
1314     address += offset;
1315
1316   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1317   aarch64_set_mem_u64 (cpu, address, aarch64_get_reg_u64 (cpu, rt, NO_SP));
1318
1319   if (wb == Post)
1320     address += offset;
1321
1322   if (wb != NoWriteBack)
1323     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1324 }
1325
1326 /* 64 bit store scaled or unscaled zero-
1327    or sign-extended 32-bit register offset.  */
1328 static void
1329 str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1330 {
1331   unsigned rm = INSTR (20, 16);
1332   unsigned rn = INSTR (9, 5);
1333   unsigned rt = INSTR (4, 0);
1334   /* rn may reference SP, rm and rt must reference ZR  */
1335
1336   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1337   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1338                                extension);
1339   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
1340
1341   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1342   aarch64_set_mem_u64 (cpu, address + displacement,
1343                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1344 }
1345
1346 /* 32 bit store byte scaled unsigned 12 bit.  */
1347 static void
1348 strb_abs (sim_cpu *cpu, uint32_t offset)
1349 {
1350   unsigned rn = INSTR (9, 5);
1351   unsigned rt = INSTR (4, 0);
1352
1353   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1354   /* The target register may not be SP but the source may be.
1355      There is no scaling required for a byte load.  */
1356   aarch64_set_mem_u8 (cpu,
1357                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
1358                       aarch64_get_reg_u8 (cpu, rt, NO_SP));
1359 }
1360
1361 /* 32 bit store byte unscaled signed 9 bit with pre- or post-writeback.  */
1362 static void
1363 strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1364 {
1365   unsigned rn = INSTR (9, 5);
1366   unsigned rt = INSTR (4, 0);
1367   uint64_t address;
1368
1369   if (rn == rt && wb != NoWriteBack)
1370     HALT_UNALLOC;
1371
1372   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1373
1374   if (wb != Post)
1375     address += offset;
1376
1377   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1378   aarch64_set_mem_u8 (cpu, address, aarch64_get_reg_u8 (cpu, rt, NO_SP));
1379
1380   if (wb == Post)
1381     address += offset;
1382
1383   if (wb != NoWriteBack)
1384     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1385 }
1386
1387 /* 32 bit store byte scaled or unscaled zero-
1388    or sign-extended 32-bit register offset.  */
1389 static void
1390 strb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1391 {
1392   unsigned rm = INSTR (20, 16);
1393   unsigned rn = INSTR (9, 5);
1394   unsigned rt = INSTR (4, 0);
1395   /* rn may reference SP, rm and rt must reference ZR  */
1396
1397   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1398   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1399                                  extension);
1400
1401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1402   /* There is no scaling required for a byte load.  */
1403   aarch64_set_mem_u8 (cpu, address + displacement,
1404                       aarch64_get_reg_u8 (cpu, rt, NO_SP));
1405 }
1406
1407 /* 32 bit store short scaled unsigned 12 bit.  */
1408 static void
1409 strh_abs (sim_cpu *cpu, uint32_t offset)
1410 {
1411   unsigned rn = INSTR (9, 5);
1412   unsigned rt = INSTR (4, 0);
1413
1414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1415   /* The target register may not be SP but the source may be.  */
1416   aarch64_set_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1417                        + SCALE (offset, 16),
1418                        aarch64_get_reg_u16 (cpu, rt, NO_SP));
1419 }
1420
1421 /* 32 bit store short unscaled signed 9 bit with pre- or post-writeback.  */
1422 static void
1423 strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1424 {
1425   unsigned rn = INSTR (9, 5);
1426   unsigned rt = INSTR (4, 0);
1427   uint64_t address;
1428
1429   if (rn == rt && wb != NoWriteBack)
1430     HALT_UNALLOC;
1431
1432   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1433
1434   if (wb != Post)
1435     address += offset;
1436
1437   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1438   aarch64_set_mem_u16 (cpu, address, aarch64_get_reg_u16 (cpu, rt, NO_SP));
1439
1440   if (wb == Post)
1441     address += offset;
1442
1443   if (wb != NoWriteBack)
1444     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1445 }
1446
1447 /* 32 bit store short scaled or unscaled zero-
1448    or sign-extended 32-bit register offset.  */
1449 static void
1450 strh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1451 {
1452   unsigned rm = INSTR (20, 16);
1453   unsigned rn = INSTR (9, 5);
1454   unsigned rt = INSTR (4, 0);
1455   /* rn may reference SP, rm and rt must reference ZR  */
1456
1457   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1458   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1459   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1460
1461   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1462   aarch64_set_mem_u16 (cpu, address + displacement,
1463                        aarch64_get_reg_u16 (cpu, rt, NO_SP));
1464 }
1465
1466 /* Prefetch unsigned 12 bit.  */
1467 static void
1468 prfm_abs (sim_cpu *cpu, uint32_t offset)
1469 {
1470   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1471                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1472                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1473                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1474                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1475                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1476                           ow ==> UNALLOC
1477      PrfOp prfop = prfop (instr, 4, 0);
1478      uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK)
1479      + SCALE (offset, 64).  */
1480
1481   /* TODO : implement prefetch of address.  */
1482 }
1483
1484 /* Prefetch scaled or unscaled zero- or sign-extended 32-bit register offset.  */
1485 static void
1486 prfm_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1487 {
1488   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1489                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1490                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1491                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1492                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1493                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1494                           ow ==> UNALLOC
1495      rn may reference SP, rm may only reference ZR
1496      PrfOp prfop = prfop (instr, 4, 0);
1497      uint64_t base = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1498      int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1499                                 extension);
1500      uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
1501      uint64_t address = base + displacement.  */
1502
1503   /* TODO : implement prefetch of address  */
1504 }
1505
1506 /* 64 bit pc-relative prefetch.  */
1507 static void
1508 prfm_pcrel (sim_cpu *cpu, int32_t offset)
1509 {
1510   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1511                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1512                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1513                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1514                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1515                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1516                           ow ==> UNALLOC
1517      PrfOp prfop = prfop (instr, 4, 0);
1518      uint64_t address = aarch64_get_PC (cpu) + offset.  */
1519
1520   /* TODO : implement this  */
1521 }
1522
1523 /* Load-store exclusive.  */
1524
1525 static void
1526 ldxr (sim_cpu *cpu)
1527 {
1528   unsigned rn = INSTR (9, 5);
1529   unsigned rt = INSTR (4, 0);
1530   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1531   int size = INSTR (31, 30);
1532   /* int ordered = INSTR (15, 15);  */
1533   /* int exclusive = ! INSTR (23, 23);  */
1534
1535   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1536   switch (size)
1537     {
1538     case 0:
1539       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
1540       break;
1541     case 1:
1542       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1543       break;
1544     case 2:
1545       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
1546       break;
1547     case 3:
1548       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
1549       break;
1550     }
1551 }
1552
1553 static void
1554 stxr (sim_cpu *cpu)
1555 {
1556   unsigned rn = INSTR (9, 5);
1557   unsigned rt = INSTR (4, 0);
1558   unsigned rs = INSTR (20, 16);
1559   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1560   int      size = INSTR (31, 30);
1561   uint64_t data = aarch64_get_reg_u64 (cpu, rt, NO_SP);
1562
1563   switch (size)
1564     {
1565     case 0: aarch64_set_mem_u8 (cpu, address, data); break;
1566     case 1: aarch64_set_mem_u16 (cpu, address, data); break;
1567     case 2: aarch64_set_mem_u32 (cpu, address, data); break;
1568     case 3: aarch64_set_mem_u64 (cpu, address, data); break;
1569     }
1570
1571   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1572   aarch64_set_reg_u64 (cpu, rs, NO_SP, 0); /* Always exclusive...  */
1573 }
1574
1575 static void
1576 dexLoadLiteral (sim_cpu *cpu)
1577 {
1578   /* instr[29,27] == 011
1579      instr[25,24] == 00
1580      instr[31,30:26] = opc: 000 ==> LDRW,  001 ==> FLDRS
1581                             010 ==> LDRX,  011 ==> FLDRD
1582                             100 ==> LDRSW, 101 ==> FLDRQ
1583                             110 ==> PRFM, 111 ==> UNALLOC
1584      instr[26] ==> V : 0 ==> GReg, 1 ==> FReg
1585      instr[23, 5] == simm19  */
1586
1587   /* unsigned rt = INSTR (4, 0);  */
1588   uint32_t dispatch = (INSTR (31, 30) << 1) | INSTR (26, 26);
1589   int32_t imm = simm32 (aarch64_get_instr (cpu), 23, 5);
1590
1591   switch (dispatch)
1592     {
1593     case 0: ldr32_pcrel (cpu, imm); break;
1594     case 1: fldrs_pcrel (cpu, imm); break;
1595     case 2: ldr_pcrel   (cpu, imm); break;
1596     case 3: fldrd_pcrel (cpu, imm); break;
1597     case 4: ldrsw_pcrel (cpu, imm); break;
1598     case 5: fldrq_pcrel (cpu, imm); break;
1599     case 6: prfm_pcrel  (cpu, imm); break;
1600     case 7:
1601     default:
1602       HALT_UNALLOC;
1603     }
1604 }
1605
1606 /* Immediate arithmetic
1607    The aimm argument is a 12 bit unsigned value or a 12 bit unsigned
1608    value left shifted by 12 bits (done at decode).
1609
1610    N.B. the register args (dest, source) can normally be Xn or SP.
1611    the exception occurs for flag setting instructions which may
1612    only use Xn for the output (dest).  */
1613
1614 /* 32 bit add immediate.  */
1615 static void
1616 add32 (sim_cpu *cpu, uint32_t aimm)
1617 {
1618   unsigned rn = INSTR (9, 5);
1619   unsigned rd = INSTR (4, 0);
1620
1621   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1622   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1623                        aarch64_get_reg_u32 (cpu, rn, SP_OK) + aimm);
1624 }
1625
1626 /* 64 bit add immediate.  */
1627 static void
1628 add64 (sim_cpu *cpu, uint32_t aimm)
1629 {
1630   unsigned rn = INSTR (9, 5);
1631   unsigned rd = INSTR (4, 0);
1632
1633   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1634   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1635                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + aimm);
1636 }
1637
1638 static void
1639 set_flags_for_add32 (sim_cpu *cpu, int32_t value1, int32_t value2)
1640 {
1641   int32_t   result = value1 + value2;
1642   int64_t   sresult = (int64_t) value1 + (int64_t) value2;
1643   uint64_t  uresult = (uint64_t)(uint32_t) value1
1644     + (uint64_t)(uint32_t) value2;
1645   uint32_t  flags = 0;
1646
1647   if (result == 0)
1648     flags |= Z;
1649
1650   if (result & (1 << 31))
1651     flags |= N;
1652
1653   if (uresult != result)
1654     flags |= C;
1655
1656   if (sresult != result)
1657     flags |= V;
1658
1659   aarch64_set_CPSR (cpu, flags);
1660 }
1661
1662 #define NEG(a) (((a) & signbit) == signbit)
1663 #define POS(a) (((a) & signbit) == 0)
1664
1665 static void
1666 set_flags_for_add64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1667 {
1668   uint64_t result = value1 + value2;
1669   uint32_t flags = 0;
1670   uint64_t signbit = 1ULL << 63;
1671
1672   if (result == 0)
1673     flags |= Z;
1674
1675   if (NEG (result))
1676     flags |= N;
1677
1678   if (   (NEG (value1) && NEG (value2))
1679       || (NEG (value1) && POS (result))
1680       || (NEG (value2) && POS (result)))
1681     flags |= C;
1682
1683   if (   (NEG (value1) && NEG (value2) && POS (result))
1684       || (POS (value1) && POS (value2) && NEG (result)))
1685     flags |= V;
1686
1687   aarch64_set_CPSR (cpu, flags);
1688 }
1689
1690 static void
1691 set_flags_for_sub32 (sim_cpu *cpu, uint32_t value1, uint32_t value2)
1692 {
1693   uint32_t result = value1 - value2;
1694   uint32_t flags = 0;
1695   uint32_t signbit = 1U << 31;
1696
1697   if (result == 0)
1698     flags |= Z;
1699
1700   if (NEG (result))
1701     flags |= N;
1702
1703   if (   (NEG (value1) && POS (value2))
1704       || (NEG (value1) && POS (result))
1705       || (POS (value2) && POS (result)))
1706     flags |= C;
1707
1708   if (   (NEG (value1) && POS (value2) && POS (result))
1709       || (POS (value1) && NEG (value2) && NEG (result)))
1710     flags |= V;
1711
1712   aarch64_set_CPSR (cpu, flags);
1713 }
1714
1715 static void
1716 set_flags_for_sub64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1717 {
1718   uint64_t result = value1 - value2;
1719   uint32_t flags = 0;
1720   uint64_t signbit = 1ULL << 63;
1721
1722   if (result == 0)
1723     flags |= Z;
1724
1725   if (NEG (result))
1726     flags |= N;
1727
1728   if (   (NEG (value1) && POS (value2))
1729       || (NEG (value1) && POS (result))
1730       || (POS (value2) && POS (result)))
1731     flags |= C;
1732
1733   if (   (NEG (value1) && POS (value2) && POS (result))
1734       || (POS (value1) && NEG (value2) && NEG (result)))
1735     flags |= V;
1736
1737   aarch64_set_CPSR (cpu, flags);
1738 }
1739
1740 static void
1741 set_flags_for_binop32 (sim_cpu *cpu, uint32_t result)
1742 {
1743   uint32_t flags = 0;
1744
1745   if (result == 0)
1746     flags |= Z;
1747   else
1748     flags &= ~ Z;
1749
1750   if (result & (1 << 31))
1751     flags |= N;
1752   else
1753     flags &= ~ N;
1754
1755   aarch64_set_CPSR (cpu, flags);
1756 }
1757
1758 static void
1759 set_flags_for_binop64 (sim_cpu *cpu, uint64_t result)
1760 {
1761   uint32_t flags = 0;
1762
1763   if (result == 0)
1764     flags |= Z;
1765   else
1766     flags &= ~ Z;
1767
1768   if (result & (1ULL << 63))
1769     flags |= N;
1770   else
1771     flags &= ~ N;
1772
1773   aarch64_set_CPSR (cpu, flags);
1774 }
1775
1776 /* 32 bit add immediate set flags.  */
1777 static void
1778 adds32 (sim_cpu *cpu, uint32_t aimm)
1779 {
1780   unsigned rn = INSTR (9, 5);
1781   unsigned rd = INSTR (4, 0);
1782   /* TODO : do we need to worry about signs here?  */
1783   int32_t value1 = aarch64_get_reg_s32 (cpu, rn, SP_OK);
1784
1785   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1786   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + aimm);
1787   set_flags_for_add32 (cpu, value1, aimm);
1788 }
1789
1790 /* 64 bit add immediate set flags.  */
1791 static void
1792 adds64 (sim_cpu *cpu, uint32_t aimm)
1793 {
1794   unsigned rn = INSTR (9, 5);
1795   unsigned rd = INSTR (4, 0);
1796   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1797   uint64_t value2 = aimm;
1798
1799   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1800   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1801   set_flags_for_add64 (cpu, value1, value2);
1802 }
1803
1804 /* 32 bit sub immediate.  */
1805 static void
1806 sub32 (sim_cpu *cpu, uint32_t aimm)
1807 {
1808   unsigned rn = INSTR (9, 5);
1809   unsigned rd = INSTR (4, 0);
1810
1811   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1812   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1813                        aarch64_get_reg_u32 (cpu, rn, SP_OK) - aimm);
1814 }
1815
1816 /* 64 bit sub immediate.  */
1817 static void
1818 sub64 (sim_cpu *cpu, uint32_t aimm)
1819 {
1820   unsigned rn = INSTR (9, 5);
1821   unsigned rd = INSTR (4, 0);
1822
1823   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1824   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1825                        aarch64_get_reg_u64 (cpu, rn, SP_OK) - aimm);
1826 }
1827
1828 /* 32 bit sub immediate set flags.  */
1829 static void
1830 subs32 (sim_cpu *cpu, uint32_t aimm)
1831 {
1832   unsigned rn = INSTR (9, 5);
1833   unsigned rd = INSTR (4, 0);
1834   uint32_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1835   uint32_t value2 = aimm;
1836
1837   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1838   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1839   set_flags_for_sub32 (cpu, value1, value2);
1840 }
1841
1842 /* 64 bit sub immediate set flags.  */
1843 static void
1844 subs64 (sim_cpu *cpu, uint32_t aimm)
1845 {
1846   unsigned rn = INSTR (9, 5);
1847   unsigned rd = INSTR (4, 0);
1848   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1849   uint32_t value2 = aimm;
1850
1851   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1852   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1853   set_flags_for_sub64 (cpu, value1, value2);
1854 }
1855
1856 /* Data Processing Register.  */
1857
1858 /* First two helpers to perform the shift operations.  */
1859
1860 static inline uint32_t
1861 shifted32 (uint32_t value, Shift shift, uint32_t count)
1862 {
1863   switch (shift)
1864     {
1865     default:
1866     case LSL:
1867       return (value << count);
1868     case LSR:
1869       return (value >> count);
1870     case ASR:
1871       {
1872         int32_t svalue = value;
1873         return (svalue >> count);
1874       }
1875     case ROR:
1876       {
1877         uint32_t top = value >> count;
1878         uint32_t bottom = value << (32 - count);
1879         return (bottom | top);
1880       }
1881     }
1882 }
1883
1884 static inline uint64_t
1885 shifted64 (uint64_t value, Shift shift, uint32_t count)
1886 {
1887   switch (shift)
1888     {
1889     default:
1890     case LSL:
1891       return (value << count);
1892     case LSR:
1893       return (value >> count);
1894     case ASR:
1895       {
1896         int64_t svalue = value;
1897         return (svalue >> count);
1898       }
1899     case ROR:
1900       {
1901         uint64_t top = value >> count;
1902         uint64_t bottom = value << (64 - count);
1903         return (bottom | top);
1904       }
1905     }
1906 }
1907
1908 /* Arithmetic shifted register.
1909    These allow an optional LSL, ASR or LSR to the second source
1910    register with a count up to the register bit count.
1911
1912    N.B register args may not be SP.  */
1913
1914 /* 32 bit ADD shifted register.  */
1915 static void
1916 add32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1917 {
1918   unsigned rm = INSTR (20, 16);
1919   unsigned rn = INSTR (9, 5);
1920   unsigned rd = INSTR (4, 0);
1921
1922   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1923   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1924                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
1925                        + shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1926                                     shift, count));
1927 }
1928
1929 /* 64 bit ADD shifted register.  */
1930 static void
1931 add64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1932 {
1933   unsigned rm = INSTR (20, 16);
1934   unsigned rn = INSTR (9, 5);
1935   unsigned rd = INSTR (4, 0);
1936
1937   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1938   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1939                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
1940                        + shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1941                                     shift, count));
1942 }
1943
1944 /* 32 bit ADD shifted register setting flags.  */
1945 static void
1946 adds32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1947 {
1948   unsigned rm = INSTR (20, 16);
1949   unsigned rn = INSTR (9, 5);
1950   unsigned rd = INSTR (4, 0);
1951
1952   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
1953   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1954                                shift, count);
1955
1956   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1957   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1958   set_flags_for_add32 (cpu, value1, value2);
1959 }
1960
1961 /* 64 bit ADD shifted register setting flags.  */
1962 static void
1963 adds64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1964 {
1965   unsigned rm = INSTR (20, 16);
1966   unsigned rn = INSTR (9, 5);
1967   unsigned rd = INSTR (4, 0);
1968
1969   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
1970   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1971                                shift, count);
1972
1973   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1974   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1975   set_flags_for_add64 (cpu, value1, value2);
1976 }
1977
1978 /* 32 bit SUB shifted register.  */
1979 static void
1980 sub32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1981 {
1982   unsigned rm = INSTR (20, 16);
1983   unsigned rn = INSTR (9, 5);
1984   unsigned rd = INSTR (4, 0);
1985
1986   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1987   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1988                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
1989                        - shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1990                                     shift, count));
1991 }
1992
1993 /* 64 bit SUB shifted register.  */
1994 static void
1995 sub64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1996 {
1997   unsigned rm = INSTR (20, 16);
1998   unsigned rn = INSTR (9, 5);
1999   unsigned rd = INSTR (4, 0);
2000
2001   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2002   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2003                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2004                        - shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2005                                     shift, count));
2006 }
2007
2008 /* 32 bit SUB shifted register setting flags.  */
2009 static void
2010 subs32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2011 {
2012   unsigned rm = INSTR (20, 16);
2013   unsigned rn = INSTR (9, 5);
2014   unsigned rd = INSTR (4, 0);
2015
2016   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2017   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
2018                               shift, count);
2019
2020   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2021   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2022   set_flags_for_sub32 (cpu, value1, value2);
2023 }
2024
2025 /* 64 bit SUB shifted register setting flags.  */
2026 static void
2027 subs64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2028 {
2029   unsigned rm = INSTR (20, 16);
2030   unsigned rn = INSTR (9, 5);
2031   unsigned rd = INSTR (4, 0);
2032
2033   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2034   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2035                                shift, count);
2036
2037   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2038   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2039   set_flags_for_sub64 (cpu, value1, value2);
2040 }
2041
2042 /* First a couple more helpers to fetch the
2043    relevant source register element either
2044    sign or zero extended as required by the
2045    extension value.  */
2046
2047 static uint32_t
2048 extreg32 (sim_cpu *cpu, unsigned int lo, Extension extension)
2049 {
2050   switch (extension)
2051     {
2052     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2053     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2054     case UXTW: /* Fall through.  */
2055     case UXTX: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2056     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2057     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2058     case SXTW: /* Fall through.  */
2059     case SXTX: /* Fall through.  */
2060     default:   return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2061   }
2062 }
2063
2064 static uint64_t
2065 extreg64 (sim_cpu *cpu, unsigned int lo, Extension extension)
2066 {
2067   switch (extension)
2068     {
2069     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2070     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2071     case UXTW: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2072     case UXTX: return aarch64_get_reg_u64 (cpu, lo, NO_SP);
2073     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2074     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2075     case SXTW: return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2076     case SXTX:
2077     default:   return aarch64_get_reg_s64 (cpu, lo, NO_SP);
2078     }
2079 }
2080
2081 /* Arithmetic extending register
2082    These allow an optional sign extension of some portion of the
2083    second source register followed by an optional left shift of
2084    between 1 and 4 bits (i.e. a shift of 0-4 bits???)
2085
2086    N.B output (dest) and first input arg (source) may normally be Xn
2087    or SP. However, for flag setting operations dest can only be
2088    Xn. Second input registers are always Xn.  */
2089
2090 /* 32 bit ADD extending register.  */
2091 static void
2092 add32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2093 {
2094   unsigned rm = INSTR (20, 16);
2095   unsigned rn = INSTR (9, 5);
2096   unsigned rd = INSTR (4, 0);
2097
2098   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2099   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2100                        aarch64_get_reg_u32 (cpu, rn, SP_OK)
2101                        + (extreg32 (cpu, rm, extension) << shift));
2102 }
2103
2104 /* 64 bit ADD extending register.
2105    N.B. This subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2106 static void
2107 add64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2108 {
2109   unsigned rm = INSTR (20, 16);
2110   unsigned rn = INSTR (9, 5);
2111   unsigned rd = INSTR (4, 0);
2112
2113   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2114   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2115                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
2116                        + (extreg64 (cpu, rm, extension) << shift));
2117 }
2118
2119 /* 32 bit ADD extending register setting flags.  */
2120 static void
2121 adds32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2122 {
2123   unsigned rm = INSTR (20, 16);
2124   unsigned rn = INSTR (9, 5);
2125   unsigned rd = INSTR (4, 0);
2126
2127   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2128   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2129
2130   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2131   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2132   set_flags_for_add32 (cpu, value1, value2);
2133 }
2134
2135 /* 64 bit ADD extending register setting flags  */
2136 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2137 static void
2138 adds64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2139 {
2140   unsigned rm = INSTR (20, 16);
2141   unsigned rn = INSTR (9, 5);
2142   unsigned rd = INSTR (4, 0);
2143
2144   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2145   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2146
2147   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2148   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2149   set_flags_for_add64 (cpu, value1, value2);
2150 }
2151
2152 /* 32 bit SUB extending register.  */
2153 static void
2154 sub32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2155 {
2156   unsigned rm = INSTR (20, 16);
2157   unsigned rn = INSTR (9, 5);
2158   unsigned rd = INSTR (4, 0);
2159
2160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2161   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2162                        aarch64_get_reg_u32 (cpu, rn, SP_OK)
2163                        - (extreg32 (cpu, rm, extension) << shift));
2164 }
2165
2166 /* 64 bit SUB extending register.  */
2167 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2168 static void
2169 sub64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2170 {
2171   unsigned rm = INSTR (20, 16);
2172   unsigned rn = INSTR (9, 5);
2173   unsigned rd = INSTR (4, 0);
2174
2175   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2176   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2177                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
2178                        - (extreg64 (cpu, rm, extension) << shift));
2179 }
2180
2181 /* 32 bit SUB extending register setting flags.  */
2182 static void
2183 subs32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2184 {
2185   unsigned rm = INSTR (20, 16);
2186   unsigned rn = INSTR (9, 5);
2187   unsigned rd = INSTR (4, 0);
2188
2189   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2190   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2191
2192   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2193   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2194   set_flags_for_sub32 (cpu, value1, value2);
2195 }
2196
2197 /* 64 bit SUB extending register setting flags  */
2198 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2199 static void
2200 subs64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2201 {
2202   unsigned rm = INSTR (20, 16);
2203   unsigned rn = INSTR (9, 5);
2204   unsigned rd = INSTR (4, 0);
2205
2206   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2207   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2208
2209   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2210   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2211   set_flags_for_sub64 (cpu, value1, value2);
2212 }
2213
2214 static void
2215 dexAddSubtractImmediate (sim_cpu *cpu)
2216 {
2217   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2218      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2219      instr[29]    = set : 0 ==> no flags, 1 ==> set flags
2220      instr[28,24] = 10001
2221      instr[23,22] = shift : 00 == LSL#0, 01 = LSL#12 1x = UNALLOC
2222      instr[21,10] = uimm12
2223      instr[9,5]   = Rn
2224      instr[4,0]   = Rd  */
2225
2226   /* N.B. the shift is applied at decode before calling the add/sub routine.  */
2227   uint32_t shift = INSTR (23, 22);
2228   uint32_t imm = INSTR (21, 10);
2229   uint32_t dispatch = INSTR (31, 29);
2230
2231   NYI_assert (28, 24, 0x11);
2232
2233   if (shift > 1)
2234     HALT_UNALLOC;
2235
2236   if (shift)
2237     imm <<= 12;
2238
2239   switch (dispatch)
2240     {
2241     case 0: add32 (cpu, imm); break;
2242     case 1: adds32 (cpu, imm); break;
2243     case 2: sub32 (cpu, imm); break;
2244     case 3: subs32 (cpu, imm); break;
2245     case 4: add64 (cpu, imm); break;
2246     case 5: adds64 (cpu, imm); break;
2247     case 6: sub64 (cpu, imm); break;
2248     case 7: subs64 (cpu, imm); break;
2249     }
2250 }
2251
2252 static void
2253 dexAddSubtractShiftedRegister (sim_cpu *cpu)
2254 {
2255   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2256      instr[30,29] = op : 00 ==> ADD, 01 ==> ADDS, 10 ==> SUB, 11 ==> SUBS
2257      instr[28,24] = 01011
2258      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> UNALLOC
2259      instr[21]    = 0
2260      instr[20,16] = Rm
2261      instr[15,10] = count : must be 0xxxxx for 32 bit
2262      instr[9,5]   = Rn
2263      instr[4,0]   = Rd  */
2264
2265   uint32_t size = INSTR (31, 31);
2266   uint32_t count = INSTR (15, 10);
2267   Shift shiftType = INSTR (23, 22);
2268
2269   NYI_assert (28, 24, 0x0B);
2270   NYI_assert (21, 21, 0);
2271
2272   /* Shift encoded as ROR is unallocated.  */
2273   if (shiftType == ROR)
2274     HALT_UNALLOC;
2275
2276   /* 32 bit operations must have count[5] = 0
2277      or else we have an UNALLOC.  */
2278   if (size == 0 && uimm (count, 5, 5))
2279     HALT_UNALLOC;
2280
2281   /* Dispatch on size:op i.e instr [31,29].  */
2282   switch (INSTR (31, 29))
2283     {
2284     case 0: add32_shift  (cpu, shiftType, count); break;
2285     case 1: adds32_shift (cpu, shiftType, count); break;
2286     case 2: sub32_shift  (cpu, shiftType, count); break;
2287     case 3: subs32_shift (cpu, shiftType, count); break;
2288     case 4: add64_shift  (cpu, shiftType, count); break;
2289     case 5: adds64_shift (cpu, shiftType, count); break;
2290     case 6: sub64_shift  (cpu, shiftType, count); break;
2291     case 7: subs64_shift (cpu, shiftType, count); break;
2292     }
2293 }
2294
2295 static void
2296 dexAddSubtractExtendedRegister (sim_cpu *cpu)
2297 {
2298   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2299      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2300      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2301      instr[28,24] = 01011
2302      instr[23,22] = opt : 0 ==> ok, 1,2,3 ==> UNALLOC
2303      instr[21]    = 1
2304      instr[20,16] = Rm
2305      instr[15,13] = option : 000 ==> UXTB, 001 ==> UXTH,
2306                              000 ==> LSL|UXTW, 001 ==> UXTZ,
2307                              000 ==> SXTB, 001 ==> SXTH,
2308                              000 ==> SXTW, 001 ==> SXTX,
2309      instr[12,10] = shift : 0,1,2,3,4 ==> ok, 5,6,7 ==> UNALLOC
2310      instr[9,5]   = Rn
2311      instr[4,0]   = Rd  */
2312
2313   Extension extensionType = INSTR (15, 13);
2314   uint32_t shift = INSTR (12, 10);
2315
2316   NYI_assert (28, 24, 0x0B);
2317   NYI_assert (21, 21, 1);
2318
2319   /* Shift may not exceed 4.  */
2320   if (shift > 4)
2321     HALT_UNALLOC;
2322
2323   /* Dispatch on size:op:set?.  */
2324   switch (INSTR (31, 29))
2325     {
2326     case 0: add32_ext  (cpu, extensionType, shift); break;
2327     case 1: adds32_ext (cpu, extensionType, shift); break;
2328     case 2: sub32_ext  (cpu, extensionType, shift); break;
2329     case 3: subs32_ext (cpu, extensionType, shift); break;
2330     case 4: add64_ext  (cpu, extensionType, shift); break;
2331     case 5: adds64_ext (cpu, extensionType, shift); break;
2332     case 6: sub64_ext  (cpu, extensionType, shift); break;
2333     case 7: subs64_ext (cpu, extensionType, shift); break;
2334     }
2335 }
2336
2337 /* Conditional data processing
2338    Condition register is implicit 3rd source.  */
2339
2340 /* 32 bit add with carry.  */
2341 /* N.B register args may not be SP.  */
2342
2343 static void
2344 adc32 (sim_cpu *cpu)
2345 {
2346   unsigned rm = INSTR (20, 16);
2347   unsigned rn = INSTR (9, 5);
2348   unsigned rd = INSTR (4, 0);
2349
2350   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2351   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2352                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
2353                        + aarch64_get_reg_u32 (cpu, rm, NO_SP)
2354                        + IS_SET (C));
2355 }
2356
2357 /* 64 bit add with carry  */
2358 static void
2359 adc64 (sim_cpu *cpu)
2360 {
2361   unsigned rm = INSTR (20, 16);
2362   unsigned rn = INSTR (9, 5);
2363   unsigned rd = INSTR (4, 0);
2364
2365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2366   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2367                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2368                        + aarch64_get_reg_u64 (cpu, rm, NO_SP)
2369                        + IS_SET (C));
2370 }
2371
2372 /* 32 bit add with carry setting flags.  */
2373 static void
2374 adcs32 (sim_cpu *cpu)
2375 {
2376   unsigned rm = INSTR (20, 16);
2377   unsigned rn = INSTR (9, 5);
2378   unsigned rd = INSTR (4, 0);
2379
2380   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2381   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2382   uint32_t carry = IS_SET (C);
2383
2384   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2385   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2386   set_flags_for_add32 (cpu, value1, value2 + carry);
2387 }
2388
2389 /* 64 bit add with carry setting flags.  */
2390 static void
2391 adcs64 (sim_cpu *cpu)
2392 {
2393   unsigned rm = INSTR (20, 16);
2394   unsigned rn = INSTR (9, 5);
2395   unsigned rd = INSTR (4, 0);
2396
2397   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2398   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2399   uint64_t carry = IS_SET (C);
2400
2401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2402   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2403   set_flags_for_add64 (cpu, value1, value2 + carry);
2404 }
2405
2406 /* 32 bit sub with carry.  */
2407 static void
2408 sbc32 (sim_cpu *cpu)
2409 {
2410   unsigned rm = INSTR (20, 16);
2411   unsigned rn = INSTR (9, 5); /* ngc iff rn == 31.  */
2412   unsigned rd = INSTR (4, 0);
2413
2414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2415   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2416                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
2417                        - aarch64_get_reg_u32 (cpu, rm, NO_SP)
2418                        - 1 + IS_SET (C));
2419 }
2420
2421 /* 64 bit sub with carry  */
2422 static void
2423 sbc64 (sim_cpu *cpu)
2424 {
2425   unsigned rm = INSTR (20, 16);
2426   unsigned rn = INSTR (9, 5);
2427   unsigned rd = INSTR (4, 0);
2428
2429   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2430   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2431                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2432                        - aarch64_get_reg_u64 (cpu, rm, NO_SP)
2433                        - 1 + IS_SET (C));
2434 }
2435
2436 /* 32 bit sub with carry setting flags  */
2437 static void
2438 sbcs32 (sim_cpu *cpu)
2439 {
2440   unsigned rm = INSTR (20, 16);
2441   unsigned rn = INSTR (9, 5);
2442   unsigned rd = INSTR (4, 0);
2443
2444   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2445   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2446   uint32_t carry  = IS_SET (C);
2447   uint32_t result = value1 - value2 + 1 - carry;
2448
2449   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2450   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2451   set_flags_for_sub32 (cpu, value1, value2 + 1 - carry);
2452 }
2453
2454 /* 64 bit sub with carry setting flags  */
2455 static void
2456 sbcs64 (sim_cpu *cpu)
2457 {
2458   unsigned rm = INSTR (20, 16);
2459   unsigned rn = INSTR (9, 5);
2460   unsigned rd = INSTR (4, 0);
2461
2462   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2463   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2464   uint64_t carry  = IS_SET (C);
2465   uint64_t result = value1 - value2 + 1 - carry;
2466
2467   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2468   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2469   set_flags_for_sub64 (cpu, value1, value2 + 1 - carry);
2470 }
2471
2472 static void
2473 dexAddSubtractWithCarry (sim_cpu *cpu)
2474 {
2475   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2476      instr[30]    = op : 0 ==> ADC, 1 ==> SBC
2477      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2478      instr[28,21] = 1 1010 000
2479      instr[20,16] = Rm
2480      instr[15,10] = op2 : 00000 ==> ok, ow ==> UNALLOC
2481      instr[9,5]   = Rn
2482      instr[4,0]   = Rd  */
2483
2484   uint32_t op2 = INSTR (15, 10);
2485
2486   NYI_assert (28, 21, 0xD0);
2487
2488   if (op2 != 0)
2489     HALT_UNALLOC;
2490
2491   /* Dispatch on size:op:set?.  */
2492   switch (INSTR (31, 29))
2493     {
2494     case 0: adc32 (cpu); break;
2495     case 1: adcs32 (cpu); break;
2496     case 2: sbc32 (cpu); break;
2497     case 3: sbcs32 (cpu); break;
2498     case 4: adc64 (cpu); break;
2499     case 5: adcs64 (cpu); break;
2500     case 6: sbc64 (cpu); break;
2501     case 7: sbcs64 (cpu); break;
2502     }
2503 }
2504
2505 static uint32_t
2506 testConditionCode (sim_cpu *cpu, CondCode cc)
2507 {
2508   /* This should be reduceable to branchless logic
2509      by some careful testing of bits in CC followed
2510      by the requisite masking and combining of bits
2511      from the flag register.
2512
2513      For now we do it with a switch.  */
2514   int res;
2515
2516   switch (cc)
2517     {
2518     case EQ:  res = IS_SET (Z);    break;
2519     case NE:  res = IS_CLEAR (Z);  break;
2520     case CS:  res = IS_SET (C);    break;
2521     case CC:  res = IS_CLEAR (C);  break;
2522     case MI:  res = IS_SET (N);    break;
2523     case PL:  res = IS_CLEAR (N);  break;
2524     case VS:  res = IS_SET (V);    break;
2525     case VC:  res = IS_CLEAR (V);  break;
2526     case HI:  res = IS_SET (C) && IS_CLEAR (Z);  break;
2527     case LS:  res = IS_CLEAR (C) || IS_SET (Z);  break;
2528     case GE:  res = IS_SET (N) == IS_SET (V);    break;
2529     case LT:  res = IS_SET (N) != IS_SET (V);    break;
2530     case GT:  res = IS_CLEAR (Z) && (IS_SET (N) == IS_SET (V));  break;
2531     case LE:  res = IS_SET (Z) || (IS_SET (N) != IS_SET (V));    break;
2532     case AL:
2533     case NV:
2534     default:
2535       res = 1;
2536       break;
2537     }
2538   return res;
2539 }
2540
2541 static void
2542 CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
2543 {
2544   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2545      instr[30]    = compare with positive (1) or negative value (0)
2546      instr[29,21] = 1 1101 0010
2547      instr[20,16] = Rm or const
2548      instr[15,12] = cond
2549      instr[11]    = compare reg (0) or const (1)
2550      instr[10]    = 0
2551      instr[9,5]   = Rn
2552      instr[4]     = 0
2553      instr[3,0]   = value for CPSR bits if the comparison does not take place.  */
2554   signed int negate;
2555   unsigned rm;
2556   unsigned rn;
2557
2558   NYI_assert (29, 21, 0x1d2);
2559   NYI_assert (10, 10, 0);
2560   NYI_assert (4, 4, 0);
2561
2562   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2563   if (! testConditionCode (cpu, INSTR (15, 12)))
2564     {
2565       aarch64_set_CPSR (cpu, INSTR (3, 0));
2566       return;
2567     }
2568
2569   negate = INSTR (30, 30) ? 1 : -1;
2570   rm = INSTR (20, 16);
2571   rn = INSTR ( 9,  5);
2572
2573   if (INSTR (31, 31))
2574     {
2575       if (INSTR (11, 11))
2576         set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2577                              negate * (uint64_t) rm);
2578       else
2579         set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2580                              negate * aarch64_get_reg_u64 (cpu, rm, SP_OK));
2581     }
2582   else
2583     {
2584       if (INSTR (11, 11))
2585         set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2586                              negate * rm);
2587       else
2588         set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2589                              negate * aarch64_get_reg_u32 (cpu, rm, SP_OK));
2590     }
2591 }
2592
2593 static void
2594 do_vec_MOV_whole_vector (sim_cpu *cpu)
2595 {
2596   /* MOV Vd.T, Vs.T  (alias for ORR Vd.T, Vn.T, Vm.T where Vn == Vm)
2597
2598      instr[31]    = 0
2599      instr[30]    = half(0)/full(1)
2600      instr[29,21] = 001110101
2601      instr[20,16] = Vs
2602      instr[15,10] = 000111
2603      instr[9,5]   = Vs
2604      instr[4,0]   = Vd  */
2605
2606   unsigned vs = INSTR (9, 5);
2607   unsigned vd = INSTR (4, 0);
2608
2609   NYI_assert (29, 21, 0x075);
2610   NYI_assert (15, 10, 0x07);
2611
2612   if (INSTR (20, 16) != vs)
2613     HALT_NYI;
2614
2615   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2616   if (INSTR (30, 30))
2617     aarch64_set_vec_u64 (cpu, vd, 1, aarch64_get_vec_u64 (cpu, vs, 1));
2618
2619   aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vs, 0));
2620 }
2621
2622 static void
2623 do_vec_MOV_into_scalar (sim_cpu *cpu)
2624 {
2625   /* instr[31]    = 0
2626      instr[30]    = word(0)/long(1)
2627      instr[29,21] = 00 1110 000
2628      instr[20,18] = element size and index
2629      instr[17,10] = 00 0011 11
2630      instr[9,5]   = V source
2631      instr[4,0]   = R dest  */
2632
2633   unsigned vs = INSTR (9, 5);
2634   unsigned rd = INSTR (4, 0);
2635
2636   NYI_assert (29, 21, 0x070);
2637   NYI_assert (17, 10, 0x0F);
2638
2639   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2640   switch (INSTR (20, 18))
2641     {
2642     case 0x2:
2643       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 0));
2644       break;
2645
2646     case 0x6:
2647       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 1));
2648       break;
2649
2650     case 0x1:
2651     case 0x3:
2652     case 0x5:
2653     case 0x7:
2654       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u32
2655                            (cpu, vs, INSTR (20, 19)));
2656       break;
2657
2658     default:
2659       HALT_NYI;
2660     }
2661 }
2662
2663 static void
2664 do_vec_INS (sim_cpu *cpu)
2665 {
2666   /* instr[31,21] = 01001110000
2667      instr[20,16] = element size and index
2668      instr[15,10] = 000111
2669      instr[9,5]   = W source
2670      instr[4,0]   = V dest  */
2671
2672   int index;
2673   unsigned rs = INSTR (9, 5);
2674   unsigned vd = INSTR (4, 0);
2675
2676   NYI_assert (31, 21, 0x270);
2677   NYI_assert (15, 10, 0x07);
2678
2679   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2680   if (INSTR (16, 16))
2681     {
2682       index = INSTR (20, 17);
2683       aarch64_set_vec_u8 (cpu, vd, index,
2684                           aarch64_get_reg_u8 (cpu, rs, NO_SP));
2685     }
2686   else if (INSTR (17, 17))
2687     {
2688       index = INSTR (20, 18);
2689       aarch64_set_vec_u16 (cpu, vd, index,
2690                            aarch64_get_reg_u16 (cpu, rs, NO_SP));
2691     }
2692   else if (INSTR (18, 18))
2693     {
2694       index = INSTR (20, 19);
2695       aarch64_set_vec_u32 (cpu, vd, index,
2696                            aarch64_get_reg_u32 (cpu, rs, NO_SP));
2697     }
2698   else if (INSTR (19, 19))
2699     {
2700       index = INSTR (20, 20);
2701       aarch64_set_vec_u64 (cpu, vd, index,
2702                            aarch64_get_reg_u64 (cpu, rs, NO_SP));
2703     }
2704   else
2705     HALT_NYI;
2706 }
2707
2708 static void
2709 do_vec_DUP_vector_into_vector (sim_cpu *cpu)
2710 {
2711   /* instr[31]    = 0
2712      instr[30]    = half(0)/full(1)
2713      instr[29,21] = 00 1110 000
2714      instr[20,16] = element size and index
2715      instr[15,10] = 0000 01
2716      instr[9,5]   = V source
2717      instr[4,0]   = V dest.  */
2718
2719   unsigned full = INSTR (30, 30);
2720   unsigned vs = INSTR (9, 5);
2721   unsigned vd = INSTR (4, 0);
2722   int i, index;
2723
2724   NYI_assert (29, 21, 0x070);
2725   NYI_assert (15, 10, 0x01);
2726
2727   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2728   if (INSTR (16, 16))
2729     {
2730       index = INSTR (20, 17);
2731
2732       for (i = 0; i < (full ? 16 : 8); i++)
2733         aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vs, index));
2734     }
2735   else if (INSTR (17, 17))
2736     {
2737       index = INSTR (20, 18);
2738
2739       for (i = 0; i < (full ? 8 : 4); i++)
2740         aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, index));
2741     }
2742   else if (INSTR (18, 18))
2743     {
2744       index = INSTR (20, 19);
2745
2746       for (i = 0; i < (full ? 4 : 2); i++)
2747         aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, index));
2748     }
2749   else
2750     {
2751       if (INSTR (19, 19) == 0)
2752         HALT_UNALLOC;
2753
2754       if (! full)
2755         HALT_UNALLOC;
2756
2757       index = INSTR (20, 20);
2758
2759       for (i = 0; i < 2; i++)
2760         aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, index));
2761     }
2762 }
2763
2764 static void
2765 do_vec_TBL (sim_cpu *cpu)
2766 {
2767   /* instr[31]    = 0
2768      instr[30]    = half(0)/full(1)
2769      instr[29,21] = 00 1110 000
2770      instr[20,16] = Vm
2771      instr[15]    = 0
2772      instr[14,13] = vec length
2773      instr[12,10] = 000
2774      instr[9,5]   = V start
2775      instr[4,0]   = V dest  */
2776
2777   int full    = INSTR (30, 30);
2778   int len     = INSTR (14, 13) + 1;
2779   unsigned vm = INSTR (20, 16);
2780   unsigned vn = INSTR (9, 5);
2781   unsigned vd = INSTR (4, 0);
2782   unsigned i;
2783
2784   NYI_assert (29, 21, 0x070);
2785   NYI_assert (12, 10, 0);
2786
2787   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2788   for (i = 0; i < (full ? 16 : 8); i++)
2789     {
2790       unsigned int selector = aarch64_get_vec_u8 (cpu, vm, i);
2791       uint8_t val;
2792
2793       if (selector < 16)
2794         val = aarch64_get_vec_u8 (cpu, vn, selector);
2795       else if (selector < 32)
2796         val = len < 2 ? 0 : aarch64_get_vec_u8 (cpu, vn + 1, selector - 16);
2797       else if (selector < 48)
2798         val = len < 3 ? 0 : aarch64_get_vec_u8 (cpu, vn + 2, selector - 32);
2799       else if (selector < 64)
2800         val = len < 4 ? 0 : aarch64_get_vec_u8 (cpu, vn + 3, selector - 48);
2801       else
2802         val = 0;
2803
2804       aarch64_set_vec_u8 (cpu, vd, i, val);
2805     }
2806 }
2807
2808 static void
2809 do_vec_TRN (sim_cpu *cpu)
2810 {
2811   /* instr[31]    = 0
2812      instr[30]    = half(0)/full(1)
2813      instr[29,24] = 00 1110
2814      instr[23,22] = size
2815      instr[21]    = 0
2816      instr[20,16] = Vm
2817      instr[15]    = 0
2818      instr[14]    = TRN1 (0) / TRN2 (1)
2819      instr[13,10] = 1010
2820      instr[9,5]   = V source
2821      instr[4,0]   = V dest.  */
2822
2823   int full    = INSTR (30, 30);
2824   int second  = INSTR (14, 14);
2825   unsigned vm = INSTR (20, 16);
2826   unsigned vn = INSTR (9, 5);
2827   unsigned vd = INSTR (4, 0);
2828   unsigned i;
2829
2830   NYI_assert (29, 24, 0x0E);
2831   NYI_assert (13, 10, 0xA);
2832
2833   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2834   switch (INSTR (23, 22))
2835     {
2836     case 0:
2837       for (i = 0; i < (full ? 8 : 4); i++)
2838         {
2839           aarch64_set_vec_u8
2840             (cpu, vd, i * 2,
2841              aarch64_get_vec_u8 (cpu, second ? vm : vn, i * 2));
2842           aarch64_set_vec_u8
2843             (cpu, vd, 1 * 2 + 1,
2844              aarch64_get_vec_u8 (cpu, second ? vn : vm, i * 2 + 1));
2845         }
2846       break;
2847
2848     case 1:
2849       for (i = 0; i < (full ? 4 : 2); i++)
2850         {
2851           aarch64_set_vec_u16
2852             (cpu, vd, i * 2,
2853              aarch64_get_vec_u16 (cpu, second ? vm : vn, i * 2));
2854           aarch64_set_vec_u16
2855             (cpu, vd, 1 * 2 + 1,
2856              aarch64_get_vec_u16 (cpu, second ? vn : vm, i * 2 + 1));
2857         }
2858       break;
2859
2860     case 2:
2861       aarch64_set_vec_u32
2862         (cpu, vd, 0, aarch64_get_vec_u32 (cpu, second ? vm : vn, 0));
2863       aarch64_set_vec_u32
2864         (cpu, vd, 1, aarch64_get_vec_u32 (cpu, second ? vn : vm, 1));
2865       aarch64_set_vec_u32
2866         (cpu, vd, 2, aarch64_get_vec_u32 (cpu, second ? vm : vn, 2));
2867       aarch64_set_vec_u32
2868         (cpu, vd, 3, aarch64_get_vec_u32 (cpu, second ? vn : vm, 3));
2869       break;
2870
2871     case 3:
2872       if (! full)
2873         HALT_UNALLOC;
2874
2875       aarch64_set_vec_u64 (cpu, vd, 0,
2876                            aarch64_get_vec_u64 (cpu, second ? vm : vn, 0));
2877       aarch64_set_vec_u64 (cpu, vd, 1,
2878                            aarch64_get_vec_u64 (cpu, second ? vn : vm, 1));
2879       break;
2880     }
2881 }
2882
2883 static void
2884 do_vec_DUP_scalar_into_vector (sim_cpu *cpu)
2885 {
2886   /* instr[31]    = 0
2887      instr[30]    = 0=> zero top 64-bits, 1=> duplicate into top 64-bits
2888                     [must be 1 for 64-bit xfer]
2889      instr[29,20] = 00 1110 0000
2890      instr[19,16] = element size: 0001=> 8-bits, 0010=> 16-bits,
2891                                   0100=> 32-bits. 1000=>64-bits
2892      instr[15,10] = 0000 11
2893      instr[9,5]   = W source
2894      instr[4,0]   = V dest.  */
2895
2896   unsigned i;
2897   unsigned Vd = INSTR (4, 0);
2898   unsigned Rs = INSTR (9, 5);
2899   int both    = INSTR (30, 30);
2900
2901   NYI_assert (29, 20, 0x0E0);
2902   NYI_assert (15, 10, 0x03);
2903
2904   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2905   switch (INSTR (19, 16))
2906     {
2907     case 1:
2908       for (i = 0; i < (both ? 16 : 8); i++)
2909         aarch64_set_vec_u8 (cpu, Vd, i, aarch64_get_reg_u8 (cpu, Rs, NO_SP));
2910       break;
2911
2912     case 2:
2913       for (i = 0; i < (both ? 8 : 4); i++)
2914         aarch64_set_vec_u16 (cpu, Vd, i, aarch64_get_reg_u16 (cpu, Rs, NO_SP));
2915       break;
2916
2917     case 4:
2918       for (i = 0; i < (both ? 4 : 2); i++)
2919         aarch64_set_vec_u32 (cpu, Vd, i, aarch64_get_reg_u32 (cpu, Rs, NO_SP));
2920       break;
2921
2922     case 8:
2923       if (!both)
2924         HALT_NYI;
2925       aarch64_set_vec_u64 (cpu, Vd, 0, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
2926       aarch64_set_vec_u64 (cpu, Vd, 1, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
2927       break;
2928
2929     default:
2930       HALT_NYI;
2931     }
2932 }
2933
2934 static void
2935 do_vec_UZP (sim_cpu *cpu)
2936 {
2937   /* instr[31]    = 0
2938      instr[30]    = half(0)/full(1)
2939      instr[29,24] = 00 1110
2940      instr[23,22] = size: byte(00), half(01), word (10), long (11)
2941      instr[21]    = 0
2942      instr[20,16] = Vm
2943      instr[15]    = 0
2944      instr[14]    = lower (0) / upper (1)
2945      instr[13,10] = 0110
2946      instr[9,5]   = Vn
2947      instr[4,0]   = Vd.  */
2948
2949   int full = INSTR (30, 30);
2950   int upper = INSTR (14, 14);
2951
2952   unsigned vm = INSTR (20, 16);
2953   unsigned vn = INSTR (9, 5);
2954   unsigned vd = INSTR (4, 0);
2955
2956   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
2957   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
2958   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
2959   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
2960
2961   uint64_t val1;
2962   uint64_t val2;
2963
2964   uint64_t input2 = full ? val_n2 : val_m1;
2965
2966   NYI_assert (29, 24, 0x0E);
2967   NYI_assert (21, 21, 0);
2968   NYI_assert (15, 15, 0);
2969   NYI_assert (13, 10, 6);
2970
2971   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2972   switch (INSTR (23, 22))
2973     {
2974     case 0:
2975       val1 = (val_n1 >> (upper * 8)) & 0xFFULL;
2976       val1 |= (val_n1 >> ((upper * 8) + 8)) & 0xFF00ULL;
2977       val1 |= (val_n1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
2978       val1 |= (val_n1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
2979
2980       val1 |= (input2 << (32 - (upper * 8))) & 0xFF00000000ULL;
2981       val1 |= (input2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
2982       val1 |= (input2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
2983       val1 |= (input2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
2984
2985       if (full)
2986         {
2987           val2 = (val_m1 >> (upper * 8)) & 0xFFULL;
2988           val2 |= (val_m1 >> ((upper * 8) + 8)) & 0xFF00ULL;
2989           val2 |= (val_m1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
2990           val2 |= (val_m1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
2991
2992           val2 |= (val_m2 << (32 - (upper * 8))) & 0xFF00000000ULL;
2993           val2 |= (val_m2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
2994           val2 |= (val_m2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
2995           val2 |= (val_m2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
2996         }
2997       break;
2998
2999     case 1:
3000       val1 = (val_n1 >> (upper * 16)) & 0xFFFFULL;
3001       val1 |= (val_n1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3002
3003       val1 |= (input2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;;
3004       val1 |= (input2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3005
3006       if (full)
3007         {
3008           val2 = (val_m1 >> (upper * 16)) & 0xFFFFULL;
3009           val2 |= (val_m1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3010
3011           val2 |= (val_m2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;
3012           val2 |= (val_m2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3013         }
3014       break;
3015
3016     case 2:
3017       val1 = (val_n1 >> (upper * 32)) & 0xFFFFFFFF;
3018       val1 |= (input2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3019
3020       if (full)
3021         {
3022           val2 = (val_m1 >> (upper * 32)) & 0xFFFFFFFF;
3023           val2 |= (val_m2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3024         }
3025       break;
3026
3027     case 3:
3028       if (! full)
3029         HALT_UNALLOC;
3030
3031       val1 = upper ? val_n2 : val_n1;
3032       val2 = upper ? val_m2 : val_m1;
3033       break;
3034     }
3035
3036   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3037   if (full)
3038     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3039 }
3040
3041 static void
3042 do_vec_ZIP (sim_cpu *cpu)
3043 {
3044   /* instr[31]    = 0
3045      instr[30]    = half(0)/full(1)
3046      instr[29,24] = 00 1110
3047      instr[23,22] = size: byte(00), hald(01), word (10), long (11)
3048      instr[21]    = 0
3049      instr[20,16] = Vm
3050      instr[15]    = 0
3051      instr[14]    = lower (0) / upper (1)
3052      instr[13,10] = 1110
3053      instr[9,5]   = Vn
3054      instr[4,0]   = Vd.  */
3055
3056   int full = INSTR (30, 30);
3057   int upper = INSTR (14, 14);
3058
3059   unsigned vm = INSTR (20, 16);
3060   unsigned vn = INSTR (9, 5);
3061   unsigned vd = INSTR (4, 0);
3062
3063   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
3064   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
3065   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
3066   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
3067
3068   uint64_t val1 = 0;
3069   uint64_t val2 = 0;
3070
3071   uint64_t input1 = upper ? val_n1 : val_m1;
3072   uint64_t input2 = upper ? val_n2 : val_m2;
3073
3074   NYI_assert (29, 24, 0x0E);
3075   NYI_assert (21, 21, 0);
3076   NYI_assert (15, 15, 0);
3077   NYI_assert (13, 10, 0xE);
3078
3079   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3080   switch (INSTR (23, 23))
3081     {
3082     case 0:
3083       val1 =
3084           ((input1 <<  0) & (0xFF    <<  0))
3085         | ((input2 <<  8) & (0xFF    <<  8))
3086         | ((input1 <<  8) & (0xFF    << 16))
3087         | ((input2 << 16) & (0xFF    << 24))
3088         | ((input1 << 16) & (0xFFULL << 32))
3089         | ((input2 << 24) & (0xFFULL << 40))
3090         | ((input1 << 24) & (0xFFULL << 48))
3091         | ((input2 << 32) & (0xFFULL << 56));
3092
3093       val2 =
3094           ((input1 >> 32) & (0xFF    <<  0))
3095         | ((input2 >> 24) & (0xFF    <<  8))
3096         | ((input1 >> 24) & (0xFF    << 16))
3097         | ((input2 >> 16) & (0xFF    << 24))
3098         | ((input1 >> 16) & (0xFFULL << 32))
3099         | ((input2 >>  8) & (0xFFULL << 40))
3100         | ((input1 >>  8) & (0xFFULL << 48))
3101         | ((input2 >>  0) & (0xFFULL << 56));
3102       break;
3103
3104     case 1:
3105       val1 =
3106           ((input1 <<  0) & (0xFFFF    <<  0))
3107         | ((input2 << 16) & (0xFFFF    << 16))
3108         | ((input1 << 16) & (0xFFFFULL << 32))
3109         | ((input2 << 32) & (0xFFFFULL << 48));
3110
3111       val2 =
3112           ((input1 >> 32) & (0xFFFF    <<  0))
3113         | ((input2 >> 16) & (0xFFFF    << 16))
3114         | ((input1 >> 16) & (0xFFFFULL << 32))
3115         | ((input2 >>  0) & (0xFFFFULL << 48));
3116       break;
3117
3118     case 2:
3119       val1 = (input1 & 0xFFFFFFFFULL) | (input2 << 32);
3120       val2 = (input2 & 0xFFFFFFFFULL) | (input1 << 32);
3121       break;
3122
3123     case 3:
3124       val1 = input1;
3125       val2 = input2;
3126       break;
3127     }
3128
3129   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3130   if (full)
3131     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3132 }
3133
3134 /* Floating point immediates are encoded in 8 bits.
3135    fpimm[7] = sign bit.
3136    fpimm[6:4] = signed exponent.
3137    fpimm[3:0] = fraction (assuming leading 1).
3138    i.e. F = s * 1.f * 2^(e - b).  */
3139
3140 static float
3141 fp_immediate_for_encoding_32 (uint32_t imm8)
3142 {
3143   float u;
3144   uint32_t s, e, f, i;
3145
3146   s = (imm8 >> 7) & 0x1;
3147   e = (imm8 >> 4) & 0x7;
3148   f = imm8 & 0xf;
3149
3150   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3151   u = (16.0 + f) / 16.0;
3152
3153   /* N.B. exponent is signed.  */
3154   if (e < 4)
3155     {
3156       int epos = e;
3157
3158       for (i = 0; i <= epos; i++)
3159         u *= 2.0;
3160     }
3161   else
3162     {
3163       int eneg = 7 - e;
3164
3165       for (i = 0; i < eneg; i++)
3166         u /= 2.0;
3167     }
3168
3169   if (s)
3170     u = - u;
3171
3172   return u;
3173 }
3174
3175 static double
3176 fp_immediate_for_encoding_64 (uint32_t imm8)
3177 {
3178   double u;
3179   uint32_t s, e, f, i;
3180
3181   s = (imm8 >> 7) & 0x1;
3182   e = (imm8 >> 4) & 0x7;
3183   f = imm8 & 0xf;
3184
3185   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3186   u = (16.0 + f) / 16.0;
3187
3188   /* N.B. exponent is signed.  */
3189   if (e < 4)
3190     {
3191       int epos = e;
3192
3193       for (i = 0; i <= epos; i++)
3194         u *= 2.0;
3195     }
3196   else
3197     {
3198       int eneg = 7 - e;
3199
3200       for (i = 0; i < eneg; i++)
3201         u /= 2.0;
3202     }
3203
3204   if (s)
3205     u = - u;
3206
3207   return u;
3208 }
3209
3210 static void
3211 do_vec_MOV_immediate (sim_cpu *cpu)
3212 {
3213   /* instr[31]    = 0
3214      instr[30]    = full/half selector
3215      instr[29,19] = 00111100000
3216      instr[18,16] = high 3 bits of uimm8
3217      instr[15,12] = size & shift:
3218                                   0000 => 32-bit
3219                                   0010 => 32-bit + LSL#8
3220                                   0100 => 32-bit + LSL#16
3221                                   0110 => 32-bit + LSL#24
3222                                   1010 => 16-bit + LSL#8
3223                                   1000 => 16-bit
3224                                   1101 => 32-bit + MSL#16
3225                                   1100 => 32-bit + MSL#8
3226                                   1110 => 8-bit
3227                                   1111 => double
3228      instr[11,10] = 01
3229      instr[9,5]   = low 5-bits of uimm8
3230      instr[4,0]   = Vd.  */
3231
3232   int full     = INSTR (30, 30);
3233   unsigned vd  = INSTR (4, 0);
3234   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3235   unsigned i;
3236
3237   NYI_assert (29, 19, 0x1E0);
3238   NYI_assert (11, 10, 1);
3239
3240   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3241   switch (INSTR (15, 12))
3242     {
3243     case 0x0: /* 32-bit, no shift.  */
3244     case 0x2: /* 32-bit, shift by 8.  */
3245     case 0x4: /* 32-bit, shift by 16.  */
3246     case 0x6: /* 32-bit, shift by 24.  */
3247       val <<= (8 * INSTR (14, 13));
3248       for (i = 0; i < (full ? 4 : 2); i++)
3249         aarch64_set_vec_u32 (cpu, vd, i, val);
3250       break;
3251
3252     case 0xa: /* 16-bit, shift by 8.  */
3253       val <<= 8;
3254       /* Fall through.  */
3255     case 0x8: /* 16-bit, no shift.  */
3256       for (i = 0; i < (full ? 8 : 4); i++)
3257         aarch64_set_vec_u16 (cpu, vd, i, val);
3258       break;
3259
3260     case 0xd: /* 32-bit, mask shift by 16.  */
3261       val <<= 8;
3262       val |= 0xFF;
3263       /* Fall through.  */
3264     case 0xc: /* 32-bit, mask shift by 8. */
3265       val <<= 8;
3266       val |= 0xFF;
3267       for (i = 0; i < (full ? 4 : 2); i++)
3268         aarch64_set_vec_u32 (cpu, vd, i, val);
3269       break;
3270
3271     case 0xe: /* 8-bit, no shift.  */
3272       for (i = 0; i < (full ? 16 : 8); i++)
3273         aarch64_set_vec_u8 (cpu, vd, i, val);
3274       break;
3275
3276     case 0xf: /* FMOV Vs.{2|4}S, #fpimm.  */
3277       {
3278         float u = fp_immediate_for_encoding_32 (val);
3279         for (i = 0; i < (full ? 4 : 2); i++)
3280           aarch64_set_vec_float (cpu, vd, i, u);
3281         break;
3282       }
3283
3284     default:
3285       HALT_NYI;
3286     }
3287 }
3288
3289 static void
3290 do_vec_MVNI (sim_cpu *cpu)
3291 {
3292   /* instr[31]    = 0
3293      instr[30]    = full/half selector
3294      instr[29,19] = 10111100000
3295      instr[18,16] = high 3 bits of uimm8
3296      instr[15,12] = selector
3297      instr[11,10] = 01
3298      instr[9,5]   = low 5-bits of uimm8
3299      instr[4,0]   = Vd.  */
3300
3301   int full     = INSTR (30, 30);
3302   unsigned vd  = INSTR (4, 0);
3303   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3304   unsigned i;
3305
3306   NYI_assert (29, 19, 0x5E0);
3307   NYI_assert (11, 10, 1);
3308
3309   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3310   switch (INSTR (15, 12))
3311     {
3312     case 0x0: /* 32-bit, no shift.  */
3313     case 0x2: /* 32-bit, shift by 8.  */
3314     case 0x4: /* 32-bit, shift by 16.  */
3315     case 0x6: /* 32-bit, shift by 24.  */
3316       val <<= (8 * INSTR (14, 13));
3317       val = ~ val;
3318       for (i = 0; i < (full ? 4 : 2); i++)
3319         aarch64_set_vec_u32 (cpu, vd, i, val);
3320       return;
3321
3322     case 0xa: /* 16-bit, 8 bit shift. */
3323       val <<= 8;
3324     case 0x8: /* 16-bit, no shift. */
3325       val = ~ val;
3326       for (i = 0; i < (full ? 8 : 4); i++)
3327         aarch64_set_vec_u16 (cpu, vd, i, val);
3328       return;
3329
3330     case 0xd: /* 32-bit, mask shift by 16.  */
3331       val <<= 8;
3332       val |= 0xFF;
3333     case 0xc: /* 32-bit, mask shift by 8. */
3334       val <<= 8;
3335       val |= 0xFF;
3336       val = ~ val;
3337       for (i = 0; i < (full ? 4 : 2); i++)
3338         aarch64_set_vec_u32 (cpu, vd, i, val);
3339       return;
3340
3341     case 0xE: /* MOVI Dn, #mask64 */
3342       {
3343         uint64_t mask = 0;
3344
3345         for (i = 0; i < 8; i++)
3346           if (val & (1 << i))
3347             mask |= (0xFFUL << (i * 8));
3348         aarch64_set_vec_u64 (cpu, vd, 0, mask);
3349         aarch64_set_vec_u64 (cpu, vd, 1, mask);
3350         return;
3351       }
3352
3353     case 0xf: /* FMOV Vd.2D, #fpimm.  */
3354       {
3355         double u = fp_immediate_for_encoding_64 (val);
3356
3357         if (! full)
3358           HALT_UNALLOC;
3359
3360         aarch64_set_vec_double (cpu, vd, 0, u);
3361         aarch64_set_vec_double (cpu, vd, 1, u);
3362         return;
3363       }
3364
3365     default:
3366       HALT_NYI;
3367     }
3368 }
3369
3370 #define ABS(A) ((A) < 0 ? - (A) : (A))
3371
3372 static void
3373 do_vec_ABS (sim_cpu *cpu)
3374 {
3375   /* instr[31]    = 0
3376      instr[30]    = half(0)/full(1)
3377      instr[29,24] = 00 1110
3378      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3379      instr[21,10] = 10 0000 1011 10
3380      instr[9,5]   = Vn
3381      instr[4.0]   = Vd.  */
3382
3383   unsigned vn = INSTR (9, 5);
3384   unsigned vd = INSTR (4, 0);
3385   unsigned full = INSTR (30, 30);
3386   unsigned i;
3387
3388   NYI_assert (29, 24, 0x0E);
3389   NYI_assert (21, 10, 0x82E);
3390
3391   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3392   switch (INSTR (23, 22))
3393     {
3394     case 0:
3395       for (i = 0; i < (full ? 16 : 8); i++)
3396         aarch64_set_vec_s8 (cpu, vd, i,
3397                             ABS (aarch64_get_vec_s8 (cpu, vn, i)));
3398       break;
3399
3400     case 1:
3401       for (i = 0; i < (full ? 8 : 4); i++)
3402         aarch64_set_vec_s16 (cpu, vd, i,
3403                              ABS (aarch64_get_vec_s16 (cpu, vn, i)));
3404       break;
3405
3406     case 2:
3407       for (i = 0; i < (full ? 4 : 2); i++)
3408         aarch64_set_vec_s32 (cpu, vd, i,
3409                              ABS (aarch64_get_vec_s32 (cpu, vn, i)));
3410       break;
3411
3412     case 3:
3413       if (! full)
3414         HALT_NYI;
3415       for (i = 0; i < 2; i++)
3416         aarch64_set_vec_s64 (cpu, vd, i,
3417                              ABS (aarch64_get_vec_s64 (cpu, vn, i)));
3418       break;
3419     }
3420 }
3421
3422 static void
3423 do_vec_ADDV (sim_cpu *cpu)
3424 {
3425   /* instr[31]    = 0
3426      instr[30]    = full/half selector
3427      instr[29,24] = 00 1110
3428      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3429      instr[21,10] = 11 0001 1011 10
3430      instr[9,5]   = Vm
3431      instr[4.0]   = Rd.  */
3432
3433   unsigned vm = INSTR (9, 5);
3434   unsigned rd = INSTR (4, 0);
3435   unsigned i;
3436   int      full = INSTR (30, 30);
3437
3438   NYI_assert (29, 24, 0x0E);
3439   NYI_assert (21, 10, 0xC6E);
3440
3441   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3442   switch (INSTR (23, 22))
3443     {
3444     case 0:
3445       {
3446         uint8_t val = 0;
3447         for (i = 0; i < (full ? 16 : 8); i++)
3448           val += aarch64_get_vec_u8 (cpu, vm, i);
3449         aarch64_set_vec_u64 (cpu, rd, 0, val);
3450         return;
3451       }
3452
3453     case 1:
3454       {
3455         uint16_t val = 0;
3456         for (i = 0; i < (full ? 8 : 4); i++)
3457           val += aarch64_get_vec_u16 (cpu, vm, i);
3458         aarch64_set_vec_u64 (cpu, rd, 0, val);
3459         return;
3460       }
3461
3462     case 2:
3463       {
3464         uint32_t val = 0;
3465         if (! full)
3466           HALT_UNALLOC;
3467         for (i = 0; i < 4; i++)
3468           val += aarch64_get_vec_u32 (cpu, vm, i);
3469         aarch64_set_vec_u64 (cpu, rd, 0, val);
3470         return;
3471       }
3472
3473     case 3:
3474       HALT_UNALLOC;
3475     }
3476 }
3477
3478 static void
3479 do_vec_ins_2 (sim_cpu *cpu)
3480 {
3481   /* instr[31,21] = 01001110000
3482      instr[20,18] = size & element selector
3483      instr[17,14] = 0000
3484      instr[13]    = direction: to vec(0), from vec (1)
3485      instr[12,10] = 111
3486      instr[9,5]   = Vm
3487      instr[4,0]   = Vd.  */
3488
3489   unsigned elem;
3490   unsigned vm = INSTR (9, 5);
3491   unsigned vd = INSTR (4, 0);
3492
3493   NYI_assert (31, 21, 0x270);
3494   NYI_assert (17, 14, 0);
3495   NYI_assert (12, 10, 7);
3496
3497   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3498   if (INSTR (13, 13) == 1)
3499     {
3500       if (INSTR (18, 18) == 1)
3501         {
3502           /* 32-bit moves.  */
3503           elem = INSTR (20, 19);
3504           aarch64_set_reg_u64 (cpu, vd, NO_SP,
3505                                aarch64_get_vec_u32 (cpu, vm, elem));
3506         }
3507       else
3508         {
3509           /* 64-bit moves.  */
3510           if (INSTR (19, 19) != 1)
3511             HALT_NYI;
3512
3513           elem = INSTR (20, 20);
3514           aarch64_set_reg_u64 (cpu, vd, NO_SP,
3515                                aarch64_get_vec_u64 (cpu, vm, elem));
3516         }
3517     }
3518   else
3519     {
3520       if (INSTR (18, 18) == 1)
3521         {
3522           /* 32-bit moves.  */
3523           elem = INSTR (20, 19);
3524           aarch64_set_vec_u32 (cpu, vd, elem,
3525                                aarch64_get_reg_u32 (cpu, vm, NO_SP));
3526         }
3527       else
3528         {
3529           /* 64-bit moves.  */
3530           if (INSTR (19, 19) != 1)
3531             HALT_NYI;
3532
3533           elem = INSTR (20, 20);
3534           aarch64_set_vec_u64 (cpu, vd, elem,
3535                                aarch64_get_reg_u64 (cpu, vm, NO_SP));
3536         }
3537     }
3538 }
3539
3540 #define DO_VEC_WIDENING_MUL(N, DST_TYPE, READ_TYPE, WRITE_TYPE)   \
3541   do                                                              \
3542     {                                                             \
3543       DST_TYPE a[N], b[N];                                        \
3544                                                                   \
3545       for (i = 0; i < (N); i++)                                   \
3546         {                                                         \
3547           a[i] = aarch64_get_vec_##READ_TYPE (cpu, vn, i + bias); \
3548           b[i] = aarch64_get_vec_##READ_TYPE (cpu, vm, i + bias); \
3549         }                                                         \
3550       for (i = 0; i < (N); i++)                                   \
3551         aarch64_set_vec_##WRITE_TYPE (cpu, vd, i, a[i] * b[i]);   \
3552     }                                                             \
3553   while (0)
3554
3555 static void
3556 do_vec_mull (sim_cpu *cpu)
3557 {
3558   /* instr[31]    = 0
3559      instr[30]    = lower(0)/upper(1) selector
3560      instr[29]    = signed(0)/unsigned(1)
3561      instr[28,24] = 0 1110
3562      instr[23,22] = size: 8-bit (00), 16-bit (01), 32-bit (10)
3563      instr[21]    = 1
3564      instr[20,16] = Vm
3565      instr[15,10] = 11 0000
3566      instr[9,5]   = Vn
3567      instr[4.0]   = Vd.  */
3568
3569   int    unsign = INSTR (29, 29);
3570   int    bias = INSTR (30, 30);
3571   unsigned vm = INSTR (20, 16);
3572   unsigned vn = INSTR ( 9,  5);
3573   unsigned vd = INSTR ( 4,  0);
3574   unsigned i;
3575
3576   NYI_assert (28, 24, 0x0E);
3577   NYI_assert (15, 10, 0x30);
3578
3579   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3580   /* NB: Read source values before writing results, in case
3581      the source and destination vectors are the same.  */
3582   switch (INSTR (23, 22))
3583     {
3584     case 0:
3585       if (bias)
3586         bias = 8;
3587       if (unsign)
3588         DO_VEC_WIDENING_MUL (8, uint16_t, u8, u16);
3589       else
3590         DO_VEC_WIDENING_MUL (8, int16_t, s8, s16);
3591       return;
3592
3593     case 1:
3594       if (bias)
3595         bias = 4;
3596       if (unsign)
3597         DO_VEC_WIDENING_MUL (4, uint32_t, u16, u32);
3598       else
3599         DO_VEC_WIDENING_MUL (4, int32_t, s16, s32);
3600       return;
3601
3602     case 2:
3603       if (bias)
3604         bias = 2;
3605       if (unsign)
3606         DO_VEC_WIDENING_MUL (2, uint64_t, u32, u64);
3607       else
3608         DO_VEC_WIDENING_MUL (2, int64_t, s32, s64);
3609       return;
3610
3611     case 3:
3612       HALT_NYI;
3613     }
3614 }
3615
3616 static void
3617 do_vec_fadd (sim_cpu *cpu)
3618 {
3619   /* instr[31]    = 0
3620      instr[30]    = half(0)/full(1)
3621      instr[29,24] = 001110
3622      instr[23]    = FADD(0)/FSUB(1)
3623      instr[22]    = float (0)/double(1)
3624      instr[21]    = 1
3625      instr[20,16] = Vm
3626      instr[15,10] = 110101
3627      instr[9,5]   = Vn
3628      instr[4.0]   = Vd.  */
3629
3630   unsigned vm = INSTR (20, 16);
3631   unsigned vn = INSTR (9, 5);
3632   unsigned vd = INSTR (4, 0);
3633   unsigned i;
3634   int      full = INSTR (30, 30);
3635
3636   NYI_assert (29, 24, 0x0E);
3637   NYI_assert (21, 21, 1);
3638   NYI_assert (15, 10, 0x35);
3639
3640   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3641   if (INSTR (23, 23))
3642     {
3643       if (INSTR (22, 22))
3644         {
3645           if (! full)
3646             HALT_NYI;
3647
3648           for (i = 0; i < 2; i++)
3649             aarch64_set_vec_double (cpu, vd, i,
3650                                     aarch64_get_vec_double (cpu, vn, i)
3651                                     - aarch64_get_vec_double (cpu, vm, i));
3652         }
3653       else
3654         {
3655           for (i = 0; i < (full ? 4 : 2); i++)
3656             aarch64_set_vec_float (cpu, vd, i,
3657                                    aarch64_get_vec_float (cpu, vn, i)
3658                                    - aarch64_get_vec_float (cpu, vm, i));
3659         }
3660     }
3661   else
3662     {
3663       if (INSTR (22, 22))
3664         {
3665           if (! full)
3666             HALT_NYI;
3667
3668           for (i = 0; i < 2; i++)
3669             aarch64_set_vec_double (cpu, vd, i,
3670                                     aarch64_get_vec_double (cpu, vm, i)
3671                                     + aarch64_get_vec_double (cpu, vn, i));
3672         }
3673       else
3674         {
3675           for (i = 0; i < (full ? 4 : 2); i++)
3676             aarch64_set_vec_float (cpu, vd, i,
3677                                    aarch64_get_vec_float (cpu, vm, i)
3678                                    + aarch64_get_vec_float (cpu, vn, i));
3679         }
3680     }
3681 }
3682
3683 static void
3684 do_vec_add (sim_cpu *cpu)
3685 {
3686   /* instr[31]    = 0
3687      instr[30]    = full/half selector
3688      instr[29,24] = 001110
3689      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3690      instr[21]    = 1
3691      instr[20,16] = Vn
3692      instr[15,10] = 100001
3693      instr[9,5]   = Vm
3694      instr[4.0]   = Vd.  */
3695
3696   unsigned vm = INSTR (20, 16);
3697   unsigned vn = INSTR (9, 5);
3698   unsigned vd = INSTR (4, 0);
3699   unsigned i;
3700   int      full = INSTR (30, 30);
3701
3702   NYI_assert (29, 24, 0x0E);
3703   NYI_assert (21, 21, 1);
3704   NYI_assert (15, 10, 0x21);
3705
3706   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3707   switch (INSTR (23, 22))
3708     {
3709     case 0:
3710       for (i = 0; i < (full ? 16 : 8); i++)
3711         aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
3712                             + aarch64_get_vec_u8 (cpu, vm, i));
3713       return;
3714
3715     case 1:
3716       for (i = 0; i < (full ? 8 : 4); i++)
3717         aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
3718                              + aarch64_get_vec_u16 (cpu, vm, i));
3719       return;
3720
3721     case 2:
3722       for (i = 0; i < (full ? 4 : 2); i++)
3723         aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
3724                              + aarch64_get_vec_u32 (cpu, vm, i));
3725       return;
3726
3727     case 3:
3728       if (! full)
3729         HALT_UNALLOC;
3730       aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vn, 0)
3731                            + aarch64_get_vec_u64 (cpu, vm, 0));
3732       aarch64_set_vec_u64 (cpu, vd, 1,
3733                            aarch64_get_vec_u64 (cpu, vn, 1)
3734                            + aarch64_get_vec_u64 (cpu, vm, 1));
3735       return;
3736     }
3737 }
3738
3739 static void
3740 do_vec_mul (sim_cpu *cpu)
3741 {
3742   /* instr[31]    = 0
3743      instr[30]    = full/half selector
3744      instr[29,24] = 00 1110
3745      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3746      instr[21]    = 1
3747      instr[20,16] = Vn
3748      instr[15,10] = 10 0111
3749      instr[9,5]   = Vm
3750      instr[4.0]   = Vd.  */
3751
3752   unsigned vm = INSTR (20, 16);
3753   unsigned vn = INSTR (9, 5);
3754   unsigned vd = INSTR (4, 0);
3755   unsigned i;
3756   int      full = INSTR (30, 30);
3757   int      bias = 0;
3758
3759   NYI_assert (29, 24, 0x0E);
3760   NYI_assert (21, 21, 1);
3761   NYI_assert (15, 10, 0x27);
3762
3763   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3764   switch (INSTR (23, 22))
3765     {
3766     case 0:
3767       DO_VEC_WIDENING_MUL (full ? 16 : 8, uint8_t, u8, u8);
3768       return;
3769
3770     case 1:
3771       DO_VEC_WIDENING_MUL (full ? 8 : 4, uint16_t, u16, u16);
3772       return;
3773
3774     case 2:
3775       DO_VEC_WIDENING_MUL (full ? 4 : 2, uint32_t, u32, u32);
3776       return;
3777
3778     case 3:
3779       HALT_UNALLOC;
3780     }
3781 }
3782
3783 static void
3784 do_vec_MLA (sim_cpu *cpu)
3785 {
3786   /* instr[31]    = 0
3787      instr[30]    = full/half selector
3788      instr[29,24] = 00 1110
3789      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3790      instr[21]    = 1
3791      instr[20,16] = Vn
3792      instr[15,10] = 1001 01
3793      instr[9,5]   = Vm
3794      instr[4.0]   = Vd.  */
3795
3796   unsigned vm = INSTR (20, 16);
3797   unsigned vn = INSTR (9, 5);
3798   unsigned vd = INSTR (4, 0);
3799   unsigned i;
3800   int      full = INSTR (30, 30);
3801
3802   NYI_assert (29, 24, 0x0E);
3803   NYI_assert (21, 21, 1);
3804   NYI_assert (15, 10, 0x25);
3805
3806   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3807   switch (INSTR (23, 22))
3808     {
3809     case 0:
3810       for (i = 0; i < (full ? 16 : 8); i++)
3811         aarch64_set_vec_u8 (cpu, vd, i,
3812                             aarch64_get_vec_u8 (cpu, vd, i)
3813                             + (aarch64_get_vec_u8 (cpu, vn, i)
3814                                * aarch64_get_vec_u8 (cpu, vm, i)));
3815       return;
3816
3817     case 1:
3818       for (i = 0; i < (full ? 8 : 4); i++)
3819         aarch64_set_vec_u16 (cpu, vd, i,
3820                              aarch64_get_vec_u16 (cpu, vd, i)
3821                              + (aarch64_get_vec_u16 (cpu, vn, i)
3822                                 * aarch64_get_vec_u16 (cpu, vm, i)));
3823       return;
3824
3825     case 2:
3826       for (i = 0; i < (full ? 4 : 2); i++)
3827         aarch64_set_vec_u32 (cpu, vd, i,
3828                              aarch64_get_vec_u32 (cpu, vd, i)
3829                              + (aarch64_get_vec_u32 (cpu, vn, i)
3830                                 * aarch64_get_vec_u32 (cpu, vm, i)));
3831       return;
3832
3833     default:
3834       HALT_UNALLOC;
3835     }
3836 }
3837
3838 static float
3839 fmaxnm (float a, float b)
3840 {
3841   if (! isnan (a))
3842     {
3843       if (! isnan (b))
3844         return a > b ? a : b;
3845       return a;
3846     }
3847   else if (! isnan (b))
3848     return b;
3849   return a;
3850 }
3851
3852 static float
3853 fminnm (float a, float b)
3854 {
3855   if (! isnan (a))
3856     {
3857       if (! isnan (b))
3858         return a < b ? a : b;
3859       return a;
3860     }
3861   else if (! isnan (b))
3862     return b;
3863   return a;
3864 }
3865
3866 static double
3867 dmaxnm (double a, double b)
3868 {
3869   if (! isnan (a))
3870     {
3871       if (! isnan (b))
3872         return a > b ? a : b;
3873       return a;
3874     }
3875   else if (! isnan (b))
3876     return b;
3877   return a;
3878 }
3879
3880 static double
3881 dminnm (double a, double b)
3882 {
3883   if (! isnan (a))
3884     {
3885       if (! isnan (b))
3886         return a < b ? a : b;
3887       return a;
3888     }
3889   else if (! isnan (b))
3890     return b;
3891   return a;
3892 }
3893
3894 static void
3895 do_vec_FminmaxNMP (sim_cpu *cpu)
3896 {
3897   /* instr [31]    = 0
3898      instr [30]    = half (0)/full (1)
3899      instr [29,24] = 10 1110
3900      instr [23]    = max(0)/min(1)
3901      instr [22]    = float (0)/double (1)
3902      instr [21]    = 1
3903      instr [20,16] = Vn
3904      instr [15,10] = 1100 01
3905      instr [9,5]   = Vm
3906      instr [4.0]   = Vd.  */
3907
3908   unsigned vm = INSTR (20, 16);
3909   unsigned vn = INSTR (9, 5);
3910   unsigned vd = INSTR (4, 0);
3911   int      full = INSTR (30, 30);
3912
3913   NYI_assert (29, 24, 0x2E);
3914   NYI_assert (21, 21, 1);
3915   NYI_assert (15, 10, 0x31);
3916
3917   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3918   if (INSTR (22, 22))
3919     {
3920       double (* fn)(double, double) = INSTR (23, 23)
3921         ? dminnm : dmaxnm;
3922
3923       if (! full)
3924         HALT_NYI;
3925       aarch64_set_vec_double (cpu, vd, 0,
3926                               fn (aarch64_get_vec_double (cpu, vn, 0),
3927                                   aarch64_get_vec_double (cpu, vn, 1)));
3928       aarch64_set_vec_double (cpu, vd, 0,
3929                               fn (aarch64_get_vec_double (cpu, vm, 0),
3930                                   aarch64_get_vec_double (cpu, vm, 1)));
3931     }
3932   else
3933     {
3934       float (* fn)(float, float) = INSTR (23, 23)
3935         ? fminnm : fmaxnm;
3936
3937       aarch64_set_vec_float (cpu, vd, 0,
3938                              fn (aarch64_get_vec_float (cpu, vn, 0),
3939                                  aarch64_get_vec_float (cpu, vn, 1)));
3940       if (full)
3941         aarch64_set_vec_float (cpu, vd, 1,
3942                                fn (aarch64_get_vec_float (cpu, vn, 2),
3943                                    aarch64_get_vec_float (cpu, vn, 3)));
3944
3945       aarch64_set_vec_float (cpu, vd, (full ? 2 : 1),
3946                              fn (aarch64_get_vec_float (cpu, vm, 0),
3947                                  aarch64_get_vec_float (cpu, vm, 1)));
3948       if (full)
3949         aarch64_set_vec_float (cpu, vd, 3,
3950                                fn (aarch64_get_vec_float (cpu, vm, 2),
3951                                    aarch64_get_vec_float (cpu, vm, 3)));
3952     }
3953 }
3954
3955 static void
3956 do_vec_AND (sim_cpu *cpu)
3957 {
3958   /* instr[31]    = 0
3959      instr[30]    = half (0)/full (1)
3960      instr[29,21] = 001110001
3961      instr[20,16] = Vm
3962      instr[15,10] = 000111
3963      instr[9,5]   = Vn
3964      instr[4.0]   = Vd.  */
3965
3966   unsigned vm = INSTR (20, 16);
3967   unsigned vn = INSTR (9, 5);
3968   unsigned vd = INSTR (4, 0);
3969   unsigned i;
3970   int      full = INSTR (30, 30);
3971
3972   NYI_assert (29, 21, 0x071);
3973   NYI_assert (15, 10, 0x07);
3974
3975   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3976   for (i = 0; i < (full ? 4 : 2); i++)
3977     aarch64_set_vec_u32 (cpu, vd, i,
3978                          aarch64_get_vec_u32 (cpu, vn, i)
3979                          & aarch64_get_vec_u32 (cpu, vm, i));
3980 }
3981
3982 static void
3983 do_vec_BSL (sim_cpu *cpu)
3984 {
3985   /* instr[31]    = 0
3986      instr[30]    = half (0)/full (1)
3987      instr[29,21] = 101110011
3988      instr[20,16] = Vm
3989      instr[15,10] = 000111
3990      instr[9,5]   = Vn
3991      instr[4.0]   = Vd.  */
3992
3993   unsigned vm = INSTR (20, 16);
3994   unsigned vn = INSTR (9, 5);
3995   unsigned vd = INSTR (4, 0);
3996   unsigned i;
3997   int      full = INSTR (30, 30);
3998
3999   NYI_assert (29, 21, 0x173);
4000   NYI_assert (15, 10, 0x07);
4001
4002   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4003   for (i = 0; i < (full ? 16 : 8); i++)
4004     aarch64_set_vec_u8 (cpu, vd, i,
4005                         (    aarch64_get_vec_u8 (cpu, vd, i)
4006                            & aarch64_get_vec_u8 (cpu, vn, i))
4007                         | ((~ aarch64_get_vec_u8 (cpu, vd, i))
4008                            & aarch64_get_vec_u8 (cpu, vm, i)));
4009 }
4010
4011 static void
4012 do_vec_EOR (sim_cpu *cpu)
4013 {
4014   /* instr[31]    = 0
4015      instr[30]    = half (0)/full (1)
4016      instr[29,21] = 10 1110 001
4017      instr[20,16] = Vm
4018      instr[15,10] = 000111
4019      instr[9,5]   = Vn
4020      instr[4.0]   = Vd.  */
4021
4022   unsigned vm = INSTR (20, 16);
4023   unsigned vn = INSTR (9, 5);
4024   unsigned vd = INSTR (4, 0);
4025   unsigned i;
4026   int      full = INSTR (30, 30);
4027
4028   NYI_assert (29, 21, 0x171);
4029   NYI_assert (15, 10, 0x07);
4030
4031   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4032   for (i = 0; i < (full ? 4 : 2); i++)
4033     aarch64_set_vec_u32 (cpu, vd, i,
4034                          aarch64_get_vec_u32 (cpu, vn, i)
4035                          ^ aarch64_get_vec_u32 (cpu, vm, i));
4036 }
4037
4038 static void
4039 do_vec_bit (sim_cpu *cpu)
4040 {
4041   /* instr[31]    = 0
4042      instr[30]    = half (0)/full (1)
4043      instr[29,23] = 10 1110 1
4044      instr[22]    = BIT (0) / BIF (1)
4045      instr[21]    = 1
4046      instr[20,16] = Vm
4047      instr[15,10] = 0001 11
4048      instr[9,5]   = Vn
4049      instr[4.0]   = Vd.  */
4050
4051   unsigned vm = INSTR (20, 16);
4052   unsigned vn = INSTR (9, 5);
4053   unsigned vd = INSTR (4, 0);
4054   unsigned full = INSTR (30, 30);
4055   unsigned test_false = INSTR (22, 22);
4056   unsigned i;
4057
4058   NYI_assert (29, 23, 0x5D);
4059   NYI_assert (21, 21, 1);
4060   NYI_assert (15, 10, 0x07);
4061
4062   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4063   for (i = 0; i < (full ? 4 : 2); i++)
4064     {
4065       uint32_t vd_val = aarch64_get_vec_u32 (cpu, vd, i);
4066       uint32_t vn_val = aarch64_get_vec_u32 (cpu, vn, i);
4067       uint32_t vm_val = aarch64_get_vec_u32 (cpu, vm, i);
4068       if (test_false)
4069         aarch64_set_vec_u32 (cpu, vd, i,
4070                              (vd_val & vm_val) | (vn_val & ~vm_val));
4071       else
4072         aarch64_set_vec_u32 (cpu, vd, i,
4073                              (vd_val & ~vm_val) | (vn_val & vm_val));
4074     }
4075 }
4076
4077 static void
4078 do_vec_ORN (sim_cpu *cpu)
4079 {
4080   /* instr[31]    = 0
4081      instr[30]    = half (0)/full (1)
4082      instr[29,21] = 00 1110 111
4083      instr[20,16] = Vm
4084      instr[15,10] = 00 0111
4085      instr[9,5]   = Vn
4086      instr[4.0]   = Vd.  */
4087
4088   unsigned vm = INSTR (20, 16);
4089   unsigned vn = INSTR (9, 5);
4090   unsigned vd = INSTR (4, 0);
4091   unsigned i;
4092   int      full = INSTR (30, 30);
4093
4094   NYI_assert (29, 21, 0x077);
4095   NYI_assert (15, 10, 0x07);
4096
4097   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4098   for (i = 0; i < (full ? 16 : 8); i++)
4099     aarch64_set_vec_u8 (cpu, vd, i,
4100                         aarch64_get_vec_u8 (cpu, vn, i)
4101                         | ~ aarch64_get_vec_u8 (cpu, vm, i));
4102 }
4103
4104 static void
4105 do_vec_ORR (sim_cpu *cpu)
4106 {
4107   /* instr[31]    = 0
4108      instr[30]    = half (0)/full (1)
4109      instr[29,21] = 00 1110 101
4110      instr[20,16] = Vm
4111      instr[15,10] = 0001 11
4112      instr[9,5]   = Vn
4113      instr[4.0]   = Vd.  */
4114
4115   unsigned vm = INSTR (20, 16);
4116   unsigned vn = INSTR (9, 5);
4117   unsigned vd = INSTR (4, 0);
4118   unsigned i;
4119   int      full = INSTR (30, 30);
4120
4121   NYI_assert (29, 21, 0x075);
4122   NYI_assert (15, 10, 0x07);
4123
4124   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4125   for (i = 0; i < (full ? 16 : 8); i++)
4126     aarch64_set_vec_u8 (cpu, vd, i,
4127                         aarch64_get_vec_u8 (cpu, vn, i)
4128                         | aarch64_get_vec_u8 (cpu, vm, i));
4129 }
4130
4131 static void
4132 do_vec_BIC (sim_cpu *cpu)
4133 {
4134   /* instr[31]    = 0
4135      instr[30]    = half (0)/full (1)
4136      instr[29,21] = 00 1110 011
4137      instr[20,16] = Vm
4138      instr[15,10] = 00 0111
4139      instr[9,5]   = Vn
4140      instr[4.0]   = Vd.  */
4141
4142   unsigned vm = INSTR (20, 16);
4143   unsigned vn = INSTR (9, 5);
4144   unsigned vd = INSTR (4, 0);
4145   unsigned i;
4146   int      full = INSTR (30, 30);
4147
4148   NYI_assert (29, 21, 0x073);
4149   NYI_assert (15, 10, 0x07);
4150
4151   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4152   for (i = 0; i < (full ? 16 : 8); i++)
4153     aarch64_set_vec_u8 (cpu, vd, i,
4154                         aarch64_get_vec_u8 (cpu, vn, i)
4155                         & ~ aarch64_get_vec_u8 (cpu, vm, i));
4156 }
4157
4158 static void
4159 do_vec_XTN (sim_cpu *cpu)
4160 {
4161   /* instr[31]    = 0
4162      instr[30]    = first part (0)/ second part (1)
4163      instr[29,24] = 00 1110
4164      instr[23,22] = size: byte(00), half(01), word (10)
4165      instr[21,10] = 1000 0100 1010
4166      instr[9,5]   = Vs
4167      instr[4,0]   = Vd.  */
4168
4169   unsigned vs = INSTR (9, 5);
4170   unsigned vd = INSTR (4, 0);
4171   unsigned bias = INSTR (30, 30);
4172   unsigned i;
4173
4174   NYI_assert (29, 24, 0x0E);
4175   NYI_assert (21, 10, 0x84A);
4176
4177   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4178   switch (INSTR (23, 22))
4179     {
4180     case 0:
4181       for (i = 0; i < 8; i++)
4182         aarch64_set_vec_u8 (cpu, vd, i + (bias * 8),
4183                             aarch64_get_vec_u16 (cpu, vs, i));
4184       return;
4185
4186     case 1:
4187       for (i = 0; i < 4; i++)
4188         aarch64_set_vec_u16 (cpu, vd, i + (bias * 4),
4189                              aarch64_get_vec_u32 (cpu, vs, i));
4190       return;
4191
4192     case 2:
4193       for (i = 0; i < 2; i++)
4194         aarch64_set_vec_u32 (cpu, vd, i + (bias * 2),
4195                              aarch64_get_vec_u64 (cpu, vs, i));
4196       return;
4197     }
4198 }
4199
4200 static void
4201 do_vec_maxv (sim_cpu *cpu)
4202 {
4203   /* instr[31]    = 0
4204      instr[30]    = half(0)/full(1)
4205      instr[29]    = signed (0)/unsigned(1)
4206      instr[28,24] = 0 1110
4207      instr[23,22] = size: byte(00), half(01), word (10)
4208      instr[21]    = 1
4209      instr[20,17] = 1 000
4210      instr[16]    = max(0)/min(1)
4211      instr[15,10] = 1010 10
4212      instr[9,5]   = V source
4213      instr[4.0]   = R dest.  */
4214
4215   unsigned vs = INSTR (9, 5);
4216   unsigned rd = INSTR (4, 0);
4217   unsigned full = INSTR (30, 30);
4218   unsigned i;
4219
4220   NYI_assert (28, 24, 0x0E);
4221   NYI_assert (21, 21, 1);
4222   NYI_assert (20, 17, 8);
4223   NYI_assert (15, 10, 0x2A);
4224
4225   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4226   switch ((INSTR (29, 29) << 1) | INSTR (16, 16))
4227     {
4228     case 0: /* SMAXV.  */
4229        {
4230         int64_t smax;
4231         switch (INSTR (23, 22))
4232           {
4233           case 0:
4234             smax = aarch64_get_vec_s8 (cpu, vs, 0);
4235             for (i = 1; i < (full ? 16 : 8); i++)
4236               smax = max (smax, aarch64_get_vec_s8 (cpu, vs, i));
4237             break;
4238           case 1:
4239             smax = aarch64_get_vec_s16 (cpu, vs, 0);
4240             for (i = 1; i < (full ? 8 : 4); i++)
4241               smax = max (smax, aarch64_get_vec_s16 (cpu, vs, i));
4242             break;
4243           case 2:
4244             smax = aarch64_get_vec_s32 (cpu, vs, 0);
4245             for (i = 1; i < (full ? 4 : 2); i++)
4246               smax = max (smax, aarch64_get_vec_s32 (cpu, vs, i));
4247             break;
4248           case 3:
4249             HALT_UNALLOC;
4250           }
4251         aarch64_set_reg_s64 (cpu, rd, NO_SP, smax);
4252         return;
4253       }
4254
4255     case 1: /* SMINV.  */
4256       {
4257         int64_t smin;
4258         switch (INSTR (23, 22))
4259           {
4260           case 0:
4261             smin = aarch64_get_vec_s8 (cpu, vs, 0);
4262             for (i = 1; i < (full ? 16 : 8); i++)
4263               smin = min (smin, aarch64_get_vec_s8 (cpu, vs, i));
4264             break;
4265           case 1:
4266             smin = aarch64_get_vec_s16 (cpu, vs, 0);
4267             for (i = 1; i < (full ? 8 : 4); i++)
4268               smin = min (smin, aarch64_get_vec_s16 (cpu, vs, i));
4269             break;
4270           case 2:
4271             smin = aarch64_get_vec_s32 (cpu, vs, 0);
4272             for (i = 1; i < (full ? 4 : 2); i++)
4273               smin = min (smin, aarch64_get_vec_s32 (cpu, vs, i));
4274             break;
4275
4276           case 3:
4277             HALT_UNALLOC;
4278           }
4279         aarch64_set_reg_s64 (cpu, rd, NO_SP, smin);
4280         return;
4281       }
4282
4283     case 2: /* UMAXV.  */
4284       {
4285         uint64_t umax;
4286         switch (INSTR (23, 22))
4287           {
4288           case 0:
4289             umax = aarch64_get_vec_u8 (cpu, vs, 0);
4290             for (i = 1; i < (full ? 16 : 8); i++)
4291               umax = max (umax, aarch64_get_vec_u8 (cpu, vs, i));
4292             break;
4293           case 1:
4294             umax = aarch64_get_vec_u16 (cpu, vs, 0);
4295             for (i = 1; i < (full ? 8 : 4); i++)
4296               umax = max (umax, aarch64_get_vec_u16 (cpu, vs, i));
4297             break;
4298           case 2:
4299             umax = aarch64_get_vec_u32 (cpu, vs, 0);
4300             for (i = 1; i < (full ? 4 : 2); i++)
4301               umax = max (umax, aarch64_get_vec_u32 (cpu, vs, i));
4302             break;
4303
4304           case 3:
4305             HALT_UNALLOC;
4306           }
4307         aarch64_set_reg_u64 (cpu, rd, NO_SP, umax);
4308         return;
4309       }
4310
4311     case 3: /* UMINV.  */
4312       {
4313         uint64_t umin;
4314         switch (INSTR (23, 22))
4315           {
4316           case 0:
4317             umin = aarch64_get_vec_u8 (cpu, vs, 0);
4318             for (i = 1; i < (full ? 16 : 8); i++)
4319               umin = min (umin, aarch64_get_vec_u8 (cpu, vs, i));
4320             break;
4321           case 1:
4322             umin = aarch64_get_vec_u16 (cpu, vs, 0);
4323             for (i = 1; i < (full ? 8 : 4); i++)
4324               umin = min (umin, aarch64_get_vec_u16 (cpu, vs, i));
4325             break;
4326           case 2:
4327             umin = aarch64_get_vec_u32 (cpu, vs, 0);
4328             for (i = 1; i < (full ? 4 : 2); i++)
4329               umin = min (umin, aarch64_get_vec_u32 (cpu, vs, i));
4330             break;
4331
4332           case 3:
4333             HALT_UNALLOC;
4334           }
4335         aarch64_set_reg_u64 (cpu, rd, NO_SP, umin);
4336         return;
4337       }
4338     }
4339 }
4340
4341 static void
4342 do_vec_fminmaxV (sim_cpu *cpu)
4343 {
4344   /* instr[31,24] = 0110 1110
4345      instr[23]    = max(0)/min(1)
4346      instr[22,14] = 011 0000 11
4347      instr[13,12] = nm(00)/normal(11)
4348      instr[11,10] = 10
4349      instr[9,5]   = V source
4350      instr[4.0]   = R dest.  */
4351
4352   unsigned vs = INSTR (9, 5);
4353   unsigned rd = INSTR (4, 0);
4354   unsigned i;
4355   float res   = aarch64_get_vec_float (cpu, vs, 0);
4356
4357   NYI_assert (31, 24, 0x6E);
4358   NYI_assert (22, 14, 0x0C3);
4359   NYI_assert (11, 10, 2);
4360
4361   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4362   if (INSTR (23, 23))
4363     {
4364       switch (INSTR (13, 12))
4365         {
4366         case 0: /* FMNINNMV.  */
4367           for (i = 1; i < 4; i++)
4368             res = fminnm (res, aarch64_get_vec_float (cpu, vs, i));
4369           break;
4370
4371         case 3: /* FMINV.  */
4372           for (i = 1; i < 4; i++)
4373             res = min (res, aarch64_get_vec_float (cpu, vs, i));
4374           break;
4375
4376         default:
4377           HALT_NYI;
4378         }
4379     }
4380   else
4381     {
4382       switch (INSTR (13, 12))
4383         {
4384         case 0: /* FMNAXNMV.  */
4385           for (i = 1; i < 4; i++)
4386             res = fmaxnm (res, aarch64_get_vec_float (cpu, vs, i));
4387           break;
4388
4389         case 3: /* FMAXV.  */
4390           for (i = 1; i < 4; i++)
4391             res = max (res, aarch64_get_vec_float (cpu, vs, i));
4392           break;
4393
4394         default:
4395           HALT_NYI;
4396         }
4397     }
4398
4399   aarch64_set_FP_float (cpu, rd, res);
4400 }
4401
4402 static void
4403 do_vec_Fminmax (sim_cpu *cpu)
4404 {
4405   /* instr[31]    = 0
4406      instr[30]    = half(0)/full(1)
4407      instr[29,24] = 00 1110
4408      instr[23]    = max(0)/min(1)
4409      instr[22]    = float(0)/double(1)
4410      instr[21]    = 1
4411      instr[20,16] = Vm
4412      instr[15,14] = 11
4413      instr[13,12] = nm(00)/normal(11)
4414      instr[11,10] = 01
4415      instr[9,5]   = Vn
4416      instr[4,0]   = Vd.  */
4417
4418   unsigned vm = INSTR (20, 16);
4419   unsigned vn = INSTR (9, 5);
4420   unsigned vd = INSTR (4, 0);
4421   unsigned full = INSTR (30, 30);
4422   unsigned min = INSTR (23, 23);
4423   unsigned i;
4424
4425   NYI_assert (29, 24, 0x0E);
4426   NYI_assert (21, 21, 1);
4427   NYI_assert (15, 14, 3);
4428   NYI_assert (11, 10, 1);
4429
4430   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4431   if (INSTR (22, 22))
4432     {
4433       double (* func)(double, double);
4434
4435       if (! full)
4436         HALT_NYI;
4437
4438       if (INSTR (13, 12) == 0)
4439         func = min ? dminnm : dmaxnm;
4440       else if (INSTR (13, 12) == 3)
4441         func = min ? fmin : fmax;
4442       else
4443         HALT_NYI;
4444
4445       for (i = 0; i < 2; i++)
4446         aarch64_set_vec_double (cpu, vd, i,
4447                                 func (aarch64_get_vec_double (cpu, vn, i),
4448                                       aarch64_get_vec_double (cpu, vm, i)));
4449     }
4450   else
4451     {
4452       float (* func)(float, float);
4453
4454       if (INSTR (13, 12) == 0)
4455         func = min ? fminnm : fmaxnm;
4456       else if (INSTR (13, 12) == 3)
4457         func = min ? fminf : fmaxf;
4458       else
4459         HALT_NYI;
4460
4461       for (i = 0; i < (full ? 4 : 2); i++)
4462         aarch64_set_vec_float (cpu, vd, i,
4463                                func (aarch64_get_vec_float (cpu, vn, i),
4464                                      aarch64_get_vec_float (cpu, vm, i)));
4465     }
4466 }
4467
4468 static void
4469 do_vec_SCVTF (sim_cpu *cpu)
4470 {
4471   /* instr[31]    = 0
4472      instr[30]    = Q
4473      instr[29,23] = 00 1110 0
4474      instr[22]    = float(0)/double(1)
4475      instr[21,10] = 10 0001 1101 10
4476      instr[9,5]   = Vn
4477      instr[4,0]   = Vd.  */
4478
4479   unsigned vn = INSTR (9, 5);
4480   unsigned vd = INSTR (4, 0);
4481   unsigned full = INSTR (30, 30);
4482   unsigned size = INSTR (22, 22);
4483   unsigned i;
4484
4485   NYI_assert (29, 23, 0x1C);
4486   NYI_assert (21, 10, 0x876);
4487
4488   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4489   if (size)
4490     {
4491       if (! full)
4492         HALT_UNALLOC;
4493
4494       for (i = 0; i < 2; i++)
4495         {
4496           double val = (double) aarch64_get_vec_u64 (cpu, vn, i);
4497           aarch64_set_vec_double (cpu, vd, i, val);
4498         }
4499     }
4500   else
4501     {
4502       for (i = 0; i < (full ? 4 : 2); i++)
4503         {
4504           float val = (float) aarch64_get_vec_u32 (cpu, vn, i);
4505           aarch64_set_vec_float (cpu, vd, i, val);
4506         }
4507     }
4508 }
4509
4510 #define VEC_CMP(SOURCE, CMP)                                            \
4511   do                                                                    \
4512     {                                                                   \
4513       switch (size)                                                     \
4514         {                                                               \
4515         case 0:                                                         \
4516           for (i = 0; i < (full ? 16 : 8); i++)                         \
4517             aarch64_set_vec_u8 (cpu, vd, i,                             \
4518                                 aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4519                                 CMP                                     \
4520                                 aarch64_get_vec_##SOURCE##8 (cpu, vm, i) \
4521                                 ? -1 : 0);                              \
4522           return;                                                       \
4523         case 1:                                                         \
4524           for (i = 0; i < (full ? 8 : 4); i++)                          \
4525             aarch64_set_vec_u16 (cpu, vd, i,                            \
4526                                  aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4527                                  CMP                                    \
4528                                  aarch64_get_vec_##SOURCE##16 (cpu, vm, i) \
4529                                  ? -1 : 0);                             \
4530           return;                                                       \
4531         case 2:                                                         \
4532           for (i = 0; i < (full ? 4 : 2); i++)                          \
4533             aarch64_set_vec_u32 (cpu, vd, i, \
4534                                  aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4535                                  CMP                                    \
4536                                  aarch64_get_vec_##SOURCE##32 (cpu, vm, i) \
4537                                  ? -1 : 0);                             \
4538           return;                                                       \
4539         case 3:                                                         \
4540           if (! full)                                                   \
4541             HALT_UNALLOC;                                               \
4542           for (i = 0; i < 2; i++)                                       \
4543             aarch64_set_vec_u64 (cpu, vd, i, \
4544                                  aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4545                                  CMP                                    \
4546                                  aarch64_get_vec_##SOURCE##64 (cpu, vm, i) \
4547                                  ? -1ULL : 0);                          \
4548           return;                                                       \
4549         }                                                               \
4550     }                                                                   \
4551   while (0)
4552
4553 #define VEC_CMP0(SOURCE, CMP)                                           \
4554   do                                                                    \
4555     {                                                                   \
4556       switch (size)                                                     \
4557         {                                                               \
4558         case 0:                                                         \
4559           for (i = 0; i < (full ? 16 : 8); i++)                         \
4560             aarch64_set_vec_u8 (cpu, vd, i,                             \
4561                                 aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4562                                 CMP 0 ? -1 : 0);                        \
4563           return;                                                       \
4564         case 1:                                                         \
4565           for (i = 0; i < (full ? 8 : 4); i++)                          \
4566             aarch64_set_vec_u16 (cpu, vd, i,                            \
4567                                  aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4568                                  CMP 0 ? -1 : 0);                       \
4569           return;                                                       \
4570         case 2:                                                         \
4571           for (i = 0; i < (full ? 4 : 2); i++)                          \
4572             aarch64_set_vec_u32 (cpu, vd, i,                            \
4573                                  aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4574                                  CMP 0 ? -1 : 0);                       \
4575           return;                                                       \
4576         case 3:                                                         \
4577           if (! full)                                                   \
4578             HALT_UNALLOC;                                               \
4579           for (i = 0; i < 2; i++)                                       \
4580             aarch64_set_vec_u64 (cpu, vd, i,                            \
4581                                  aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4582                                  CMP 0 ? -1ULL : 0);                    \
4583           return;                                                       \
4584         }                                                               \
4585     }                                                                   \
4586   while (0)
4587
4588 #define VEC_FCMP0(CMP)                                                  \
4589   do                                                                    \
4590     {                                                                   \
4591       if (vm != 0)                                                      \
4592         HALT_NYI;                                                       \
4593       if (INSTR (22, 22))                                               \
4594         {                                                               \
4595           if (! full)                                                   \
4596             HALT_NYI;                                                   \
4597           for (i = 0; i < 2; i++)                                       \
4598             aarch64_set_vec_u64 (cpu, vd, i,                            \
4599                                  aarch64_get_vec_double (cpu, vn, i)    \
4600                                  CMP 0.0 ? -1 : 0);                     \
4601         }                                                               \
4602       else                                                              \
4603         {                                                               \
4604           for (i = 0; i < (full ? 4 : 2); i++)                          \
4605             aarch64_set_vec_u32 (cpu, vd, i,                            \
4606                                  aarch64_get_vec_float (cpu, vn, i)     \
4607                                  CMP 0.0 ? -1 : 0);                     \
4608         }                                                               \
4609       return;                                                           \
4610     }                                                                   \
4611   while (0)
4612
4613 #define VEC_FCMP(CMP)                                                   \
4614   do                                                                    \
4615     {                                                                   \
4616       if (INSTR (22, 22))                                               \
4617         {                                                               \
4618           if (! full)                                                   \
4619             HALT_NYI;                                                   \
4620           for (i = 0; i < 2; i++)                                       \
4621             aarch64_set_vec_u64 (cpu, vd, i,                            \
4622                                  aarch64_get_vec_double (cpu, vn, i)    \
4623                                  CMP                                    \
4624                                  aarch64_get_vec_double (cpu, vm, i)    \
4625                                  ? -1 : 0);                             \
4626         }                                                               \
4627       else                                                              \
4628         {                                                               \
4629           for (i = 0; i < (full ? 4 : 2); i++)                          \
4630             aarch64_set_vec_u32 (cpu, vd, i,                            \
4631                                  aarch64_get_vec_float (cpu, vn, i)     \
4632                                  CMP                                    \
4633                                  aarch64_get_vec_float (cpu, vm, i)     \
4634                                  ? -1 : 0);                             \
4635         }                                                               \
4636       return;                                                           \
4637     }                                                                   \
4638   while (0)
4639
4640 static void
4641 do_vec_compare (sim_cpu *cpu)
4642 {
4643   /* instr[31]    = 0
4644      instr[30]    = half(0)/full(1)
4645      instr[29]    = part-of-comparison-type
4646      instr[28,24] = 0 1110
4647      instr[23,22] = size of integer compares: byte(00), half(01), word (10), long (11)
4648                     type of float compares: single (-0) / double (-1)
4649      instr[21]    = 1
4650      instr[20,16] = Vm or 00000 (compare vs 0)
4651      instr[15,10] = part-of-comparison-type
4652      instr[9,5]   = Vn
4653      instr[4.0]   = Vd.  */
4654
4655   int full = INSTR (30, 30);
4656   int size = INSTR (23, 22);
4657   unsigned vm = INSTR (20, 16);
4658   unsigned vn = INSTR (9, 5);
4659   unsigned vd = INSTR (4, 0);
4660   unsigned i;
4661
4662   NYI_assert (28, 24, 0x0E);
4663   NYI_assert (21, 21, 1);
4664
4665   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4666   if ((INSTR (11, 11)
4667        && INSTR (14, 14))
4668       || ((INSTR (11, 11) == 0
4669            && INSTR (10, 10) == 0)))
4670     {
4671       /* A compare vs 0.  */
4672       if (vm != 0)
4673         {
4674           if (INSTR (15, 10) == 0x2A)
4675             do_vec_maxv (cpu);
4676           else if (INSTR (15, 10) == 0x32
4677                    || INSTR (15, 10) == 0x3E)
4678             do_vec_fminmaxV (cpu);
4679           else if (INSTR (29, 23) == 0x1C
4680                    && INSTR (21, 10) == 0x876)
4681             do_vec_SCVTF (cpu);
4682           else
4683             HALT_NYI;
4684           return;
4685         }
4686     }
4687
4688   if (INSTR (14, 14))
4689     {
4690       /* A floating point compare.  */
4691       unsigned decode = (INSTR (29, 29) << 5) | (INSTR (23, 23) << 4)
4692         | INSTR (13, 10);
4693
4694       NYI_assert (15, 15, 1);
4695
4696       switch (decode)
4697         {
4698         case /* 0b010010: GT#0 */ 0x12: VEC_FCMP0 (>);
4699         case /* 0b110010: GE#0 */ 0x32: VEC_FCMP0 (>=);
4700         case /* 0b010110: EQ#0 */ 0x16: VEC_FCMP0 (==);
4701         case /* 0b110110: LE#0 */ 0x36: VEC_FCMP0 (<=);
4702         case /* 0b011010: LT#0 */ 0x1A: VEC_FCMP0 (<);
4703         case /* 0b111001: GT */   0x39: VEC_FCMP  (>);
4704         case /* 0b101001: GE */   0x29: VEC_FCMP  (>=);
4705         case /* 0b001001: EQ */   0x09: VEC_FCMP  (==);
4706
4707         default:
4708           HALT_NYI;
4709         }
4710     }
4711   else
4712     {
4713       unsigned decode = (INSTR (29, 29) << 6) | INSTR (15, 10);
4714
4715       switch (decode)
4716         {
4717         case 0x0D: /* 0001101 GT */     VEC_CMP  (s, > );
4718         case 0x0F: /* 0001111 GE */     VEC_CMP  (s, >= );
4719         case 0x22: /* 0100010 GT #0 */  VEC_CMP0 (s, > );
4720         case 0x23: /* 0100011 TST */    VEC_CMP  (u, & );
4721         case 0x26: /* 0100110 EQ #0 */  VEC_CMP0 (s, == );
4722         case 0x2A: /* 0101010 LT #0 */  VEC_CMP0 (s, < );
4723         case 0x4D: /* 1001101 HI */     VEC_CMP  (u, > );
4724         case 0x4F: /* 1001111 HS */     VEC_CMP  (u, >= );
4725         case 0x62: /* 1100010 GE #0 */  VEC_CMP0 (s, >= );
4726         case 0x63: /* 1100011 EQ */     VEC_CMP  (u, == );
4727         case 0x66: /* 1100110 LE #0 */  VEC_CMP0 (s, <= );
4728         default:
4729           if (vm == 0)
4730             HALT_NYI;
4731           do_vec_maxv (cpu);
4732         }
4733     }
4734 }
4735
4736 static void
4737 do_vec_SSHL (sim_cpu *cpu)
4738 {
4739   /* instr[31]    = 0
4740      instr[30]    = first part (0)/ second part (1)
4741      instr[29,24] = 00 1110
4742      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4743      instr[21]    = 1
4744      instr[20,16] = Vm
4745      instr[15,10] = 0100 01
4746      instr[9,5]   = Vn
4747      instr[4,0]   = Vd.  */
4748
4749   unsigned full = INSTR (30, 30);
4750   unsigned vm = INSTR (20, 16);
4751   unsigned vn = INSTR (9, 5);
4752   unsigned vd = INSTR (4, 0);
4753   unsigned i;
4754   signed int shift;
4755
4756   NYI_assert (29, 24, 0x0E);
4757   NYI_assert (21, 21, 1);
4758   NYI_assert (15, 10, 0x11);
4759
4760   /* FIXME: What is a signed shift left in this context ?.  */
4761
4762   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4763   switch (INSTR (23, 22))
4764     {
4765     case 0:
4766       for (i = 0; i < (full ? 16 : 8); i++)
4767         {
4768           shift = aarch64_get_vec_s8 (cpu, vm, i);
4769           if (shift >= 0)
4770             aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4771                                 << shift);
4772           else
4773             aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4774                                 >> - shift);
4775         }
4776       return;
4777
4778     case 1:
4779       for (i = 0; i < (full ? 8 : 4); i++)
4780         {
4781           shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4782           if (shift >= 0)
4783             aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4784                                  << shift);
4785           else
4786             aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4787                                  >> - shift);
4788         }
4789       return;
4790
4791     case 2:
4792       for (i = 0; i < (full ? 4 : 2); i++)
4793         {
4794           shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4795           if (shift >= 0)
4796             aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4797                                  << shift);
4798           else
4799             aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4800                                  >> - shift);
4801         }
4802       return;
4803
4804     case 3:
4805       if (! full)
4806         HALT_UNALLOC;
4807       for (i = 0; i < 2; i++)
4808         {
4809           shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4810           if (shift >= 0)
4811             aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4812                                  << shift);
4813           else
4814             aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4815                                  >> - shift);
4816         }
4817       return;
4818     }
4819 }
4820
4821 static void
4822 do_vec_USHL (sim_cpu *cpu)
4823 {
4824   /* instr[31]    = 0
4825      instr[30]    = first part (0)/ second part (1)
4826      instr[29,24] = 10 1110
4827      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4828      instr[21]    = 1
4829      instr[20,16] = Vm
4830      instr[15,10] = 0100 01
4831      instr[9,5]   = Vn
4832      instr[4,0]   = Vd  */
4833
4834   unsigned full = INSTR (30, 30);
4835   unsigned vm = INSTR (20, 16);
4836   unsigned vn = INSTR (9, 5);
4837   unsigned vd = INSTR (4, 0);
4838   unsigned i;
4839   signed int shift;
4840
4841   NYI_assert (29, 24, 0x2E);
4842   NYI_assert (15, 10, 0x11);
4843
4844   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4845   switch (INSTR (23, 22))
4846     {
4847     case 0:
4848         for (i = 0; i < (full ? 16 : 8); i++)
4849           {
4850             shift = aarch64_get_vec_s8 (cpu, vm, i);
4851             if (shift >= 0)
4852               aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
4853                                   << shift);
4854             else
4855               aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
4856                                   >> - shift);
4857           }
4858       return;
4859
4860     case 1:
4861       for (i = 0; i < (full ? 8 : 4); i++)
4862         {
4863           shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4864           if (shift >= 0)
4865             aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
4866                                  << shift);
4867           else
4868             aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
4869                                  >> - shift);
4870         }
4871       return;
4872
4873     case 2:
4874       for (i = 0; i < (full ? 4 : 2); i++)
4875         {
4876           shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4877           if (shift >= 0)
4878             aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
4879                                  << shift);
4880           else
4881             aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
4882                                  >> - shift);
4883         }
4884       return;
4885
4886     case 3:
4887       if (! full)
4888         HALT_UNALLOC;
4889       for (i = 0; i < 2; i++)
4890         {
4891           shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4892           if (shift >= 0)
4893             aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
4894                                  << shift);
4895           else
4896             aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
4897                                  >> - shift);
4898         }
4899       return;
4900     }
4901 }
4902
4903 static void
4904 do_vec_FMLA (sim_cpu *cpu)
4905 {
4906   /* instr[31]    = 0
4907      instr[30]    = full/half selector
4908      instr[29,23] = 0011100
4909      instr[22]    = size: 0=>float, 1=>double
4910      instr[21]    = 1
4911      instr[20,16] = Vn
4912      instr[15,10] = 1100 11
4913      instr[9,5]   = Vm
4914      instr[4.0]   = Vd.  */
4915
4916   unsigned vm = INSTR (20, 16);
4917   unsigned vn = INSTR (9, 5);
4918   unsigned vd = INSTR (4, 0);
4919   unsigned i;
4920   int      full = INSTR (30, 30);
4921
4922   NYI_assert (29, 23, 0x1C);
4923   NYI_assert (21, 21, 1);
4924   NYI_assert (15, 10, 0x33);
4925
4926   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4927   if (INSTR (22, 22))
4928     {
4929       if (! full)
4930         HALT_UNALLOC;
4931       for (i = 0; i < 2; i++)
4932         aarch64_set_vec_double (cpu, vd, i,
4933                                 aarch64_get_vec_double (cpu, vn, i) *
4934                                 aarch64_get_vec_double (cpu, vm, i) +
4935                                 aarch64_get_vec_double (cpu, vd, i));
4936     }
4937   else
4938     {
4939       for (i = 0; i < (full ? 4 : 2); i++)
4940         aarch64_set_vec_float (cpu, vd, i,
4941                                aarch64_get_vec_float (cpu, vn, i) *
4942                                aarch64_get_vec_float (cpu, vm, i) +
4943                                aarch64_get_vec_float (cpu, vd, i));
4944     }
4945 }
4946
4947 static void
4948 do_vec_max (sim_cpu *cpu)
4949 {
4950   /* instr[31]    = 0
4951      instr[30]    = full/half selector
4952      instr[29]    = SMAX (0) / UMAX (1)
4953      instr[28,24] = 0 1110
4954      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
4955      instr[21]    = 1
4956      instr[20,16] = Vn
4957      instr[15,10] = 0110 01
4958      instr[9,5]   = Vm
4959      instr[4.0]   = Vd.  */
4960
4961   unsigned vm = INSTR (20, 16);
4962   unsigned vn = INSTR (9, 5);
4963   unsigned vd = INSTR (4, 0);
4964   unsigned i;
4965   int      full = INSTR (30, 30);
4966
4967   NYI_assert (28, 24, 0x0E);
4968   NYI_assert (21, 21, 1);
4969   NYI_assert (15, 10, 0x19);
4970
4971   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4972   if (INSTR (29, 29))
4973     {
4974       switch (INSTR (23, 22))
4975         {
4976         case 0:
4977           for (i = 0; i < (full ? 16 : 8); i++)
4978             aarch64_set_vec_u8 (cpu, vd, i,
4979                                 aarch64_get_vec_u8 (cpu, vn, i)
4980                                 > aarch64_get_vec_u8 (cpu, vm, i)
4981                                 ? aarch64_get_vec_u8 (cpu, vn, i)
4982                                 : aarch64_get_vec_u8 (cpu, vm, i));
4983           return;
4984
4985         case 1:
4986           for (i = 0; i < (full ? 8 : 4); i++)
4987             aarch64_set_vec_u16 (cpu, vd, i,
4988                                  aarch64_get_vec_u16 (cpu, vn, i)
4989                                  > aarch64_get_vec_u16 (cpu, vm, i)
4990                                  ? aarch64_get_vec_u16 (cpu, vn, i)
4991                                  : aarch64_get_vec_u16 (cpu, vm, i));
4992           return;
4993
4994         case 2:
4995           for (i = 0; i < (full ? 4 : 2); i++)
4996             aarch64_set_vec_u32 (cpu, vd, i,
4997                                  aarch64_get_vec_u32 (cpu, vn, i)
4998                                  > aarch64_get_vec_u32 (cpu, vm, i)
4999                                  ? aarch64_get_vec_u32 (cpu, vn, i)
5000                                  : aarch64_get_vec_u32 (cpu, vm, i));
5001           return;
5002
5003         case 3:
5004           HALT_UNALLOC;
5005         }
5006     }
5007   else
5008     {
5009       switch (INSTR (23, 22))
5010         {
5011         case 0:
5012           for (i = 0; i < (full ? 16 : 8); i++)
5013             aarch64_set_vec_s8 (cpu, vd, i,
5014                                 aarch64_get_vec_s8 (cpu, vn, i)
5015                                 > aarch64_get_vec_s8 (cpu, vm, i)
5016                                 ? aarch64_get_vec_s8 (cpu, vn, i)
5017                                 : aarch64_get_vec_s8 (cpu, vm, i));
5018           return;
5019
5020         case 1:
5021           for (i = 0; i < (full ? 8 : 4); i++)
5022             aarch64_set_vec_s16 (cpu, vd, i,
5023                                  aarch64_get_vec_s16 (cpu, vn, i)
5024                                  > aarch64_get_vec_s16 (cpu, vm, i)
5025                                  ? aarch64_get_vec_s16 (cpu, vn, i)
5026                                  : aarch64_get_vec_s16 (cpu, vm, i));
5027           return;
5028
5029         case 2:
5030           for (i = 0; i < (full ? 4 : 2); i++)
5031             aarch64_set_vec_s32 (cpu, vd, i,
5032                                  aarch64_get_vec_s32 (cpu, vn, i)
5033                                  > aarch64_get_vec_s32 (cpu, vm, i)
5034                                  ? aarch64_get_vec_s32 (cpu, vn, i)
5035                                  : aarch64_get_vec_s32 (cpu, vm, i));
5036           return;
5037
5038         case 3:
5039           HALT_UNALLOC;
5040         }
5041     }
5042 }
5043
5044 static void
5045 do_vec_min (sim_cpu *cpu)
5046 {
5047   /* instr[31]    = 0
5048      instr[30]    = full/half selector
5049      instr[29]    = SMIN (0) / UMIN (1)
5050      instr[28,24] = 0 1110
5051      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
5052      instr[21]    = 1
5053      instr[20,16] = Vn
5054      instr[15,10] = 0110 11
5055      instr[9,5]   = Vm
5056      instr[4.0]   = Vd.  */
5057
5058   unsigned vm = INSTR (20, 16);
5059   unsigned vn = INSTR (9, 5);
5060   unsigned vd = INSTR (4, 0);
5061   unsigned i;
5062   int      full = INSTR (30, 30);
5063
5064   NYI_assert (28, 24, 0x0E);
5065   NYI_assert (21, 21, 1);
5066   NYI_assert (15, 10, 0x1B);
5067
5068   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5069   if (INSTR (29, 29))
5070     {
5071       switch (INSTR (23, 22))
5072         {
5073         case 0:
5074           for (i = 0; i < (full ? 16 : 8); i++)
5075             aarch64_set_vec_u8 (cpu, vd, i,
5076                                 aarch64_get_vec_u8 (cpu, vn, i)
5077                                 < aarch64_get_vec_u8 (cpu, vm, i)
5078                                 ? aarch64_get_vec_u8 (cpu, vn, i)
5079                                 : aarch64_get_vec_u8 (cpu, vm, i));
5080           return;
5081
5082         case 1:
5083           for (i = 0; i < (full ? 8 : 4); i++)
5084             aarch64_set_vec_u16 (cpu, vd, i,
5085                                  aarch64_get_vec_u16 (cpu, vn, i)
5086                                  < aarch64_get_vec_u16 (cpu, vm, i)
5087                                  ? aarch64_get_vec_u16 (cpu, vn, i)
5088                                  : aarch64_get_vec_u16 (cpu, vm, i));
5089           return;
5090
5091         case 2:
5092           for (i = 0; i < (full ? 4 : 2); i++)
5093             aarch64_set_vec_u32 (cpu, vd, i,
5094                                  aarch64_get_vec_u32 (cpu, vn, i)
5095                                  < aarch64_get_vec_u32 (cpu, vm, i)
5096                                  ? aarch64_get_vec_u32 (cpu, vn, i)
5097                                  : aarch64_get_vec_u32 (cpu, vm, i));
5098           return;
5099
5100         case 3:
5101           HALT_UNALLOC;
5102         }
5103     }
5104   else
5105     {
5106       switch (INSTR (23, 22))
5107         {
5108         case 0:
5109           for (i = 0; i < (full ? 16 : 8); i++)
5110             aarch64_set_vec_s8 (cpu, vd, i,
5111                                 aarch64_get_vec_s8 (cpu, vn, i)
5112                                 < aarch64_get_vec_s8 (cpu, vm, i)
5113                                 ? aarch64_get_vec_s8 (cpu, vn, i)
5114                                 : aarch64_get_vec_s8 (cpu, vm, i));
5115           return;
5116
5117         case 1:
5118           for (i = 0; i < (full ? 8 : 4); i++)
5119             aarch64_set_vec_s16 (cpu, vd, i,
5120                                  aarch64_get_vec_s16 (cpu, vn, i)
5121                                  < aarch64_get_vec_s16 (cpu, vm, i)
5122                                  ? aarch64_get_vec_s16 (cpu, vn, i)
5123                                  : aarch64_get_vec_s16 (cpu, vm, i));
5124           return;
5125
5126         case 2:
5127           for (i = 0; i < (full ? 4 : 2); i++)
5128             aarch64_set_vec_s32 (cpu, vd, i,
5129                                  aarch64_get_vec_s32 (cpu, vn, i)
5130                                  < aarch64_get_vec_s32 (cpu, vm, i)
5131                                  ? aarch64_get_vec_s32 (cpu, vn, i)
5132                                  : aarch64_get_vec_s32 (cpu, vm, i));
5133           return;
5134
5135         case 3:
5136           HALT_UNALLOC;
5137         }
5138     }
5139 }
5140
5141 static void
5142 do_vec_sub_long (sim_cpu *cpu)
5143 {
5144   /* instr[31]    = 0
5145      instr[30]    = lower (0) / upper (1)
5146      instr[29]    = signed (0) / unsigned (1)
5147      instr[28,24] = 0 1110
5148      instr[23,22] = size: bytes (00), half (01), word (10)
5149      instr[21]    = 1
5150      insrt[20,16] = Vm
5151      instr[15,10] = 0010 00
5152      instr[9,5]   = Vn
5153      instr[4,0]   = V dest.  */
5154
5155   unsigned size = INSTR (23, 22);
5156   unsigned vm = INSTR (20, 16);
5157   unsigned vn = INSTR (9, 5);
5158   unsigned vd = INSTR (4, 0);
5159   unsigned bias = 0;
5160   unsigned i;
5161
5162   NYI_assert (28, 24, 0x0E);
5163   NYI_assert (21, 21, 1);
5164   NYI_assert (15, 10, 0x08);
5165
5166   if (size == 3)
5167     HALT_UNALLOC;
5168
5169   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5170   switch (INSTR (30, 29))
5171     {
5172     case 2: /* SSUBL2.  */
5173       bias = 2;
5174     case 0: /* SSUBL.  */
5175       switch (size)
5176         {
5177         case 0:
5178           bias *= 3;
5179           for (i = 0; i < 8; i++)
5180             aarch64_set_vec_s16 (cpu, vd, i,
5181                                  aarch64_get_vec_s8 (cpu, vn, i + bias)
5182                                  - aarch64_get_vec_s8 (cpu, vm, i + bias));
5183           break;
5184
5185         case 1:
5186           bias *= 2;
5187           for (i = 0; i < 4; i++)
5188             aarch64_set_vec_s32 (cpu, vd, i,
5189                                  aarch64_get_vec_s16 (cpu, vn, i + bias)
5190                                  - aarch64_get_vec_s16 (cpu, vm, i + bias));
5191           break;
5192
5193         case 2:
5194           for (i = 0; i < 2; i++)
5195             aarch64_set_vec_s64 (cpu, vd, i,
5196                                  aarch64_get_vec_s32 (cpu, vn, i + bias)
5197                                  - aarch64_get_vec_s32 (cpu, vm, i + bias));
5198           break;
5199
5200         default:
5201           HALT_UNALLOC;
5202         }
5203       break;
5204
5205     case 3: /* USUBL2.  */
5206       bias = 2;
5207     case 1: /* USUBL.  */
5208       switch (size)
5209         {
5210         case 0:
5211           bias *= 3;
5212           for (i = 0; i < 8; i++)
5213             aarch64_set_vec_u16 (cpu, vd, i,
5214                                  aarch64_get_vec_u8 (cpu, vn, i + bias)
5215                                  - aarch64_get_vec_u8 (cpu, vm, i + bias));
5216           break;
5217
5218         case 1:
5219           bias *= 2;
5220           for (i = 0; i < 4; i++)
5221             aarch64_set_vec_u32 (cpu, vd, i,
5222                                  aarch64_get_vec_u16 (cpu, vn, i + bias)
5223                                  - aarch64_get_vec_u16 (cpu, vm, i + bias));
5224           break;
5225
5226         case 2:
5227           for (i = 0; i < 2; i++)
5228             aarch64_set_vec_u64 (cpu, vd, i,
5229                                  aarch64_get_vec_u32 (cpu, vn, i + bias)
5230                                  - aarch64_get_vec_u32 (cpu, vm, i + bias));
5231           break;
5232
5233         default:
5234           HALT_UNALLOC;
5235         }
5236       break;
5237     }
5238 }
5239
5240 static void
5241 do_vec_ADDP (sim_cpu *cpu)
5242 {
5243   /* instr[31]    = 0
5244      instr[30]    = half(0)/full(1)
5245      instr[29,24] = 00 1110
5246      instr[23,22] = size: bytes (00), half (01), word (10), long (11)
5247      instr[21]    = 1
5248      insrt[20,16] = Vm
5249      instr[15,10] = 1011 11
5250      instr[9,5]   = Vn
5251      instr[4,0]   = V dest.  */
5252
5253   FRegister copy_vn;
5254   FRegister copy_vm;
5255   unsigned full = INSTR (30, 30);
5256   unsigned size = INSTR (23, 22);
5257   unsigned vm = INSTR (20, 16);
5258   unsigned vn = INSTR (9, 5);
5259   unsigned vd = INSTR (4, 0);
5260   unsigned i, range;
5261
5262   NYI_assert (29, 24, 0x0E);
5263   NYI_assert (21, 21, 1);
5264   NYI_assert (15, 10, 0x2F);
5265
5266   /* Make copies of the source registers in case vd == vn/vm.  */
5267   copy_vn = cpu->fr[vn];
5268   copy_vm = cpu->fr[vm];
5269
5270   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5271   switch (size)
5272     {
5273     case 0:
5274       range = full ? 8 : 4;
5275       for (i = 0; i < range; i++)
5276         {
5277           aarch64_set_vec_u8 (cpu, vd, i,
5278                               copy_vn.b[i * 2] + copy_vn.b[i * 2 + 1]);
5279           aarch64_set_vec_u8 (cpu, vd, i + range,
5280                               copy_vm.b[i * 2] + copy_vm.b[i * 2 + 1]);
5281         }
5282       return;
5283
5284     case 1:
5285       range = full ? 4 : 2;
5286       for (i = 0; i < range; i++)
5287         {
5288           aarch64_set_vec_u16 (cpu, vd, i,
5289                                copy_vn.h[i * 2] + copy_vn.h[i * 2 + 1]);
5290           aarch64_set_vec_u16 (cpu, vd, i + range,
5291                                copy_vm.h[i * 2] + copy_vm.h[i * 2 + 1]);
5292         }
5293       return;
5294
5295     case 2:
5296       range = full ? 2 : 1;
5297       for (i = 0; i < range; i++)
5298         {
5299           aarch64_set_vec_u32 (cpu, vd, i,
5300                                copy_vn.w[i * 2] + copy_vn.w[i * 2 + 1]);
5301           aarch64_set_vec_u32 (cpu, vd, i + range,
5302                                copy_vm.w[i * 2] + copy_vm.w[i * 2 + 1]);
5303         }
5304       return;
5305
5306     case 3:
5307       if (! full)
5308         HALT_UNALLOC;
5309       aarch64_set_vec_u64 (cpu, vd, 0, copy_vn.v[0] + copy_vn.v[1]);
5310       aarch64_set_vec_u64 (cpu, vd, 1, copy_vm.v[0] + copy_vm.v[1]);
5311       return;
5312     }
5313 }
5314
5315 static void
5316 do_vec_UMOV (sim_cpu *cpu)
5317 {
5318   /* instr[31]    = 0
5319      instr[30]    = 32-bit(0)/64-bit(1)
5320      instr[29,21] = 00 1110 000
5321      insrt[20,16] = size & index
5322      instr[15,10] = 0011 11
5323      instr[9,5]   = V source
5324      instr[4,0]   = R dest.  */
5325
5326   unsigned vs = INSTR (9, 5);
5327   unsigned rd = INSTR (4, 0);
5328   unsigned index;
5329
5330   NYI_assert (29, 21, 0x070);
5331   NYI_assert (15, 10, 0x0F);
5332
5333   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5334   if (INSTR (16, 16))
5335     {
5336       /* Byte transfer.  */
5337       index = INSTR (20, 17);
5338       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5339                            aarch64_get_vec_u8 (cpu, vs, index));
5340     }
5341   else if (INSTR (17, 17))
5342     {
5343       index = INSTR (20, 18);
5344       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5345                            aarch64_get_vec_u16 (cpu, vs, index));
5346     }
5347   else if (INSTR (18, 18))
5348     {
5349       index = INSTR (20, 19);
5350       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5351                            aarch64_get_vec_u32 (cpu, vs, index));
5352     }
5353   else
5354     {
5355       if (INSTR (30, 30) != 1)
5356         HALT_UNALLOC;
5357
5358       index = INSTR (20, 20);
5359       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5360                            aarch64_get_vec_u64 (cpu, vs, index));
5361     }
5362 }
5363
5364 static void
5365 do_vec_FABS (sim_cpu *cpu)
5366 {
5367   /* instr[31]    = 0
5368      instr[30]    = half(0)/full(1)
5369      instr[29,23] = 00 1110 1
5370      instr[22]    = float(0)/double(1)
5371      instr[21,16] = 10 0000
5372      instr[15,10] = 1111 10
5373      instr[9,5]   = Vn
5374      instr[4,0]   = Vd.  */
5375
5376   unsigned vn = INSTR (9, 5);
5377   unsigned vd = INSTR (4, 0);
5378   unsigned full = INSTR (30, 30);
5379   unsigned i;
5380
5381   NYI_assert (29, 23, 0x1D);
5382   NYI_assert (21, 10, 0x83E);
5383
5384   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5385   if (INSTR (22, 22))
5386     {
5387       if (! full)
5388         HALT_NYI;
5389
5390       for (i = 0; i < 2; i++)
5391         aarch64_set_vec_double (cpu, vd, i,
5392                                 fabs (aarch64_get_vec_double (cpu, vn, i)));
5393     }
5394   else
5395     {
5396       for (i = 0; i < (full ? 4 : 2); i++)
5397         aarch64_set_vec_float (cpu, vd, i,
5398                                fabsf (aarch64_get_vec_float (cpu, vn, i)));
5399     }
5400 }
5401
5402 static void
5403 do_vec_FCVTZS (sim_cpu *cpu)
5404 {
5405   /* instr[31]    = 0
5406      instr[30]    = half (0) / all (1)
5407      instr[29,23] = 00 1110 1
5408      instr[22]    = single (0) / double (1)
5409      instr[21,10] = 10 0001 1011 10
5410      instr[9,5]   = Rn
5411      instr[4,0]   = Rd.  */
5412
5413   unsigned rn = INSTR (9, 5);
5414   unsigned rd = INSTR (4, 0);
5415   unsigned full = INSTR (30, 30);
5416   unsigned i;
5417
5418   NYI_assert (31, 31, 0);
5419   NYI_assert (29, 23, 0x1D);
5420   NYI_assert (21, 10, 0x86E);
5421
5422   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5423   if (INSTR (22, 22))
5424     {
5425       if (! full)
5426         HALT_UNALLOC;
5427
5428       for (i = 0; i < 2; i++)
5429         aarch64_set_vec_s64 (cpu, rd, i,
5430                              (int64_t) aarch64_get_vec_double (cpu, rn, i));
5431     }
5432   else
5433     for (i = 0; i < (full ? 4 : 2); i++)
5434       aarch64_set_vec_s32 (cpu, rd, i,
5435                            (int32_t) aarch64_get_vec_float (cpu, rn, i));
5436 }
5437
5438 static void
5439 do_vec_REV64 (sim_cpu *cpu)
5440 {
5441   /* instr[31]    = 0
5442      instr[30]    = full/half
5443      instr[29,24] = 00 1110
5444      instr[23,22] = size
5445      instr[21,10] = 10 0000 0000 10
5446      instr[9,5]   = Rn
5447      instr[4,0]   = Rd.  */
5448
5449   unsigned rn = INSTR (9, 5);
5450   unsigned rd = INSTR (4, 0);
5451   unsigned size = INSTR (23, 22);
5452   unsigned full = INSTR (30, 30);
5453   unsigned i;
5454   FRegister val;
5455
5456   NYI_assert (29, 24, 0x0E);
5457   NYI_assert (21, 10, 0x802);
5458
5459   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5460   switch (size)
5461     {
5462     case 0:
5463       for (i = 0; i < (full ? 16 : 8); i++)
5464         val.b[i ^ 0x7] = aarch64_get_vec_u8 (cpu, rn, i);
5465       break;
5466
5467     case 1:
5468       for (i = 0; i < (full ? 8 : 4); i++)
5469         val.h[i ^ 0x3] = aarch64_get_vec_u16 (cpu, rn, i);
5470       break;
5471
5472     case 2:
5473       for (i = 0; i < (full ? 4 : 2); i++)
5474         val.w[i ^ 0x1] = aarch64_get_vec_u32 (cpu, rn, i);
5475       break;
5476
5477     case 3:
5478       HALT_UNALLOC;
5479     }
5480
5481   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5482   if (full)
5483     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5484 }
5485
5486 static void
5487 do_vec_REV16 (sim_cpu *cpu)
5488 {
5489   /* instr[31]    = 0
5490      instr[30]    = full/half
5491      instr[29,24] = 00 1110
5492      instr[23,22] = size
5493      instr[21,10] = 10 0000 0001 10
5494      instr[9,5]   = Rn
5495      instr[4,0]   = Rd.  */
5496
5497   unsigned rn = INSTR (9, 5);
5498   unsigned rd = INSTR (4, 0);
5499   unsigned size = INSTR (23, 22);
5500   unsigned full = INSTR (30, 30);
5501   unsigned i;
5502   FRegister val;
5503
5504   NYI_assert (29, 24, 0x0E);
5505   NYI_assert (21, 10, 0x806);
5506
5507   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5508   switch (size)
5509     {
5510     case 0:
5511       for (i = 0; i < (full ? 16 : 8); i++)
5512         val.b[i ^ 0x1] = aarch64_get_vec_u8 (cpu, rn, i);
5513       break;
5514
5515     default:
5516       HALT_UNALLOC;
5517     }
5518
5519   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5520   if (full)
5521     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5522 }
5523
5524 static void
5525 do_vec_op1 (sim_cpu *cpu)
5526 {
5527   /* instr[31]    = 0
5528      instr[30]    = half/full
5529      instr[29,24] = 00 1110
5530      instr[23,21] = ???
5531      instr[20,16] = Vm
5532      instr[15,10] = sub-opcode
5533      instr[9,5]   = Vn
5534      instr[4,0]   = Vd  */
5535   NYI_assert (29, 24, 0x0E);
5536
5537   if (INSTR (21, 21) == 0)
5538     {
5539       if (INSTR (23, 22) == 0)
5540         {
5541           if (INSTR (30, 30) == 1
5542               && INSTR (17, 14) == 0
5543               && INSTR (12, 10) == 7)
5544             return do_vec_ins_2 (cpu);
5545
5546           switch (INSTR (15, 10))
5547             {
5548             case 0x01: do_vec_DUP_vector_into_vector (cpu); return;
5549             case 0x03: do_vec_DUP_scalar_into_vector (cpu); return;
5550             case 0x07: do_vec_INS (cpu); return;
5551             case 0x0A: do_vec_TRN (cpu); return;
5552
5553             case 0x0F:
5554               if (INSTR (17, 16) == 0)
5555                 {
5556                   do_vec_MOV_into_scalar (cpu);
5557                   return;
5558                 }
5559               break;
5560
5561             case 0x00:
5562             case 0x08:
5563             case 0x10:
5564             case 0x18:
5565               do_vec_TBL (cpu); return;
5566
5567             case 0x06:
5568             case 0x16:
5569               do_vec_UZP (cpu); return;
5570
5571             case 0x0E:
5572             case 0x1E:
5573               do_vec_ZIP (cpu); return;
5574
5575             default:
5576               HALT_NYI;
5577             }
5578         }
5579
5580       switch (INSTR (13, 10))
5581         {
5582         case 0x6: do_vec_UZP (cpu); return;
5583         case 0xE: do_vec_ZIP (cpu); return;
5584         case 0xA: do_vec_TRN (cpu); return;
5585         case 0xF: do_vec_UMOV (cpu); return;
5586         default:  HALT_NYI;
5587         }
5588     }
5589
5590   switch (INSTR (15, 10))
5591     {
5592     case 0x02: do_vec_REV64 (cpu); return;
5593     case 0x06: do_vec_REV16 (cpu); return;
5594
5595     case 0x07:
5596       switch (INSTR (23, 21))
5597         {
5598         case 1: do_vec_AND (cpu); return;
5599         case 3: do_vec_BIC (cpu); return;
5600         case 5: do_vec_ORR (cpu); return;
5601         case 7: do_vec_ORN (cpu); return;
5602         default: HALT_NYI;
5603         }
5604
5605     case 0x08: do_vec_sub_long (cpu); return;
5606     case 0x0a: do_vec_XTN (cpu); return;
5607     case 0x11: do_vec_SSHL (cpu); return;
5608     case 0x19: do_vec_max (cpu); return;
5609     case 0x1B: do_vec_min (cpu); return;
5610     case 0x21: do_vec_add (cpu); return;
5611     case 0x25: do_vec_MLA (cpu); return;
5612     case 0x27: do_vec_mul (cpu); return;
5613     case 0x2F: do_vec_ADDP (cpu); return;
5614     case 0x30: do_vec_mull (cpu); return;
5615     case 0x33: do_vec_FMLA (cpu); return;
5616     case 0x35: do_vec_fadd (cpu); return;
5617
5618     case 0x2E:
5619       switch (INSTR (20, 16))
5620         {
5621         case 0x00: do_vec_ABS (cpu); return;
5622         case 0x01: do_vec_FCVTZS (cpu); return;
5623         case 0x11: do_vec_ADDV (cpu); return;
5624         default: HALT_NYI;
5625         }
5626
5627     case 0x31:
5628     case 0x3B:
5629       do_vec_Fminmax (cpu); return;
5630
5631     case 0x0D:
5632     case 0x0F:
5633     case 0x22:
5634     case 0x23:
5635     case 0x26:
5636     case 0x2A:
5637     case 0x32:
5638     case 0x36:
5639     case 0x39:
5640     case 0x3A:
5641       do_vec_compare (cpu); return;
5642
5643     case 0x3E:
5644       do_vec_FABS (cpu); return;
5645
5646     default:
5647       HALT_NYI;
5648     }
5649 }
5650
5651 static void
5652 do_vec_xtl (sim_cpu *cpu)
5653 {
5654   /* instr[31]    = 0
5655      instr[30,29] = SXTL (00), UXTL (01), SXTL2 (10), UXTL2 (11)
5656      instr[28,22] = 0 1111 00
5657      instr[21,16] = size & shift (USHLL, SSHLL, USHLL2, SSHLL2)
5658      instr[15,10] = 1010 01
5659      instr[9,5]   = V source
5660      instr[4,0]   = V dest.  */
5661
5662   unsigned vs = INSTR (9, 5);
5663   unsigned vd = INSTR (4, 0);
5664   unsigned i, shift, bias = 0;
5665
5666   NYI_assert (28, 22, 0x3C);
5667   NYI_assert (15, 10, 0x29);
5668
5669   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5670   switch (INSTR (30, 29))
5671     {
5672     case 2: /* SXTL2, SSHLL2.  */
5673       bias = 2;
5674     case 0: /* SXTL, SSHLL.  */
5675       if (INSTR (21, 21))
5676         {
5677           int64_t val1, val2;
5678
5679           shift = INSTR (20, 16);
5680           /* Get the source values before setting the destination values
5681              in case the source and destination are the same.  */
5682           val1 = aarch64_get_vec_s32 (cpu, vs, bias) << shift;
5683           val2 = aarch64_get_vec_s32 (cpu, vs, bias + 1) << shift;
5684           aarch64_set_vec_s64 (cpu, vd, 0, val1);
5685           aarch64_set_vec_s64 (cpu, vd, 1, val2);
5686         }
5687       else if (INSTR (20, 20))
5688         {
5689           int32_t v[4];
5690           int32_t v1,v2,v3,v4;
5691
5692           shift = INSTR (19, 16);
5693           bias *= 2;
5694           for (i = 0; i < 4; i++)
5695             v[i] = aarch64_get_vec_s16 (cpu, vs, bias + i) << shift;
5696           for (i = 0; i < 4; i++)
5697             aarch64_set_vec_s32 (cpu, vd, i, v[i]);
5698         }
5699       else
5700         {
5701           int16_t v[8];
5702           NYI_assert (19, 19, 1);
5703
5704           shift = INSTR (18, 16);
5705           bias *= 4;
5706           for (i = 0; i < 8; i++)
5707             v[i] = aarch64_get_vec_s8 (cpu, vs, i + bias) << shift;
5708           for (i = 0; i < 8; i++)
5709             aarch64_set_vec_s16 (cpu, vd, i, v[i]);
5710         }
5711       return;
5712
5713     case 3: /* UXTL2, USHLL2.  */
5714       bias = 2;
5715     case 1: /* UXTL, USHLL.  */
5716       if (INSTR (21, 21))
5717         {
5718           uint64_t v1, v2;
5719           shift = INSTR (20, 16);
5720           v1 = aarch64_get_vec_u32 (cpu, vs, bias) << shift;
5721           v2 = aarch64_get_vec_u32 (cpu, vs, bias + 1) << shift;
5722           aarch64_set_vec_u64 (cpu, vd, 0, v1);
5723           aarch64_set_vec_u64 (cpu, vd, 1, v2);
5724         }
5725       else if (INSTR (20, 20))
5726         {
5727           uint32_t v[4];
5728           shift = INSTR (19, 16);
5729           bias *= 2;
5730           for (i = 0; i < 4; i++)
5731             v[i] = aarch64_get_vec_u16 (cpu, vs, i + bias) << shift;
5732           for (i = 0; i < 4; i++)
5733             aarch64_set_vec_u32 (cpu, vd, i, v[i]);
5734         }
5735       else
5736         {
5737           uint16_t v[8];
5738           NYI_assert (19, 19, 1);
5739
5740           shift = INSTR (18, 16);
5741           bias *= 4;
5742           for (i = 0; i < 8; i++)
5743             v[i] = aarch64_get_vec_u8 (cpu, vs, i + bias) << shift;
5744           for (i = 0; i < 8; i++)
5745             aarch64_set_vec_u16 (cpu, vd, i, v[i]);
5746         }
5747       return;
5748     }
5749 }
5750
5751 static void
5752 do_vec_SHL (sim_cpu *cpu)
5753 {
5754   /* instr [31]    = 0
5755      instr [30]    = half(0)/full(1)
5756      instr [29,23] = 001 1110
5757      instr [22,16] = size and shift amount
5758      instr [15,10] = 01 0101
5759      instr [9, 5]  = Vs
5760      instr [4, 0]  = Vd.  */
5761
5762   int shift;
5763   int full    = INSTR (30, 30);
5764   unsigned vs = INSTR (9, 5);
5765   unsigned vd = INSTR (4, 0);
5766   unsigned i;
5767
5768   NYI_assert (29, 23, 0x1E);
5769   NYI_assert (15, 10, 0x15);
5770
5771   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5772   if (INSTR (22, 22))
5773     {
5774       shift = INSTR (21, 16);
5775
5776       if (full == 0)
5777         HALT_UNALLOC;
5778
5779       for (i = 0; i < 2; i++)
5780         {
5781           uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5782           aarch64_set_vec_u64 (cpu, vd, i, val << shift);
5783         }
5784
5785       return;
5786     }
5787
5788   if (INSTR (21, 21))
5789     {
5790       shift = INSTR (20, 16);
5791
5792       for (i = 0; i < (full ? 4 : 2); i++)
5793         {
5794           uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5795           aarch64_set_vec_u32 (cpu, vd, i, val << shift);
5796         }
5797
5798       return;
5799     }
5800
5801   if (INSTR (20, 20))
5802     {
5803       shift = INSTR (19, 16);
5804
5805       for (i = 0; i < (full ? 8 : 4); i++)
5806         {
5807           uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5808           aarch64_set_vec_u16 (cpu, vd, i, val << shift);
5809         }
5810
5811       return;
5812     }
5813
5814   if (INSTR (19, 19) == 0)
5815     HALT_UNALLOC;
5816
5817   shift = INSTR (18, 16);
5818
5819   for (i = 0; i < (full ? 16 : 8); i++)
5820     {
5821       uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5822       aarch64_set_vec_u8 (cpu, vd, i, val << shift);
5823     }
5824 }
5825
5826 static void
5827 do_vec_SSHR_USHR (sim_cpu *cpu)
5828 {
5829   /* instr [31]    = 0
5830      instr [30]    = half(0)/full(1)
5831      instr [29]    = signed(0)/unsigned(1)
5832      instr [28,23] = 0 1111 0
5833      instr [22,16] = size and shift amount
5834      instr [15,10] = 0000 01
5835      instr [9, 5]  = Vs
5836      instr [4, 0]  = Vd.  */
5837
5838   int full       = INSTR (30, 30);
5839   int sign       = ! INSTR (29, 29);
5840   unsigned shift = INSTR (22, 16);
5841   unsigned vs    = INSTR (9, 5);
5842   unsigned vd    = INSTR (4, 0);
5843   unsigned i;
5844
5845   NYI_assert (28, 23, 0x1E);
5846   NYI_assert (15, 10, 0x01);
5847
5848   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5849   if (INSTR (22, 22))
5850     {
5851       shift = 128 - shift;
5852
5853       if (full == 0)
5854         HALT_UNALLOC;
5855
5856       if (sign)
5857         for (i = 0; i < 2; i++)
5858           {
5859             int64_t val = aarch64_get_vec_s64 (cpu, vs, i);
5860             aarch64_set_vec_s64 (cpu, vd, i, val >> shift);
5861           }
5862       else
5863         for (i = 0; i < 2; i++)
5864           {
5865             uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5866             aarch64_set_vec_u64 (cpu, vd, i, val >> shift);
5867           }
5868
5869       return;
5870     }
5871
5872   if (INSTR (21, 21))
5873     {
5874       shift = 64 - shift;
5875
5876       if (sign)
5877         for (i = 0; i < (full ? 4 : 2); i++)
5878           {
5879             int32_t val = aarch64_get_vec_s32 (cpu, vs, i);
5880             aarch64_set_vec_s32 (cpu, vd, i, val >> shift);
5881           }
5882       else
5883         for (i = 0; i < (full ? 4 : 2); i++)
5884           {
5885             uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5886             aarch64_set_vec_u32 (cpu, vd, i, val >> shift);
5887           }
5888
5889       return;
5890     }
5891
5892   if (INSTR (20, 20))
5893     {
5894       shift = 32 - shift;
5895
5896       if (sign)
5897         for (i = 0; i < (full ? 8 : 4); i++)
5898           {
5899             int16_t val = aarch64_get_vec_s16 (cpu, vs, i);
5900             aarch64_set_vec_s16 (cpu, vd, i, val >> shift);
5901           }
5902       else
5903         for (i = 0; i < (full ? 8 : 4); i++)
5904           {
5905             uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5906             aarch64_set_vec_u16 (cpu, vd, i, val >> shift);
5907           }
5908
5909       return;
5910     }
5911
5912   if (INSTR (19, 19) == 0)
5913     HALT_UNALLOC;
5914
5915   shift = 16 - shift;
5916
5917   if (sign)
5918     for (i = 0; i < (full ? 16 : 8); i++)
5919       {
5920         int8_t val = aarch64_get_vec_s8 (cpu, vs, i);
5921         aarch64_set_vec_s8 (cpu, vd, i, val >> shift);
5922       }
5923   else
5924     for (i = 0; i < (full ? 16 : 8); i++)
5925       {
5926         uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5927         aarch64_set_vec_u8 (cpu, vd, i, val >> shift);
5928       }
5929 }
5930
5931 static void
5932 do_vec_MUL_by_element (sim_cpu *cpu)
5933 {
5934   /* instr[31]    = 0
5935      instr[30]    = half/full
5936      instr[29,24] = 00 1111
5937      instr[23,22] = size
5938      instr[21]    = L
5939      instr[20]    = M
5940      instr[19,16] = m
5941      instr[15,12] = 1000
5942      instr[11]    = H
5943      instr[10]    = 0
5944      instr[9,5]   = Vn
5945      instr[4,0]   = Vd  */
5946
5947   unsigned full     = INSTR (30, 30);
5948   unsigned L        = INSTR (21, 21);
5949   unsigned H        = INSTR (11, 11);
5950   unsigned vn       = INSTR (9, 5);
5951   unsigned vd       = INSTR (4, 0);
5952   unsigned size     = INSTR (23, 22);
5953   unsigned index;
5954   unsigned vm;
5955   unsigned e;
5956
5957   NYI_assert (29, 24, 0x0F);
5958   NYI_assert (15, 12, 0x8);
5959   NYI_assert (10, 10, 0);
5960
5961   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5962   switch (size)
5963     {
5964     case 1:
5965       {
5966         /* 16 bit products.  */
5967         uint16_t product;
5968         uint16_t element1;
5969         uint16_t element2;
5970
5971         index = (H << 2) | (L << 1) | INSTR (20, 20);
5972         vm = INSTR (19, 16);
5973         element2 = aarch64_get_vec_u16 (cpu, vm, index);
5974
5975         for (e = 0; e < (full ? 8 : 4); e ++)
5976           {
5977             element1 = aarch64_get_vec_u16 (cpu, vn, e);
5978             product  = element1 * element2;
5979             aarch64_set_vec_u16 (cpu, vd, e, product);
5980           }
5981       }
5982       break;
5983
5984     case 2:
5985       {
5986         /* 32 bit products.  */
5987         uint32_t product;
5988         uint32_t element1;
5989         uint32_t element2;
5990
5991         index = (H << 1) | L;
5992         vm = INSTR (20, 16);
5993         element2 = aarch64_get_vec_u32 (cpu, vm, index);
5994
5995         for (e = 0; e < (full ? 4 : 2); e ++)
5996           {
5997             element1 = aarch64_get_vec_u32 (cpu, vn, e);
5998             product  = element1 * element2;
5999             aarch64_set_vec_u32 (cpu, vd, e, product);
6000           }
6001       }
6002       break;
6003
6004     default:
6005       HALT_UNALLOC;
6006     }
6007 }
6008
6009 static void
6010 do_FMLA_by_element (sim_cpu *cpu)
6011 {
6012   /* instr[31]    = 0
6013      instr[30]    = half/full
6014      instr[29,23] = 00 1111 1
6015      instr[22]    = size
6016      instr[21]    = L
6017      instr[20,16] = m
6018      instr[15,12] = 0001
6019      instr[11]    = H
6020      instr[10]    = 0
6021      instr[9,5]   = Vn
6022      instr[4,0]   = Vd  */
6023
6024   unsigned full     = INSTR (30, 30);
6025   unsigned size     = INSTR (22, 22);
6026   unsigned L        = INSTR (21, 21);
6027   unsigned vm       = INSTR (20, 16);
6028   unsigned H        = INSTR (11, 11);
6029   unsigned vn       = INSTR (9, 5);
6030   unsigned vd       = INSTR (4, 0);
6031   unsigned e;
6032
6033   NYI_assert (29, 23, 0x1F);
6034   NYI_assert (15, 12, 0x1);
6035   NYI_assert (10, 10, 0);
6036
6037   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6038   if (size)
6039     {
6040       double element1, element2;
6041
6042       if (! full || L)
6043         HALT_UNALLOC;
6044
6045       element2 = aarch64_get_vec_double (cpu, vm, H);
6046
6047       for (e = 0; e < 2; e++)
6048         {
6049           element1 = aarch64_get_vec_double (cpu, vn, e);
6050           element1 *= element2;
6051           element1 += aarch64_get_vec_double (cpu, vd, e);
6052           aarch64_set_vec_double (cpu, vd, e, element1);
6053         }
6054     }
6055   else
6056     {
6057       float element1;
6058       float element2 = aarch64_get_vec_float (cpu, vm, (H << 1) | L);
6059
6060       for (e = 0; e < (full ? 4 : 2); e++)
6061         {
6062           element1 = aarch64_get_vec_float (cpu, vn, e);
6063           element1 *= element2;
6064           element1 += aarch64_get_vec_float (cpu, vd, e);
6065           aarch64_set_vec_float (cpu, vd, e, element1);
6066         }
6067     }
6068 }
6069
6070 static void
6071 do_vec_op2 (sim_cpu *cpu)
6072 {
6073   /* instr[31]    = 0
6074      instr[30]    = half/full
6075      instr[29,24] = 00 1111
6076      instr[23]    = ?
6077      instr[22,16] = element size & index
6078      instr[15,10] = sub-opcode
6079      instr[9,5]   = Vm
6080      instr[4,0]   = Vd  */
6081
6082   NYI_assert (29, 24, 0x0F);
6083
6084   if (INSTR (23, 23) != 0)
6085     {
6086       switch (INSTR (15, 10))
6087         {
6088         case 0x04:
6089         case 0x06:
6090           do_FMLA_by_element (cpu);
6091           return;
6092
6093         case 0x20:
6094         case 0x22:
6095           do_vec_MUL_by_element (cpu);
6096           return;
6097
6098         default:
6099           HALT_NYI;
6100         }
6101     }
6102   else
6103     {
6104       switch (INSTR (15, 10))
6105         {
6106         case 0x01: do_vec_SSHR_USHR (cpu); return;
6107         case 0x15: do_vec_SHL (cpu); return;
6108         case 0x20:
6109         case 0x22: do_vec_MUL_by_element (cpu); return;
6110         case 0x29: do_vec_xtl (cpu); return;
6111         default:   HALT_NYI;
6112         }
6113     }
6114 }
6115
6116 static void
6117 do_vec_neg (sim_cpu *cpu)
6118 {
6119   /* instr[31]    = 0
6120      instr[30]    = full(1)/half(0)
6121      instr[29,24] = 10 1110
6122      instr[23,22] = size: byte(00), half (01), word (10), long (11)
6123      instr[21,10] = 1000 0010 1110
6124      instr[9,5]   = Vs
6125      instr[4,0]   = Vd  */
6126
6127   int    full = INSTR (30, 30);
6128   unsigned vs = INSTR (9, 5);
6129   unsigned vd = INSTR (4, 0);
6130   unsigned i;
6131
6132   NYI_assert (29, 24, 0x2E);
6133   NYI_assert (21, 10, 0x82E);
6134
6135   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6136   switch (INSTR (23, 22))
6137     {
6138     case 0:
6139       for (i = 0; i < (full ? 16 : 8); i++)
6140         aarch64_set_vec_s8 (cpu, vd, i, - aarch64_get_vec_s8 (cpu, vs, i));
6141       return;
6142
6143     case 1:
6144       for (i = 0; i < (full ? 8 : 4); i++)
6145         aarch64_set_vec_s16 (cpu, vd, i, - aarch64_get_vec_s16 (cpu, vs, i));
6146       return;
6147
6148     case 2:
6149       for (i = 0; i < (full ? 4 : 2); i++)
6150         aarch64_set_vec_s32 (cpu, vd, i, - aarch64_get_vec_s32 (cpu, vs, i));
6151       return;
6152
6153     case 3:
6154       if (! full)
6155         HALT_NYI;
6156       for (i = 0; i < 2; i++)
6157         aarch64_set_vec_s64 (cpu, vd, i, - aarch64_get_vec_s64 (cpu, vs, i));
6158       return;
6159     }
6160 }
6161
6162 static void
6163 do_vec_sqrt (sim_cpu *cpu)
6164 {
6165   /* instr[31]    = 0
6166      instr[30]    = full(1)/half(0)
6167      instr[29,23] = 101 1101
6168      instr[22]    = single(0)/double(1)
6169      instr[21,10] = 1000 0111 1110
6170      instr[9,5]   = Vs
6171      instr[4,0]   = Vd.  */
6172
6173   int    full = INSTR (30, 30);
6174   unsigned vs = INSTR (9, 5);
6175   unsigned vd = INSTR (4, 0);
6176   unsigned i;
6177
6178   NYI_assert (29, 23, 0x5B);
6179   NYI_assert (21, 10, 0x87E);
6180
6181   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6182   if (INSTR (22, 22) == 0)
6183     for (i = 0; i < (full ? 4 : 2); i++)
6184       aarch64_set_vec_float (cpu, vd, i,
6185                              sqrtf (aarch64_get_vec_float (cpu, vs, i)));
6186   else
6187     for (i = 0; i < 2; i++)
6188       aarch64_set_vec_double (cpu, vd, i,
6189                               sqrt (aarch64_get_vec_double (cpu, vs, i)));
6190 }
6191
6192 static void
6193 do_vec_mls_indexed (sim_cpu *cpu)
6194 {
6195   /* instr[31]       = 0
6196      instr[30]       = half(0)/full(1)
6197      instr[29,24]    = 10 1111
6198      instr[23,22]    = 16-bit(01)/32-bit(10)
6199      instr[21,20+11] = index (if 16-bit)
6200      instr[21+11]    = index (if 32-bit)
6201      instr[20,16]    = Vm
6202      instr[15,12]    = 0100
6203      instr[11]       = part of index
6204      instr[10]       = 0
6205      instr[9,5]      = Vs
6206      instr[4,0]      = Vd.  */
6207
6208   int    full = INSTR (30, 30);
6209   unsigned vs = INSTR (9, 5);
6210   unsigned vd = INSTR (4, 0);
6211   unsigned vm = INSTR (20, 16);
6212   unsigned i;
6213
6214   NYI_assert (15, 12, 4);
6215   NYI_assert (10, 10, 0);
6216
6217   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6218   switch (INSTR (23, 22))
6219     {
6220     case 1:
6221       {
6222         unsigned elem;
6223         uint32_t val;
6224
6225         if (vm > 15)
6226           HALT_NYI;
6227
6228         elem = (INSTR (21, 20) << 1) | INSTR (11, 11);
6229         val = aarch64_get_vec_u16 (cpu, vm, elem);
6230
6231         for (i = 0; i < (full ? 8 : 4); i++)
6232           aarch64_set_vec_u32 (cpu, vd, i,
6233                                aarch64_get_vec_u32 (cpu, vd, i) -
6234                                (aarch64_get_vec_u32 (cpu, vs, i) * val));
6235         return;
6236       }
6237
6238     case 2:
6239       {
6240         unsigned elem = (INSTR (21, 21) << 1) | INSTR (11, 11);
6241         uint64_t val = aarch64_get_vec_u32 (cpu, vm, elem);
6242
6243         for (i = 0; i < (full ? 4 : 2); i++)
6244           aarch64_set_vec_u64 (cpu, vd, i,
6245                                aarch64_get_vec_u64 (cpu, vd, i) -
6246                                (aarch64_get_vec_u64 (cpu, vs, i) * val));
6247         return;
6248       }
6249
6250     case 0:
6251     case 3:
6252     default:
6253       HALT_NYI;
6254     }
6255 }
6256
6257 static void
6258 do_vec_SUB (sim_cpu *cpu)
6259 {
6260   /* instr [31]    = 0
6261      instr [30]    = half(0)/full(1)
6262      instr [29,24] = 10 1110
6263      instr [23,22] = size: byte(00, half(01), word (10), long (11)
6264      instr [21]    = 1
6265      instr [20,16] = Vm
6266      instr [15,10] = 10 0001
6267      instr [9, 5]  = Vn
6268      instr [4, 0]  = Vd.  */
6269
6270   unsigned full = INSTR (30, 30);
6271   unsigned vm = INSTR (20, 16);
6272   unsigned vn = INSTR (9, 5);
6273   unsigned vd = INSTR (4, 0);
6274   unsigned i;
6275
6276   NYI_assert (29, 24, 0x2E);
6277   NYI_assert (21, 21, 1);
6278   NYI_assert (15, 10, 0x21);
6279
6280   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6281   switch (INSTR (23, 22))
6282     {
6283     case 0:
6284       for (i = 0; i < (full ? 16 : 8); i++)
6285         aarch64_set_vec_s8 (cpu, vd, i,
6286                             aarch64_get_vec_s8 (cpu, vn, i)
6287                             - aarch64_get_vec_s8 (cpu, vm, i));
6288       return;
6289
6290     case 1:
6291       for (i = 0; i < (full ? 8 : 4); i++)
6292         aarch64_set_vec_s16 (cpu, vd, i,
6293                              aarch64_get_vec_s16 (cpu, vn, i)
6294                              - aarch64_get_vec_s16 (cpu, vm, i));
6295       return;
6296
6297     case 2:
6298       for (i = 0; i < (full ? 4 : 2); i++)
6299         aarch64_set_vec_s32 (cpu, vd, i,
6300                              aarch64_get_vec_s32 (cpu, vn, i)
6301                              - aarch64_get_vec_s32 (cpu, vm, i));
6302       return;
6303
6304     case 3:
6305       if (full == 0)
6306         HALT_UNALLOC;
6307
6308       for (i = 0; i < 2; i++)
6309         aarch64_set_vec_s64 (cpu, vd, i,
6310                              aarch64_get_vec_s64 (cpu, vn, i)
6311                              - aarch64_get_vec_s64 (cpu, vm, i));
6312       return;
6313     }
6314 }
6315
6316 static void
6317 do_vec_MLS (sim_cpu *cpu)
6318 {
6319   /* instr [31]    = 0
6320      instr [30]    = half(0)/full(1)
6321      instr [29,24] = 10 1110
6322      instr [23,22] = size: byte(00, half(01), word (10)
6323      instr [21]    = 1
6324      instr [20,16] = Vm
6325      instr [15,10] = 10 0101
6326      instr [9, 5]  = Vn
6327      instr [4, 0]  = Vd.  */
6328
6329   unsigned full = INSTR (30, 30);
6330   unsigned vm = INSTR (20, 16);
6331   unsigned vn = INSTR (9, 5);
6332   unsigned vd = INSTR (4, 0);
6333   unsigned i;
6334
6335   NYI_assert (29, 24, 0x2E);
6336   NYI_assert (21, 21, 1);
6337   NYI_assert (15, 10, 0x25);
6338
6339   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6340   switch (INSTR (23, 22))
6341     {
6342     case 0:
6343       for (i = 0; i < (full ? 16 : 8); i++)
6344         aarch64_set_vec_u8 (cpu, vd, i,
6345                             aarch64_get_vec_u8 (cpu, vd, i)
6346                             - (aarch64_get_vec_u8 (cpu, vn, i)
6347                                * aarch64_get_vec_u8 (cpu, vm, i)));
6348       return;
6349
6350     case 1:
6351       for (i = 0; i < (full ? 8 : 4); i++)
6352         aarch64_set_vec_u16 (cpu, vd, i,
6353                              aarch64_get_vec_u16 (cpu, vd, i)
6354                              - (aarch64_get_vec_u16 (cpu, vn, i)
6355                                 * aarch64_get_vec_u16 (cpu, vm, i)));
6356       return;
6357
6358     case 2:
6359       for (i = 0; i < (full ? 4 : 2); i++)
6360         aarch64_set_vec_u32 (cpu, vd, i,
6361                              aarch64_get_vec_u32 (cpu, vd, i)
6362                              - (aarch64_get_vec_u32 (cpu, vn, i)
6363                                 * aarch64_get_vec_u32 (cpu, vm, i)));
6364       return;
6365
6366     default:
6367       HALT_UNALLOC;
6368     }
6369 }
6370
6371 static void
6372 do_vec_FDIV (sim_cpu *cpu)
6373 {
6374   /* instr [31]    = 0
6375      instr [30]    = half(0)/full(1)
6376      instr [29,23] = 10 1110 0
6377      instr [22]    = float()/double(1)
6378      instr [21]    = 1
6379      instr [20,16] = Vm
6380      instr [15,10] = 1111 11
6381      instr [9, 5]  = Vn
6382      instr [4, 0]  = Vd.  */
6383
6384   unsigned full = INSTR (30, 30);
6385   unsigned vm = INSTR (20, 16);
6386   unsigned vn = INSTR (9, 5);
6387   unsigned vd = INSTR (4, 0);
6388   unsigned i;
6389
6390   NYI_assert (29, 23, 0x5C);
6391   NYI_assert (21, 21, 1);
6392   NYI_assert (15, 10, 0x3F);
6393
6394   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6395   if (INSTR (22, 22))
6396     {
6397       if (! full)
6398         HALT_UNALLOC;
6399
6400       for (i = 0; i < 2; i++)
6401         aarch64_set_vec_double (cpu, vd, i,
6402                                 aarch64_get_vec_double (cpu, vn, i)
6403                                 / aarch64_get_vec_double (cpu, vm, i));
6404     }
6405   else
6406     for (i = 0; i < (full ? 4 : 2); i++)
6407       aarch64_set_vec_float (cpu, vd, i,
6408                              aarch64_get_vec_float (cpu, vn, i)
6409                              / aarch64_get_vec_float (cpu, vm, i));
6410 }
6411
6412 static void
6413 do_vec_FMUL (sim_cpu *cpu)
6414 {
6415   /* instr [31]    = 0
6416      instr [30]    = half(0)/full(1)
6417      instr [29,23] = 10 1110 0
6418      instr [22]    = float(0)/double(1)
6419      instr [21]    = 1
6420      instr [20,16] = Vm
6421      instr [15,10] = 1101 11
6422      instr [9, 5]  = Vn
6423      instr [4, 0]  = Vd.  */
6424
6425   unsigned full = INSTR (30, 30);
6426   unsigned vm = INSTR (20, 16);
6427   unsigned vn = INSTR (9, 5);
6428   unsigned vd = INSTR (4, 0);
6429   unsigned i;
6430
6431   NYI_assert (29, 23, 0x5C);
6432   NYI_assert (21, 21, 1);
6433   NYI_assert (15, 10, 0x37);
6434
6435   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6436   if (INSTR (22, 22))
6437     {
6438       if (! full)
6439         HALT_UNALLOC;
6440
6441       for (i = 0; i < 2; i++)
6442         aarch64_set_vec_double (cpu, vd, i,
6443                                 aarch64_get_vec_double (cpu, vn, i)
6444                                 * aarch64_get_vec_double (cpu, vm, i));
6445     }
6446   else
6447     for (i = 0; i < (full ? 4 : 2); i++)
6448       aarch64_set_vec_float (cpu, vd, i,
6449                              aarch64_get_vec_float (cpu, vn, i)
6450                              * aarch64_get_vec_float (cpu, vm, i));
6451 }
6452
6453 static void
6454 do_vec_FADDP (sim_cpu *cpu)
6455 {
6456   /* instr [31]    = 0
6457      instr [30]    = half(0)/full(1)
6458      instr [29,23] = 10 1110 0
6459      instr [22]    = float(0)/double(1)
6460      instr [21]    = 1
6461      instr [20,16] = Vm
6462      instr [15,10] = 1101 01
6463      instr [9, 5]  = Vn
6464      instr [4, 0]  = Vd.  */
6465
6466   unsigned full = INSTR (30, 30);
6467   unsigned vm = INSTR (20, 16);
6468   unsigned vn = INSTR (9, 5);
6469   unsigned vd = INSTR (4, 0);
6470
6471   NYI_assert (29, 23, 0x5C);
6472   NYI_assert (21, 21, 1);
6473   NYI_assert (15, 10, 0x35);
6474
6475   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6476   if (INSTR (22, 22))
6477     {
6478       /* Extract values before adding them incase vd == vn/vm.  */
6479       double tmp1 = aarch64_get_vec_double (cpu, vn, 0);
6480       double tmp2 = aarch64_get_vec_double (cpu, vn, 1);
6481       double tmp3 = aarch64_get_vec_double (cpu, vm, 0);
6482       double tmp4 = aarch64_get_vec_double (cpu, vm, 1);
6483
6484       if (! full)
6485         HALT_UNALLOC;
6486
6487       aarch64_set_vec_double (cpu, vd, 0, tmp1 + tmp2);
6488       aarch64_set_vec_double (cpu, vd, 1, tmp3 + tmp4);
6489     }
6490   else
6491     {
6492       /* Extract values before adding them incase vd == vn/vm.  */
6493       float tmp1 = aarch64_get_vec_float (cpu, vn, 0);
6494       float tmp2 = aarch64_get_vec_float (cpu, vn, 1);
6495       float tmp5 = aarch64_get_vec_float (cpu, vm, 0);
6496       float tmp6 = aarch64_get_vec_float (cpu, vm, 1);
6497
6498       if (full)
6499         {
6500           float tmp3 = aarch64_get_vec_float (cpu, vn, 2);
6501           float tmp4 = aarch64_get_vec_float (cpu, vn, 3);
6502           float tmp7 = aarch64_get_vec_float (cpu, vm, 2);
6503           float tmp8 = aarch64_get_vec_float (cpu, vm, 3);
6504
6505           aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6506           aarch64_set_vec_float (cpu, vd, 1, tmp3 + tmp4);
6507           aarch64_set_vec_float (cpu, vd, 2, tmp5 + tmp6);
6508           aarch64_set_vec_float (cpu, vd, 3, tmp7 + tmp8);
6509         }
6510       else
6511         {
6512           aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6513           aarch64_set_vec_float (cpu, vd, 1, tmp5 + tmp6);
6514         }
6515     }
6516 }
6517
6518 static void
6519 do_vec_FSQRT (sim_cpu *cpu)
6520 {
6521   /* instr[31]    = 0
6522      instr[30]    = half(0)/full(1)
6523      instr[29,23] = 10 1110 1
6524      instr[22]    = single(0)/double(1)
6525      instr[21,10] = 10 0001 1111 10
6526      instr[9,5]   = Vsrc
6527      instr[4,0]   = Vdest.  */
6528
6529   unsigned vn = INSTR (9, 5);
6530   unsigned vd = INSTR (4, 0);
6531   unsigned full = INSTR (30, 30);
6532   int i;
6533
6534   NYI_assert (29, 23, 0x5D);
6535   NYI_assert (21, 10, 0x87E);
6536
6537   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6538   if (INSTR (22, 22))
6539     {
6540       if (! full)
6541         HALT_UNALLOC;
6542
6543       for (i = 0; i < 2; i++)
6544         aarch64_set_vec_double (cpu, vd, i,
6545                                 sqrt (aarch64_get_vec_double (cpu, vn, i)));
6546     }
6547   else
6548     {
6549       for (i = 0; i < (full ? 4 : 2); i++)
6550         aarch64_set_vec_float (cpu, vd, i,
6551                                sqrtf (aarch64_get_vec_float (cpu, vn, i)));
6552     }
6553 }
6554
6555 static void
6556 do_vec_FNEG (sim_cpu *cpu)
6557 {
6558   /* instr[31]    = 0
6559      instr[30]    = half (0)/full (1)
6560      instr[29,23] = 10 1110 1
6561      instr[22]    = single (0)/double (1)
6562      instr[21,10] = 10 0000 1111 10
6563      instr[9,5]   = Vsrc
6564      instr[4,0]   = Vdest.  */
6565
6566   unsigned vn = INSTR (9, 5);
6567   unsigned vd = INSTR (4, 0);
6568   unsigned full = INSTR (30, 30);
6569   int i;
6570
6571   NYI_assert (29, 23, 0x5D);
6572   NYI_assert (21, 10, 0x83E);
6573
6574   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6575   if (INSTR (22, 22))
6576     {
6577       if (! full)
6578         HALT_UNALLOC;
6579
6580       for (i = 0; i < 2; i++)
6581         aarch64_set_vec_double (cpu, vd, i,
6582                                 - aarch64_get_vec_double (cpu, vn, i));
6583     }
6584   else
6585     {
6586       for (i = 0; i < (full ? 4 : 2); i++)
6587         aarch64_set_vec_float (cpu, vd, i,
6588                                - aarch64_get_vec_float (cpu, vn, i));
6589     }
6590 }
6591
6592 static void
6593 do_vec_NOT (sim_cpu *cpu)
6594 {
6595   /* instr[31]    = 0
6596      instr[30]    = half (0)/full (1)
6597      instr[29,10] = 10 1110 0010 0000 0101 10
6598      instr[9,5]   = Vn
6599      instr[4.0]   = Vd.  */
6600
6601   unsigned vn = INSTR (9, 5);
6602   unsigned vd = INSTR (4, 0);
6603   unsigned i;
6604   int      full = INSTR (30, 30);
6605
6606   NYI_assert (29, 10, 0xB8816);
6607
6608   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6609   for (i = 0; i < (full ? 16 : 8); i++)
6610     aarch64_set_vec_u8 (cpu, vd, i, ~ aarch64_get_vec_u8 (cpu, vn, i));
6611 }
6612
6613 static unsigned int
6614 clz (uint64_t val, unsigned size)
6615 {
6616   uint64_t mask = 1;
6617   int      count;
6618
6619   mask <<= (size - 1);
6620   count = 0;
6621   do
6622     {
6623       if (val & mask)
6624         break;
6625       mask >>= 1;
6626       count ++;
6627     }
6628   while (mask);
6629
6630   return count;
6631 }
6632
6633 static void
6634 do_vec_CLZ (sim_cpu *cpu)
6635 {
6636   /* instr[31]    = 0
6637      instr[30]    = half (0)/full (1)
6638      instr[29,24] = 10 1110
6639      instr[23,22] = size
6640      instr[21,10] = 10 0000 0100 10
6641      instr[9,5]   = Vn
6642      instr[4.0]   = Vd.  */
6643
6644   unsigned vn = INSTR (9, 5);
6645   unsigned vd = INSTR (4, 0);
6646   unsigned i;
6647   int      full = INSTR (30,30);
6648
6649   NYI_assert (29, 24, 0x2E);
6650   NYI_assert (21, 10, 0x812);
6651
6652   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6653   switch (INSTR (23, 22))
6654     {
6655     case 0:
6656       for (i = 0; i < (full ? 16 : 8); i++)
6657         aarch64_set_vec_u8 (cpu, vd, i, clz (aarch64_get_vec_u8 (cpu, vn, i), 8));
6658       break;
6659     case 1:
6660       for (i = 0; i < (full ? 8 : 4); i++)
6661         aarch64_set_vec_u16 (cpu, vd, i, clz (aarch64_get_vec_u16 (cpu, vn, i), 16));
6662       break;
6663     case 2:
6664       for (i = 0; i < (full ? 4 : 2); i++)
6665         aarch64_set_vec_u32 (cpu, vd, i, clz (aarch64_get_vec_u32 (cpu, vn, i), 32));
6666       break;
6667     case 3:
6668       if (! full)
6669         HALT_UNALLOC;
6670       aarch64_set_vec_u64 (cpu, vd, 0, clz (aarch64_get_vec_u64 (cpu, vn, 0), 64));
6671       aarch64_set_vec_u64 (cpu, vd, 1, clz (aarch64_get_vec_u64 (cpu, vn, 1), 64));
6672       break;
6673     }
6674 }
6675
6676 static void
6677 do_vec_MOV_element (sim_cpu *cpu)
6678 {
6679   /* instr[31,21] = 0110 1110 000
6680      instr[20,16] = size & dest index
6681      instr[15]    = 0
6682      instr[14,11] = source index
6683      instr[10]    = 1
6684      instr[9,5]   = Vs
6685      instr[4.0]   = Vd.  */
6686
6687   unsigned vs = INSTR (9, 5);
6688   unsigned vd = INSTR (4, 0);
6689   unsigned src_index;
6690   unsigned dst_index;
6691
6692   NYI_assert (31, 21, 0x370);
6693   NYI_assert (15, 15, 0);
6694   NYI_assert (10, 10, 1);
6695
6696   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6697   if (INSTR (16, 16))
6698     {
6699       /* Move a byte.  */
6700       src_index = INSTR (14, 11);
6701       dst_index = INSTR (20, 17);
6702       aarch64_set_vec_u8 (cpu, vd, dst_index,
6703                           aarch64_get_vec_u8 (cpu, vs, src_index));
6704     }
6705   else if (INSTR (17, 17))
6706     {
6707       /* Move 16-bits.  */
6708       NYI_assert (11, 11, 0);
6709       src_index = INSTR (14, 12);
6710       dst_index = INSTR (20, 18);
6711       aarch64_set_vec_u16 (cpu, vd, dst_index,
6712                            aarch64_get_vec_u16 (cpu, vs, src_index));
6713     }
6714   else if (INSTR (18, 18))
6715     {
6716       /* Move 32-bits.  */
6717       NYI_assert (12, 11, 0);
6718       src_index = INSTR (14, 13);
6719       dst_index = INSTR (20, 19);
6720       aarch64_set_vec_u32 (cpu, vd, dst_index,
6721                            aarch64_get_vec_u32 (cpu, vs, src_index));
6722     }
6723   else
6724     {
6725       NYI_assert (19, 19, 1);
6726       NYI_assert (13, 11, 0);
6727       src_index = INSTR (14, 14);
6728       dst_index = INSTR (20, 20);
6729       aarch64_set_vec_u64 (cpu, vd, dst_index,
6730                            aarch64_get_vec_u64 (cpu, vs, src_index));
6731     }
6732 }
6733
6734 static void
6735 do_vec_REV32 (sim_cpu *cpu)
6736 {
6737   /* instr[31]    = 0
6738      instr[30]    = full/half
6739      instr[29,24] = 10 1110
6740      instr[23,22] = size
6741      instr[21,10] = 10 0000 0000 10
6742      instr[9,5]   = Rn
6743      instr[4,0]   = Rd.  */
6744
6745   unsigned rn = INSTR (9, 5);
6746   unsigned rd = INSTR (4, 0);
6747   unsigned size = INSTR (23, 22);
6748   unsigned full = INSTR (30, 30);
6749   unsigned i;
6750   FRegister val;
6751
6752   NYI_assert (29, 24, 0x2E);
6753   NYI_assert (21, 10, 0x802);
6754
6755   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6756   switch (size)
6757     {
6758     case 0:
6759       for (i = 0; i < (full ? 16 : 8); i++)
6760         val.b[i ^ 0x3] = aarch64_get_vec_u8 (cpu, rn, i);
6761       break;
6762
6763     case 1:
6764       for (i = 0; i < (full ? 8 : 4); i++)
6765         val.h[i ^ 0x1] = aarch64_get_vec_u16 (cpu, rn, i);
6766       break;
6767
6768     default:
6769       HALT_UNALLOC;
6770     }
6771
6772   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
6773   if (full)
6774     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
6775 }
6776
6777 static void
6778 do_vec_EXT (sim_cpu *cpu)
6779 {
6780   /* instr[31]    = 0
6781      instr[30]    = full/half
6782      instr[29,21] = 10 1110 000
6783      instr[20,16] = Vm
6784      instr[15]    = 0
6785      instr[14,11] = source index
6786      instr[10]    = 0
6787      instr[9,5]   = Vn
6788      instr[4.0]   = Vd.  */
6789
6790   unsigned vm = INSTR (20, 16);
6791   unsigned vn = INSTR (9, 5);
6792   unsigned vd = INSTR (4, 0);
6793   unsigned src_index = INSTR (14, 11);
6794   unsigned full = INSTR (30, 30);
6795   unsigned i;
6796   unsigned j;
6797   FRegister val;
6798
6799   NYI_assert (31, 21, 0x370);
6800   NYI_assert (15, 15, 0);
6801   NYI_assert (10, 10, 0);
6802
6803   if (!full && (src_index & 0x8))
6804     HALT_UNALLOC;
6805
6806   j = 0;
6807
6808   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6809   for (i = src_index; i < (full ? 16 : 8); i++)
6810     val.b[j ++] = aarch64_get_vec_u8 (cpu, vn, i);
6811   for (i = 0; i < src_index; i++)
6812     val.b[j ++] = aarch64_get_vec_u8 (cpu, vm, i);
6813
6814   aarch64_set_vec_u64 (cpu, vd, 0, val.v[0]);
6815   if (full)
6816     aarch64_set_vec_u64 (cpu, vd, 1, val.v[1]);
6817 }
6818
6819 static void
6820 dexAdvSIMD0 (sim_cpu *cpu)
6821 {
6822   /* instr [28,25] = 0 111.  */
6823   if (    INSTR (15, 10) == 0x07
6824       && (INSTR (9, 5) ==
6825           INSTR (20, 16)))
6826     {
6827       if (INSTR (31, 21) == 0x075
6828           || INSTR (31, 21) == 0x275)
6829         {
6830           do_vec_MOV_whole_vector (cpu);
6831           return;
6832         }
6833     }
6834
6835   if (INSTR (29, 19) == 0x1E0)
6836     {
6837       do_vec_MOV_immediate (cpu);
6838       return;
6839     }
6840
6841   if (INSTR (29, 19) == 0x5E0)
6842     {
6843       do_vec_MVNI (cpu);
6844       return;
6845     }
6846
6847   if (INSTR (29, 19) == 0x1C0
6848       || INSTR (29, 19) == 0x1C1)
6849     {
6850       if (INSTR (15, 10) == 0x03)
6851         {
6852           do_vec_DUP_scalar_into_vector (cpu);
6853           return;
6854         }
6855     }
6856
6857   switch (INSTR (29, 24))
6858     {
6859     case 0x0E: do_vec_op1 (cpu); return;
6860     case 0x0F: do_vec_op2 (cpu); return;
6861
6862     case 0x2E:
6863       if (INSTR (21, 21) == 1)
6864         {
6865           switch (INSTR (15, 10))
6866             {
6867             case 0x02:
6868               do_vec_REV32 (cpu);
6869               return;
6870
6871             case 0x07:
6872               switch (INSTR (23, 22))
6873                 {
6874                 case 0: do_vec_EOR (cpu); return;
6875                 case 1: do_vec_BSL (cpu); return;
6876                 case 2:
6877                 case 3: do_vec_bit (cpu); return;
6878                 }
6879               break;
6880
6881             case 0x08: do_vec_sub_long (cpu); return;
6882             case 0x11: do_vec_USHL (cpu); return;
6883             case 0x12: do_vec_CLZ (cpu); return;
6884             case 0x16: do_vec_NOT (cpu); return;
6885             case 0x19: do_vec_max (cpu); return;
6886             case 0x1B: do_vec_min (cpu); return;
6887             case 0x21: do_vec_SUB (cpu); return;
6888             case 0x25: do_vec_MLS (cpu); return;
6889             case 0x31: do_vec_FminmaxNMP (cpu); return;
6890             case 0x35: do_vec_FADDP (cpu); return;
6891             case 0x37: do_vec_FMUL (cpu); return;
6892             case 0x3F: do_vec_FDIV (cpu); return;
6893
6894             case 0x3E:
6895               switch (INSTR (20, 16))
6896                 {
6897                 case 0x00: do_vec_FNEG (cpu); return;
6898                 case 0x01: do_vec_FSQRT (cpu); return;
6899                 default:   HALT_NYI;
6900                 }
6901
6902             case 0x0D:
6903             case 0x0F:
6904             case 0x22:
6905             case 0x23:
6906             case 0x26:
6907             case 0x2A:
6908             case 0x32:
6909             case 0x36:
6910             case 0x39:
6911             case 0x3A:
6912               do_vec_compare (cpu); return;
6913
6914             default:
6915               break;
6916             }
6917         }
6918
6919       if (INSTR (31, 21) == 0x370)
6920         {
6921           if (INSTR (10, 10))
6922             do_vec_MOV_element (cpu);
6923           else
6924             do_vec_EXT (cpu);
6925           return;
6926         }
6927
6928       switch (INSTR (21, 10))
6929         {
6930         case 0x82E: do_vec_neg (cpu); return;
6931         case 0x87E: do_vec_sqrt (cpu); return;
6932         default:
6933           if (INSTR (15, 10) == 0x30)
6934             {
6935               do_vec_mull (cpu);
6936               return;
6937             }
6938           break;
6939         }
6940       break;
6941
6942     case 0x2f:
6943       switch (INSTR (15, 10))
6944         {
6945         case 0x01: do_vec_SSHR_USHR (cpu); return;
6946         case 0x10:
6947         case 0x12: do_vec_mls_indexed (cpu); return;
6948         case 0x29: do_vec_xtl (cpu); return;
6949         default:
6950           HALT_NYI;
6951         }
6952
6953     default:
6954       break;
6955     }
6956
6957   HALT_NYI;
6958 }
6959
6960 /* 3 sources.  */
6961
6962 /* Float multiply add.  */
6963 static void
6964 fmadds (sim_cpu *cpu)
6965 {
6966   unsigned sa = INSTR (14, 10);
6967   unsigned sm = INSTR (20, 16);
6968   unsigned sn = INSTR ( 9,  5);
6969   unsigned sd = INSTR ( 4,  0);
6970
6971   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6972   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
6973                         + aarch64_get_FP_float (cpu, sn)
6974                         * aarch64_get_FP_float (cpu, sm));
6975 }
6976
6977 /* Double multiply add.  */
6978 static void
6979 fmaddd (sim_cpu *cpu)
6980 {
6981   unsigned sa = INSTR (14, 10);
6982   unsigned sm = INSTR (20, 16);
6983   unsigned sn = INSTR ( 9,  5);
6984   unsigned sd = INSTR ( 4,  0);
6985
6986   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6987   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
6988                          + aarch64_get_FP_double (cpu, sn)
6989                          * aarch64_get_FP_double (cpu, sm));
6990 }
6991
6992 /* Float multiply subtract.  */
6993 static void
6994 fmsubs (sim_cpu *cpu)
6995 {
6996   unsigned sa = INSTR (14, 10);
6997   unsigned sm = INSTR (20, 16);
6998   unsigned sn = INSTR ( 9,  5);
6999   unsigned sd = INSTR ( 4,  0);
7000
7001   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7002   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
7003                         - aarch64_get_FP_float (cpu, sn)
7004                         * aarch64_get_FP_float (cpu, sm));
7005 }
7006
7007 /* Double multiply subtract.  */
7008 static void
7009 fmsubd (sim_cpu *cpu)
7010 {
7011   unsigned sa = INSTR (14, 10);
7012   unsigned sm = INSTR (20, 16);
7013   unsigned sn = INSTR ( 9,  5);
7014   unsigned sd = INSTR ( 4,  0);
7015
7016   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7017   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7018                          - aarch64_get_FP_double (cpu, sn)
7019                          * aarch64_get_FP_double (cpu, sm));
7020 }
7021
7022 /* Float negative multiply add.  */
7023 static void
7024 fnmadds (sim_cpu *cpu)
7025 {
7026   unsigned sa = INSTR (14, 10);
7027   unsigned sm = INSTR (20, 16);
7028   unsigned sn = INSTR ( 9,  5);
7029   unsigned sd = INSTR ( 4,  0);
7030
7031   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7032   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7033                         + (- aarch64_get_FP_float (cpu, sn))
7034                         * aarch64_get_FP_float (cpu, sm));
7035 }
7036
7037 /* Double negative multiply add.  */
7038 static void
7039 fnmaddd (sim_cpu *cpu)
7040 {
7041   unsigned sa = INSTR (14, 10);
7042   unsigned sm = INSTR (20, 16);
7043   unsigned sn = INSTR ( 9,  5);
7044   unsigned sd = INSTR ( 4,  0);
7045
7046   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7047   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7048                          + (- aarch64_get_FP_double (cpu, sn))
7049                          * aarch64_get_FP_double (cpu, sm));
7050 }
7051
7052 /* Float negative multiply subtract.  */
7053 static void
7054 fnmsubs (sim_cpu *cpu)
7055 {
7056   unsigned sa = INSTR (14, 10);
7057   unsigned sm = INSTR (20, 16);
7058   unsigned sn = INSTR ( 9,  5);
7059   unsigned sd = INSTR ( 4,  0);
7060
7061   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7062   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7063                         + aarch64_get_FP_float (cpu, sn)
7064                         * aarch64_get_FP_float (cpu, sm));
7065 }
7066
7067 /* Double negative multiply subtract.  */
7068 static void
7069 fnmsubd (sim_cpu *cpu)
7070 {
7071   unsigned sa = INSTR (14, 10);
7072   unsigned sm = INSTR (20, 16);
7073   unsigned sn = INSTR ( 9,  5);
7074   unsigned sd = INSTR ( 4,  0);
7075
7076   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7077   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7078                          + aarch64_get_FP_double (cpu, sn)
7079                          * aarch64_get_FP_double (cpu, sm));
7080 }
7081
7082 static void
7083 dexSimpleFPDataProc3Source (sim_cpu *cpu)
7084 {
7085   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7086      instr[30]    = 0
7087      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7088      instr[28,25] = 1111
7089      instr[24]    = 1
7090      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7091      instr[21]    ==> o1 : 0 ==> unnegated, 1 ==> negated
7092      instr[15]    ==> o2 : 0 ==> ADD, 1 ==> SUB  */
7093
7094   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7095   /* dispatch on combined type:o1:o2.  */
7096   uint32_t dispatch = (INSTR (23, 21) << 1) | INSTR (15, 15);
7097
7098   if (M_S != 0)
7099     HALT_UNALLOC;
7100
7101   switch (dispatch)
7102     {
7103     case 0: fmadds (cpu); return;
7104     case 1: fmsubs (cpu); return;
7105     case 2: fnmadds (cpu); return;
7106     case 3: fnmsubs (cpu); return;
7107     case 4: fmaddd (cpu); return;
7108     case 5: fmsubd (cpu); return;
7109     case 6: fnmaddd (cpu); return;
7110     case 7: fnmsubd (cpu); return;
7111     default:
7112       /* type > 1 is currently unallocated.  */
7113       HALT_UNALLOC;
7114     }
7115 }
7116
7117 static void
7118 dexSimpleFPFixedConvert (sim_cpu *cpu)
7119 {
7120   HALT_NYI;
7121 }
7122
7123 static void
7124 dexSimpleFPCondCompare (sim_cpu *cpu)
7125 {
7126   /* instr [31,23] = 0001 1110 0
7127      instr [22]    = type
7128      instr [21]    = 1
7129      instr [20,16] = Rm
7130      instr [15,12] = condition
7131      instr [11,10] = 01
7132      instr [9,5]   = Rn
7133      instr [4]     = 0
7134      instr [3,0]   = nzcv  */
7135
7136   unsigned rm = INSTR (20, 16);
7137   unsigned rn = INSTR (9, 5);
7138
7139   NYI_assert (31, 23, 0x3C);
7140   NYI_assert (11, 10, 0x1);
7141   NYI_assert (4,  4,  0);
7142
7143   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7144   if (! testConditionCode (cpu, INSTR (15, 12)))
7145     {
7146       aarch64_set_CPSR (cpu, INSTR (3, 0));
7147       return;
7148     }
7149
7150   if (INSTR (22, 22))
7151     {
7152       /* Double precision.  */
7153       double val1 = aarch64_get_vec_double (cpu, rn, 0);
7154       double val2 = aarch64_get_vec_double (cpu, rm, 0);
7155
7156       /* FIXME: Check for NaNs.  */
7157       if (val1 == val2)
7158         aarch64_set_CPSR (cpu, (Z | C));
7159       else if (val1 < val2)
7160         aarch64_set_CPSR (cpu, N);
7161       else /* val1 > val2 */
7162         aarch64_set_CPSR (cpu, C);
7163     }
7164   else
7165     {
7166       /* Single precision.  */
7167       float val1 = aarch64_get_vec_float (cpu, rn, 0);
7168       float val2 = aarch64_get_vec_float (cpu, rm, 0);
7169
7170       /* FIXME: Check for NaNs.  */
7171       if (val1 == val2)
7172         aarch64_set_CPSR (cpu, (Z | C));
7173       else if (val1 < val2)
7174         aarch64_set_CPSR (cpu, N);
7175       else /* val1 > val2 */
7176         aarch64_set_CPSR (cpu, C);
7177     }
7178 }
7179
7180 /* 2 sources.  */
7181
7182 /* Float add.  */
7183 static void
7184 fadds (sim_cpu *cpu)
7185 {
7186   unsigned sm = INSTR (20, 16);
7187   unsigned sn = INSTR ( 9,  5);
7188   unsigned sd = INSTR ( 4,  0);
7189
7190   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7191   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7192                         + aarch64_get_FP_float (cpu, sm));
7193 }
7194
7195 /* Double add.  */
7196 static void
7197 faddd (sim_cpu *cpu)
7198 {
7199   unsigned sm = INSTR (20, 16);
7200   unsigned sn = INSTR ( 9,  5);
7201   unsigned sd = INSTR ( 4,  0);
7202
7203   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7204   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7205                          + aarch64_get_FP_double (cpu, sm));
7206 }
7207
7208 /* Float divide.  */
7209 static void
7210 fdivs (sim_cpu *cpu)
7211 {
7212   unsigned sm = INSTR (20, 16);
7213   unsigned sn = INSTR ( 9,  5);
7214   unsigned sd = INSTR ( 4,  0);
7215
7216   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7217   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7218                         / aarch64_get_FP_float (cpu, sm));
7219 }
7220
7221 /* Double divide.  */
7222 static void
7223 fdivd (sim_cpu *cpu)
7224 {
7225   unsigned sm = INSTR (20, 16);
7226   unsigned sn = INSTR ( 9,  5);
7227   unsigned sd = INSTR ( 4,  0);
7228
7229   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7230   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7231                          / aarch64_get_FP_double (cpu, sm));
7232 }
7233
7234 /* Float multiply.  */
7235 static void
7236 fmuls (sim_cpu *cpu)
7237 {
7238   unsigned sm = INSTR (20, 16);
7239   unsigned sn = INSTR ( 9,  5);
7240   unsigned sd = INSTR ( 4,  0);
7241
7242   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7243   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7244                         * aarch64_get_FP_float (cpu, sm));
7245 }
7246
7247 /* Double multiply.  */
7248 static void
7249 fmuld (sim_cpu *cpu)
7250 {
7251   unsigned sm = INSTR (20, 16);
7252   unsigned sn = INSTR ( 9,  5);
7253   unsigned sd = INSTR ( 4,  0);
7254
7255   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7256   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7257                          * aarch64_get_FP_double (cpu, sm));
7258 }
7259
7260 /* Float negate and multiply.  */
7261 static void
7262 fnmuls (sim_cpu *cpu)
7263 {
7264   unsigned sm = INSTR (20, 16);
7265   unsigned sn = INSTR ( 9,  5);
7266   unsigned sd = INSTR ( 4,  0);
7267
7268   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7269   aarch64_set_FP_float (cpu, sd, - (aarch64_get_FP_float (cpu, sn)
7270                                     * aarch64_get_FP_float (cpu, sm)));
7271 }
7272
7273 /* Double negate and multiply.  */
7274 static void
7275 fnmuld (sim_cpu *cpu)
7276 {
7277   unsigned sm = INSTR (20, 16);
7278   unsigned sn = INSTR ( 9,  5);
7279   unsigned sd = INSTR ( 4,  0);
7280
7281   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7282   aarch64_set_FP_double (cpu, sd, - (aarch64_get_FP_double (cpu, sn)
7283                                      * aarch64_get_FP_double (cpu, sm)));
7284 }
7285
7286 /* Float subtract.  */
7287 static void
7288 fsubs (sim_cpu *cpu)
7289 {
7290   unsigned sm = INSTR (20, 16);
7291   unsigned sn = INSTR ( 9,  5);
7292   unsigned sd = INSTR ( 4,  0);
7293
7294   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7295   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7296                         - aarch64_get_FP_float (cpu, sm));
7297 }
7298
7299 /* Double subtract.  */
7300 static void
7301 fsubd (sim_cpu *cpu)
7302 {
7303   unsigned sm = INSTR (20, 16);
7304   unsigned sn = INSTR ( 9,  5);
7305   unsigned sd = INSTR ( 4,  0);
7306
7307   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7308   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7309                          - aarch64_get_FP_double (cpu, sm));
7310 }
7311
7312 static void
7313 do_FMINNM (sim_cpu *cpu)
7314 {
7315   /* instr[31,23] = 0 0011 1100
7316      instr[22]    = float(0)/double(1)
7317      instr[21]    = 1
7318      instr[20,16] = Sm
7319      instr[15,10] = 01 1110
7320      instr[9,5]   = Sn
7321      instr[4,0]   = Cpu  */
7322
7323   unsigned sm = INSTR (20, 16);
7324   unsigned sn = INSTR ( 9,  5);
7325   unsigned sd = INSTR ( 4,  0);
7326
7327   NYI_assert (31, 23, 0x03C);
7328   NYI_assert (15, 10, 0x1E);
7329
7330   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7331   if (INSTR (22, 22))
7332     aarch64_set_FP_double (cpu, sd,
7333                            dminnm (aarch64_get_FP_double (cpu, sn),
7334                                    aarch64_get_FP_double (cpu, sm)));
7335   else
7336     aarch64_set_FP_float (cpu, sd,
7337                           fminnm (aarch64_get_FP_float (cpu, sn),
7338                                   aarch64_get_FP_float (cpu, sm)));
7339 }
7340
7341 static void
7342 do_FMAXNM (sim_cpu *cpu)
7343 {
7344   /* instr[31,23] = 0 0011 1100
7345      instr[22]    = float(0)/double(1)
7346      instr[21]    = 1
7347      instr[20,16] = Sm
7348      instr[15,10] = 01 1010
7349      instr[9,5]   = Sn
7350      instr[4,0]   = Cpu  */
7351
7352   unsigned sm = INSTR (20, 16);
7353   unsigned sn = INSTR ( 9,  5);
7354   unsigned sd = INSTR ( 4,  0);
7355
7356   NYI_assert (31, 23, 0x03C);
7357   NYI_assert (15, 10, 0x1A);
7358
7359   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7360   if (INSTR (22, 22))
7361     aarch64_set_FP_double (cpu, sd,
7362                            dmaxnm (aarch64_get_FP_double (cpu, sn),
7363                                    aarch64_get_FP_double (cpu, sm)));
7364   else
7365     aarch64_set_FP_float (cpu, sd,
7366                           fmaxnm (aarch64_get_FP_float (cpu, sn),
7367                                   aarch64_get_FP_float (cpu, sm)));
7368 }
7369
7370 static void
7371 dexSimpleFPDataProc2Source (sim_cpu *cpu)
7372 {
7373   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7374      instr[30]    = 0
7375      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7376      instr[28,25] = 1111
7377      instr[24]    = 0
7378      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7379      instr[21]    = 1
7380      instr[20,16] = Vm
7381      instr[15,12] ==> opcode : 0000 ==> FMUL, 0001 ==> FDIV
7382                                0010 ==> FADD, 0011 ==> FSUB,
7383                                0100 ==> FMAX, 0101 ==> FMIN
7384                                0110 ==> FMAXNM, 0111 ==> FMINNM
7385                                1000 ==> FNMUL, ow ==> UNALLOC
7386      instr[11,10] = 10
7387      instr[9,5]   = Vn
7388      instr[4,0]   = Vd  */
7389
7390   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7391   uint32_t type = INSTR (23, 22);
7392   /* Dispatch on opcode.  */
7393   uint32_t dispatch = INSTR (15, 12);
7394
7395   if (type > 1)
7396     HALT_UNALLOC;
7397
7398   if (M_S != 0)
7399     HALT_UNALLOC;
7400
7401   if (type)
7402     switch (dispatch)
7403       {
7404       case 0: fmuld (cpu); return;
7405       case 1: fdivd (cpu); return;
7406       case 2: faddd (cpu); return;
7407       case 3: fsubd (cpu); return;
7408       case 6: do_FMAXNM (cpu); return;
7409       case 7: do_FMINNM (cpu); return;
7410       case 8: fnmuld (cpu); return;
7411
7412         /* Have not yet implemented fmax and fmin.  */
7413       case 4:
7414       case 5:
7415         HALT_NYI;
7416
7417       default:
7418         HALT_UNALLOC;
7419       }
7420   else /* type == 0 => floats.  */
7421     switch (dispatch)
7422       {
7423       case 0: fmuls (cpu); return;
7424       case 1: fdivs (cpu); return;
7425       case 2: fadds (cpu); return;
7426       case 3: fsubs (cpu); return;
7427       case 6: do_FMAXNM (cpu); return;
7428       case 7: do_FMINNM (cpu); return;
7429       case 8: fnmuls (cpu); return;
7430
7431       case 4:
7432       case 5:
7433         HALT_NYI;
7434
7435       default:
7436         HALT_UNALLOC;
7437       }
7438 }
7439
7440 static void
7441 dexSimpleFPCondSelect (sim_cpu *cpu)
7442 {
7443   /* FCSEL
7444      instr[31,23] = 0 0011 1100
7445      instr[22]    = 0=>single 1=>double
7446      instr[21]    = 1
7447      instr[20,16] = Sm
7448      instr[15,12] = cond
7449      instr[11,10] = 11
7450      instr[9,5]   = Sn
7451      instr[4,0]   = Cpu  */
7452   unsigned sm = INSTR (20, 16);
7453   unsigned sn = INSTR ( 9, 5);
7454   unsigned sd = INSTR ( 4, 0);
7455   uint32_t set = testConditionCode (cpu, INSTR (15, 12));
7456
7457   NYI_assert (31, 23, 0x03C);
7458   NYI_assert (11, 10, 0x3);
7459
7460   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7461   if (INSTR (22, 22))
7462     aarch64_set_FP_double (cpu, sd, (set ? aarch64_get_FP_double (cpu, sn)
7463                                      : aarch64_get_FP_double (cpu, sm)));
7464   else
7465     aarch64_set_FP_float (cpu, sd, (set ? aarch64_get_FP_float (cpu, sn)
7466                                     : aarch64_get_FP_float (cpu, sm)));
7467 }
7468
7469 /* Store 32 bit unscaled signed 9 bit.  */
7470 static void
7471 fsturs (sim_cpu *cpu, int32_t offset)
7472 {
7473   unsigned int rn = INSTR (9, 5);
7474   unsigned int st = INSTR (4, 0);
7475
7476   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7477   aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7478                        aarch64_get_vec_u32 (cpu, st, 0));
7479 }
7480
7481 /* Store 64 bit unscaled signed 9 bit.  */
7482 static void
7483 fsturd (sim_cpu *cpu, int32_t offset)
7484 {
7485   unsigned int rn = INSTR (9, 5);
7486   unsigned int st = INSTR (4, 0);
7487
7488   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7489   aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7490                        aarch64_get_vec_u64 (cpu, st, 0));
7491 }
7492
7493 /* Store 128 bit unscaled signed 9 bit.  */
7494 static void
7495 fsturq (sim_cpu *cpu, int32_t offset)
7496 {
7497   unsigned int rn = INSTR (9, 5);
7498   unsigned int st = INSTR (4, 0);
7499   FRegister a;
7500
7501   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7502   aarch64_get_FP_long_double (cpu, st, & a);
7503   aarch64_set_mem_long_double (cpu,
7504                                aarch64_get_reg_u64 (cpu, rn, 1)
7505                                + offset, a);
7506 }
7507
7508 /* TODO FP move register.  */
7509
7510 /* 32 bit fp to fp move register.  */
7511 static void
7512 ffmovs (sim_cpu *cpu)
7513 {
7514   unsigned int rn = INSTR (9, 5);
7515   unsigned int st = INSTR (4, 0);
7516
7517   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7518   aarch64_set_FP_float (cpu, st, aarch64_get_FP_float (cpu, rn));
7519 }
7520
7521 /* 64 bit fp to fp move register.  */
7522 static void
7523 ffmovd (sim_cpu *cpu)
7524 {
7525   unsigned int rn = INSTR (9, 5);
7526   unsigned int st = INSTR (4, 0);
7527
7528   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7529   aarch64_set_FP_double (cpu, st, aarch64_get_FP_double (cpu, rn));
7530 }
7531
7532 /* 32 bit GReg to Vec move register.  */
7533 static void
7534 fgmovs (sim_cpu *cpu)
7535 {
7536   unsigned int rn = INSTR (9, 5);
7537   unsigned int st = INSTR (4, 0);
7538
7539   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7540   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_reg_u32 (cpu, rn, NO_SP));
7541 }
7542
7543 /* 64 bit g to fp move register.  */
7544 static void
7545 fgmovd (sim_cpu *cpu)
7546 {
7547   unsigned int rn = INSTR (9, 5);
7548   unsigned int st = INSTR (4, 0);
7549
7550   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7551   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_reg_u64 (cpu, rn, NO_SP));
7552 }
7553
7554 /* 32 bit fp to g move register.  */
7555 static void
7556 gfmovs (sim_cpu *cpu)
7557 {
7558   unsigned int rn = INSTR (9, 5);
7559   unsigned int st = INSTR (4, 0);
7560
7561   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7562   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u32 (cpu, rn, 0));
7563 }
7564
7565 /* 64 bit fp to g move register.  */
7566 static void
7567 gfmovd (sim_cpu *cpu)
7568 {
7569   unsigned int rn = INSTR (9, 5);
7570   unsigned int st = INSTR (4, 0);
7571
7572   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7573   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u64 (cpu, rn, 0));
7574 }
7575
7576 /* FP move immediate
7577
7578    These install an immediate 8 bit value in the target register
7579    where the 8 bits comprise 1 sign bit, 4 bits of fraction and a 3
7580    bit exponent.  */
7581
7582 static void
7583 fmovs (sim_cpu *cpu)
7584 {
7585   unsigned int sd = INSTR (4, 0);
7586   uint32_t imm = INSTR (20, 13);
7587   float f = fp_immediate_for_encoding_32 (imm);
7588
7589   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7590   aarch64_set_FP_float (cpu, sd, f);
7591 }
7592
7593 static void
7594 fmovd (sim_cpu *cpu)
7595 {
7596   unsigned int sd = INSTR (4, 0);
7597   uint32_t imm = INSTR (20, 13);
7598   double d = fp_immediate_for_encoding_64 (imm);
7599
7600   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7601   aarch64_set_FP_double (cpu, sd, d);
7602 }
7603
7604 static void
7605 dexSimpleFPImmediate (sim_cpu *cpu)
7606 {
7607   /* instr[31,23] == 00111100
7608      instr[22]    == type : single(0)/double(1)
7609      instr[21]    == 1
7610      instr[20,13] == imm8
7611      instr[12,10] == 100
7612      instr[9,5]   == imm5 : 00000 ==> PK, ow ==> UNALLOC
7613      instr[4,0]   == Rd  */
7614   uint32_t imm5 = INSTR (9, 5);
7615
7616   NYI_assert (31, 23, 0x3C);
7617
7618   if (imm5 != 0)
7619     HALT_UNALLOC;
7620
7621   if (INSTR (22, 22))
7622     fmovd (cpu);
7623   else
7624     fmovs (cpu);
7625 }
7626
7627 /* TODO specific decode and execute for group Load Store.  */
7628
7629 /* TODO FP load/store single register (unscaled offset).  */
7630
7631 /* TODO load 8 bit unscaled signed 9 bit.  */
7632 /* TODO load 16 bit unscaled signed 9 bit.  */
7633
7634 /* Load 32 bit unscaled signed 9 bit.  */
7635 static void
7636 fldurs (sim_cpu *cpu, int32_t offset)
7637 {
7638   unsigned int rn = INSTR (9, 5);
7639   unsigned int st = INSTR (4, 0);
7640
7641   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7642   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
7643                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7644 }
7645
7646 /* Load 64 bit unscaled signed 9 bit.  */
7647 static void
7648 fldurd (sim_cpu *cpu, int32_t offset)
7649 {
7650   unsigned int rn = INSTR (9, 5);
7651   unsigned int st = INSTR (4, 0);
7652
7653   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7654   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64
7655                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7656 }
7657
7658 /* Load 128 bit unscaled signed 9 bit.  */
7659 static void
7660 fldurq (sim_cpu *cpu, int32_t offset)
7661 {
7662   unsigned int rn = INSTR (9, 5);
7663   unsigned int st = INSTR (4, 0);
7664   FRegister a;
7665   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
7666
7667   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7668   aarch64_get_mem_long_double (cpu, addr, & a);
7669   aarch64_set_FP_long_double (cpu, st, a);
7670 }
7671
7672 /* TODO store 8 bit unscaled signed 9 bit.  */
7673 /* TODO store 16 bit unscaled signed 9 bit.  */
7674
7675
7676 /* 1 source.  */
7677
7678 /* Float absolute value.  */
7679 static void
7680 fabss (sim_cpu *cpu)
7681 {
7682   unsigned sn = INSTR (9, 5);
7683   unsigned sd = INSTR (4, 0);
7684   float value = aarch64_get_FP_float (cpu, sn);
7685
7686   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7687   aarch64_set_FP_float (cpu, sd, fabsf (value));
7688 }
7689
7690 /* Double absolute value.  */
7691 static void
7692 fabcpu (sim_cpu *cpu)
7693 {
7694   unsigned sn = INSTR (9, 5);
7695   unsigned sd = INSTR (4, 0);
7696   double value = aarch64_get_FP_double (cpu, sn);
7697
7698   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7699   aarch64_set_FP_double (cpu, sd, fabs (value));
7700 }
7701
7702 /* Float negative value.  */
7703 static void
7704 fnegs (sim_cpu *cpu)
7705 {
7706   unsigned sn = INSTR (9, 5);
7707   unsigned sd = INSTR (4, 0);
7708
7709   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7710   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sn));
7711 }
7712
7713 /* Double negative value.  */
7714 static void
7715 fnegd (sim_cpu *cpu)
7716 {
7717   unsigned sn = INSTR (9, 5);
7718   unsigned sd = INSTR (4, 0);
7719
7720   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7721   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sn));
7722 }
7723
7724 /* Float square root.  */
7725 static void
7726 fsqrts (sim_cpu *cpu)
7727 {
7728   unsigned sn = INSTR (9, 5);
7729   unsigned sd = INSTR (4, 0);
7730
7731   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7732   aarch64_set_FP_float (cpu, sd, sqrtf (aarch64_get_FP_float (cpu, sn)));
7733 }
7734
7735 /* Double square root.  */
7736 static void
7737 fsqrtd (sim_cpu *cpu)
7738 {
7739   unsigned sn = INSTR (9, 5);
7740   unsigned sd = INSTR (4, 0);
7741
7742   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7743   aarch64_set_FP_double (cpu, sd,
7744                          sqrt (aarch64_get_FP_double (cpu, sn)));
7745 }
7746
7747 /* Convert double to float.  */
7748 static void
7749 fcvtds (sim_cpu *cpu)
7750 {
7751   unsigned sn = INSTR (9, 5);
7752   unsigned sd = INSTR (4, 0);
7753
7754   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7755   aarch64_set_FP_float (cpu, sd, (float) aarch64_get_FP_double (cpu, sn));
7756 }
7757
7758 /* Convert float to double.  */
7759 static void
7760 fcvtcpu (sim_cpu *cpu)
7761 {
7762   unsigned sn = INSTR (9, 5);
7763   unsigned sd = INSTR (4, 0);
7764
7765   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7766   aarch64_set_FP_double (cpu, sd, (double) aarch64_get_FP_float (cpu, sn));
7767 }
7768
7769 static void
7770 do_FRINT (sim_cpu *cpu)
7771 {
7772   /* instr[31,23] = 0001 1110 0
7773      instr[22]    = single(0)/double(1)
7774      instr[21,18] = 1001
7775      instr[17,15] = rounding mode
7776      instr[14,10] = 10000
7777      instr[9,5]   = source
7778      instr[4,0]   = dest  */
7779
7780   float val;
7781   unsigned rs = INSTR (9, 5);
7782   unsigned rd = INSTR (4, 0);
7783   unsigned int rmode = INSTR (17, 15);
7784
7785   NYI_assert (31, 23, 0x03C);
7786   NYI_assert (21, 18, 0x9);
7787   NYI_assert (14, 10, 0x10);
7788
7789   if (rmode == 6 || rmode == 7)
7790     /* FIXME: Add support for rmode == 6 exactness check.  */
7791     rmode = uimm (aarch64_get_FPSR (cpu), 23, 22);
7792
7793   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7794   if (INSTR (22, 22))
7795     {
7796       double val = aarch64_get_FP_double (cpu, rs);
7797
7798       switch (rmode)
7799         {
7800         case 0: /* mode N: nearest or even.  */
7801           {
7802             double rval = round (val);
7803
7804             if (val - rval == 0.5)
7805               {
7806                 if (((rval / 2.0) * 2.0) != rval)
7807                   rval += 1.0;
7808               }
7809
7810             aarch64_set_FP_double (cpu, rd, round (val));
7811             return;
7812           }
7813
7814         case 1: /* mode P: towards +inf.  */
7815           if (val < 0.0)
7816             aarch64_set_FP_double (cpu, rd, trunc (val));
7817           else
7818             aarch64_set_FP_double (cpu, rd, round (val));
7819           return;
7820
7821         case 2: /* mode M: towards -inf.  */
7822           if (val < 0.0)
7823             aarch64_set_FP_double (cpu, rd, round (val));
7824           else
7825             aarch64_set_FP_double (cpu, rd, trunc (val));
7826           return;
7827
7828         case 3: /* mode Z: towards 0.  */
7829           aarch64_set_FP_double (cpu, rd, trunc (val));
7830           return;
7831
7832         case 4: /* mode A: away from 0.  */
7833           aarch64_set_FP_double (cpu, rd, round (val));
7834           return;
7835
7836         case 6: /* mode X: use FPCR with exactness check.  */
7837         case 7: /* mode I: use FPCR mode.  */
7838           HALT_NYI;
7839
7840         default:
7841           HALT_UNALLOC;
7842         }
7843     }
7844
7845   val = aarch64_get_FP_float (cpu, rs);
7846
7847   switch (rmode)
7848     {
7849     case 0: /* mode N: nearest or even.  */
7850       {
7851         float rval = roundf (val);
7852
7853         if (val - rval == 0.5)
7854           {
7855             if (((rval / 2.0) * 2.0) != rval)
7856               rval += 1.0;
7857           }
7858
7859         aarch64_set_FP_float (cpu, rd, rval);
7860         return;
7861       }
7862
7863     case 1: /* mode P: towards +inf.  */
7864       if (val < 0.0)
7865         aarch64_set_FP_float (cpu, rd, truncf (val));
7866       else
7867         aarch64_set_FP_float (cpu, rd, roundf (val));
7868       return;
7869
7870     case 2: /* mode M: towards -inf.  */
7871       if (val < 0.0)
7872         aarch64_set_FP_float (cpu, rd, truncf (val));
7873       else
7874         aarch64_set_FP_float (cpu, rd, roundf (val));
7875       return;
7876
7877     case 3: /* mode Z: towards 0.  */
7878       aarch64_set_FP_float (cpu, rd, truncf (val));
7879       return;
7880
7881     case 4: /* mode A: away from 0.  */
7882       aarch64_set_FP_float (cpu, rd, roundf (val));
7883       return;
7884
7885     case 6: /* mode X: use FPCR with exactness check.  */
7886     case 7: /* mode I: use FPCR mode.  */
7887       HALT_NYI;
7888
7889     default:
7890       HALT_UNALLOC;
7891     }
7892 }
7893
7894 /* Convert half to float.  */
7895 static void
7896 do_FCVT_half_to_single (sim_cpu *cpu)
7897 {
7898   unsigned rn = INSTR (9, 5);
7899   unsigned rd = INSTR (4, 0);
7900
7901   NYI_assert (31, 10, 0x7B890);
7902
7903   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7904   aarch64_set_FP_float (cpu, rd, (float) aarch64_get_FP_half  (cpu, rn));
7905 }
7906
7907 /* Convert half to double.  */
7908 static void
7909 do_FCVT_half_to_double (sim_cpu *cpu)
7910 {
7911   unsigned rn = INSTR (9, 5);
7912   unsigned rd = INSTR (4, 0);
7913
7914   NYI_assert (31, 10, 0x7B8B0);
7915
7916   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7917   aarch64_set_FP_double (cpu, rd, (double) aarch64_get_FP_half  (cpu, rn));
7918 }
7919
7920 static void
7921 do_FCVT_single_to_half (sim_cpu *cpu)
7922 {
7923   unsigned rn = INSTR (9, 5);
7924   unsigned rd = INSTR (4, 0);
7925
7926   NYI_assert (31, 10, 0x788F0);
7927
7928   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7929   aarch64_set_FP_half (cpu, rd, aarch64_get_FP_float  (cpu, rn));
7930 }
7931
7932 /* Convert double to half.  */
7933 static void
7934 do_FCVT_double_to_half (sim_cpu *cpu)
7935 {
7936   unsigned rn = INSTR (9, 5);
7937   unsigned rd = INSTR (4, 0);
7938
7939   NYI_assert (31, 10, 0x798F0);
7940
7941   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7942   aarch64_set_FP_half (cpu, rd, (float) aarch64_get_FP_double  (cpu, rn));
7943 }
7944
7945 static void
7946 dexSimpleFPDataProc1Source (sim_cpu *cpu)
7947 {
7948   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7949      instr[30]    = 0
7950      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7951      instr[28,25] = 1111
7952      instr[24]    = 0
7953      instr[23,22] ==> type : 00 ==> source is single,
7954                              01 ==> source is double
7955                              10 ==> UNALLOC
7956                              11 ==> UNALLOC or source is half
7957      instr[21]    = 1
7958      instr[20,15] ==> opcode : with type 00 or 01
7959                                000000 ==> FMOV, 000001 ==> FABS,
7960                                000010 ==> FNEG, 000011 ==> FSQRT,
7961                                000100 ==> UNALLOC, 000101 ==> FCVT,(to single/double)
7962                                000110 ==> UNALLOC, 000111 ==> FCVT (to half)
7963                                001000 ==> FRINTN, 001001 ==> FRINTP,
7964                                001010 ==> FRINTM, 001011 ==> FRINTZ,
7965                                001100 ==> FRINTA, 001101 ==> UNALLOC
7966                                001110 ==> FRINTX, 001111 ==> FRINTI
7967                                with type 11
7968                                000100 ==> FCVT (half-to-single)
7969                                000101 ==> FCVT (half-to-double)
7970                                instr[14,10] = 10000.  */
7971
7972   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7973   uint32_t type   = INSTR (23, 22);
7974   uint32_t opcode = INSTR (20, 15);
7975
7976   if (M_S != 0)
7977     HALT_UNALLOC;
7978
7979   if (type == 3)
7980     {
7981       if (opcode == 4)
7982         do_FCVT_half_to_single (cpu);
7983       else if (opcode == 5)
7984         do_FCVT_half_to_double (cpu);
7985       else
7986         HALT_UNALLOC;
7987       return;
7988     }
7989
7990   if (type == 2)
7991     HALT_UNALLOC;
7992
7993   switch (opcode)
7994     {
7995     case 0:
7996       if (type)
7997         ffmovd (cpu);
7998       else
7999         ffmovs (cpu);
8000       return;
8001
8002     case 1:
8003       if (type)
8004         fabcpu (cpu);
8005       else
8006         fabss (cpu);
8007       return;
8008
8009     case 2:
8010       if (type)
8011         fnegd (cpu);
8012       else
8013         fnegs (cpu);
8014       return;
8015
8016     case 3:
8017       if (type)
8018         fsqrtd (cpu);
8019       else
8020         fsqrts (cpu);
8021       return;
8022
8023     case 4:
8024       if (type)
8025         fcvtds (cpu);
8026       else
8027         HALT_UNALLOC;
8028       return;
8029
8030     case 5:
8031       if (type)
8032         HALT_UNALLOC;
8033       fcvtcpu (cpu);
8034       return;
8035
8036     case 8:             /* FRINTN etc.  */
8037     case 9:
8038     case 10:
8039     case 11:
8040     case 12:
8041     case 14:
8042     case 15:
8043        do_FRINT (cpu);
8044        return;
8045
8046     case 7:
8047       if (INSTR (22, 22))
8048         do_FCVT_double_to_half (cpu);
8049       else
8050         do_FCVT_single_to_half (cpu);
8051       return;
8052
8053     case 13:
8054       HALT_NYI;
8055
8056     default:
8057       HALT_UNALLOC;
8058     }
8059 }
8060
8061 /* 32 bit signed int to float.  */
8062 static void
8063 scvtf32 (sim_cpu *cpu)
8064 {
8065   unsigned rn = INSTR (9, 5);
8066   unsigned sd = INSTR (4, 0);
8067
8068   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8069   aarch64_set_FP_float
8070     (cpu, sd, (float) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8071 }
8072
8073 /* signed int to float.  */
8074 static void
8075 scvtf (sim_cpu *cpu)
8076 {
8077   unsigned rn = INSTR (9, 5);
8078   unsigned sd = INSTR (4, 0);
8079
8080   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8081   aarch64_set_FP_float
8082     (cpu, sd, (float) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8083 }
8084
8085 /* 32 bit signed int to double.  */
8086 static void
8087 scvtd32 (sim_cpu *cpu)
8088 {
8089   unsigned rn = INSTR (9, 5);
8090   unsigned sd = INSTR (4, 0);
8091
8092   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8093   aarch64_set_FP_double
8094     (cpu, sd, (double) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8095 }
8096
8097 /* signed int to double.  */
8098 static void
8099 scvtd (sim_cpu *cpu)
8100 {
8101   unsigned rn = INSTR (9, 5);
8102   unsigned sd = INSTR (4, 0);
8103
8104   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8105   aarch64_set_FP_double
8106     (cpu, sd, (double) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8107 }
8108
8109 static const float  FLOAT_INT_MAX   = (float)  INT_MAX;
8110 static const float  FLOAT_INT_MIN   = (float)  INT_MIN;
8111 static const double DOUBLE_INT_MAX  = (double) INT_MAX;
8112 static const double DOUBLE_INT_MIN  = (double) INT_MIN;
8113 static const float  FLOAT_LONG_MAX  = (float)  LONG_MAX;
8114 static const float  FLOAT_LONG_MIN  = (float)  LONG_MIN;
8115 static const double DOUBLE_LONG_MAX = (double) LONG_MAX;
8116 static const double DOUBLE_LONG_MIN = (double) LONG_MIN;
8117
8118 #define UINT_MIN 0
8119 #define ULONG_MIN 0
8120 static const float  FLOAT_UINT_MAX   = (float)  UINT_MAX;
8121 static const float  FLOAT_UINT_MIN   = (float)  UINT_MIN;
8122 static const double DOUBLE_UINT_MAX  = (double) UINT_MAX;
8123 static const double DOUBLE_UINT_MIN  = (double) UINT_MIN;
8124 static const float  FLOAT_ULONG_MAX  = (float)  ULONG_MAX;
8125 static const float  FLOAT_ULONG_MIN  = (float)  ULONG_MIN;
8126 static const double DOUBLE_ULONG_MAX = (double) ULONG_MAX;
8127 static const double DOUBLE_ULONG_MIN = (double) ULONG_MIN;
8128
8129 /* Check for FP exception conditions:
8130      NaN raises IO
8131      Infinity raises IO
8132      Out of Range raises IO and IX and saturates value
8133      Denormal raises ID and IX and sets to zero.  */
8134 #define RAISE_EXCEPTIONS(F, VALUE, FTYPE, ITYPE)        \
8135   do                                                    \
8136     {                                                   \
8137       switch (fpclassify (F))                           \
8138         {                                               \
8139         case FP_INFINITE:                               \
8140         case FP_NAN:                                    \
8141           aarch64_set_FPSR (cpu, IO);                   \
8142           if (signbit (F))                              \
8143             VALUE = ITYPE##_MAX;                        \
8144           else                                          \
8145             VALUE = ITYPE##_MIN;                        \
8146           break;                                        \
8147                                                         \
8148         case FP_NORMAL:                                 \
8149           if (F >= FTYPE##_##ITYPE##_MAX)               \
8150             {                                           \
8151               aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);    \
8152               VALUE = ITYPE##_MAX;                      \
8153             }                                           \
8154           else if (F <= FTYPE##_##ITYPE##_MIN)          \
8155             {                                           \
8156               aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);    \
8157               VALUE = ITYPE##_MIN;                      \
8158             }                                           \
8159           break;                                        \
8160                                                         \
8161         case FP_SUBNORMAL:                              \
8162           aarch64_set_FPSR_bits (cpu, IO | IX | ID, IX | ID);   \
8163           VALUE = 0;                                    \
8164           break;                                        \
8165                                                         \
8166         default:                                        \
8167         case FP_ZERO:                                   \
8168           VALUE = 0;                                    \
8169           break;                                        \
8170         }                                               \
8171     }                                                   \
8172   while (0)
8173
8174 /* 32 bit convert float to signed int truncate towards zero.  */
8175 static void
8176 fcvtszs32 (sim_cpu *cpu)
8177 {
8178   unsigned sn = INSTR (9, 5);
8179   unsigned rd = INSTR (4, 0);
8180   /* TODO : check that this rounds toward zero.  */
8181   float   f = aarch64_get_FP_float (cpu, sn);
8182   int32_t value = (int32_t) f;
8183
8184   RAISE_EXCEPTIONS (f, value, FLOAT, INT);
8185
8186   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8187   /* Avoid sign extension to 64 bit.  */
8188   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8189 }
8190
8191 /* 64 bit convert float to signed int truncate towards zero.  */
8192 static void
8193 fcvtszs (sim_cpu *cpu)
8194 {
8195   unsigned sn = INSTR (9, 5);
8196   unsigned rd = INSTR (4, 0);
8197   float f = aarch64_get_FP_float (cpu, sn);
8198   int64_t value = (int64_t) f;
8199
8200   RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
8201
8202   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8203   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8204 }
8205
8206 /* 32 bit convert double to signed int truncate towards zero.  */
8207 static void
8208 fcvtszd32 (sim_cpu *cpu)
8209 {
8210   unsigned sn = INSTR (9, 5);
8211   unsigned rd = INSTR (4, 0);
8212   /* TODO : check that this rounds toward zero.  */
8213   double   d = aarch64_get_FP_double (cpu, sn);
8214   int32_t  value = (int32_t) d;
8215
8216   RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
8217
8218   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8219   /* Avoid sign extension to 64 bit.  */
8220   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8221 }
8222
8223 /* 64 bit convert double to signed int truncate towards zero.  */
8224 static void
8225 fcvtszd (sim_cpu *cpu)
8226 {
8227   unsigned sn = INSTR (9, 5);
8228   unsigned rd = INSTR (4, 0);
8229   /* TODO : check that this rounds toward zero.  */
8230   double  d = aarch64_get_FP_double (cpu, sn);
8231   int64_t value;
8232
8233   value = (int64_t) d;
8234
8235   RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
8236
8237   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8238   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8239 }
8240
8241 static void
8242 do_fcvtzu (sim_cpu *cpu)
8243 {
8244   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8245      instr[30,23] = 00111100
8246      instr[22]    = type: single (0)/ double (1)
8247      instr[21]    = enable (0)/disable(1) precision
8248      instr[20,16] = 11001
8249      instr[15,10] = precision
8250      instr[9,5]   = Rs
8251      instr[4,0]   = Rd.  */
8252
8253   unsigned rs = INSTR (9, 5);
8254   unsigned rd = INSTR (4, 0);
8255
8256   NYI_assert (30, 23, 0x3C);
8257   NYI_assert (20, 16, 0x19);
8258
8259   if (INSTR (21, 21) != 1)
8260     /* Convert to fixed point.  */
8261     HALT_NYI;
8262
8263   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8264   if (INSTR (31, 31))
8265     {
8266       /* Convert to unsigned 64-bit integer.  */
8267       if (INSTR (22, 22))
8268         {
8269           double  d = aarch64_get_FP_double (cpu, rs);
8270           uint64_t value = (uint64_t) d;
8271
8272           /* Do not raise an exception if we have reached ULONG_MAX.  */
8273           if (value != (1UL << 63))
8274             RAISE_EXCEPTIONS (d, value, DOUBLE, ULONG);
8275
8276           aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8277         }
8278       else
8279         {
8280           float  f = aarch64_get_FP_float (cpu, rs);
8281           uint64_t value = (uint64_t) f;
8282
8283           /* Do not raise an exception if we have reached ULONG_MAX.  */
8284           if (value != (1UL << 63))
8285             RAISE_EXCEPTIONS (f, value, FLOAT, ULONG);
8286
8287           aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8288         }
8289     }
8290   else
8291     {
8292       uint32_t value;
8293
8294       /* Convert to unsigned 32-bit integer.  */
8295       if (INSTR (22, 22))
8296         {
8297           double  d = aarch64_get_FP_double (cpu, rs);
8298
8299           value = (uint32_t) d;
8300           /* Do not raise an exception if we have reached UINT_MAX.  */
8301           if (value != (1UL << 31))
8302             RAISE_EXCEPTIONS (d, value, DOUBLE, UINT);
8303         }
8304       else
8305         {
8306           float  f = aarch64_get_FP_float (cpu, rs);
8307
8308           value = (uint32_t) f;
8309           /* Do not raise an exception if we have reached UINT_MAX.  */
8310           if (value != (1UL << 31))
8311             RAISE_EXCEPTIONS (f, value, FLOAT, UINT);
8312         }
8313
8314       aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8315     }
8316 }
8317
8318 static void
8319 do_UCVTF (sim_cpu *cpu)
8320 {
8321   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8322      instr[30,23] = 001 1110 0
8323      instr[22]    = type: single (0)/ double (1)
8324      instr[21]    = enable (0)/disable(1) precision
8325      instr[20,16] = 0 0011
8326      instr[15,10] = precision
8327      instr[9,5]   = Rs
8328      instr[4,0]   = Rd.  */
8329
8330   unsigned rs = INSTR (9, 5);
8331   unsigned rd = INSTR (4, 0);
8332
8333   NYI_assert (30, 23, 0x3C);
8334   NYI_assert (20, 16, 0x03);
8335
8336   if (INSTR (21, 21) != 1)
8337     HALT_NYI;
8338
8339   /* FIXME: Add exception raising.  */
8340   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8341   if (INSTR (31, 31))
8342     {
8343       uint64_t value = aarch64_get_reg_u64 (cpu, rs, NO_SP);
8344
8345       if (INSTR (22, 22))
8346         aarch64_set_FP_double (cpu, rd, (double) value);
8347       else
8348         aarch64_set_FP_float (cpu, rd, (float) value);
8349     }
8350   else
8351     {
8352       uint32_t value =  aarch64_get_reg_u32 (cpu, rs, NO_SP);
8353
8354       if (INSTR (22, 22))
8355         aarch64_set_FP_double (cpu, rd, (double) value);
8356       else
8357         aarch64_set_FP_float (cpu, rd, (float) value);
8358     }
8359 }
8360
8361 static void
8362 float_vector_move (sim_cpu *cpu)
8363 {
8364   /* instr[31,17] == 100 1111 0101 0111
8365      instr[16]    ==> direction 0=> to GR, 1=> from GR
8366      instr[15,10] => ???
8367      instr[9,5]   ==> source
8368      instr[4,0]   ==> dest.  */
8369
8370   unsigned rn = INSTR (9, 5);
8371   unsigned rd = INSTR (4, 0);
8372
8373   NYI_assert (31, 17, 0x4F57);
8374
8375   if (INSTR (15, 10) != 0)
8376     HALT_UNALLOC;
8377
8378   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8379   if (INSTR (16, 16))
8380     aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_reg_u64 (cpu, rn, NO_SP));
8381   else
8382     aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, rn, 1));
8383 }
8384
8385 static void
8386 dexSimpleFPIntegerConvert (sim_cpu *cpu)
8387 {
8388   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
8389      instr[30     = 0
8390      instr[29]    = S :  0 ==> OK, 1 ==> UNALLOC
8391      instr[28,25] = 1111
8392      instr[24]    = 0
8393      instr[23,22] = type : 00 ==> single, 01 ==> double, 1x ==> UNALLOC
8394      instr[21]    = 1
8395      instr[20,19] = rmode
8396      instr[18,16] = opcode
8397      instr[15,10] = 10 0000  */
8398
8399   uint32_t rmode_opcode;
8400   uint32_t size_type;
8401   uint32_t type;
8402   uint32_t size;
8403   uint32_t S;
8404
8405   if (INSTR (31, 17) == 0x4F57)
8406     {
8407       float_vector_move (cpu);
8408       return;
8409     }
8410
8411   size = INSTR (31, 31);
8412   S = INSTR (29, 29);
8413   if (S != 0)
8414     HALT_UNALLOC;
8415
8416   type = INSTR (23, 22);
8417   if (type > 1)
8418     HALT_UNALLOC;
8419
8420   rmode_opcode = INSTR (20, 16);
8421   size_type = (size << 1) | type; /* 0==32f, 1==32d, 2==64f, 3==64d.  */
8422
8423   switch (rmode_opcode)
8424     {
8425     case 2:                     /* SCVTF.  */
8426       switch (size_type)
8427         {
8428         case 0: scvtf32 (cpu); return;
8429         case 1: scvtd32 (cpu); return;
8430         case 2: scvtf (cpu); return;
8431         case 3: scvtd (cpu); return;
8432         }
8433
8434     case 6:                     /* FMOV GR, Vec.  */
8435       switch (size_type)
8436         {
8437         case 0:  gfmovs (cpu); return;
8438         case 3:  gfmovd (cpu); return;
8439         default: HALT_UNALLOC;
8440         }
8441
8442     case 7:                     /* FMOV vec, GR.  */
8443       switch (size_type)
8444         {
8445         case 0:  fgmovs (cpu); return;
8446         case 3:  fgmovd (cpu); return;
8447         default: HALT_UNALLOC;
8448         }
8449
8450     case 24:                    /* FCVTZS.  */
8451       switch (size_type)
8452         {
8453         case 0: fcvtszs32 (cpu); return;
8454         case 1: fcvtszd32 (cpu); return;
8455         case 2: fcvtszs (cpu); return;
8456         case 3: fcvtszd (cpu); return;
8457         }
8458
8459     case 25: do_fcvtzu (cpu); return;
8460     case 3:  do_UCVTF (cpu); return;
8461
8462     case 0:     /* FCVTNS.  */
8463     case 1:     /* FCVTNU.  */
8464     case 4:     /* FCVTAS.  */
8465     case 5:     /* FCVTAU.  */
8466     case 8:     /* FCVPTS.  */
8467     case 9:     /* FCVTPU.  */
8468     case 16:    /* FCVTMS.  */
8469     case 17:    /* FCVTMU.  */
8470     default:
8471       HALT_NYI;
8472     }
8473 }
8474
8475 static void
8476 set_flags_for_float_compare (sim_cpu *cpu, float fvalue1, float fvalue2)
8477 {
8478   uint32_t flags;
8479
8480   /* FIXME: Add exception raising.  */
8481   if (isnan (fvalue1) || isnan (fvalue2))
8482     flags = C|V;
8483   else if (isinf (fvalue1) && isinf (fvalue2))
8484     {
8485       /* Subtracting two infinities may give a NaN.  We only need to compare
8486          the signs, which we can get from isinf.  */
8487       int result = isinf (fvalue1) - isinf (fvalue2);
8488
8489       if (result == 0)
8490         flags = Z|C;
8491       else if (result < 0)
8492         flags = N;
8493       else /* (result > 0).  */
8494         flags = C;
8495     }
8496   else
8497     {
8498       float result = fvalue1 - fvalue2;
8499
8500       if (result == 0.0)
8501         flags = Z|C;
8502       else if (result < 0)
8503         flags = N;
8504       else /* (result > 0).  */
8505         flags = C;
8506     }
8507
8508   aarch64_set_CPSR (cpu, flags);
8509 }
8510
8511 static void
8512 fcmps (sim_cpu *cpu)
8513 {
8514   unsigned sm = INSTR (20, 16);
8515   unsigned sn = INSTR ( 9,  5);
8516
8517   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8518   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8519
8520   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8521   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8522 }
8523
8524 /* Float compare to zero -- Invalid Operation exception
8525    only on signaling NaNs.  */
8526 static void
8527 fcmpzs (sim_cpu *cpu)
8528 {
8529   unsigned sn = INSTR ( 9,  5);
8530   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8531
8532   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8533   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8534 }
8535
8536 /* Float compare -- Invalid Operation exception on all NaNs.  */
8537 static void
8538 fcmpes (sim_cpu *cpu)
8539 {
8540   unsigned sm = INSTR (20, 16);
8541   unsigned sn = INSTR ( 9,  5);
8542
8543   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8544   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8545
8546   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8547   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8548 }
8549
8550 /* Float compare to zero -- Invalid Operation exception on all NaNs.  */
8551 static void
8552 fcmpzes (sim_cpu *cpu)
8553 {
8554   unsigned sn = INSTR ( 9,  5);
8555   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8556
8557   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8558   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8559 }
8560
8561 static void
8562 set_flags_for_double_compare (sim_cpu *cpu, double dval1, double dval2)
8563 {
8564   uint32_t flags;
8565
8566   /* FIXME: Add exception raising.  */
8567   if (isnan (dval1) || isnan (dval2))
8568     flags = C|V;
8569   else if (isinf (dval1) && isinf (dval2))
8570     {
8571       /* Subtracting two infinities may give a NaN.  We only need to compare
8572          the signs, which we can get from isinf.  */
8573       int result = isinf (dval1) - isinf (dval2);
8574
8575       if (result == 0)
8576         flags = Z|C;
8577       else if (result < 0)
8578         flags = N;
8579       else /* (result > 0).  */
8580         flags = C;
8581     }
8582   else
8583     {
8584       double result = dval1 - dval2;
8585
8586       if (result == 0.0)
8587         flags = Z|C;
8588       else if (result < 0)
8589         flags = N;
8590       else /* (result > 0).  */
8591         flags = C;
8592     }
8593
8594   aarch64_set_CPSR (cpu, flags);
8595 }
8596
8597 /* Double compare -- Invalid Operation exception only on signaling NaNs.  */
8598 static void
8599 fcmpd (sim_cpu *cpu)
8600 {
8601   unsigned sm = INSTR (20, 16);
8602   unsigned sn = INSTR ( 9,  5);
8603
8604   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8605   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8606
8607   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8608   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8609 }
8610
8611 /* Double compare to zero -- Invalid Operation exception
8612    only on signaling NaNs.  */
8613 static void
8614 fcmpzd (sim_cpu *cpu)
8615 {
8616   unsigned sn = INSTR ( 9,  5);
8617   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8618
8619   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8620   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8621 }
8622
8623 /* Double compare -- Invalid Operation exception on all NaNs.  */
8624 static void
8625 fcmped (sim_cpu *cpu)
8626 {
8627   unsigned sm = INSTR (20, 16);
8628   unsigned sn = INSTR ( 9,  5);
8629
8630   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8631   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8632
8633   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8634   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8635 }
8636
8637 /* Double compare to zero -- Invalid Operation exception on all NaNs.  */
8638 static void
8639 fcmpzed (sim_cpu *cpu)
8640 {
8641   unsigned sn = INSTR ( 9,  5);
8642   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8643
8644   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8645   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8646 }
8647
8648 static void
8649 dexSimpleFPCompare (sim_cpu *cpu)
8650 {
8651   /* assert instr[28,25] == 1111
8652      instr[30:24:21:13,10] = 0011000
8653      instr[31] = M : 0 ==> OK, 1 ==> UNALLOC
8654      instr[29] ==> S :  0 ==> OK, 1 ==> UNALLOC
8655      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
8656      instr[15,14] ==> op : 00 ==> OK, ow ==> UNALLOC
8657      instr[4,0] ==> opcode2 : 00000 ==> FCMP, 10000 ==> FCMPE,
8658                               01000 ==> FCMPZ, 11000 ==> FCMPEZ,
8659                               ow ==> UNALLOC  */
8660   uint32_t dispatch;
8661   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
8662   uint32_t type = INSTR (23, 22);
8663   uint32_t op = INSTR (15, 14);
8664   uint32_t op2_2_0 = INSTR (2, 0);
8665
8666   if (op2_2_0 != 0)
8667     HALT_UNALLOC;
8668
8669   if (M_S != 0)
8670     HALT_UNALLOC;
8671
8672   if (type > 1)
8673     HALT_UNALLOC;
8674
8675   if (op != 0)
8676     HALT_UNALLOC;
8677
8678   /* dispatch on type and top 2 bits of opcode.  */
8679   dispatch = (type << 2) | INSTR (4, 3);
8680
8681   switch (dispatch)
8682     {
8683     case 0: fcmps (cpu); return;
8684     case 1: fcmpzs (cpu); return;
8685     case 2: fcmpes (cpu); return;
8686     case 3: fcmpzes (cpu); return;
8687     case 4: fcmpd (cpu); return;
8688     case 5: fcmpzd (cpu); return;
8689     case 6: fcmped (cpu); return;
8690     case 7: fcmpzed (cpu); return;
8691     }
8692 }
8693
8694 static void
8695 do_scalar_FADDP (sim_cpu *cpu)
8696 {
8697   /* instr [31,23] = 0111 1110 0
8698      instr [22]    = single(0)/double(1)
8699      instr [21,10] = 11 0000 1101 10
8700      instr [9,5]   = Fn
8701      instr [4,0]   = Fd.  */
8702
8703   unsigned Fn = INSTR (9, 5);
8704   unsigned Fd = INSTR (4, 0);
8705
8706   NYI_assert (31, 23, 0x0FC);
8707   NYI_assert (21, 10, 0xC36);
8708
8709   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8710   if (INSTR (22, 22))
8711     {
8712       double val1 = aarch64_get_vec_double (cpu, Fn, 0);
8713       double val2 = aarch64_get_vec_double (cpu, Fn, 1);
8714
8715       aarch64_set_FP_double (cpu, Fd, val1 + val2);
8716     }
8717   else
8718     {
8719       float val1 = aarch64_get_vec_float (cpu, Fn, 0);
8720       float val2 = aarch64_get_vec_float (cpu, Fn, 1);
8721
8722       aarch64_set_FP_float (cpu, Fd, val1 + val2);
8723     }
8724 }
8725
8726 /* Floating point absolute difference.  */
8727
8728 static void
8729 do_scalar_FABD (sim_cpu *cpu)
8730 {
8731   /* instr [31,23] = 0111 1110 1
8732      instr [22]    = float(0)/double(1)
8733      instr [21]    = 1
8734      instr [20,16] = Rm
8735      instr [15,10] = 1101 01
8736      instr [9, 5]  = Rn
8737      instr [4, 0]  = Rd.  */
8738
8739   unsigned rm = INSTR (20, 16);
8740   unsigned rn = INSTR (9, 5);
8741   unsigned rd = INSTR (4, 0);
8742
8743   NYI_assert (31, 23, 0x0FD);
8744   NYI_assert (21, 21, 1);
8745   NYI_assert (15, 10, 0x35);
8746
8747   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8748   if (INSTR (22, 22))
8749     aarch64_set_FP_double (cpu, rd,
8750                            fabs (aarch64_get_FP_double (cpu, rn)
8751                                  - aarch64_get_FP_double (cpu, rm)));
8752   else
8753     aarch64_set_FP_float (cpu, rd,
8754                           fabsf (aarch64_get_FP_float (cpu, rn)
8755                                  - aarch64_get_FP_float (cpu, rm)));
8756 }
8757
8758 static void
8759 do_scalar_CMGT (sim_cpu *cpu)
8760 {
8761   /* instr [31,21] = 0101 1110 111
8762      instr [20,16] = Rm
8763      instr [15,10] = 00 1101
8764      instr [9, 5]  = Rn
8765      instr [4, 0]  = Rd.  */
8766
8767   unsigned rm = INSTR (20, 16);
8768   unsigned rn = INSTR (9, 5);
8769   unsigned rd = INSTR (4, 0);
8770
8771   NYI_assert (31, 21, 0x2F7);
8772   NYI_assert (15, 10, 0x0D);
8773
8774   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8775   aarch64_set_vec_u64 (cpu, rd, 0,
8776                        aarch64_get_vec_u64 (cpu, rn, 0) >
8777                        aarch64_get_vec_u64 (cpu, rm, 0) ? -1L : 0L);
8778 }
8779
8780 static void
8781 do_scalar_USHR (sim_cpu *cpu)
8782 {
8783   /* instr [31,23] = 0111 1111 0
8784      instr [22,16] = shift amount
8785      instr [15,10] = 0000 01
8786      instr [9, 5]  = Rn
8787      instr [4, 0]  = Rd.  */
8788
8789   unsigned amount = 128 - INSTR (22, 16);
8790   unsigned rn = INSTR (9, 5);
8791   unsigned rd = INSTR (4, 0);
8792
8793   NYI_assert (31, 23, 0x0FE);
8794   NYI_assert (15, 10, 0x01);
8795
8796   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8797   aarch64_set_vec_u64 (cpu, rd, 0,
8798                        aarch64_get_vec_u64 (cpu, rn, 0) >> amount);
8799 }
8800
8801 static void
8802 do_scalar_SSHL (sim_cpu *cpu)
8803 {
8804   /* instr [31,21] = 0101 1110 111
8805      instr [20,16] = Rm
8806      instr [15,10] = 0100 01
8807      instr [9, 5]  = Rn
8808      instr [4, 0]  = Rd.  */
8809
8810   unsigned rm = INSTR (20, 16);
8811   unsigned rn = INSTR (9, 5);
8812   unsigned rd = INSTR (4, 0);
8813   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
8814
8815   NYI_assert (31, 21, 0x2F7);
8816   NYI_assert (15, 10, 0x11);
8817
8818   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8819   if (shift >= 0)
8820     aarch64_set_vec_s64 (cpu, rd, 0,
8821                          aarch64_get_vec_s64 (cpu, rn, 0) << shift);
8822   else
8823     aarch64_set_vec_s64 (cpu, rd, 0,
8824                          aarch64_get_vec_s64 (cpu, rn, 0) >> - shift);
8825 }
8826
8827 static void
8828 do_scalar_shift (sim_cpu *cpu)
8829 {
8830   /* instr [31,23] = 0101 1111 0
8831      instr [22,16] = shift amount
8832      instr [15,10] = 0101 01   [SHL]
8833      instr [15,10] = 0000 01   [SSHR]
8834      instr [9, 5]  = Rn
8835      instr [4, 0]  = Rd.  */
8836
8837   unsigned rn = INSTR (9, 5);
8838   unsigned rd = INSTR (4, 0);
8839   unsigned amount;
8840
8841   NYI_assert (31, 23, 0x0BE);
8842
8843   if (INSTR (22, 22) == 0)
8844     HALT_UNALLOC;
8845
8846   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8847   switch (INSTR (15, 10))
8848     {
8849     case 0x01: /* SSHR */
8850       amount = 128 - INSTR (22, 16);
8851       aarch64_set_vec_s64 (cpu, rd, 0,
8852                            aarch64_get_vec_s64 (cpu, rn, 0) >> amount);
8853       return;
8854     case 0x15: /* SHL */
8855       amount = INSTR (22, 16) - 64;
8856       aarch64_set_vec_u64 (cpu, rd, 0,
8857                            aarch64_get_vec_u64 (cpu, rn, 0) << amount);
8858       return;
8859     default:
8860       HALT_NYI;
8861     }
8862 }
8863
8864 /* FCMEQ FCMGT FCMGE.  */
8865 static void
8866 do_scalar_FCM (sim_cpu *cpu)
8867 {
8868   /* instr [31,30] = 01
8869      instr [29]    = U
8870      instr [28,24] = 1 1110
8871      instr [23]    = E
8872      instr [22]    = size
8873      instr [21]    = 1
8874      instr [20,16] = Rm
8875      instr [15,12] = 1110
8876      instr [11]    = AC
8877      instr [10]    = 1
8878      instr [9, 5]  = Rn
8879      instr [4, 0]  = Rd.  */
8880
8881   unsigned rm = INSTR (20, 16);
8882   unsigned rn = INSTR (9, 5);
8883   unsigned rd = INSTR (4, 0);
8884   unsigned EUac = (INSTR (23, 23) << 2) | (INSTR (29, 29) << 1) | INSTR (11, 11);
8885   unsigned result;
8886   float val1;
8887   float val2;
8888
8889   NYI_assert (31, 30, 1);
8890   NYI_assert (28, 24, 0x1E);
8891   NYI_assert (21, 21, 1);
8892   NYI_assert (15, 12, 0xE);
8893   NYI_assert (10, 10, 1);
8894
8895   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8896   if (INSTR (22, 22))
8897     {
8898       double val1 = aarch64_get_FP_double (cpu, rn);
8899       double val2 = aarch64_get_FP_double (cpu, rm);
8900
8901       switch (EUac)
8902         {
8903         case 0: /* 000 */
8904           result = val1 == val2;
8905           break;
8906
8907         case 3: /* 011 */
8908           val1 = fabs (val1);
8909           val2 = fabs (val2);
8910           /* Fall through. */
8911         case 2: /* 010 */
8912           result = val1 >= val2;
8913           break;
8914
8915         case 7: /* 111 */
8916           val1 = fabs (val1);
8917           val2 = fabs (val2);
8918           /* Fall through. */
8919         case 6: /* 110 */
8920           result = val1 > val2;
8921           break;
8922
8923         default:
8924           HALT_UNALLOC;
8925         }
8926
8927       aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
8928       return;
8929     }
8930
8931   val1 = aarch64_get_FP_float (cpu, rn);
8932   val2 = aarch64_get_FP_float (cpu, rm);
8933
8934   switch (EUac)
8935     {
8936     case 0: /* 000 */
8937       result = val1 == val2;
8938       break;
8939
8940     case 3: /* 011 */
8941       val1 = fabsf (val1);
8942       val2 = fabsf (val2);
8943       /* Fall through. */
8944     case 2: /* 010 */
8945       result = val1 >= val2;
8946       break;
8947
8948     case 7: /* 111 */
8949       val1 = fabsf (val1);
8950       val2 = fabsf (val2);
8951       /* Fall through. */
8952     case 6: /* 110 */
8953       result = val1 > val2;
8954       break;
8955
8956     default:
8957       HALT_UNALLOC;
8958     }
8959
8960   aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
8961 }
8962
8963 /* An alias of DUP.  */
8964 static void
8965 do_scalar_MOV (sim_cpu *cpu)
8966 {
8967   /* instr [31,21] = 0101 1110 000
8968      instr [20,16] = imm5
8969      instr [15,10] = 0000 01
8970      instr [9, 5]  = Rn
8971      instr [4, 0]  = Rd.  */
8972
8973   unsigned rn = INSTR (9, 5);
8974   unsigned rd = INSTR (4, 0);
8975   unsigned index;
8976
8977   NYI_assert (31, 21, 0x2F0);
8978   NYI_assert (15, 10, 0x01);
8979
8980   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8981   if (INSTR (16, 16))
8982     {
8983       /* 8-bit.  */
8984       index = INSTR (20, 17);
8985       aarch64_set_vec_u8
8986         (cpu, rd, 0, aarch64_get_vec_u8 (cpu, rn, index));
8987     }
8988   else if (INSTR (17, 17))
8989     {
8990       /* 16-bit.  */
8991       index = INSTR (20, 18);
8992       aarch64_set_vec_u16
8993         (cpu, rd, 0, aarch64_get_vec_u16 (cpu, rn, index));
8994     }
8995   else if (INSTR (18, 18))
8996     {
8997       /* 32-bit.  */
8998       index = INSTR (20, 19);
8999       aarch64_set_vec_u32
9000         (cpu, rd, 0, aarch64_get_vec_u32 (cpu, rn, index));
9001     }
9002   else if (INSTR (19, 19))
9003     {
9004       /* 64-bit.  */
9005       index = INSTR (20, 20);
9006       aarch64_set_vec_u64
9007         (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, index));
9008     }
9009   else
9010     HALT_UNALLOC;
9011 }
9012
9013 static void
9014 do_scalar_NEG (sim_cpu *cpu)
9015 {
9016   /* instr [31,10] = 0111 1110 1110 0000 1011 10
9017      instr [9, 5]  = Rn
9018      instr [4, 0]  = Rd.  */
9019
9020   unsigned rn = INSTR (9, 5);
9021   unsigned rd = INSTR (4, 0);
9022
9023   NYI_assert (31, 10, 0x1FB82E);
9024
9025   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9026   aarch64_set_vec_u64 (cpu, rd, 0, - aarch64_get_vec_u64 (cpu, rn, 0));
9027 }
9028
9029 static void
9030 do_scalar_USHL (sim_cpu *cpu)
9031 {
9032   /* instr [31,21] = 0111 1110 111
9033      instr [20,16] = Rm
9034      instr [15,10] = 0100 01
9035      instr [9, 5]  = Rn
9036      instr [4, 0]  = Rd.  */
9037
9038   unsigned rm = INSTR (20, 16);
9039   unsigned rn = INSTR (9, 5);
9040   unsigned rd = INSTR (4, 0);
9041   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
9042
9043   NYI_assert (31, 21, 0x3F7);
9044   NYI_assert (15, 10, 0x11);
9045
9046   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9047   if (shift >= 0)
9048     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) << shift);
9049   else
9050     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) >> - shift);
9051 }
9052
9053 static void
9054 do_double_add (sim_cpu *cpu)
9055 {
9056   /* instr [31,21] = 0101 1110 111
9057      instr [20,16] = Fn
9058      instr [15,10] = 1000 01
9059      instr [9,5]   = Fm
9060      instr [4,0]   = Fd.  */
9061   unsigned Fd;
9062   unsigned Fm;
9063   unsigned Fn;
9064   double val1;
9065   double val2;
9066
9067   NYI_assert (31, 21, 0x2F7);
9068   NYI_assert (15, 10, 0x21);
9069
9070   Fd = INSTR (4, 0);
9071   Fm = INSTR (9, 5);
9072   Fn = INSTR (20, 16);
9073
9074   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9075   val1 = aarch64_get_FP_double (cpu, Fm);
9076   val2 = aarch64_get_FP_double (cpu, Fn);
9077
9078   aarch64_set_FP_double (cpu, Fd, val1 + val2);
9079 }
9080
9081 static void
9082 do_scalar_UCVTF (sim_cpu *cpu)
9083 {
9084   /* instr [31,23] = 0111 1110 0
9085      instr [22]    = single(0)/double(1)
9086      instr [21,10] = 10 0001 1101 10
9087      instr [9,5]   = rn
9088      instr [4,0]   = rd.  */
9089
9090   unsigned rn = INSTR (9, 5);
9091   unsigned rd = INSTR (4, 0);
9092
9093   NYI_assert (31, 23, 0x0FC);
9094   NYI_assert (21, 10, 0x876);
9095
9096   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9097   if (INSTR (22, 22))
9098     {
9099       uint64_t val = aarch64_get_vec_u64 (cpu, rn, 0);
9100
9101       aarch64_set_vec_double (cpu, rd, 0, (double) val);
9102     }
9103   else
9104     {
9105       uint32_t val = aarch64_get_vec_u32 (cpu, rn, 0);
9106
9107       aarch64_set_vec_float (cpu, rd, 0, (float) val);
9108     }
9109 }
9110
9111 static void
9112 do_scalar_vec (sim_cpu *cpu)
9113 {
9114   /* instr [30] = 1.  */
9115   /* instr [28,25] = 1111.  */
9116   switch (INSTR (31, 23))
9117     {
9118     case 0xBC:
9119       switch (INSTR (15, 10))
9120         {
9121         case 0x01: do_scalar_MOV (cpu); return;
9122         case 0x39: do_scalar_FCM (cpu); return;
9123         case 0x3B: do_scalar_FCM (cpu); return;
9124         }
9125       break;
9126
9127     case 0xBE: do_scalar_shift (cpu); return;
9128
9129     case 0xFC:
9130       switch (INSTR (15, 10))
9131         {
9132         case 0x36:
9133           switch (INSTR (21, 16))
9134             {
9135             case 0x30: do_scalar_FADDP (cpu); return;
9136             case 0x21: do_scalar_UCVTF (cpu); return;
9137             }
9138           HALT_NYI;
9139         case 0x39: do_scalar_FCM (cpu); return;
9140         case 0x3B: do_scalar_FCM (cpu); return;
9141         }
9142       break;
9143
9144     case 0xFD:
9145       switch (INSTR (15, 10))
9146         {
9147         case 0x0D: do_scalar_CMGT (cpu); return;
9148         case 0x11: do_scalar_USHL (cpu); return;
9149         case 0x2E: do_scalar_NEG (cpu); return;
9150         case 0x35: do_scalar_FABD (cpu); return;
9151         case 0x39: do_scalar_FCM (cpu); return;
9152         case 0x3B: do_scalar_FCM (cpu); return;
9153         default:
9154           HALT_NYI;
9155         }
9156
9157     case 0xFE: do_scalar_USHR (cpu); return;
9158
9159     case 0xBD:
9160       switch (INSTR (15, 10))
9161         {
9162         case 0x21: do_double_add (cpu); return;
9163         case 0x11: do_scalar_SSHL (cpu); return;
9164         default:
9165           HALT_NYI;
9166         }
9167
9168     default:
9169       HALT_NYI;
9170     }
9171 }
9172
9173 static void
9174 dexAdvSIMD1 (sim_cpu *cpu)
9175 {
9176   /* instr [28,25] = 1 111.  */
9177
9178   /* We are currently only interested in the basic
9179      scalar fp routines which all have bit 30 = 0.  */
9180   if (INSTR (30, 30))
9181     do_scalar_vec (cpu);
9182
9183   /* instr[24] is set for FP data processing 3-source and clear for
9184      all other basic scalar fp instruction groups.  */
9185   else if (INSTR (24, 24))
9186     dexSimpleFPDataProc3Source (cpu);
9187
9188   /* instr[21] is clear for floating <-> fixed conversions and set for
9189      all other basic scalar fp instruction groups.  */
9190   else if (!INSTR (21, 21))
9191     dexSimpleFPFixedConvert (cpu);
9192
9193   /* instr[11,10] : 01 ==> cond compare, 10 ==> Data Proc 2 Source
9194      11 ==> cond select,  00 ==> other.  */
9195   else
9196     switch (INSTR (11, 10))
9197       {
9198       case 1: dexSimpleFPCondCompare (cpu); return;
9199       case 2: dexSimpleFPDataProc2Source (cpu); return;
9200       case 3: dexSimpleFPCondSelect (cpu); return;
9201
9202       default:
9203         /* Now an ordered cascade of tests.
9204            FP immediate has instr [12] == 1.
9205            FP compare has   instr [13] == 1.
9206            FP Data Proc 1 Source has instr [14] == 1.
9207            FP floating <--> integer conversions has instr [15] == 0.  */
9208         if (INSTR (12, 12))
9209           dexSimpleFPImmediate (cpu);
9210
9211         else if (INSTR (13, 13))
9212           dexSimpleFPCompare (cpu);
9213
9214         else if (INSTR (14, 14))
9215           dexSimpleFPDataProc1Source (cpu);
9216
9217         else if (!INSTR (15, 15))
9218           dexSimpleFPIntegerConvert (cpu);
9219
9220         else
9221           /* If we get here then instr[15] == 1 which means UNALLOC.  */
9222           HALT_UNALLOC;
9223       }
9224 }
9225
9226 /* PC relative addressing.  */
9227
9228 static void
9229 pcadr (sim_cpu *cpu)
9230 {
9231   /* instr[31] = op : 0 ==> ADR, 1 ==> ADRP
9232      instr[30,29] = immlo
9233      instr[23,5] = immhi.  */
9234   uint64_t address;
9235   unsigned rd = INSTR (4, 0);
9236   uint32_t isPage = INSTR (31, 31);
9237   union { int64_t u64; uint64_t s64; } imm;
9238   uint64_t offset;
9239
9240   imm.s64 = simm64 (aarch64_get_instr (cpu), 23, 5);
9241   offset = imm.u64;
9242   offset = (offset << 2) | INSTR (30, 29);
9243
9244   address = aarch64_get_PC (cpu);
9245
9246   if (isPage)
9247     {
9248       offset <<= 12;
9249       address &= ~0xfff;
9250     }
9251
9252   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9253   aarch64_set_reg_u64 (cpu, rd, NO_SP, address + offset);
9254 }
9255
9256 /* Specific decode and execute for group Data Processing Immediate.  */
9257
9258 static void
9259 dexPCRelAddressing (sim_cpu *cpu)
9260 {
9261   /* assert instr[28,24] = 10000.  */
9262   pcadr (cpu);
9263 }
9264
9265 /* Immediate logical.
9266    The bimm32/64 argument is constructed by replicating a 2, 4, 8,
9267    16, 32 or 64 bit sequence pulled out at decode and possibly
9268    inverting it..
9269
9270    N.B. the output register (dest) can normally be Xn or SP
9271    the exception occurs for flag setting instructions which may
9272    only use Xn for the output (dest).  The input register can
9273    never be SP.  */
9274
9275 /* 32 bit and immediate.  */
9276 static void
9277 and32 (sim_cpu *cpu, uint32_t bimm)
9278 {
9279   unsigned rn = INSTR (9, 5);
9280   unsigned rd = INSTR (4, 0);
9281
9282   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9283   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9284                        aarch64_get_reg_u32 (cpu, rn, NO_SP) & bimm);
9285 }
9286
9287 /* 64 bit and immediate.  */
9288 static void
9289 and64 (sim_cpu *cpu, uint64_t bimm)
9290 {
9291   unsigned rn = INSTR (9, 5);
9292   unsigned rd = INSTR (4, 0);
9293
9294   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9295   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9296                        aarch64_get_reg_u64 (cpu, rn, NO_SP) & bimm);
9297 }
9298
9299 /* 32 bit and immediate set flags.  */
9300 static void
9301 ands32 (sim_cpu *cpu, uint32_t bimm)
9302 {
9303   unsigned rn = INSTR (9, 5);
9304   unsigned rd = INSTR (4, 0);
9305
9306   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9307   uint32_t value2 = bimm;
9308
9309   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9310   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9311   set_flags_for_binop32 (cpu, value1 & value2);
9312 }
9313
9314 /* 64 bit and immediate set flags.  */
9315 static void
9316 ands64 (sim_cpu *cpu, uint64_t bimm)
9317 {
9318   unsigned rn = INSTR (9, 5);
9319   unsigned rd = INSTR (4, 0);
9320
9321   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9322   uint64_t value2 = bimm;
9323
9324   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9325   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9326   set_flags_for_binop64 (cpu, value1 & value2);
9327 }
9328
9329 /* 32 bit exclusive or immediate.  */
9330 static void
9331 eor32 (sim_cpu *cpu, uint32_t bimm)
9332 {
9333   unsigned rn = INSTR (9, 5);
9334   unsigned rd = INSTR (4, 0);
9335
9336   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9337   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9338                        aarch64_get_reg_u32 (cpu, rn, NO_SP) ^ bimm);
9339 }
9340
9341 /* 64 bit exclusive or immediate.  */
9342 static void
9343 eor64 (sim_cpu *cpu, uint64_t bimm)
9344 {
9345   unsigned rn = INSTR (9, 5);
9346   unsigned rd = INSTR (4, 0);
9347
9348   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9349   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9350                        aarch64_get_reg_u64 (cpu, rn, NO_SP) ^ bimm);
9351 }
9352
9353 /* 32 bit or immediate.  */
9354 static void
9355 orr32 (sim_cpu *cpu, uint32_t bimm)
9356 {
9357   unsigned rn = INSTR (9, 5);
9358   unsigned rd = INSTR (4, 0);
9359
9360   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9361   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9362                        aarch64_get_reg_u32 (cpu, rn, NO_SP) | bimm);
9363 }
9364
9365 /* 64 bit or immediate.  */
9366 static void
9367 orr64 (sim_cpu *cpu, uint64_t bimm)
9368 {
9369   unsigned rn = INSTR (9, 5);
9370   unsigned rd = INSTR (4, 0);
9371
9372   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9373   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9374                        aarch64_get_reg_u64 (cpu, rn, NO_SP) | bimm);
9375 }
9376
9377 /* Logical shifted register.
9378    These allow an optional LSL, ASR, LSR or ROR to the second source
9379    register with a count up to the register bit count.
9380    N.B register args may not be SP.  */
9381
9382 /* 32 bit AND shifted register.  */
9383 static void
9384 and32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9385 {
9386   unsigned rm = INSTR (20, 16);
9387   unsigned rn = INSTR (9, 5);
9388   unsigned rd = INSTR (4, 0);
9389
9390   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9391   aarch64_set_reg_u64
9392     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9393      & shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9394 }
9395
9396 /* 64 bit AND shifted register.  */
9397 static void
9398 and64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9399 {
9400   unsigned rm = INSTR (20, 16);
9401   unsigned rn = INSTR (9, 5);
9402   unsigned rd = INSTR (4, 0);
9403
9404   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9405   aarch64_set_reg_u64
9406     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9407      & shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9408 }
9409
9410 /* 32 bit AND shifted register setting flags.  */
9411 static void
9412 ands32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9413 {
9414   unsigned rm = INSTR (20, 16);
9415   unsigned rn = INSTR (9, 5);
9416   unsigned rd = INSTR (4, 0);
9417
9418   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9419   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9420                                shift, count);
9421
9422   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9423   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9424   set_flags_for_binop32 (cpu, value1 & value2);
9425 }
9426
9427 /* 64 bit AND shifted register setting flags.  */
9428 static void
9429 ands64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9430 {
9431   unsigned rm = INSTR (20, 16);
9432   unsigned rn = INSTR (9, 5);
9433   unsigned rd = INSTR (4, 0);
9434
9435   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9436   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9437                                shift, count);
9438
9439   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9440   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9441   set_flags_for_binop64 (cpu, value1 & value2);
9442 }
9443
9444 /* 32 bit BIC shifted register.  */
9445 static void
9446 bic32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9447 {
9448   unsigned rm = INSTR (20, 16);
9449   unsigned rn = INSTR (9, 5);
9450   unsigned rd = INSTR (4, 0);
9451
9452   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9453   aarch64_set_reg_u64
9454     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9455      & ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9456 }
9457
9458 /* 64 bit BIC shifted register.  */
9459 static void
9460 bic64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9461 {
9462   unsigned rm = INSTR (20, 16);
9463   unsigned rn = INSTR (9, 5);
9464   unsigned rd = INSTR (4, 0);
9465
9466   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9467   aarch64_set_reg_u64
9468     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9469      & ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9470 }
9471
9472 /* 32 bit BIC shifted register setting flags.  */
9473 static void
9474 bics32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9475 {
9476   unsigned rm = INSTR (20, 16);
9477   unsigned rn = INSTR (9, 5);
9478   unsigned rd = INSTR (4, 0);
9479
9480   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9481   uint32_t value2 = ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9482                                  shift, count);
9483
9484   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9485   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9486   set_flags_for_binop32 (cpu, value1 & value2);
9487 }
9488
9489 /* 64 bit BIC shifted register setting flags.  */
9490 static void
9491 bics64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9492 {
9493   unsigned rm = INSTR (20, 16);
9494   unsigned rn = INSTR (9, 5);
9495   unsigned rd = INSTR (4, 0);
9496
9497   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9498   uint64_t value2 = ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9499                                  shift, count);
9500
9501   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9502   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9503   set_flags_for_binop64 (cpu, value1 & value2);
9504 }
9505
9506 /* 32 bit EON shifted register.  */
9507 static void
9508 eon32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9509 {
9510   unsigned rm = INSTR (20, 16);
9511   unsigned rn = INSTR (9, 5);
9512   unsigned rd = INSTR (4, 0);
9513
9514   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9515   aarch64_set_reg_u64
9516     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9517      ^ ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9518 }
9519
9520 /* 64 bit EON shifted register.  */
9521 static void
9522 eon64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9523 {
9524   unsigned rm = INSTR (20, 16);
9525   unsigned rn = INSTR (9, 5);
9526   unsigned rd = INSTR (4, 0);
9527
9528   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9529   aarch64_set_reg_u64
9530     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9531      ^ ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9532 }
9533
9534 /* 32 bit EOR shifted register.  */
9535 static void
9536 eor32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9537 {
9538   unsigned rm = INSTR (20, 16);
9539   unsigned rn = INSTR (9, 5);
9540   unsigned rd = INSTR (4, 0);
9541
9542   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9543   aarch64_set_reg_u64
9544     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9545      ^ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9546 }
9547
9548 /* 64 bit EOR shifted register.  */
9549 static void
9550 eor64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9551 {
9552   unsigned rm = INSTR (20, 16);
9553   unsigned rn = INSTR (9, 5);
9554   unsigned rd = INSTR (4, 0);
9555
9556   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9557   aarch64_set_reg_u64
9558     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9559      ^ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9560 }
9561
9562 /* 32 bit ORR shifted register.  */
9563 static void
9564 orr32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9565 {
9566   unsigned rm = INSTR (20, 16);
9567   unsigned rn = INSTR (9, 5);
9568   unsigned rd = INSTR (4, 0);
9569
9570   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9571   aarch64_set_reg_u64
9572     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9573      | shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9574 }
9575
9576 /* 64 bit ORR shifted register.  */
9577 static void
9578 orr64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9579 {
9580   unsigned rm = INSTR (20, 16);
9581   unsigned rn = INSTR (9, 5);
9582   unsigned rd = INSTR (4, 0);
9583
9584   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9585   aarch64_set_reg_u64
9586     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9587      | shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9588 }
9589
9590 /* 32 bit ORN shifted register.  */
9591 static void
9592 orn32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9593 {
9594   unsigned rm = INSTR (20, 16);
9595   unsigned rn = INSTR (9, 5);
9596   unsigned rd = INSTR (4, 0);
9597
9598   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9599   aarch64_set_reg_u64
9600     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9601      | ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9602 }
9603
9604 /* 64 bit ORN shifted register.  */
9605 static void
9606 orn64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9607 {
9608   unsigned rm = INSTR (20, 16);
9609   unsigned rn = INSTR (9, 5);
9610   unsigned rd = INSTR (4, 0);
9611
9612   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9613   aarch64_set_reg_u64
9614     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9615      | ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9616 }
9617
9618 static void
9619 dexLogicalImmediate (sim_cpu *cpu)
9620 {
9621   /* assert instr[28,23] = 1001000
9622      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9623      instr[30,29] = op : 0 ==> AND, 1 ==> ORR, 2 ==> EOR, 3 ==> ANDS
9624      instr[22] = N : used to construct immediate mask
9625      instr[21,16] = immr
9626      instr[15,10] = imms
9627      instr[9,5] = Rn
9628      instr[4,0] = Rd  */
9629
9630   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
9631   uint32_t size = INSTR (31, 31);
9632   uint32_t N = INSTR (22, 22);
9633   /* uint32_t immr = INSTR (21, 16);.  */
9634   /* uint32_t imms = INSTR (15, 10);.  */
9635   uint32_t index = INSTR (22, 10);
9636   uint64_t bimm64 = LITable [index];
9637   uint32_t dispatch = INSTR (30, 29);
9638
9639   if (~size & N)
9640     HALT_UNALLOC;
9641
9642   if (!bimm64)
9643     HALT_UNALLOC;
9644
9645   if (size == 0)
9646     {
9647       uint32_t bimm = (uint32_t) bimm64;
9648
9649       switch (dispatch)
9650         {
9651         case 0: and32 (cpu, bimm); return;
9652         case 1: orr32 (cpu, bimm); return;
9653         case 2: eor32 (cpu, bimm); return;
9654         case 3: ands32 (cpu, bimm); return;
9655         }
9656     }
9657   else
9658     {
9659       switch (dispatch)
9660         {
9661         case 0: and64 (cpu, bimm64); return;
9662         case 1: orr64 (cpu, bimm64); return;
9663         case 2: eor64 (cpu, bimm64); return;
9664         case 3: ands64 (cpu, bimm64); return;
9665         }
9666     }
9667   HALT_UNALLOC;
9668 }
9669
9670 /* Immediate move.
9671    The uimm argument is a 16 bit value to be inserted into the
9672    target register the pos argument locates the 16 bit word in the
9673    dest register i.e. it is in {0, 1} for 32 bit and {0, 1, 2,
9674    3} for 64 bit.
9675    N.B register arg may not be SP so it should be.
9676    accessed using the setGZRegisterXXX accessors.  */
9677
9678 /* 32 bit move 16 bit immediate zero remaining shorts.  */
9679 static void
9680 movz32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9681 {
9682   unsigned rd = INSTR (4, 0);
9683
9684   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9685   aarch64_set_reg_u64 (cpu, rd, NO_SP, val << (pos * 16));
9686 }
9687
9688 /* 64 bit move 16 bit immediate zero remaining shorts.  */
9689 static void
9690 movz64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9691 {
9692   unsigned rd = INSTR (4, 0);
9693
9694   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9695   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((uint64_t) val) << (pos * 16));
9696 }
9697
9698 /* 32 bit move 16 bit immediate negated.  */
9699 static void
9700 movn32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9701 {
9702   unsigned rd = INSTR (4, 0);
9703
9704   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9705   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((val << (pos * 16)) ^ 0xffffffffU));
9706 }
9707
9708 /* 64 bit move 16 bit immediate negated.  */
9709 static void
9710 movn64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9711 {
9712   unsigned rd = INSTR (4, 0);
9713
9714   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9715   aarch64_set_reg_u64
9716     (cpu, rd, NO_SP, ((((uint64_t) val) << (pos * 16))
9717                       ^ 0xffffffffffffffffULL));
9718 }
9719
9720 /* 32 bit move 16 bit immediate keep remaining shorts.  */
9721 static void
9722 movk32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9723 {
9724   unsigned rd = INSTR (4, 0);
9725   uint32_t current = aarch64_get_reg_u32 (cpu, rd, NO_SP);
9726   uint32_t value = val << (pos * 16);
9727   uint32_t mask = ~(0xffffU << (pos * 16));
9728
9729   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9730   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
9731 }
9732
9733 /* 64 bit move 16 it immediate keep remaining shorts.  */
9734 static void
9735 movk64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9736 {
9737   unsigned rd = INSTR (4, 0);
9738   uint64_t current = aarch64_get_reg_u64 (cpu, rd, NO_SP);
9739   uint64_t value = (uint64_t) val << (pos * 16);
9740   uint64_t mask = ~(0xffffULL << (pos * 16));
9741
9742   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9743   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
9744 }
9745
9746 static void
9747 dexMoveWideImmediate (sim_cpu *cpu)
9748 {
9749   /* assert instr[28:23] = 100101
9750      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9751      instr[30,29] = op : 0 ==> MOVN, 1 ==> UNALLOC, 2 ==> MOVZ, 3 ==> MOVK
9752      instr[22,21] = shift : 00 == LSL#0, 01 = LSL#16, 10 = LSL#32, 11 = LSL#48
9753      instr[20,5] = uimm16
9754      instr[4,0] = Rd  */
9755
9756   /* N.B. the (multiple of 16) shift is applied by the called routine,
9757      we just pass the multiplier.  */
9758
9759   uint32_t imm;
9760   uint32_t size = INSTR (31, 31);
9761   uint32_t op = INSTR (30, 29);
9762   uint32_t shift = INSTR (22, 21);
9763
9764   /* 32 bit can only shift 0 or 1 lot of 16.
9765      anything else is an unallocated instruction.  */
9766   if (size == 0 && (shift > 1))
9767     HALT_UNALLOC;
9768
9769   if (op == 1)
9770     HALT_UNALLOC;
9771
9772   imm = INSTR (20, 5);
9773
9774   if (size == 0)
9775     {
9776       if (op == 0)
9777         movn32 (cpu, imm, shift);
9778       else if (op == 2)
9779         movz32 (cpu, imm, shift);
9780       else
9781         movk32 (cpu, imm, shift);
9782     }
9783   else
9784     {
9785       if (op == 0)
9786         movn64 (cpu, imm, shift);
9787       else if (op == 2)
9788         movz64 (cpu, imm, shift);
9789       else
9790         movk64 (cpu, imm, shift);
9791     }
9792 }
9793
9794 /* Bitfield operations.
9795    These take a pair of bit positions r and s which are in {0..31}
9796    or {0..63} depending on the instruction word size.
9797    N.B register args may not be SP.  */
9798
9799 /* OK, we start with ubfm which just needs to pick
9800    some bits out of source zero the rest and write
9801    the result to dest.  Just need two logical shifts.  */
9802
9803 /* 32 bit bitfield move, left and right of affected zeroed
9804    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9805 static void
9806 ubfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9807 {
9808   unsigned rd;
9809   unsigned rn = INSTR (9, 5);
9810   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9811
9812   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
9813   if (r <= s)
9814     {
9815       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
9816          We want only bits s:xxx:r at the bottom of the word
9817          so we LSL bit s up to bit 31 i.e. by 31 - s
9818          and then we LSR to bring bit 31 down to bit s - r
9819          i.e. by 31 + r - s.  */
9820       value <<= 31 - s;
9821       value >>= 31 + r - s;
9822     }
9823   else
9824     {
9825       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0
9826          We want only bits s:xxx:0 starting at it 31-(r-1)
9827          so we LSL bit s up to bit 31 i.e. by 31 - s
9828          and then we LSL to bring bit 31 down to 31-(r-1)+s
9829          i.e. by r - (s + 1).  */
9830       value <<= 31 - s;
9831       value >>= r - (s + 1);
9832     }
9833
9834   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9835   rd = INSTR (4, 0);
9836   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
9837 }
9838
9839 /* 64 bit bitfield move, left and right of affected zeroed
9840    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9841 static void
9842 ubfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9843 {
9844   unsigned rd;
9845   unsigned rn = INSTR (9, 5);
9846   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9847
9848   if (r <= s)
9849     {
9850       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
9851          We want only bits s:xxx:r at the bottom of the word.
9852          So we LSL bit s up to bit 63 i.e. by 63 - s
9853          and then we LSR to bring bit 63 down to bit s - r
9854          i.e. by 63 + r - s.  */
9855       value <<= 63 - s;
9856       value >>= 63 + r - s;
9857     }
9858   else
9859     {
9860       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0.
9861          We want only bits s:xxx:0 starting at it 63-(r-1).
9862          So we LSL bit s up to bit 63 i.e. by 63 - s
9863          and then we LSL to bring bit 63 down to 63-(r-1)+s
9864          i.e. by r - (s + 1).  */
9865       value <<= 63 - s;
9866       value >>= r - (s + 1);
9867     }
9868
9869   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9870   rd = INSTR (4, 0);
9871   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
9872 }
9873
9874 /* The signed versions need to insert sign bits
9875    on the left of the inserted bit field. so we do
9876    much the same as the unsigned version except we
9877    use an arithmetic shift right -- this just means
9878    we need to operate on signed values.  */
9879
9880 /* 32 bit bitfield move, left of affected sign-extended, right zeroed.  */
9881 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9882 static void
9883 sbfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9884 {
9885   unsigned rd;
9886   unsigned rn = INSTR (9, 5);
9887   /* as per ubfm32 but use an ASR instead of an LSR.  */
9888   int32_t value = aarch64_get_reg_s32 (cpu, rn, NO_SP);
9889
9890   if (r <= s)
9891     {
9892       value <<= 31 - s;
9893       value >>= 31 + r - s;
9894     }
9895   else
9896     {
9897       value <<= 31 - s;
9898       value >>= r - (s + 1);
9899     }
9900
9901   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9902   rd = INSTR (4, 0);
9903   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
9904 }
9905
9906 /* 64 bit bitfield move, left of affected sign-extended, right zeroed.  */
9907 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9908 static void
9909 sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9910 {
9911   unsigned rd;
9912   unsigned rn = INSTR (9, 5);
9913   /* acpu per ubfm but use an ASR instead of an LSR.  */
9914   int64_t value = aarch64_get_reg_s64 (cpu, rn, NO_SP);
9915
9916   if (r <= s)
9917     {
9918       value <<= 63 - s;
9919       value >>= 63 + r - s;
9920     }
9921   else
9922     {
9923       value <<= 63 - s;
9924       value >>= r - (s + 1);
9925     }
9926
9927   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9928   rd = INSTR (4, 0);
9929   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
9930 }
9931
9932 /* Finally, these versions leave non-affected bits
9933    as is. so we need to generate the bits as per
9934    ubfm and also generate a mask to pick the
9935    bits from the original and computed values.  */
9936
9937 /* 32 bit bitfield move, non-affected bits left as is.
9938    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9939 static void
9940 bfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9941 {
9942   unsigned rn = INSTR (9, 5);
9943   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9944   uint32_t mask = -1;
9945   unsigned rd;
9946   uint32_t value2;
9947
9948   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
9949   if (r <= s)
9950     {
9951       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
9952          We want only bits s:xxx:r at the bottom of the word
9953          so we LSL bit s up to bit 31 i.e. by 31 - s
9954          and then we LSR to bring bit 31 down to bit s - r
9955          i.e. by 31 + r - s.  */
9956       value <<= 31 - s;
9957       value >>= 31 + r - s;
9958       /* the mask must include the same bits.  */
9959       mask <<= 31 - s;
9960       mask >>= 31 + r - s;
9961     }
9962   else
9963     {
9964       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0.
9965          We want only bits s:xxx:0 starting at it 31-(r-1)
9966          so we LSL bit s up to bit 31 i.e. by 31 - s
9967          and then we LSL to bring bit 31 down to 31-(r-1)+s
9968          i.e. by r - (s + 1).  */
9969       value <<= 31 - s;
9970       value >>= r - (s + 1);
9971       /* The mask must include the same bits.  */
9972       mask <<= 31 - s;
9973       mask >>= r - (s + 1);
9974     }
9975
9976   rd = INSTR (4, 0);
9977   value2 = aarch64_get_reg_u32 (cpu, rd, NO_SP);
9978
9979   value2 &= ~mask;
9980   value2 |= value;
9981
9982   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9983   aarch64_set_reg_u64
9984     (cpu, rd, NO_SP, (aarch64_get_reg_u32 (cpu, rd, NO_SP) & ~mask) | value);
9985 }
9986
9987 /* 64 bit bitfield move, non-affected bits left as is.
9988    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9989 static void
9990 bfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9991 {
9992   unsigned rd;
9993   unsigned rn = INSTR (9, 5);
9994   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9995   uint64_t mask = 0xffffffffffffffffULL;
9996
9997   if (r <= s)
9998     {
9999       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
10000          We want only bits s:xxx:r at the bottom of the word
10001          so we LSL bit s up to bit 63 i.e. by 63 - s
10002          and then we LSR to bring bit 63 down to bit s - r
10003          i.e. by 63 + r - s.  */
10004       value <<= 63 - s;
10005       value >>= 63 + r - s;
10006       /* The mask must include the same bits.  */
10007       mask <<= 63 - s;
10008       mask >>= 63 + r - s;
10009     }
10010   else
10011     {
10012       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0
10013          We want only bits s:xxx:0 starting at it 63-(r-1)
10014          so we LSL bit s up to bit 63 i.e. by 63 - s
10015          and then we LSL to bring bit 63 down to 63-(r-1)+s
10016          i.e. by r - (s + 1).  */
10017       value <<= 63 - s;
10018       value >>= r - (s + 1);
10019       /* The mask must include the same bits.  */
10020       mask <<= 63 - s;
10021       mask >>= r - (s + 1);
10022     }
10023
10024   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10025   rd = INSTR (4, 0);
10026   aarch64_set_reg_u64
10027     (cpu, rd, NO_SP, (aarch64_get_reg_u64 (cpu, rd, NO_SP) & ~mask) | value);
10028 }
10029
10030 static void
10031 dexBitfieldImmediate (sim_cpu *cpu)
10032 {
10033   /* assert instr[28:23] = 100110
10034      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
10035      instr[30,29] = op : 0 ==> SBFM, 1 ==> BFM, 2 ==> UBFM, 3 ==> UNALLOC
10036      instr[22] = N : must be 0 for 32 bit, 1 for 64 bit ow UNALLOC
10037      instr[21,16] = immr : 0xxxxx for 32 bit, xxxxxx for 64 bit
10038      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10039      instr[9,5] = Rn
10040      instr[4,0] = Rd  */
10041
10042   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10043   uint32_t dispatch;
10044   uint32_t imms;
10045   uint32_t size = INSTR (31, 31);
10046   uint32_t N = INSTR (22, 22);
10047   /* 32 bit operations must have immr[5] = 0 and imms[5] = 0.  */
10048   /* or else we have an UNALLOC.  */
10049   uint32_t immr = INSTR (21, 16);
10050
10051   if (~size & N)
10052     HALT_UNALLOC;
10053
10054   if (!size && uimm (immr, 5, 5))
10055     HALT_UNALLOC;
10056
10057   imms = INSTR (15, 10);
10058   if (!size && uimm (imms, 5, 5))
10059     HALT_UNALLOC;
10060
10061   /* Switch on combined size and op.  */
10062   dispatch = INSTR (31, 29);
10063   switch (dispatch)
10064     {
10065     case 0: sbfm32 (cpu, immr, imms); return;
10066     case 1: bfm32 (cpu, immr, imms); return;
10067     case 2: ubfm32 (cpu, immr, imms); return;
10068     case 4: sbfm (cpu, immr, imms); return;
10069     case 5: bfm (cpu, immr, imms); return;
10070     case 6: ubfm (cpu, immr, imms); return;
10071     default: HALT_UNALLOC;
10072     }
10073 }
10074
10075 static void
10076 do_EXTR_32 (sim_cpu *cpu)
10077 {
10078   /* instr[31:21] = 00010011100
10079      instr[20,16] = Rm
10080      instr[15,10] = imms :  0xxxxx for 32 bit
10081      instr[9,5]   = Rn
10082      instr[4,0]   = Rd  */
10083   unsigned rm   = INSTR (20, 16);
10084   unsigned imms = INSTR (15, 10) & 31;
10085   unsigned rn   = INSTR ( 9,  5);
10086   unsigned rd   = INSTR ( 4,  0);
10087   uint64_t val1;
10088   uint64_t val2;
10089
10090   val1 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
10091   val1 >>= imms;
10092   val2 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10093   val2 <<= (32 - imms);
10094
10095   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10096   aarch64_set_reg_u64 (cpu, rd, NO_SP, val1 | val2);
10097 }
10098
10099 static void
10100 do_EXTR_64 (sim_cpu *cpu)
10101 {
10102   /* instr[31:21] = 10010011100
10103      instr[20,16] = Rm
10104      instr[15,10] = imms
10105      instr[9,5]   = Rn
10106      instr[4,0]   = Rd  */
10107   unsigned rm   = INSTR (20, 16);
10108   unsigned imms = INSTR (15, 10) & 63;
10109   unsigned rn   = INSTR ( 9,  5);
10110   unsigned rd   = INSTR ( 4,  0);
10111   uint64_t val;
10112
10113   val = aarch64_get_reg_u64 (cpu, rm, NO_SP);
10114   val >>= imms;
10115   val |= (aarch64_get_reg_u64 (cpu, rn, NO_SP) << (64 - imms));
10116
10117   aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
10118 }
10119
10120 static void
10121 dexExtractImmediate (sim_cpu *cpu)
10122 {
10123   /* assert instr[28:23] = 100111
10124      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
10125      instr[30,29] = op21 : 0 ==> EXTR, 1,2,3 ==> UNALLOC
10126      instr[22]    = N : must be 0 for 32 bit, 1 for 64 bit or UNALLOC
10127      instr[21]    = op0 : must be 0 or UNALLOC
10128      instr[20,16] = Rm
10129      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10130      instr[9,5]   = Rn
10131      instr[4,0]   = Rd  */
10132
10133   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10134   /* 64 bit operations must have N = 1 or else we have an UNALLOC.  */
10135   uint32_t dispatch;
10136   uint32_t size = INSTR (31, 31);
10137   uint32_t N = INSTR (22, 22);
10138   /* 32 bit operations must have imms[5] = 0
10139      or else we have an UNALLOC.  */
10140   uint32_t imms = INSTR (15, 10);
10141
10142   if (size ^ N)
10143     HALT_UNALLOC;
10144
10145   if (!size && uimm (imms, 5, 5))
10146     HALT_UNALLOC;
10147
10148   /* Switch on combined size and op.  */
10149   dispatch = INSTR (31, 29);
10150
10151   if (dispatch == 0)
10152     do_EXTR_32 (cpu);
10153
10154   else if (dispatch == 4)
10155     do_EXTR_64 (cpu);
10156
10157   else if (dispatch == 1)
10158     HALT_NYI;
10159   else
10160     HALT_UNALLOC;
10161 }
10162
10163 static void
10164 dexDPImm (sim_cpu *cpu)
10165 {
10166   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
10167      assert  group == GROUP_DPIMM_1000 || grpoup == GROUP_DPIMM_1001
10168      bits [25,23] of a DPImm are the secondary dispatch vector.  */
10169   uint32_t group2 = dispatchDPImm (aarch64_get_instr (cpu));
10170
10171   switch (group2)
10172     {
10173     case DPIMM_PCADR_000:
10174     case DPIMM_PCADR_001:
10175       dexPCRelAddressing (cpu);
10176       return;
10177
10178     case DPIMM_ADDSUB_010:
10179     case DPIMM_ADDSUB_011:
10180       dexAddSubtractImmediate (cpu);
10181       return;
10182
10183     case DPIMM_LOG_100:
10184       dexLogicalImmediate (cpu);
10185       return;
10186
10187     case DPIMM_MOV_101:
10188       dexMoveWideImmediate (cpu);
10189       return;
10190
10191     case DPIMM_BITF_110:
10192       dexBitfieldImmediate (cpu);
10193       return;
10194
10195     case DPIMM_EXTR_111:
10196       dexExtractImmediate (cpu);
10197       return;
10198
10199     default:
10200       /* Should never reach here.  */
10201       HALT_NYI;
10202     }
10203 }
10204
10205 static void
10206 dexLoadUnscaledImmediate (sim_cpu *cpu)
10207 {
10208   /* instr[29,24] == 111_00
10209      instr[21] == 0
10210      instr[11,10] == 00
10211      instr[31,30] = size
10212      instr[26] = V
10213      instr[23,22] = opc
10214      instr[20,12] = simm9
10215      instr[9,5] = rn may be SP.  */
10216   /* unsigned rt = INSTR (4, 0);  */
10217   uint32_t V = INSTR (26, 26);
10218   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10219   int32_t imm = simm32 (aarch64_get_instr (cpu), 20, 12);
10220
10221   if (!V)
10222     {
10223       /* GReg operations.  */
10224       switch (dispatch)
10225         {
10226         case 0:  sturb (cpu, imm); return;
10227         case 1:  ldurb32 (cpu, imm); return;
10228         case 2:  ldursb64 (cpu, imm); return;
10229         case 3:  ldursb32 (cpu, imm); return;
10230         case 4:  sturh (cpu, imm); return;
10231         case 5:  ldurh32 (cpu, imm); return;
10232         case 6:  ldursh64 (cpu, imm); return;
10233         case 7:  ldursh32 (cpu, imm); return;
10234         case 8:  stur32 (cpu, imm); return;
10235         case 9:  ldur32 (cpu, imm); return;
10236         case 10: ldursw (cpu, imm); return;
10237         case 12: stur64 (cpu, imm); return;
10238         case 13: ldur64 (cpu, imm); return;
10239
10240         case 14:
10241           /* PRFUM NYI.  */
10242           HALT_NYI;
10243
10244         default:
10245         case 11:
10246         case 15:
10247           HALT_UNALLOC;
10248         }
10249     }
10250
10251   /* FReg operations.  */
10252   switch (dispatch)
10253     {
10254     case 2:  fsturq (cpu, imm); return;
10255     case 3:  fldurq (cpu, imm); return;
10256     case 8:  fsturs (cpu, imm); return;
10257     case 9:  fldurs (cpu, imm); return;
10258     case 12: fsturd (cpu, imm); return;
10259     case 13: fldurd (cpu, imm); return;
10260
10261     case 0: /* STUR 8 bit FP.  */
10262     case 1: /* LDUR 8 bit FP.  */
10263     case 4: /* STUR 16 bit FP.  */
10264     case 5: /* LDUR 8 bit FP.  */
10265       HALT_NYI;
10266
10267     default:
10268     case 6:
10269     case 7:
10270     case 10:
10271     case 11:
10272     case 14:
10273     case 15:
10274       HALT_UNALLOC;
10275     }
10276 }
10277
10278 /*  N.B. A preliminary note regarding all the ldrs<x>32
10279     instructions
10280
10281    The signed value loaded by these instructions is cast to unsigned
10282    before being assigned to aarch64_get_reg_u64 (cpu, N) i.e. to the
10283    64 bit element of the GReg union. this performs a 32 bit sign extension
10284    (as required) but avoids 64 bit sign extension, thus ensuring that the
10285    top half of the register word is zero. this is what the spec demands
10286    when a 32 bit load occurs.  */
10287
10288 /* 32 bit load sign-extended byte scaled unsigned 12 bit.  */
10289 static void
10290 ldrsb32_abs (sim_cpu *cpu, uint32_t offset)
10291 {
10292   unsigned int rn = INSTR (9, 5);
10293   unsigned int rt = INSTR (4, 0);
10294
10295   /* The target register may not be SP but the source may be
10296      there is no scaling required for a byte load.  */
10297   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
10298   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10299                        (int64_t) aarch64_get_mem_s8 (cpu, address));
10300 }
10301
10302 /* 32 bit load sign-extended byte scaled or unscaled zero-
10303    or sign-extended 32-bit register offset.  */
10304 static void
10305 ldrsb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10306 {
10307   unsigned int rm = INSTR (20, 16);
10308   unsigned int rn = INSTR (9, 5);
10309   unsigned int rt = INSTR (4, 0);
10310
10311   /* rn may reference SP, rm and rt must reference ZR.  */
10312
10313   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10314   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10315                                  extension);
10316
10317   /* There is no scaling required for a byte load.  */
10318   aarch64_set_reg_u64
10319     (cpu, rt, NO_SP, (int64_t) aarch64_get_mem_s8 (cpu, address
10320                                                    + displacement));
10321 }
10322
10323 /* 32 bit load sign-extended byte unscaled signed 9 bit with
10324    pre- or post-writeback.  */
10325 static void
10326 ldrsb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10327 {
10328   uint64_t address;
10329   unsigned int rn = INSTR (9, 5);
10330   unsigned int rt = INSTR (4, 0);
10331
10332   if (rn == rt && wb != NoWriteBack)
10333     HALT_UNALLOC;
10334
10335   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10336
10337   if (wb == Pre)
10338       address += offset;
10339
10340   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10341                        (int64_t) aarch64_get_mem_s8 (cpu, address));
10342
10343   if (wb == Post)
10344     address += offset;
10345
10346   if (wb != NoWriteBack)
10347     aarch64_set_reg_u64 (cpu, rn, NO_SP, address);
10348 }
10349
10350 /* 8 bit store scaled.  */
10351 static void
10352 fstrb_abs (sim_cpu *cpu, uint32_t offset)
10353 {
10354   unsigned st = INSTR (4, 0);
10355   unsigned rn = INSTR (9, 5);
10356
10357   aarch64_set_mem_u8 (cpu,
10358                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
10359                       aarch64_get_vec_u8 (cpu, st, 0));
10360 }
10361
10362 /* 8 bit store scaled or unscaled zero- or
10363    sign-extended 8-bit register offset.  */
10364 static void
10365 fstrb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10366 {
10367   unsigned rm = INSTR (20, 16);
10368   unsigned rn = INSTR (9, 5);
10369   unsigned st = INSTR (4, 0);
10370
10371   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10372   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10373                                extension);
10374   uint64_t  displacement = scaling == Scaled ? extended : 0;
10375
10376   aarch64_set_mem_u8
10377     (cpu, address + displacement, aarch64_get_vec_u8 (cpu, st, 0));
10378 }
10379
10380 /* 16 bit store scaled.  */
10381 static void
10382 fstrh_abs (sim_cpu *cpu, uint32_t offset)
10383 {
10384   unsigned st = INSTR (4, 0);
10385   unsigned rn = INSTR (9, 5);
10386
10387   aarch64_set_mem_u16
10388     (cpu,
10389      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16),
10390      aarch64_get_vec_u16 (cpu, st, 0));
10391 }
10392
10393 /* 16 bit store scaled or unscaled zero-
10394    or sign-extended 16-bit register offset.  */
10395 static void
10396 fstrh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10397 {
10398   unsigned rm = INSTR (20, 16);
10399   unsigned rn = INSTR (9, 5);
10400   unsigned st = INSTR (4, 0);
10401
10402   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10403   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10404                                extension);
10405   uint64_t  displacement = OPT_SCALE (extended, 16, scaling);
10406
10407   aarch64_set_mem_u16
10408     (cpu, address + displacement, aarch64_get_vec_u16 (cpu, st, 0));
10409 }
10410
10411 /* 32 bit store scaled unsigned 12 bit.  */
10412 static void
10413 fstrs_abs (sim_cpu *cpu, uint32_t offset)
10414 {
10415   unsigned st = INSTR (4, 0);
10416   unsigned rn = INSTR (9, 5);
10417
10418   aarch64_set_mem_u32
10419     (cpu,
10420      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32),
10421      aarch64_get_vec_u32 (cpu, st, 0));
10422 }
10423
10424 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10425 static void
10426 fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10427 {
10428   unsigned rn = INSTR (9, 5);
10429   unsigned st = INSTR (4, 0);
10430
10431   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10432
10433   if (wb != Post)
10434     address += offset;
10435
10436   aarch64_set_mem_u32 (cpu, address, aarch64_get_vec_u32 (cpu, st, 0));
10437
10438   if (wb == Post)
10439     address += offset;
10440
10441   if (wb != NoWriteBack)
10442     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10443 }
10444
10445 /* 32 bit store scaled or unscaled zero-
10446    or sign-extended 32-bit register offset.  */
10447 static void
10448 fstrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10449 {
10450   unsigned rm = INSTR (20, 16);
10451   unsigned rn = INSTR (9, 5);
10452   unsigned st = INSTR (4, 0);
10453
10454   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10455   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10456                                extension);
10457   uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
10458
10459   aarch64_set_mem_u32
10460     (cpu, address + displacement, aarch64_get_vec_u32 (cpu, st, 0));
10461 }
10462
10463 /* 64 bit store scaled unsigned 12 bit.  */
10464 static void
10465 fstrd_abs (sim_cpu *cpu, uint32_t offset)
10466 {
10467   unsigned st = INSTR (4, 0);
10468   unsigned rn = INSTR (9, 5);
10469
10470   aarch64_set_mem_u64
10471     (cpu,
10472      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64),
10473      aarch64_get_vec_u64 (cpu, st, 0));
10474 }
10475
10476 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10477 static void
10478 fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10479 {
10480   unsigned rn = INSTR (9, 5);
10481   unsigned st = INSTR (4, 0);
10482
10483   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10484
10485   if (wb != Post)
10486     address += offset;
10487
10488   aarch64_set_mem_u64 (cpu, address, aarch64_get_vec_u64 (cpu, st, 0));
10489
10490   if (wb == Post)
10491     address += offset;
10492
10493   if (wb != NoWriteBack)
10494     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10495 }
10496
10497 /* 64 bit store scaled or unscaled zero-
10498    or sign-extended 32-bit register offset.  */
10499 static void
10500 fstrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10501 {
10502   unsigned rm = INSTR (20, 16);
10503   unsigned rn = INSTR (9, 5);
10504   unsigned st = INSTR (4, 0);
10505
10506   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10507   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10508                                extension);
10509   uint64_t  displacement = OPT_SCALE (extended, 64, scaling);
10510
10511   aarch64_set_mem_u64
10512     (cpu, address + displacement, aarch64_get_vec_u64 (cpu, st, 0));
10513 }
10514
10515 /* 128 bit store scaled unsigned 12 bit.  */
10516 static void
10517 fstrq_abs (sim_cpu *cpu, uint32_t offset)
10518 {
10519   FRegister a;
10520   unsigned st = INSTR (4, 0);
10521   unsigned rn = INSTR (9, 5);
10522   uint64_t addr;
10523
10524   aarch64_get_FP_long_double (cpu, st, & a);
10525
10526   addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
10527   aarch64_set_mem_long_double (cpu, addr, a);
10528 }
10529
10530 /* 128 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10531 static void
10532 fstrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10533 {
10534   FRegister a;
10535   unsigned rn = INSTR (9, 5);
10536   unsigned st = INSTR (4, 0);
10537   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10538
10539   if (wb != Post)
10540     address += offset;
10541
10542   aarch64_get_FP_long_double (cpu, st, & a);
10543   aarch64_set_mem_long_double (cpu, address, a);
10544
10545   if (wb == Post)
10546     address += offset;
10547
10548   if (wb != NoWriteBack)
10549     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10550 }
10551
10552 /* 128 bit store scaled or unscaled zero-
10553    or sign-extended 32-bit register offset.  */
10554 static void
10555 fstrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10556 {
10557   unsigned rm = INSTR (20, 16);
10558   unsigned rn = INSTR (9, 5);
10559   unsigned st = INSTR (4, 0);
10560
10561   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10562   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10563                                extension);
10564   uint64_t  displacement = OPT_SCALE (extended, 128, scaling);
10565
10566   FRegister a;
10567
10568   aarch64_get_FP_long_double (cpu, st, & a);
10569   aarch64_set_mem_long_double (cpu, address + displacement, a);
10570 }
10571
10572 static void
10573 dexLoadImmediatePrePost (sim_cpu *cpu)
10574 {
10575   /* instr[31,30] = size
10576      instr[29,27] = 111
10577      instr[26]    = V
10578      instr[25,24] = 00
10579      instr[23,22] = opc
10580      instr[21]    = 0
10581      instr[20,12] = simm9
10582      instr[11]    = wb : 0 ==> Post, 1 ==> Pre
10583      instr[10]    = 0
10584      instr[9,5]   = Rn may be SP.
10585      instr[4,0]   = Rt */
10586
10587   uint32_t  V        = INSTR (26, 26);
10588   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10589   int32_t   imm      = simm32 (aarch64_get_instr (cpu), 20, 12);
10590   WriteBack wb       = INSTR (11, 11);
10591
10592   if (!V)
10593     {
10594       /* GReg operations.  */
10595       switch (dispatch)
10596         {
10597         case 0:  strb_wb (cpu, imm, wb); return;
10598         case 1:  ldrb32_wb (cpu, imm, wb); return;
10599         case 2:  ldrsb_wb (cpu, imm, wb); return;
10600         case 3:  ldrsb32_wb (cpu, imm, wb); return;
10601         case 4:  strh_wb (cpu, imm, wb); return;
10602         case 5:  ldrh32_wb (cpu, imm, wb); return;
10603         case 6:  ldrsh64_wb (cpu, imm, wb); return;
10604         case 7:  ldrsh32_wb (cpu, imm, wb); return;
10605         case 8:  str32_wb (cpu, imm, wb); return;
10606         case 9:  ldr32_wb (cpu, imm, wb); return;
10607         case 10: ldrsw_wb (cpu, imm, wb); return;
10608         case 12: str_wb (cpu, imm, wb); return;
10609         case 13: ldr_wb (cpu, imm, wb); return;
10610
10611         default:
10612         case 11:
10613         case 14:
10614         case 15:
10615           HALT_UNALLOC;
10616         }
10617     }
10618
10619   /* FReg operations.  */
10620   switch (dispatch)
10621     {
10622     case 2:  fstrq_wb (cpu, imm, wb); return;
10623     case 3:  fldrq_wb (cpu, imm, wb); return;
10624     case 8:  fstrs_wb (cpu, imm, wb); return;
10625     case 9:  fldrs_wb (cpu, imm, wb); return;
10626     case 12: fstrd_wb (cpu, imm, wb); return;
10627     case 13: fldrd_wb (cpu, imm, wb); return;
10628
10629     case 0:       /* STUR 8 bit FP.  */
10630     case 1:       /* LDUR 8 bit FP.  */
10631     case 4:       /* STUR 16 bit FP.  */
10632     case 5:       /* LDUR 8 bit FP.  */
10633       HALT_NYI;
10634
10635     default:
10636     case 6:
10637     case 7:
10638     case 10:
10639     case 11:
10640     case 14:
10641     case 15:
10642       HALT_UNALLOC;
10643     }
10644 }
10645
10646 static void
10647 dexLoadRegisterOffset (sim_cpu *cpu)
10648 {
10649   /* instr[31,30] = size
10650      instr[29,27] = 111
10651      instr[26]    = V
10652      instr[25,24] = 00
10653      instr[23,22] = opc
10654      instr[21]    = 1
10655      instr[20,16] = rm
10656      instr[15,13] = option : 010 ==> UXTW, 011 ==> UXTX/LSL,
10657                              110 ==> SXTW, 111 ==> SXTX,
10658                              ow ==> RESERVED
10659      instr[12]    = scaled
10660      instr[11,10] = 10
10661      instr[9,5]   = rn
10662      instr[4,0]   = rt.  */
10663
10664   uint32_t  V = INSTR (26, 26);
10665   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10666   Scaling   scale = INSTR (12, 12);
10667   Extension extensionType = INSTR (15, 13);
10668
10669   /* Check for illegal extension types.  */
10670   if (uimm (extensionType, 1, 1) == 0)
10671     HALT_UNALLOC;
10672
10673   if (extensionType == UXTX || extensionType == SXTX)
10674     extensionType = NoExtension;
10675
10676   if (!V)
10677     {
10678       /* GReg operations.  */
10679       switch (dispatch)
10680         {
10681         case 0:  strb_scale_ext (cpu, scale, extensionType); return;
10682         case 1:  ldrb32_scale_ext (cpu, scale, extensionType); return;
10683         case 2:  ldrsb_scale_ext (cpu, scale, extensionType); return;
10684         case 3:  ldrsb32_scale_ext (cpu, scale, extensionType); return;
10685         case 4:  strh_scale_ext (cpu, scale, extensionType); return;
10686         case 5:  ldrh32_scale_ext (cpu, scale, extensionType); return;
10687         case 6:  ldrsh_scale_ext (cpu, scale, extensionType); return;
10688         case 7:  ldrsh32_scale_ext (cpu, scale, extensionType); return;
10689         case 8:  str32_scale_ext (cpu, scale, extensionType); return;
10690         case 9:  ldr32_scale_ext (cpu, scale, extensionType); return;
10691         case 10: ldrsw_scale_ext (cpu, scale, extensionType); return;
10692         case 12: str_scale_ext (cpu, scale, extensionType); return;
10693         case 13: ldr_scale_ext (cpu, scale, extensionType); return;
10694         case 14: prfm_scale_ext (cpu, scale, extensionType); return;
10695
10696         default:
10697         case 11:
10698         case 15:
10699           HALT_UNALLOC;
10700         }
10701     }
10702
10703   /* FReg operations.  */
10704   switch (dispatch)
10705     {
10706     case 1: /* LDUR 8 bit FP.  */
10707       HALT_NYI;
10708     case 3:  fldrq_scale_ext (cpu, scale, extensionType); return;
10709     case 5: /* LDUR 8 bit FP.  */
10710       HALT_NYI;
10711     case 9:  fldrs_scale_ext (cpu, scale, extensionType); return;
10712     case 13: fldrd_scale_ext (cpu, scale, extensionType); return;
10713
10714     case 0:  fstrb_scale_ext (cpu, scale, extensionType); return;
10715     case 2:  fstrq_scale_ext (cpu, scale, extensionType); return;
10716     case 4:  fstrh_scale_ext (cpu, scale, extensionType); return;
10717     case 8:  fstrs_scale_ext (cpu, scale, extensionType); return;
10718     case 12: fstrd_scale_ext (cpu, scale, extensionType); return;
10719
10720     default:
10721     case 6:
10722     case 7:
10723     case 10:
10724     case 11:
10725     case 14:
10726     case 15:
10727       HALT_UNALLOC;
10728     }
10729 }
10730
10731 static void
10732 dexLoadUnsignedImmediate (sim_cpu *cpu)
10733 {
10734   /* instr[29,24] == 111_01
10735      instr[31,30] = size
10736      instr[26]    = V
10737      instr[23,22] = opc
10738      instr[21,10] = uimm12 : unsigned immediate offset
10739      instr[9,5]   = rn may be SP.
10740      instr[4,0]   = rt.  */
10741
10742   uint32_t V = INSTR (26,26);
10743   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10744   uint32_t imm = INSTR (21, 10);
10745
10746   if (!V)
10747     {
10748       /* GReg operations.  */
10749       switch (dispatch)
10750         {
10751         case 0:  strb_abs (cpu, imm); return;
10752         case 1:  ldrb32_abs (cpu, imm); return;
10753         case 2:  ldrsb_abs (cpu, imm); return;
10754         case 3:  ldrsb32_abs (cpu, imm); return;
10755         case 4:  strh_abs (cpu, imm); return;
10756         case 5:  ldrh32_abs (cpu, imm); return;
10757         case 6:  ldrsh_abs (cpu, imm); return;
10758         case 7:  ldrsh32_abs (cpu, imm); return;
10759         case 8:  str32_abs (cpu, imm); return;
10760         case 9:  ldr32_abs (cpu, imm); return;
10761         case 10: ldrsw_abs (cpu, imm); return;
10762         case 12: str_abs (cpu, imm); return;
10763         case 13: ldr_abs (cpu, imm); return;
10764         case 14: prfm_abs (cpu, imm); return;
10765
10766         default:
10767         case 11:
10768         case 15:
10769           HALT_UNALLOC;
10770         }
10771     }
10772
10773   /* FReg operations.  */
10774   switch (dispatch)
10775     {
10776     case 0:  fstrb_abs (cpu, imm); return;
10777     case 4:  fstrh_abs (cpu, imm); return;
10778     case 8:  fstrs_abs (cpu, imm); return;
10779     case 12: fstrd_abs (cpu, imm); return;
10780     case 2:  fstrq_abs (cpu, imm); return;
10781
10782     case 1:  fldrb_abs (cpu, imm); return;
10783     case 5:  fldrh_abs (cpu, imm); return;
10784     case 9:  fldrs_abs (cpu, imm); return;
10785     case 13: fldrd_abs (cpu, imm); return;
10786     case 3:  fldrq_abs (cpu, imm); return;
10787
10788     default:
10789     case 6:
10790     case 7:
10791     case 10:
10792     case 11:
10793     case 14:
10794     case 15:
10795       HALT_UNALLOC;
10796     }
10797 }
10798
10799 static void
10800 dexLoadExclusive (sim_cpu *cpu)
10801 {
10802   /* assert instr[29:24] = 001000;
10803      instr[31,30] = size
10804      instr[23] = 0 if exclusive
10805      instr[22] = L : 1 if load, 0 if store
10806      instr[21] = 1 if pair
10807      instr[20,16] = Rs
10808      instr[15] = o0 : 1 if ordered
10809      instr[14,10] = Rt2
10810      instr[9,5] = Rn
10811      instr[4.0] = Rt.  */
10812
10813   switch (INSTR (22, 21))
10814     {
10815     case 2:   ldxr (cpu); return;
10816     case 0:   stxr (cpu); return;
10817     default:  HALT_NYI;
10818     }
10819 }
10820
10821 static void
10822 dexLoadOther (sim_cpu *cpu)
10823 {
10824   uint32_t dispatch;
10825
10826   /* instr[29,25] = 111_0
10827      instr[24] == 0 ==> dispatch, 1 ==> ldst reg unsigned immediate
10828      instr[21:11,10] is the secondary dispatch.  */
10829   if (INSTR (24, 24))
10830     {
10831       dexLoadUnsignedImmediate (cpu);
10832       return;
10833     }
10834
10835   dispatch = ((INSTR (21, 21) << 2) | INSTR (11, 10));
10836   switch (dispatch)
10837     {
10838     case 0: dexLoadUnscaledImmediate (cpu); return;
10839     case 1: dexLoadImmediatePrePost (cpu); return;
10840     case 3: dexLoadImmediatePrePost (cpu); return;
10841     case 6: dexLoadRegisterOffset (cpu); return;
10842
10843     default:
10844     case 2:
10845     case 4:
10846     case 5:
10847     case 7:
10848       HALT_NYI;
10849     }
10850 }
10851
10852 static void
10853 store_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10854 {
10855   unsigned rn = INSTR (14, 10);
10856   unsigned rd = INSTR (9, 5);
10857   unsigned rm = INSTR (4, 0);
10858   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10859
10860   if ((rn == rd || rm == rd) && wb != NoWriteBack)
10861     HALT_UNALLOC; /* ??? */
10862
10863   offset <<= 2;
10864
10865   if (wb != Post)
10866     address += offset;
10867
10868   aarch64_set_mem_u32 (cpu, address,
10869                        aarch64_get_reg_u32 (cpu, rm, NO_SP));
10870   aarch64_set_mem_u32 (cpu, address + 4,
10871                        aarch64_get_reg_u32 (cpu, rn, NO_SP));
10872
10873   if (wb == Post)
10874     address += offset;
10875
10876   if (wb != NoWriteBack)
10877     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10878 }
10879
10880 static void
10881 store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10882 {
10883   unsigned rn = INSTR (14, 10);
10884   unsigned rd = INSTR (9, 5);
10885   unsigned rm = INSTR (4, 0);
10886   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10887
10888   if ((rn == rd || rm == rd) && wb != NoWriteBack)
10889     HALT_UNALLOC; /* ??? */
10890
10891   offset <<= 3;
10892
10893   if (wb != Post)
10894     address += offset;
10895
10896   aarch64_set_mem_u64 (cpu, address,
10897                        aarch64_get_reg_u64 (cpu, rm, NO_SP));
10898   aarch64_set_mem_u64 (cpu, address + 8,
10899                        aarch64_get_reg_u64 (cpu, rn, NO_SP));
10900
10901   if (wb == Post)
10902     address += offset;
10903
10904   if (wb != NoWriteBack)
10905     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10906 }
10907
10908 static void
10909 load_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10910 {
10911   unsigned rn = INSTR (14, 10);
10912   unsigned rd = INSTR (9, 5);
10913   unsigned rm = INSTR (4, 0);
10914   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10915
10916   /* Treat this as unalloc to make sure we don't do it.  */
10917   if (rn == rm)
10918     HALT_UNALLOC;
10919
10920   offset <<= 2;
10921
10922   if (wb != Post)
10923     address += offset;
10924
10925   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u32 (cpu, address));
10926   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u32 (cpu, address + 4));
10927
10928   if (wb == Post)
10929     address += offset;
10930
10931   if (wb != NoWriteBack)
10932     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10933 }
10934
10935 static void
10936 load_pair_s32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10937 {
10938   unsigned rn = INSTR (14, 10);
10939   unsigned rd = INSTR (9, 5);
10940   unsigned rm = INSTR (4, 0);
10941   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10942
10943   /* Treat this as unalloc to make sure we don't do it.  */
10944   if (rn == rm)
10945     HALT_UNALLOC;
10946
10947   offset <<= 2;
10948
10949   if (wb != Post)
10950     address += offset;
10951
10952   aarch64_set_reg_s64 (cpu, rm, SP_OK, aarch64_get_mem_s32 (cpu, address));
10953   aarch64_set_reg_s64 (cpu, rn, SP_OK, aarch64_get_mem_s32 (cpu, address + 4));
10954
10955   if (wb == Post)
10956     address += offset;
10957
10958   if (wb != NoWriteBack)
10959     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10960 }
10961
10962 static void
10963 load_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10964 {
10965   unsigned rn = INSTR (14, 10);
10966   unsigned rd = INSTR (9, 5);
10967   unsigned rm = INSTR (4, 0);
10968   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10969
10970   /* Treat this as unalloc to make sure we don't do it.  */
10971   if (rn == rm)
10972     HALT_UNALLOC;
10973
10974   offset <<= 3;
10975
10976   if (wb != Post)
10977     address += offset;
10978
10979   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u64 (cpu, address));
10980   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u64 (cpu, address + 8));
10981
10982   if (wb == Post)
10983     address += offset;
10984
10985   if (wb != NoWriteBack)
10986     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10987 }
10988
10989 static void
10990 dex_load_store_pair_gr (sim_cpu *cpu)
10991 {
10992   /* instr[31,30] = size (10=> 64-bit, 01=> signed 32-bit, 00=> 32-bit)
10993      instr[29,25] = instruction encoding: 101_0
10994      instr[26]    = V : 1 if fp 0 if gp
10995      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
10996      instr[22]    = load/store (1=> load)
10997      instr[21,15] = signed, scaled, offset
10998      instr[14,10] = Rn
10999      instr[ 9, 5] = Rd
11000      instr[ 4, 0] = Rm.  */
11001
11002   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11003   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11004
11005   switch (dispatch)
11006     {
11007     case 2: store_pair_u32 (cpu, offset, Post); return;
11008     case 3: load_pair_u32  (cpu, offset, Post); return;
11009     case 4: store_pair_u32 (cpu, offset, NoWriteBack); return;
11010     case 5: load_pair_u32  (cpu, offset, NoWriteBack); return;
11011     case 6: store_pair_u32 (cpu, offset, Pre); return;
11012     case 7: load_pair_u32  (cpu, offset, Pre); return;
11013
11014     case 11: load_pair_s32  (cpu, offset, Post); return;
11015     case 13: load_pair_s32  (cpu, offset, NoWriteBack); return;
11016     case 15: load_pair_s32  (cpu, offset, Pre); return;
11017
11018     case 18: store_pair_u64 (cpu, offset, Post); return;
11019     case 19: load_pair_u64  (cpu, offset, Post); return;
11020     case 20: store_pair_u64 (cpu, offset, NoWriteBack); return;
11021     case 21: load_pair_u64  (cpu, offset, NoWriteBack); return;
11022     case 22: store_pair_u64 (cpu, offset, Pre); return;
11023     case 23: load_pair_u64  (cpu, offset, Pre); return;
11024
11025     default:
11026       HALT_UNALLOC;
11027     }
11028 }
11029
11030 static void
11031 store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11032 {
11033   unsigned rn = INSTR (14, 10);
11034   unsigned rd = INSTR (9, 5);
11035   unsigned rm = INSTR (4, 0);
11036   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11037
11038   offset <<= 2;
11039
11040   if (wb != Post)
11041     address += offset;
11042
11043   aarch64_set_mem_u32 (cpu, address,     aarch64_get_vec_u32 (cpu, rm, 0));
11044   aarch64_set_mem_u32 (cpu, address + 4, aarch64_get_vec_u32 (cpu, rn, 0));
11045
11046   if (wb == Post)
11047     address += offset;
11048
11049   if (wb != NoWriteBack)
11050     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11051 }
11052
11053 static void
11054 store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11055 {
11056   unsigned rn = INSTR (14, 10);
11057   unsigned rd = INSTR (9, 5);
11058   unsigned rm = INSTR (4, 0);
11059   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11060
11061   offset <<= 3;
11062
11063   if (wb != Post)
11064     address += offset;
11065
11066   aarch64_set_mem_u64 (cpu, address,     aarch64_get_vec_u64 (cpu, rm, 0));
11067   aarch64_set_mem_u64 (cpu, address + 8, aarch64_get_vec_u64 (cpu, rn, 0));
11068
11069   if (wb == Post)
11070     address += offset;
11071
11072   if (wb != NoWriteBack)
11073     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11074 }
11075
11076 static void
11077 store_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11078 {
11079   FRegister a;
11080   unsigned rn = INSTR (14, 10);
11081   unsigned rd = INSTR (9, 5);
11082   unsigned rm = INSTR (4, 0);
11083   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11084
11085   offset <<= 4;
11086
11087   if (wb != Post)
11088     address += offset;
11089
11090   aarch64_get_FP_long_double (cpu, rm, & a);
11091   aarch64_set_mem_long_double (cpu, address, a);
11092   aarch64_get_FP_long_double (cpu, rn, & a);
11093   aarch64_set_mem_long_double (cpu, address + 16, a);
11094
11095   if (wb == Post)
11096     address += offset;
11097
11098   if (wb != NoWriteBack)
11099     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11100 }
11101
11102 static void
11103 load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11104 {
11105   unsigned rn = INSTR (14, 10);
11106   unsigned rd = INSTR (9, 5);
11107   unsigned rm = INSTR (4, 0);
11108   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11109
11110   if (rm == rn)
11111     HALT_UNALLOC;
11112
11113   offset <<= 2;
11114
11115   if (wb != Post)
11116     address += offset;
11117
11118   aarch64_set_vec_u32 (cpu, rm, 0, aarch64_get_mem_u32 (cpu, address));
11119   aarch64_set_vec_u32 (cpu, rn, 0, aarch64_get_mem_u32 (cpu, address + 4));
11120
11121   if (wb == Post)
11122     address += offset;
11123
11124   if (wb != NoWriteBack)
11125     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11126 }
11127
11128 static void
11129 load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11130 {
11131   unsigned rn = INSTR (14, 10);
11132   unsigned rd = INSTR (9, 5);
11133   unsigned rm = INSTR (4, 0);
11134   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11135
11136   if (rm == rn)
11137     HALT_UNALLOC;
11138
11139   offset <<= 3;
11140
11141   if (wb != Post)
11142     address += offset;
11143
11144   aarch64_set_vec_u64 (cpu, rm, 0, aarch64_get_mem_u64 (cpu, address));
11145   aarch64_set_vec_u64 (cpu, rn, 0, aarch64_get_mem_u64 (cpu, address + 8));
11146
11147   if (wb == Post)
11148     address += offset;
11149
11150   if (wb != NoWriteBack)
11151     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11152 }
11153
11154 static void
11155 load_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11156 {
11157   FRegister a;
11158   unsigned rn = INSTR (14, 10);
11159   unsigned rd = INSTR (9, 5);
11160   unsigned rm = INSTR (4, 0);
11161   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11162
11163   if (rm == rn)
11164     HALT_UNALLOC;
11165
11166   offset <<= 4;
11167
11168   if (wb != Post)
11169     address += offset;
11170
11171   aarch64_get_mem_long_double (cpu, address, & a);
11172   aarch64_set_FP_long_double (cpu, rm, a);
11173   aarch64_get_mem_long_double (cpu, address + 16, & a);
11174   aarch64_set_FP_long_double (cpu, rn, a);
11175
11176   if (wb == Post)
11177     address += offset;
11178
11179   if (wb != NoWriteBack)
11180     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11181 }
11182
11183 static void
11184 dex_load_store_pair_fp (sim_cpu *cpu)
11185 {
11186   /* instr[31,30] = size (10=> 128-bit, 01=> 64-bit, 00=> 32-bit)
11187      instr[29,25] = instruction encoding
11188      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11189      instr[22]    = load/store (1=> load)
11190      instr[21,15] = signed, scaled, offset
11191      instr[14,10] = Rn
11192      instr[ 9, 5] = Rd
11193      instr[ 4, 0] = Rm  */
11194
11195   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11196   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11197
11198   switch (dispatch)
11199     {
11200     case 2: store_pair_float (cpu, offset, Post); return;
11201     case 3: load_pair_float  (cpu, offset, Post); return;
11202     case 4: store_pair_float (cpu, offset, NoWriteBack); return;
11203     case 5: load_pair_float  (cpu, offset, NoWriteBack); return;
11204     case 6: store_pair_float (cpu, offset, Pre); return;
11205     case 7: load_pair_float  (cpu, offset, Pre); return;
11206
11207     case 10: store_pair_double (cpu, offset, Post); return;
11208     case 11: load_pair_double  (cpu, offset, Post); return;
11209     case 12: store_pair_double (cpu, offset, NoWriteBack); return;
11210     case 13: load_pair_double  (cpu, offset, NoWriteBack); return;
11211     case 14: store_pair_double (cpu, offset, Pre); return;
11212     case 15: load_pair_double  (cpu, offset, Pre); return;
11213
11214     case 18: store_pair_long_double (cpu, offset, Post); return;
11215     case 19: load_pair_long_double  (cpu, offset, Post); return;
11216     case 20: store_pair_long_double (cpu, offset, NoWriteBack); return;
11217     case 21: load_pair_long_double  (cpu, offset, NoWriteBack); return;
11218     case 22: store_pair_long_double (cpu, offset, Pre); return;
11219     case 23: load_pair_long_double  (cpu, offset, Pre); return;
11220
11221     default:
11222       HALT_UNALLOC;
11223     }
11224 }
11225
11226 static inline unsigned
11227 vec_reg (unsigned v, unsigned o)
11228 {
11229   return (v + o) & 0x3F;
11230 }
11231
11232 /* Load multiple N-element structures to N consecutive registers.  */
11233 static void
11234 vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
11235 {
11236   int      all  = INSTR (30, 30);
11237   unsigned size = INSTR (11, 10);
11238   unsigned vd   = INSTR (4, 0);
11239   unsigned i;
11240
11241   switch (size)
11242     {
11243     case 0: /* 8-bit operations.  */
11244       if (all)
11245         for (i = 0; i < (16 * N); i++)
11246           aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
11247                               aarch64_get_mem_u8 (cpu, address + i));
11248       else
11249         for (i = 0; i < (8 * N); i++)
11250           aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
11251                               aarch64_get_mem_u8 (cpu, address + i));
11252       return;
11253
11254     case 1: /* 16-bit operations.  */
11255       if (all)
11256         for (i = 0; i < (8 * N); i++)
11257           aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
11258                                aarch64_get_mem_u16 (cpu, address + i * 2));
11259       else
11260         for (i = 0; i < (4 * N); i++)
11261           aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
11262                                aarch64_get_mem_u16 (cpu, address + i * 2));
11263       return;
11264
11265     case 2: /* 32-bit operations.  */
11266       if (all)
11267         for (i = 0; i < (4 * N); i++)
11268           aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
11269                                aarch64_get_mem_u32 (cpu, address + i * 4));
11270       else
11271         for (i = 0; i < (2 * N); i++)
11272           aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
11273                                aarch64_get_mem_u32 (cpu, address + i * 4));
11274       return;
11275
11276     case 3: /* 64-bit operations.  */
11277       if (all)
11278         for (i = 0; i < (2 * N); i++)
11279           aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
11280                                aarch64_get_mem_u64 (cpu, address + i * 8));
11281       else
11282         for (i = 0; i < N; i++)
11283           aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
11284                                aarch64_get_mem_u64 (cpu, address + i * 8));
11285       return;
11286     }
11287 }
11288
11289 /* LD4: load multiple 4-element to four consecutive registers.  */
11290 static void
11291 LD4 (sim_cpu *cpu, uint64_t address)
11292 {
11293   vec_load (cpu, address, 4);
11294 }
11295
11296 /* LD3: load multiple 3-element structures to three consecutive registers.  */
11297 static void
11298 LD3 (sim_cpu *cpu, uint64_t address)
11299 {
11300   vec_load (cpu, address, 3);
11301 }
11302
11303 /* LD2: load multiple 2-element structures to two consecutive registers.  */
11304 static void
11305 LD2 (sim_cpu *cpu, uint64_t address)
11306 {
11307   vec_load (cpu, address, 2);
11308 }
11309
11310 /* Load multiple 1-element structures into one register.  */
11311 static void
11312 LD1_1 (sim_cpu *cpu, uint64_t address)
11313 {
11314   int      all  = INSTR (30, 30);
11315   unsigned size = INSTR (11, 10);
11316   unsigned vd   = INSTR (4, 0);
11317   unsigned i;
11318
11319   switch (size)
11320     {
11321     case 0:
11322       /* LD1 {Vd.16b}, addr, #16 */
11323       /* LD1 {Vd.8b}, addr, #8 */
11324       for (i = 0; i < (all ? 16 : 8); i++)
11325         aarch64_set_vec_u8 (cpu, vd, i,
11326                             aarch64_get_mem_u8 (cpu, address + i));
11327       return;
11328
11329     case 1:
11330       /* LD1 {Vd.8h}, addr, #16 */
11331       /* LD1 {Vd.4h}, addr, #8 */
11332       for (i = 0; i < (all ? 8 : 4); i++)
11333         aarch64_set_vec_u16 (cpu, vd, i,
11334                              aarch64_get_mem_u16 (cpu, address + i * 2));
11335       return;
11336
11337     case 2:
11338       /* LD1 {Vd.4s}, addr, #16 */
11339       /* LD1 {Vd.2s}, addr, #8 */
11340       for (i = 0; i < (all ? 4 : 2); i++)
11341         aarch64_set_vec_u32 (cpu, vd, i,
11342                              aarch64_get_mem_u32 (cpu, address + i * 4));
11343       return;
11344
11345     case 3:
11346       /* LD1 {Vd.2d}, addr, #16 */
11347       /* LD1 {Vd.1d}, addr, #8 */
11348       for (i = 0; i < (all ? 2 : 1); i++)
11349         aarch64_set_vec_u64 (cpu, vd, i,
11350                              aarch64_get_mem_u64 (cpu, address + i * 8));
11351       return;
11352     }
11353 }
11354
11355 /* Load multiple 1-element structures into two registers.  */
11356 static void
11357 LD1_2 (sim_cpu *cpu, uint64_t address)
11358 {
11359   /* FIXME: This algorithm is *exactly* the same as the LD2 version.
11360      So why have two different instructions ?  There must be something
11361      wrong somewhere.  */
11362   vec_load (cpu, address, 2);
11363 }
11364
11365 /* Load multiple 1-element structures into three registers.  */
11366 static void
11367 LD1_3 (sim_cpu *cpu, uint64_t address)
11368 {
11369   /* FIXME: This algorithm is *exactly* the same as the LD3 version.
11370      So why have two different instructions ?  There must be something
11371      wrong somewhere.  */
11372   vec_load (cpu, address, 3);
11373 }
11374
11375 /* Load multiple 1-element structures into four registers.  */
11376 static void
11377 LD1_4 (sim_cpu *cpu, uint64_t address)
11378 {
11379   /* FIXME: This algorithm is *exactly* the same as the LD4 version.
11380      So why have two different instructions ?  There must be something
11381      wrong somewhere.  */
11382   vec_load (cpu, address, 4);
11383 }
11384
11385 /* Store multiple N-element structures to N consecutive registers.  */
11386 static void
11387 vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
11388 {
11389   int      all  = INSTR (30, 30);
11390   unsigned size = INSTR (11, 10);
11391   unsigned vd   = INSTR (4, 0);
11392   unsigned i;
11393
11394   switch (size)
11395     {
11396     case 0: /* 8-bit operations.  */
11397       if (all)
11398         for (i = 0; i < (16 * N); i++)
11399           aarch64_set_mem_u8
11400             (cpu, address + i,
11401              aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
11402       else
11403         for (i = 0; i < (8 * N); i++)
11404           aarch64_set_mem_u8
11405             (cpu, address + i,
11406              aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
11407       return;
11408
11409     case 1: /* 16-bit operations.  */
11410       if (all)
11411         for (i = 0; i < (8 * N); i++)
11412           aarch64_set_mem_u16
11413             (cpu, address + i * 2,
11414              aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
11415       else
11416         for (i = 0; i < (4 * N); i++)
11417           aarch64_set_mem_u16
11418             (cpu, address + i * 2,
11419              aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
11420       return;
11421
11422     case 2: /* 32-bit operations.  */
11423       if (all)
11424         for (i = 0; i < (4 * N); i++)
11425           aarch64_set_mem_u32
11426             (cpu, address + i * 4,
11427              aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
11428       else
11429         for (i = 0; i < (2 * N); i++)
11430           aarch64_set_mem_u32
11431             (cpu, address + i * 4,
11432              aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
11433       return;
11434
11435     case 3: /* 64-bit operations.  */
11436       if (all)
11437         for (i = 0; i < (2 * N); i++)
11438           aarch64_set_mem_u64
11439             (cpu, address + i * 8,
11440              aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
11441       else
11442         for (i = 0; i < N; i++)
11443           aarch64_set_mem_u64
11444             (cpu, address + i * 8,
11445              aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
11446       return;
11447     }
11448 }
11449
11450 /* Store multiple 4-element structure to four consecutive registers.  */
11451 static void
11452 ST4 (sim_cpu *cpu, uint64_t address)
11453 {
11454   vec_store (cpu, address, 4);
11455 }
11456
11457 /* Store multiple 3-element structures to three consecutive registers.  */
11458 static void
11459 ST3 (sim_cpu *cpu, uint64_t address)
11460 {
11461   vec_store (cpu, address, 3);
11462 }
11463
11464 /* Store multiple 2-element structures to two consecutive registers.  */
11465 static void
11466 ST2 (sim_cpu *cpu, uint64_t address)
11467 {
11468   vec_store (cpu, address, 2);
11469 }
11470
11471 /* Store multiple 1-element structures into one register.  */
11472 static void
11473 ST1_1 (sim_cpu *cpu, uint64_t address)
11474 {
11475   int      all  = INSTR (30, 30);
11476   unsigned size = INSTR (11, 10);
11477   unsigned vd   = INSTR (4, 0);
11478   unsigned i;
11479
11480   switch (size)
11481     {
11482     case 0:
11483       for (i = 0; i < (all ? 16 : 8); i++)
11484         aarch64_set_mem_u8 (cpu, address + i,
11485                             aarch64_get_vec_u8 (cpu, vd, i));
11486       return;
11487
11488     case 1:
11489       for (i = 0; i < (all ? 8 : 4); i++)
11490         aarch64_set_mem_u16 (cpu, address + i * 2,
11491                              aarch64_get_vec_u16 (cpu, vd, i));
11492       return;
11493
11494     case 2:
11495       for (i = 0; i < (all ? 4 : 2); i++)
11496         aarch64_set_mem_u32 (cpu, address + i * 4,
11497                              aarch64_get_vec_u32 (cpu, vd, i));
11498       return;
11499
11500     case 3:
11501       for (i = 0; i < (all ? 2 : 1); i++)
11502         aarch64_set_mem_u64 (cpu, address + i * 8,
11503                              aarch64_get_vec_u64 (cpu, vd, i));
11504       return;
11505     }
11506 }
11507
11508 /* Store multiple 1-element structures into two registers.  */
11509 static void
11510 ST1_2 (sim_cpu *cpu, uint64_t address)
11511 {
11512   /* FIXME: This algorithm is *exactly* the same as the ST2 version.
11513      So why have two different instructions ?  There must be
11514      something wrong somewhere.  */
11515   vec_store (cpu, address, 2);
11516 }
11517
11518 /* Store multiple 1-element structures into three registers.  */
11519 static void
11520 ST1_3 (sim_cpu *cpu, uint64_t address)
11521 {
11522   /* FIXME: This algorithm is *exactly* the same as the ST3 version.
11523      So why have two different instructions ?  There must be
11524      something wrong somewhere.  */
11525   vec_store (cpu, address, 3);
11526 }
11527
11528 /* Store multiple 1-element structures into four registers.  */
11529 static void
11530 ST1_4 (sim_cpu *cpu, uint64_t address)
11531 {
11532   /* FIXME: This algorithm is *exactly* the same as the ST4 version.
11533      So why have two different instructions ?  There must be
11534      something wrong somewhere.  */
11535   vec_store (cpu, address, 4);
11536 }
11537
11538 #define LDn_STn_SINGLE_LANE_AND_SIZE()                          \
11539   do                                                            \
11540     {                                                           \
11541       switch (INSTR (15, 14))                                   \
11542         {                                                       \
11543         case 0:                                                 \
11544           lane = (full << 3) | (s << 2) | size;                 \
11545           size = 0;                                             \
11546           break;                                                \
11547                                                                 \
11548         case 1:                                                 \
11549           if ((size & 1) == 1)                                  \
11550             HALT_UNALLOC;                                       \
11551           lane = (full << 2) | (s << 1) | (size >> 1);          \
11552           size = 1;                                             \
11553           break;                                                \
11554                                                                 \
11555         case 2:                                                 \
11556           if ((size & 2) == 2)                                  \
11557             HALT_UNALLOC;                                       \
11558                                                                 \
11559           if ((size & 1) == 0)                                  \
11560             {                                                   \
11561               lane = (full << 1) | s;                           \
11562               size = 2;                                         \
11563             }                                                   \
11564           else                                                  \
11565             {                                                   \
11566               if (s)                                            \
11567                 HALT_UNALLOC;                                   \
11568               lane = full;                                      \
11569               size = 3;                                         \
11570             }                                                   \
11571           break;                                                \
11572                                                                 \
11573         default:                                                \
11574           HALT_UNALLOC;                                         \
11575         }                                                       \
11576     }                                                           \
11577   while (0)
11578
11579 /* Load single structure into one lane of N registers.  */
11580 static void
11581 do_vec_LDn_single (sim_cpu *cpu, uint64_t address)
11582 {
11583   /* instr[31]    = 0
11584      instr[30]    = element selector 0=>half, 1=>all elements
11585      instr[29,24] = 00 1101
11586      instr[23]    = 0=>simple, 1=>post
11587      instr[22]    = 1
11588      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11589      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11590                       11111 (immediate post inc)
11591      instr[15,13] = opcode
11592      instr[12]    = S, used for lane number
11593      instr[11,10] = size, also used for lane number
11594      instr[9,5]   = address
11595      instr[4,0]   = Vd  */
11596
11597   unsigned full = INSTR (30, 30);
11598   unsigned vd = INSTR (4, 0);
11599   unsigned size = INSTR (11, 10);
11600   unsigned s = INSTR (12, 12);
11601   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11602   int lane = 0;
11603   int i;
11604
11605   NYI_assert (29, 24, 0x0D);
11606   NYI_assert (22, 22, 1);
11607
11608   /* Compute the lane number first (using size), and then compute size.  */
11609   LDn_STn_SINGLE_LANE_AND_SIZE ();
11610
11611   for (i = 0; i < nregs; i++)
11612     switch (size)
11613       {
11614       case 0:
11615         {
11616           uint8_t val = aarch64_get_mem_u8 (cpu, address + i);
11617           aarch64_set_vec_u8 (cpu, vd + i, lane, val);
11618           break;
11619         }
11620
11621       case 1:
11622         {
11623           uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2));
11624           aarch64_set_vec_u16 (cpu, vd + i, lane, val);
11625           break;
11626         }
11627
11628       case 2:
11629         {
11630           uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4));
11631           aarch64_set_vec_u32 (cpu, vd + i, lane, val);
11632           break;
11633         }
11634
11635       case 3:
11636         {
11637           uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8));
11638           aarch64_set_vec_u64 (cpu, vd + i, lane, val);
11639           break;
11640         }
11641       }
11642 }
11643
11644 /* Store single structure from one lane from N registers.  */
11645 static void
11646 do_vec_STn_single (sim_cpu *cpu, uint64_t address)
11647 {
11648   /* instr[31]    = 0
11649      instr[30]    = element selector 0=>half, 1=>all elements
11650      instr[29,24] = 00 1101
11651      instr[23]    = 0=>simple, 1=>post
11652      instr[22]    = 0
11653      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11654      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11655                       11111 (immediate post inc)
11656      instr[15,13] = opcode
11657      instr[12]    = S, used for lane number
11658      instr[11,10] = size, also used for lane number
11659      instr[9,5]   = address
11660      instr[4,0]   = Vd  */
11661
11662   unsigned full = INSTR (30, 30);
11663   unsigned vd = INSTR (4, 0);
11664   unsigned size = INSTR (11, 10);
11665   unsigned s = INSTR (12, 12);
11666   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11667   int lane = 0;
11668   int i;
11669
11670   NYI_assert (29, 24, 0x0D);
11671   NYI_assert (22, 22, 0);
11672
11673   /* Compute the lane number first (using size), and then compute size.  */
11674   LDn_STn_SINGLE_LANE_AND_SIZE ();
11675
11676   for (i = 0; i < nregs; i++)
11677     switch (size)
11678       {
11679       case 0:
11680         {
11681           uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane);
11682           aarch64_set_mem_u8 (cpu, address + i, val);
11683           break;
11684         }
11685
11686       case 1:
11687         {
11688           uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane);
11689           aarch64_set_mem_u16 (cpu, address + (i * 2), val);
11690           break;
11691         }
11692
11693       case 2:
11694         {
11695           uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane);
11696           aarch64_set_mem_u32 (cpu, address + (i * 4), val);
11697           break;
11698         }
11699
11700       case 3:
11701         {
11702           uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane);
11703           aarch64_set_mem_u64 (cpu, address + (i * 8), val);
11704           break;
11705         }
11706       }
11707 }
11708
11709 /* Load single structure into all lanes of N registers.  */
11710 static void
11711 do_vec_LDnR (sim_cpu *cpu, uint64_t address)
11712 {
11713   /* instr[31]    = 0
11714      instr[30]    = element selector 0=>half, 1=>all elements
11715      instr[29,24] = 00 1101
11716      instr[23]    = 0=>simple, 1=>post
11717      instr[22]    = 1
11718      instr[21]    = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
11719      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11720                       11111 (immediate post inc)
11721      instr[15,14] = 11
11722      instr[13]    = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
11723      instr[12]    = 0
11724      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11725                                  10=> word(s), 11=> double(d)
11726      instr[9,5]   = address
11727      instr[4,0]   = Vd  */
11728
11729   unsigned full = INSTR (30, 30);
11730   unsigned vd = INSTR (4, 0);
11731   unsigned size = INSTR (11, 10);
11732   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11733   int i, n;
11734
11735   NYI_assert (29, 24, 0x0D);
11736   NYI_assert (22, 22, 1);
11737   NYI_assert (15, 14, 3);
11738   NYI_assert (12, 12, 0);
11739
11740   for (n = 0; n < nregs; n++)
11741     switch (size)
11742       {
11743       case 0:
11744         {
11745           uint8_t val = aarch64_get_mem_u8 (cpu, address + n);
11746           for (i = 0; i < (full ? 16 : 8); i++)
11747             aarch64_set_vec_u8 (cpu, vd + n, i, val);
11748           break;
11749         }
11750
11751       case 1:
11752         {
11753           uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2));
11754           for (i = 0; i < (full ? 8 : 4); i++)
11755             aarch64_set_vec_u16 (cpu, vd + n, i, val);
11756           break;
11757         }
11758
11759       case 2:
11760         {
11761           uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4));
11762           for (i = 0; i < (full ? 4 : 2); i++)
11763             aarch64_set_vec_u32 (cpu, vd + n, i, val);
11764           break;
11765         }
11766
11767       case 3:
11768         {
11769           uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8));
11770           for (i = 0; i < (full ? 2 : 1); i++)
11771             aarch64_set_vec_u64 (cpu, vd + n, i, val);
11772           break;
11773         }
11774
11775       default:
11776         HALT_UNALLOC;
11777       }
11778 }
11779
11780 static void
11781 do_vec_load_store (sim_cpu *cpu)
11782 {
11783   /* {LD|ST}<N>   {Vd..Vd+N}, vaddr
11784
11785      instr[31]    = 0
11786      instr[30]    = element selector 0=>half, 1=>all elements
11787      instr[29,25] = 00110
11788      instr[24]    = 0=>multiple struct, 1=>single struct
11789      instr[23]    = 0=>simple, 1=>post
11790      instr[22]    = 0=>store, 1=>load
11791      instr[21]    = 0 (LDn) / small(0)-large(1) selector (LDnR)
11792      instr[20,16] = 00000 (simple), Vinc (reg-post-inc, no SP),
11793                     11111 (immediate post inc)
11794      instr[15,12] = elements and destinations.  eg for load:
11795                      0000=>LD4 => load multiple 4-element to
11796                      four consecutive registers
11797                      0100=>LD3 => load multiple 3-element to
11798                      three consecutive registers
11799                      1000=>LD2 => load multiple 2-element to
11800                      two consecutive registers
11801                      0010=>LD1 => load multiple 1-element to
11802                      four consecutive registers
11803                      0110=>LD1 => load multiple 1-element to
11804                      three consecutive registers
11805                      1010=>LD1 => load multiple 1-element to
11806                      two consecutive registers
11807                      0111=>LD1 => load multiple 1-element to
11808                      one register
11809                      1100=>LDR1,LDR2
11810                      1110=>LDR3,LDR4
11811      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11812                                  10=> word(s), 11=> double(d)
11813      instr[9,5]   = Vn, can be SP
11814      instr[4,0]   = Vd  */
11815
11816   int single;
11817   int post;
11818   int load;
11819   unsigned vn;
11820   uint64_t address;
11821   int type;
11822
11823   if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
11824     HALT_NYI;
11825
11826   single = INSTR (24, 24);
11827   post = INSTR (23, 23);
11828   load = INSTR (22, 22);
11829   type = INSTR (15, 12);
11830   vn = INSTR (9, 5);
11831   address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
11832
11833   if (! single && INSTR (21, 21) != 0)
11834     HALT_UNALLOC;
11835
11836   if (post)
11837     {
11838       unsigned vm = INSTR (20, 16);
11839
11840       if (vm == R31)
11841         {
11842           unsigned sizeof_operation;
11843
11844           if (single)
11845             {
11846               if ((type >= 0) && (type <= 11))
11847                 {
11848                   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11849                   switch (INSTR (15, 14))
11850                     {
11851                     case 0:
11852                       sizeof_operation = nregs * 1;
11853                       break;
11854                     case 1:
11855                       sizeof_operation = nregs * 2;
11856                       break;
11857                     case 2:
11858                       if (INSTR (10, 10) == 0)
11859                         sizeof_operation = nregs * 4;
11860                       else
11861                         sizeof_operation = nregs * 8;
11862                       break;
11863                     default:
11864                       HALT_UNALLOC;
11865                     }
11866                 }
11867               else if (type == 0xC)
11868                 {
11869                   sizeof_operation = INSTR (21, 21) ? 2 : 1;
11870                   sizeof_operation <<= INSTR (11, 10);
11871                 }
11872               else if (type == 0xE)
11873                 {
11874                   sizeof_operation = INSTR (21, 21) ? 4 : 3;
11875                   sizeof_operation <<= INSTR (11, 10);
11876                 }
11877               else
11878                 HALT_UNALLOC;
11879             }
11880           else
11881             {
11882               switch (type)
11883                 {
11884                 case 0: sizeof_operation = 32; break;
11885                 case 4: sizeof_operation = 24; break;
11886                 case 8: sizeof_operation = 16; break;
11887
11888                 case 7:
11889                   /* One register, immediate offset variant.  */
11890                   sizeof_operation = 8;
11891                   break;
11892
11893                 case 10:
11894                   /* Two registers, immediate offset variant.  */
11895                   sizeof_operation = 16;
11896                   break;
11897
11898                 case 6:
11899                   /* Three registers, immediate offset variant.  */
11900                   sizeof_operation = 24;
11901                   break;
11902
11903                 case 2:
11904                   /* Four registers, immediate offset variant.  */
11905                   sizeof_operation = 32;
11906                   break;
11907
11908                 default:
11909                   HALT_UNALLOC;
11910                 }
11911
11912               if (INSTR (30, 30))
11913                 sizeof_operation *= 2;
11914             }
11915
11916           aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
11917         }
11918       else
11919         aarch64_set_reg_u64 (cpu, vn, SP_OK,
11920                              address + aarch64_get_reg_u64 (cpu, vm, NO_SP));
11921     }
11922   else
11923     {
11924       NYI_assert (20, 16, 0);
11925     }
11926
11927   if (single)
11928     {
11929       if (load)
11930         {
11931           if ((type >= 0) && (type <= 11))
11932             do_vec_LDn_single (cpu, address);
11933           else if ((type == 0xC) || (type == 0xE))
11934             do_vec_LDnR (cpu, address);
11935           else
11936             HALT_UNALLOC;
11937           return;
11938         }
11939
11940       /* Stores.  */
11941       if ((type >= 0) && (type <= 11))
11942         {
11943           do_vec_STn_single (cpu, address);
11944           return;
11945         }
11946
11947       HALT_UNALLOC;
11948     }
11949
11950   if (load)
11951     {
11952       switch (type)
11953         {
11954         case 0:  LD4 (cpu, address); return;
11955         case 4:  LD3 (cpu, address); return;
11956         case 8:  LD2 (cpu, address); return;
11957         case 2:  LD1_4 (cpu, address); return;
11958         case 6:  LD1_3 (cpu, address); return;
11959         case 10: LD1_2 (cpu, address); return;
11960         case 7:  LD1_1 (cpu, address); return;
11961
11962         default:
11963           HALT_UNALLOC;
11964         }
11965     }
11966
11967   /* Stores.  */
11968   switch (type)
11969     {
11970     case 0:  ST4 (cpu, address); return;
11971     case 4:  ST3 (cpu, address); return;
11972     case 8:  ST2 (cpu, address); return;
11973     case 2:  ST1_4 (cpu, address); return;
11974     case 6:  ST1_3 (cpu, address); return;
11975     case 10: ST1_2 (cpu, address); return;
11976     case 7:  ST1_1 (cpu, address); return;
11977     default:
11978       HALT_UNALLOC;
11979     }
11980 }
11981
11982 static void
11983 dexLdSt (sim_cpu *cpu)
11984 {
11985   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
11986      assert  group == GROUP_LDST_0100 || group == GROUP_LDST_0110 ||
11987              group == GROUP_LDST_1100 || group == GROUP_LDST_1110
11988      bits [29,28:26] of a LS are the secondary dispatch vector.  */
11989   uint32_t group2 = dispatchLS (aarch64_get_instr (cpu));
11990
11991   switch (group2)
11992     {
11993     case LS_EXCL_000:
11994       dexLoadExclusive (cpu); return;
11995
11996     case LS_LIT_010:
11997     case LS_LIT_011:
11998       dexLoadLiteral (cpu); return;
11999
12000     case LS_OTHER_110:
12001     case LS_OTHER_111:
12002       dexLoadOther (cpu); return;
12003
12004     case LS_ADVSIMD_001:
12005       do_vec_load_store (cpu); return;
12006
12007     case LS_PAIR_100:
12008       dex_load_store_pair_gr (cpu); return;
12009
12010     case LS_PAIR_101:
12011       dex_load_store_pair_fp (cpu); return;
12012
12013     default:
12014       /* Should never reach here.  */
12015       HALT_NYI;
12016     }
12017 }
12018
12019 /* Specific decode and execute for group Data Processing Register.  */
12020
12021 static void
12022 dexLogicalShiftedRegister (sim_cpu *cpu)
12023 {
12024   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12025      instr[30,29] = op
12026      instr[28:24] = 01010
12027      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> ROR
12028      instr[21]    = N
12029      instr[20,16] = Rm
12030      instr[15,10] = count : must be 0xxxxx for 32 bit
12031      instr[9,5]   = Rn
12032      instr[4,0]   = Rd  */
12033
12034   uint32_t size      = INSTR (31, 31);
12035   Shift    shiftType = INSTR (23, 22);
12036   uint32_t count     = INSTR (15, 10);
12037
12038   /* 32 bit operations must have count[5] = 0.
12039      or else we have an UNALLOC.  */
12040   if (size == 0 && uimm (count, 5, 5))
12041     HALT_UNALLOC;
12042
12043   /* Dispatch on size:op:N.  */
12044   switch ((INSTR (31, 29) << 1) | INSTR (21, 21))
12045     {
12046     case 0: and32_shift  (cpu, shiftType, count); return;
12047     case 1: bic32_shift  (cpu, shiftType, count); return;
12048     case 2: orr32_shift  (cpu, shiftType, count); return;
12049     case 3: orn32_shift  (cpu, shiftType, count); return;
12050     case 4: eor32_shift  (cpu, shiftType, count); return;
12051     case 5: eon32_shift  (cpu, shiftType, count); return;
12052     case 6: ands32_shift (cpu, shiftType, count); return;
12053     case 7: bics32_shift (cpu, shiftType, count); return;
12054     case 8: and64_shift  (cpu, shiftType, count); return;
12055     case 9: bic64_shift  (cpu, shiftType, count); return;
12056     case 10:orr64_shift  (cpu, shiftType, count); return;
12057     case 11:orn64_shift  (cpu, shiftType, count); return;
12058     case 12:eor64_shift  (cpu, shiftType, count); return;
12059     case 13:eon64_shift  (cpu, shiftType, count); return;
12060     case 14:ands64_shift (cpu, shiftType, count); return;
12061     case 15:bics64_shift (cpu, shiftType, count); return;
12062     }
12063 }
12064
12065 /* 32 bit conditional select.  */
12066 static void
12067 csel32 (sim_cpu *cpu, CondCode cc)
12068 {
12069   unsigned rm = INSTR (20, 16);
12070   unsigned rn = INSTR (9, 5);
12071   unsigned rd = INSTR (4, 0);
12072
12073   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12074                        testConditionCode (cpu, cc)
12075                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12076                        : aarch64_get_reg_u32 (cpu, rm, NO_SP));
12077 }
12078
12079 /* 64 bit conditional select.  */
12080 static void
12081 csel64 (sim_cpu *cpu, CondCode cc)
12082 {
12083   unsigned rm = INSTR (20, 16);
12084   unsigned rn = INSTR (9, 5);
12085   unsigned rd = INSTR (4, 0);
12086
12087   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12088                        testConditionCode (cpu, cc)
12089                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12090                        : aarch64_get_reg_u64 (cpu, rm, NO_SP));
12091 }
12092
12093 /* 32 bit conditional increment.  */
12094 static void
12095 csinc32 (sim_cpu *cpu, CondCode cc)
12096 {
12097   unsigned rm = INSTR (20, 16);
12098   unsigned rn = INSTR (9, 5);
12099   unsigned rd = INSTR (4, 0);
12100
12101   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12102                        testConditionCode (cpu, cc)
12103                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12104                        : aarch64_get_reg_u32 (cpu, rm, NO_SP) + 1);
12105 }
12106
12107 /* 64 bit conditional increment.  */
12108 static void
12109 csinc64 (sim_cpu *cpu, CondCode cc)
12110 {
12111   unsigned rm = INSTR (20, 16);
12112   unsigned rn = INSTR (9, 5);
12113   unsigned rd = INSTR (4, 0);
12114
12115   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12116                        testConditionCode (cpu, cc)
12117                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12118                        : aarch64_get_reg_u64 (cpu, rm, NO_SP) + 1);
12119 }
12120
12121 /* 32 bit conditional invert.  */
12122 static void
12123 csinv32 (sim_cpu *cpu, CondCode cc)
12124 {
12125   unsigned rm = INSTR (20, 16);
12126   unsigned rn = INSTR (9, 5);
12127   unsigned rd = INSTR (4, 0);
12128
12129   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12130                        testConditionCode (cpu, cc)
12131                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12132                        : ~ aarch64_get_reg_u32 (cpu, rm, NO_SP));
12133 }
12134
12135 /* 64 bit conditional invert.  */
12136 static void
12137 csinv64 (sim_cpu *cpu, CondCode cc)
12138 {
12139   unsigned rm = INSTR (20, 16);
12140   unsigned rn = INSTR (9, 5);
12141   unsigned rd = INSTR (4, 0);
12142
12143   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12144                        testConditionCode (cpu, cc)
12145                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12146                        : ~ aarch64_get_reg_u64 (cpu, rm, NO_SP));
12147 }
12148
12149 /* 32 bit conditional negate.  */
12150 static void
12151 csneg32 (sim_cpu *cpu, CondCode cc)
12152 {
12153   unsigned rm = INSTR (20, 16);
12154   unsigned rn = INSTR (9, 5);
12155   unsigned rd = INSTR (4, 0);
12156
12157   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12158                        testConditionCode (cpu, cc)
12159                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12160                        : - aarch64_get_reg_u32 (cpu, rm, NO_SP));
12161 }
12162
12163 /* 64 bit conditional negate.  */
12164 static void
12165 csneg64 (sim_cpu *cpu, CondCode cc)
12166 {
12167   unsigned rm = INSTR (20, 16);
12168   unsigned rn = INSTR (9, 5);
12169   unsigned rd = INSTR (4, 0);
12170
12171   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12172                        testConditionCode (cpu, cc)
12173                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12174                        : - aarch64_get_reg_u64 (cpu, rm, NO_SP));
12175 }
12176
12177 static void
12178 dexCondSelect (sim_cpu *cpu)
12179 {
12180   /* instr[28,21] = 11011011
12181      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12182      instr[30:11,10] = op : 000 ==> CSEL, 001 ==> CSINC,
12183                             100 ==> CSINV, 101 ==> CSNEG,
12184                             _1_ ==> UNALLOC
12185      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12186      instr[15,12] = cond
12187      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC  */
12188
12189   CondCode cc = INSTR (15, 12);
12190   uint32_t S = INSTR (29, 29);
12191   uint32_t op2 = INSTR (11, 10);
12192
12193   if (S == 1)
12194     HALT_UNALLOC;
12195
12196   if (op2 & 0x2)
12197     HALT_UNALLOC;
12198
12199   switch ((INSTR (31, 30) << 1) | op2)
12200     {
12201     case 0: csel32  (cpu, cc); return;
12202     case 1: csinc32 (cpu, cc); return;
12203     case 2: csinv32 (cpu, cc); return;
12204     case 3: csneg32 (cpu, cc); return;
12205     case 4: csel64  (cpu, cc); return;
12206     case 5: csinc64 (cpu, cc); return;
12207     case 6: csinv64 (cpu, cc); return;
12208     case 7: csneg64 (cpu, cc); return;
12209     }
12210 }
12211
12212 /* Some helpers for counting leading 1 or 0 bits.  */
12213
12214 /* Counts the number of leading bits which are the same
12215    in a 32 bit value in the range 1 to 32.  */
12216 static uint32_t
12217 leading32 (uint32_t value)
12218 {
12219   int32_t mask= 0xffff0000;
12220   uint32_t count= 16; /* Counts number of bits set in mask.  */
12221   uint32_t lo = 1;    /* Lower bound for number of sign bits.  */
12222   uint32_t hi = 32;   /* Upper bound for number of sign bits.  */
12223
12224   while (lo + 1 < hi)
12225     {
12226       int32_t test = (value & mask);
12227
12228       if (test == 0 || test == mask)
12229         {
12230           lo = count;
12231           count = (lo + hi) / 2;
12232           mask >>= (count - lo);
12233         }
12234       else
12235         {
12236           hi = count;
12237           count = (lo + hi) / 2;
12238           mask <<= hi - count;
12239         }
12240     }
12241
12242   if (lo != hi)
12243     {
12244       int32_t test;
12245
12246       mask >>= 1;
12247       test = (value & mask);
12248
12249       if (test == 0 || test == mask)
12250         count = hi;
12251       else
12252         count = lo;
12253     }
12254
12255   return count;
12256 }
12257
12258 /* Counts the number of leading bits which are the same
12259    in a 64 bit value in the range 1 to 64.  */
12260 static uint64_t
12261 leading64 (uint64_t value)
12262 {
12263   int64_t mask= 0xffffffff00000000LL;
12264   uint64_t count = 32; /* Counts number of bits set in mask.  */
12265   uint64_t lo = 1;     /* Lower bound for number of sign bits.  */
12266   uint64_t hi = 64;    /* Upper bound for number of sign bits.  */
12267
12268   while (lo + 1 < hi)
12269     {
12270       int64_t test = (value & mask);
12271
12272       if (test == 0 || test == mask)
12273         {
12274           lo = count;
12275           count = (lo + hi) / 2;
12276           mask >>= (count - lo);
12277         }
12278       else
12279         {
12280           hi = count;
12281           count = (lo + hi) / 2;
12282           mask <<= hi - count;
12283         }
12284     }
12285
12286   if (lo != hi)
12287     {
12288       int64_t test;
12289
12290       mask >>= 1;
12291       test = (value & mask);
12292
12293       if (test == 0 || test == mask)
12294         count = hi;
12295       else
12296         count = lo;
12297     }
12298
12299   return count;
12300 }
12301
12302 /* Bit operations.  */
12303 /* N.B register args may not be SP.  */
12304
12305 /* 32 bit count leading sign bits.  */
12306 static void
12307 cls32 (sim_cpu *cpu)
12308 {
12309   unsigned rn = INSTR (9, 5);
12310   unsigned rd = INSTR (4, 0);
12311
12312   /* N.B. the result needs to exclude the leading bit.  */
12313   aarch64_set_reg_u64
12314     (cpu, rd, NO_SP, leading32 (aarch64_get_reg_u32 (cpu, rn, NO_SP)) - 1);
12315 }
12316
12317 /* 64 bit count leading sign bits.  */
12318 static void
12319 cls64 (sim_cpu *cpu)
12320 {
12321   unsigned rn = INSTR (9, 5);
12322   unsigned rd = INSTR (4, 0);
12323
12324   /* N.B. the result needs to exclude the leading bit.  */
12325   aarch64_set_reg_u64
12326     (cpu, rd, NO_SP, leading64 (aarch64_get_reg_u64 (cpu, rn, NO_SP)) - 1);
12327 }
12328
12329 /* 32 bit count leading zero bits.  */
12330 static void
12331 clz32 (sim_cpu *cpu)
12332 {
12333   unsigned rn = INSTR (9, 5);
12334   unsigned rd = INSTR (4, 0);
12335   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12336
12337   /* if the sign (top) bit is set then the count is 0.  */
12338   if (pick32 (value, 31, 31))
12339     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12340   else
12341     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading32 (value));
12342 }
12343
12344 /* 64 bit count leading zero bits.  */
12345 static void
12346 clz64 (sim_cpu *cpu)
12347 {
12348   unsigned rn = INSTR (9, 5);
12349   unsigned rd = INSTR (4, 0);
12350   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12351
12352   /* if the sign (top) bit is set then the count is 0.  */
12353   if (pick64 (value, 63, 63))
12354     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12355   else
12356     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading64 (value));
12357 }
12358
12359 /* 32 bit reverse bits.  */
12360 static void
12361 rbit32 (sim_cpu *cpu)
12362 {
12363   unsigned rn = INSTR (9, 5);
12364   unsigned rd = INSTR (4, 0);
12365   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12366   uint32_t result = 0;
12367   int i;
12368
12369   for (i = 0; i < 32; i++)
12370     {
12371       result <<= 1;
12372       result |= (value & 1);
12373       value >>= 1;
12374     }
12375   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12376 }
12377
12378 /* 64 bit reverse bits.  */
12379 static void
12380 rbit64 (sim_cpu *cpu)
12381 {
12382   unsigned rn = INSTR (9, 5);
12383   unsigned rd = INSTR (4, 0);
12384   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12385   uint64_t result = 0;
12386   int i;
12387
12388   for (i = 0; i < 64; i++)
12389     {
12390       result <<= 1;
12391       result |= (value & 1UL);
12392       value >>= 1;
12393     }
12394   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12395 }
12396
12397 /* 32 bit reverse bytes.  */
12398 static void
12399 rev32 (sim_cpu *cpu)
12400 {
12401   unsigned rn = INSTR (9, 5);
12402   unsigned rd = INSTR (4, 0);
12403   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12404   uint32_t result = 0;
12405   int i;
12406
12407   for (i = 0; i < 4; i++)
12408     {
12409       result <<= 8;
12410       result |= (value & 0xff);
12411       value >>= 8;
12412     }
12413   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12414 }
12415
12416 /* 64 bit reverse bytes.  */
12417 static void
12418 rev64 (sim_cpu *cpu)
12419 {
12420   unsigned rn = INSTR (9, 5);
12421   unsigned rd = INSTR (4, 0);
12422   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12423   uint64_t result = 0;
12424   int i;
12425
12426   for (i = 0; i < 8; i++)
12427     {
12428       result <<= 8;
12429       result |= (value & 0xffULL);
12430       value >>= 8;
12431     }
12432   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12433 }
12434
12435 /* 32 bit reverse shorts.  */
12436 /* N.B.this reverses the order of the bytes in each half word.  */
12437 static void
12438 revh32 (sim_cpu *cpu)
12439 {
12440   unsigned rn = INSTR (9, 5);
12441   unsigned rd = INSTR (4, 0);
12442   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12443   uint32_t result = 0;
12444   int i;
12445
12446   for (i = 0; i < 2; i++)
12447     {
12448       result <<= 8;
12449       result |= (value & 0x00ff00ff);
12450       value >>= 8;
12451     }
12452   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12453 }
12454
12455 /* 64 bit reverse shorts.  */
12456 /* N.B.this reverses the order of the bytes in each half word.  */
12457 static void
12458 revh64 (sim_cpu *cpu)
12459 {
12460   unsigned rn = INSTR (9, 5);
12461   unsigned rd = INSTR (4, 0);
12462   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12463   uint64_t result = 0;
12464   int i;
12465
12466   for (i = 0; i < 2; i++)
12467     {
12468       result <<= 8;
12469       result |= (value & 0x00ff00ff00ff00ffULL);
12470       value >>= 8;
12471     }
12472   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12473 }
12474
12475 static void
12476 dexDataProc1Source (sim_cpu *cpu)
12477 {
12478   /* instr[30]    = 1
12479      instr[28,21] = 111010110
12480      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12481      instr[29]    = S : 0 ==> ok, 1 ==> UNALLOC
12482      instr[20,16] = opcode2 : 00000 ==> ok, ow ==> UNALLOC
12483      instr[15,10] = opcode : 000000 ==> RBIT, 000001 ==> REV16,
12484                              000010 ==> REV, 000011 ==> UNALLOC
12485                              000100 ==> CLZ, 000101 ==> CLS
12486                              ow ==> UNALLOC
12487      instr[9,5]   = rn : may not be SP
12488      instr[4,0]   = rd : may not be SP.  */
12489
12490   uint32_t S = INSTR (29, 29);
12491   uint32_t opcode2 = INSTR (20, 16);
12492   uint32_t opcode = INSTR (15, 10);
12493   uint32_t dispatch = ((INSTR (31, 31) << 3) | opcode);
12494
12495   if (S == 1)
12496     HALT_UNALLOC;
12497
12498   if (opcode2 != 0)
12499     HALT_UNALLOC;
12500
12501   if (opcode & 0x38)
12502     HALT_UNALLOC;
12503
12504   switch (dispatch)
12505     {
12506     case 0: rbit32 (cpu); return;
12507     case 1: revh32 (cpu); return;
12508     case 2: rev32 (cpu); return;
12509     case 4: clz32 (cpu); return;
12510     case 5: cls32 (cpu); return;
12511     case 8: rbit64 (cpu); return;
12512     case 9: revh64 (cpu); return;
12513     case 10:rev32 (cpu); return;
12514     case 11:rev64 (cpu); return;
12515     case 12:clz64 (cpu); return;
12516     case 13:cls64 (cpu); return;
12517     default: HALT_UNALLOC;
12518     }
12519 }
12520
12521 /* Variable shift.
12522    Shifts by count supplied in register.
12523    N.B register args may not be SP.
12524    These all use the shifted auxiliary function for
12525    simplicity and clarity.  Writing the actual shift
12526    inline would avoid a branch and so be faster but
12527    would also necessitate getting signs right.  */
12528
12529 /* 32 bit arithmetic shift right.  */
12530 static void
12531 asrv32 (sim_cpu *cpu)
12532 {
12533   unsigned rm = INSTR (20, 16);
12534   unsigned rn = INSTR (9, 5);
12535   unsigned rd = INSTR (4, 0);
12536
12537   aarch64_set_reg_u64
12538     (cpu, rd, NO_SP,
12539      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ASR,
12540                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12541 }
12542
12543 /* 64 bit arithmetic shift right.  */
12544 static void
12545 asrv64 (sim_cpu *cpu)
12546 {
12547   unsigned rm = INSTR (20, 16);
12548   unsigned rn = INSTR (9, 5);
12549   unsigned rd = INSTR (4, 0);
12550
12551   aarch64_set_reg_u64
12552     (cpu, rd, NO_SP,
12553      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ASR,
12554                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12555 }
12556
12557 /* 32 bit logical shift left.  */
12558 static void
12559 lslv32 (sim_cpu *cpu)
12560 {
12561   unsigned rm = INSTR (20, 16);
12562   unsigned rn = INSTR (9, 5);
12563   unsigned rd = INSTR (4, 0);
12564
12565   aarch64_set_reg_u64
12566     (cpu, rd, NO_SP,
12567      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSL,
12568                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12569 }
12570
12571 /* 64 bit arithmetic shift left.  */
12572 static void
12573 lslv64 (sim_cpu *cpu)
12574 {
12575   unsigned rm = INSTR (20, 16);
12576   unsigned rn = INSTR (9, 5);
12577   unsigned rd = INSTR (4, 0);
12578
12579   aarch64_set_reg_u64
12580     (cpu, rd, NO_SP,
12581      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSL,
12582                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12583 }
12584
12585 /* 32 bit logical shift right.  */
12586 static void
12587 lsrv32 (sim_cpu *cpu)
12588 {
12589   unsigned rm = INSTR (20, 16);
12590   unsigned rn = INSTR (9, 5);
12591   unsigned rd = INSTR (4, 0);
12592
12593   aarch64_set_reg_u64
12594     (cpu, rd, NO_SP,
12595      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSR,
12596                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12597 }
12598
12599 /* 64 bit logical shift right.  */
12600 static void
12601 lsrv64 (sim_cpu *cpu)
12602 {
12603   unsigned rm = INSTR (20, 16);
12604   unsigned rn = INSTR (9, 5);
12605   unsigned rd = INSTR (4, 0);
12606
12607   aarch64_set_reg_u64
12608     (cpu, rd, NO_SP,
12609      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSR,
12610                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12611 }
12612
12613 /* 32 bit rotate right.  */
12614 static void
12615 rorv32 (sim_cpu *cpu)
12616 {
12617   unsigned rm = INSTR (20, 16);
12618   unsigned rn = INSTR (9, 5);
12619   unsigned rd = INSTR (4, 0);
12620
12621   aarch64_set_reg_u64
12622     (cpu, rd, NO_SP,
12623      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ROR,
12624                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12625 }
12626
12627 /* 64 bit rotate right.  */
12628 static void
12629 rorv64 (sim_cpu *cpu)
12630 {
12631   unsigned rm = INSTR (20, 16);
12632   unsigned rn = INSTR (9, 5);
12633   unsigned rd = INSTR (4, 0);
12634
12635   aarch64_set_reg_u64
12636     (cpu, rd, NO_SP,
12637      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ROR,
12638                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12639 }
12640
12641
12642 /* divide.  */
12643
12644 /* 32 bit signed divide.  */
12645 static void
12646 cpuiv32 (sim_cpu *cpu)
12647 {
12648   unsigned rm = INSTR (20, 16);
12649   unsigned rn = INSTR (9, 5);
12650   unsigned rd = INSTR (4, 0);
12651   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12652   /* TODO : check that this rounds towards zero as required.  */
12653   int64_t dividend = aarch64_get_reg_s32 (cpu, rn, NO_SP);
12654   int64_t divisor = aarch64_get_reg_s32 (cpu, rm, NO_SP);
12655
12656   aarch64_set_reg_s64 (cpu, rd, NO_SP,
12657                        divisor ? ((int32_t) (dividend / divisor)) : 0);
12658 }
12659
12660 /* 64 bit signed divide.  */
12661 static void
12662 cpuiv64 (sim_cpu *cpu)
12663 {
12664   unsigned rm = INSTR (20, 16);
12665   unsigned rn = INSTR (9, 5);
12666   unsigned rd = INSTR (4, 0);
12667
12668   /* TODO : check that this rounds towards zero as required.  */
12669   int64_t divisor = aarch64_get_reg_s64 (cpu, rm, NO_SP);
12670
12671   aarch64_set_reg_s64
12672     (cpu, rd, NO_SP,
12673      divisor ? (aarch64_get_reg_s64 (cpu, rn, NO_SP) / divisor) : 0);
12674 }
12675
12676 /* 32 bit unsigned divide.  */
12677 static void
12678 udiv32 (sim_cpu *cpu)
12679 {
12680   unsigned rm = INSTR (20, 16);
12681   unsigned rn = INSTR (9, 5);
12682   unsigned rd = INSTR (4, 0);
12683
12684   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12685   uint64_t dividend = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12686   uint64_t divisor  = aarch64_get_reg_u32 (cpu, rm, NO_SP);
12687
12688   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12689                        divisor ? (uint32_t) (dividend / divisor) : 0);
12690 }
12691
12692 /* 64 bit unsigned divide.  */
12693 static void
12694 udiv64 (sim_cpu *cpu)
12695 {
12696   unsigned rm = INSTR (20, 16);
12697   unsigned rn = INSTR (9, 5);
12698   unsigned rd = INSTR (4, 0);
12699
12700   /* TODO : check that this rounds towards zero as required.  */
12701   uint64_t divisor = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12702
12703   aarch64_set_reg_u64
12704     (cpu, rd, NO_SP,
12705      divisor ? (aarch64_get_reg_u64 (cpu, rn, NO_SP) / divisor) : 0);
12706 }
12707
12708 static void
12709 dexDataProc2Source (sim_cpu *cpu)
12710 {
12711   /* assert instr[30] == 0
12712      instr[28,21] == 11010110
12713      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
12714      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12715      instr[15,10] = opcode : 000010 ==> UDIV, 000011 ==> CPUIV,
12716                              001000 ==> LSLV, 001001 ==> LSRV
12717                              001010 ==> ASRV, 001011 ==> RORV
12718                              ow ==> UNALLOC.  */
12719
12720   uint32_t dispatch;
12721   uint32_t S = INSTR (29, 29);
12722   uint32_t opcode = INSTR (15, 10);
12723
12724   if (S == 1)
12725     HALT_UNALLOC;
12726
12727   if (opcode & 0x34)
12728     HALT_UNALLOC;
12729
12730   dispatch = (  (INSTR (31, 31) << 3)
12731               | (uimm (opcode, 3, 3) << 2)
12732               |  uimm (opcode, 1, 0));
12733   switch (dispatch)
12734     {
12735     case 2:  udiv32 (cpu); return;
12736     case 3:  cpuiv32 (cpu); return;
12737     case 4:  lslv32 (cpu); return;
12738     case 5:  lsrv32 (cpu); return;
12739     case 6:  asrv32 (cpu); return;
12740     case 7:  rorv32 (cpu); return;
12741     case 10: udiv64 (cpu); return;
12742     case 11: cpuiv64 (cpu); return;
12743     case 12: lslv64 (cpu); return;
12744     case 13: lsrv64 (cpu); return;
12745     case 14: asrv64 (cpu); return;
12746     case 15: rorv64 (cpu); return;
12747     default: HALT_UNALLOC;
12748     }
12749 }
12750
12751
12752 /* Multiply.  */
12753
12754 /* 32 bit multiply and add.  */
12755 static void
12756 madd32 (sim_cpu *cpu)
12757 {
12758   unsigned rm = INSTR (20, 16);
12759   unsigned ra = INSTR (14, 10);
12760   unsigned rn = INSTR (9, 5);
12761   unsigned rd = INSTR (4, 0);
12762
12763   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12764   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12765                        aarch64_get_reg_u32 (cpu, ra, NO_SP)
12766                        + aarch64_get_reg_u32 (cpu, rn, NO_SP)
12767                        * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12768 }
12769
12770 /* 64 bit multiply and add.  */
12771 static void
12772 madd64 (sim_cpu *cpu)
12773 {
12774   unsigned rm = INSTR (20, 16);
12775   unsigned ra = INSTR (14, 10);
12776   unsigned rn = INSTR (9, 5);
12777   unsigned rd = INSTR (4, 0);
12778
12779   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12780   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12781                        aarch64_get_reg_u64 (cpu, ra, NO_SP)
12782                        + (aarch64_get_reg_u64 (cpu, rn, NO_SP)
12783                           * aarch64_get_reg_u64 (cpu, rm, NO_SP)));
12784 }
12785
12786 /* 32 bit multiply and sub.  */
12787 static void
12788 msub32 (sim_cpu *cpu)
12789 {
12790   unsigned rm = INSTR (20, 16);
12791   unsigned ra = INSTR (14, 10);
12792   unsigned rn = INSTR (9, 5);
12793   unsigned rd = INSTR (4, 0);
12794
12795   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12796   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12797                        aarch64_get_reg_u32 (cpu, ra, NO_SP)
12798                        - aarch64_get_reg_u32 (cpu, rn, NO_SP)
12799                        * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12800 }
12801
12802 /* 64 bit multiply and sub.  */
12803 static void
12804 msub64 (sim_cpu *cpu)
12805 {
12806   unsigned rm = INSTR (20, 16);
12807   unsigned ra = INSTR (14, 10);
12808   unsigned rn = INSTR (9, 5);
12809   unsigned rd = INSTR (4, 0);
12810
12811   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12812   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12813                        aarch64_get_reg_u64 (cpu, ra, NO_SP)
12814                        - aarch64_get_reg_u64 (cpu, rn, NO_SP)
12815                        * aarch64_get_reg_u64 (cpu, rm, NO_SP));
12816 }
12817
12818 /* Signed multiply add long -- source, source2 : 32 bit, source3 : 64 bit.  */
12819 static void
12820 smaddl (sim_cpu *cpu)
12821 {
12822   unsigned rm = INSTR (20, 16);
12823   unsigned ra = INSTR (14, 10);
12824   unsigned rn = INSTR (9, 5);
12825   unsigned rd = INSTR (4, 0);
12826
12827   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12828      obtain a 64 bit product.  */
12829   aarch64_set_reg_s64
12830     (cpu, rd, NO_SP,
12831      aarch64_get_reg_s64 (cpu, ra, NO_SP)
12832      + ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
12833      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
12834 }
12835
12836 /* Signed multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
12837 static void
12838 smsubl (sim_cpu *cpu)
12839 {
12840   unsigned rm = INSTR (20, 16);
12841   unsigned ra = INSTR (14, 10);
12842   unsigned rn = INSTR (9, 5);
12843   unsigned rd = INSTR (4, 0);
12844
12845   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12846      obtain a 64 bit product.  */
12847   aarch64_set_reg_s64
12848     (cpu, rd, NO_SP,
12849      aarch64_get_reg_s64 (cpu, ra, NO_SP)
12850      - ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
12851      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
12852 }
12853
12854 /* Integer Multiply/Divide.  */
12855
12856 /* First some macros and a helper function.  */
12857 /* Macros to test or access elements of 64 bit words.  */
12858
12859 /* Mask used to access lo 32 bits of 64 bit unsigned int.  */
12860 #define LOW_WORD_MASK ((1ULL << 32) - 1)
12861 /* Return the lo 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
12862 #define lowWordToU64(_value_u64) ((_value_u64) & LOW_WORD_MASK)
12863 /* Return the hi 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
12864 #define highWordToU64(_value_u64) ((_value_u64) >> 32)
12865
12866 /* Offset of sign bit in 64 bit signed integger.  */
12867 #define SIGN_SHIFT_U64 63
12868 /* The sign bit itself -- also identifies the minimum negative int value.  */
12869 #define SIGN_BIT_U64 (1UL << SIGN_SHIFT_U64)
12870 /* Return true if a 64 bit signed int presented as an unsigned int is the
12871    most negative value.  */
12872 #define isMinimumU64(_value_u64) ((_value_u64) == SIGN_BIT_U64)
12873 /* Return true (non-zero) if a 64 bit signed int presented as an unsigned
12874    int has its sign bit set to false.  */
12875 #define isSignSetU64(_value_u64) ((_value_u64) & SIGN_BIT_U64)
12876 /* Return 1L or -1L according to whether a 64 bit signed int presented as
12877    an unsigned int has its sign bit set or not.  */
12878 #define signOfU64(_value_u64) (1L + (((value_u64) >> SIGN_SHIFT_U64) * -2L)
12879 /* Clear the sign bit of a 64 bit signed int presented as an unsigned int.  */
12880 #define clearSignU64(_value_u64) ((_value_u64) &= ~SIGN_BIT_U64)
12881
12882 /* Multiply two 64 bit ints and return.
12883    the hi 64 bits of the 128 bit product.  */
12884
12885 static uint64_t
12886 mul64hi (uint64_t value1, uint64_t value2)
12887 {
12888   uint64_t resultmid1;
12889   uint64_t result;
12890   uint64_t value1_lo = lowWordToU64 (value1);
12891   uint64_t value1_hi = highWordToU64 (value1) ;
12892   uint64_t value2_lo = lowWordToU64 (value2);
12893   uint64_t value2_hi = highWordToU64 (value2);
12894
12895   /* Cross-multiply and collect results.  */
12896   uint64_t xproductlo = value1_lo * value2_lo;
12897   uint64_t xproductmid1 = value1_lo * value2_hi;
12898   uint64_t xproductmid2 = value1_hi * value2_lo;
12899   uint64_t xproducthi = value1_hi * value2_hi;
12900   uint64_t carry = 0;
12901   /* Start accumulating 64 bit results.  */
12902   /* Drop bottom half of lowest cross-product.  */
12903   uint64_t resultmid = xproductlo >> 32;
12904   /* Add in middle products.  */
12905   resultmid = resultmid + xproductmid1;
12906
12907   /* Check for overflow.  */
12908   if (resultmid < xproductmid1)
12909     /* Carry over 1 into top cross-product.  */
12910     carry++;
12911
12912   resultmid1  = resultmid + xproductmid2;
12913
12914   /* Check for overflow.  */
12915   if (resultmid1 < xproductmid2)
12916     /* Carry over 1 into top cross-product.  */
12917     carry++;
12918
12919   /* Drop lowest 32 bits of middle cross-product.  */
12920   result = resultmid1 >> 32;
12921
12922   /* Add top cross-product plus and any carry.  */
12923   result += xproducthi + carry;
12924
12925   return result;
12926 }
12927
12928 /* Signed multiply high, source, source2 :
12929    64 bit, dest <-- high 64-bit of result.  */
12930 static void
12931 smulh (sim_cpu *cpu)
12932 {
12933   uint64_t uresult;
12934   int64_t  result;
12935   unsigned rm = INSTR (20, 16);
12936   unsigned rn = INSTR (9, 5);
12937   unsigned rd = INSTR (4, 0);
12938   GReg     ra = INSTR (14, 10);
12939   int64_t  value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12940   int64_t  value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12941   uint64_t uvalue1;
12942   uint64_t uvalue2;
12943   int64_t  signum = 1;
12944
12945   if (ra != R31)
12946     HALT_UNALLOC;
12947
12948   /* Convert to unsigned and use the unsigned mul64hi routine
12949      the fix the sign up afterwards.  */
12950   if (value1 < 0)
12951     {
12952       signum *= -1L;
12953       uvalue1 = -value1;
12954     }
12955   else
12956     {
12957       uvalue1 = value1;
12958     }
12959
12960   if (value2 < 0)
12961     {
12962       signum *= -1L;
12963       uvalue2 = -value2;
12964     }
12965   else
12966     {
12967       uvalue2 = value2;
12968     }
12969
12970   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12971   uresult = mul64hi (uvalue1, uvalue2);
12972   result = uresult;
12973   result *= signum;
12974
12975   aarch64_set_reg_s64 (cpu, rd, NO_SP, result);
12976 }
12977
12978 /* Unsigned multiply add long -- source, source2 :
12979    32 bit, source3 : 64 bit.  */
12980 static void
12981 umaddl (sim_cpu *cpu)
12982 {
12983   unsigned rm = INSTR (20, 16);
12984   unsigned ra = INSTR (14, 10);
12985   unsigned rn = INSTR (9, 5);
12986   unsigned rd = INSTR (4, 0);
12987
12988   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12989   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12990      obtain a 64 bit product.  */
12991   aarch64_set_reg_u64
12992     (cpu, rd, NO_SP,
12993      aarch64_get_reg_u64 (cpu, ra, NO_SP)
12994      + ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
12995      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
12996 }
12997
12998 /* Unsigned multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
12999 static void
13000 umsubl (sim_cpu *cpu)
13001 {
13002   unsigned rm = INSTR (20, 16);
13003   unsigned ra = INSTR (14, 10);
13004   unsigned rn = INSTR (9, 5);
13005   unsigned rd = INSTR (4, 0);
13006
13007   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13008   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13009      obtain a 64 bit product.  */
13010   aarch64_set_reg_u64
13011     (cpu, rd, NO_SP,
13012      aarch64_get_reg_u64 (cpu, ra, NO_SP)
13013      - ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
13014      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
13015 }
13016
13017 /* Unsigned multiply high, source, source2 :
13018    64 bit, dest <-- high 64-bit of result.  */
13019 static void
13020 umulh (sim_cpu *cpu)
13021 {
13022   unsigned rm = INSTR (20, 16);
13023   unsigned rn = INSTR (9, 5);
13024   unsigned rd = INSTR (4, 0);
13025   GReg     ra = INSTR (14, 10);
13026
13027   if (ra != R31)
13028     HALT_UNALLOC;
13029
13030   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13031   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13032                        mul64hi (aarch64_get_reg_u64 (cpu, rn, NO_SP),
13033                                 aarch64_get_reg_u64 (cpu, rm, NO_SP)));
13034 }
13035
13036 static void
13037 dexDataProc3Source (sim_cpu *cpu)
13038 {
13039   /* assert instr[28,24] == 11011.  */
13040   /* instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit (for rd at least)
13041      instr[30,29] = op54 : 00 ==> ok, ow ==> UNALLOC
13042      instr[23,21] = op31 : 111 ==> UNALLOC, o2 ==> ok
13043      instr[15] = o0 : 0/1 ==> ok
13044      instr[23,21:15] ==> op : 0000 ==> MADD, 0001 ==> MSUB,     (32/64 bit)
13045                               0010 ==> SMADDL, 0011 ==> SMSUBL, (64 bit only)
13046                               0100 ==> SMULH,                   (64 bit only)
13047                               1010 ==> UMADDL, 1011 ==> UNSUBL, (64 bit only)
13048                               1100 ==> UMULH                    (64 bit only)
13049                               ow ==> UNALLOC.  */
13050
13051   uint32_t dispatch;
13052   uint32_t size = INSTR (31, 31);
13053   uint32_t op54 = INSTR (30, 29);
13054   uint32_t op31 = INSTR (23, 21);
13055   uint32_t o0 = INSTR (15, 15);
13056
13057   if (op54 != 0)
13058     HALT_UNALLOC;
13059
13060   if (size == 0)
13061     {
13062       if (op31 != 0)
13063         HALT_UNALLOC;
13064
13065       if (o0 == 0)
13066         madd32 (cpu);
13067       else
13068         msub32 (cpu);
13069       return;
13070     }
13071
13072   dispatch = (op31 << 1) | o0;
13073
13074   switch (dispatch)
13075     {
13076     case 0:  madd64 (cpu); return;
13077     case 1:  msub64 (cpu); return;
13078     case 2:  smaddl (cpu); return;
13079     case 3:  smsubl (cpu); return;
13080     case 4:  smulh (cpu); return;
13081     case 10: umaddl (cpu); return;
13082     case 11: umsubl (cpu); return;
13083     case 12: umulh (cpu); return;
13084     default: HALT_UNALLOC;
13085     }
13086 }
13087
13088 static void
13089 dexDPReg (sim_cpu *cpu)
13090 {
13091   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
13092      assert  group == GROUP_DPREG_0101 || group == GROUP_DPREG_1101
13093      bits [28:24:21] of a DPReg are the secondary dispatch vector.  */
13094   uint32_t group2 = dispatchDPReg (aarch64_get_instr (cpu));
13095
13096   switch (group2)
13097     {
13098     case DPREG_LOG_000:
13099     case DPREG_LOG_001:
13100       dexLogicalShiftedRegister (cpu); return;
13101
13102     case DPREG_ADDSHF_010:
13103       dexAddSubtractShiftedRegister (cpu); return;
13104
13105     case DPREG_ADDEXT_011:
13106       dexAddSubtractExtendedRegister (cpu); return;
13107
13108     case DPREG_ADDCOND_100:
13109       {
13110         /* This set bundles a variety of different operations.  */
13111         /* Check for.  */
13112         /* 1) add/sub w carry.  */
13113         uint32_t mask1 = 0x1FE00000U;
13114         uint32_t val1  = 0x1A000000U;
13115         /* 2) cond compare register/immediate.  */
13116         uint32_t mask2 = 0x1FE00000U;
13117         uint32_t val2  = 0x1A400000U;
13118         /* 3) cond select.  */
13119         uint32_t mask3 = 0x1FE00000U;
13120         uint32_t val3  = 0x1A800000U;
13121         /* 4) data proc 1/2 source.  */
13122         uint32_t mask4 = 0x1FE00000U;
13123         uint32_t val4  = 0x1AC00000U;
13124
13125         if ((aarch64_get_instr (cpu) & mask1) == val1)
13126           dexAddSubtractWithCarry (cpu);
13127
13128         else if ((aarch64_get_instr (cpu) & mask2) == val2)
13129           CondCompare (cpu);
13130
13131         else if ((aarch64_get_instr (cpu) & mask3) == val3)
13132           dexCondSelect (cpu);
13133
13134         else if ((aarch64_get_instr (cpu) & mask4) == val4)
13135           {
13136             /* Bit 30 is clear for data proc 2 source
13137                and set for data proc 1 source.  */
13138             if (aarch64_get_instr (cpu)  & (1U << 30))
13139               dexDataProc1Source (cpu);
13140             else
13141               dexDataProc2Source (cpu);
13142           }
13143
13144         else
13145           /* Should not reach here.  */
13146           HALT_NYI;
13147
13148         return;
13149       }
13150
13151     case DPREG_3SRC_110:
13152       dexDataProc3Source (cpu); return;
13153
13154     case DPREG_UNALLOC_101:
13155       HALT_UNALLOC;
13156
13157     case DPREG_3SRC_111:
13158       dexDataProc3Source (cpu); return;
13159
13160     default:
13161       /* Should never reach here.  */
13162       HALT_NYI;
13163     }
13164 }
13165
13166 /* Unconditional Branch immediate.
13167    Offset is a PC-relative byte offset in the range +/- 128MiB.
13168    The offset is assumed to be raw from the decode i.e. the
13169    simulator is expected to scale them from word offsets to byte.  */
13170
13171 /* Unconditional branch.  */
13172 static void
13173 buc (sim_cpu *cpu, int32_t offset)
13174 {
13175   aarch64_set_next_PC_by_offset (cpu, offset);
13176 }
13177
13178 static unsigned stack_depth = 0;
13179
13180 /* Unconditional branch and link -- writes return PC to LR.  */
13181 static void
13182 bl (sim_cpu *cpu, int32_t offset)
13183 {
13184   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13185   aarch64_save_LR (cpu);
13186   aarch64_set_next_PC_by_offset (cpu, offset);
13187
13188   if (TRACE_BRANCH_P (cpu))
13189     {
13190       ++ stack_depth;
13191       TRACE_BRANCH (cpu,
13192                     " %*scall %" PRIx64 " [%s]"
13193                     " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13194                     stack_depth, " ", aarch64_get_next_PC (cpu),
13195                     aarch64_get_func (CPU_STATE (cpu),
13196                                       aarch64_get_next_PC (cpu)),
13197                     aarch64_get_reg_u64 (cpu, 0, NO_SP),
13198                     aarch64_get_reg_u64 (cpu, 1, NO_SP),
13199                     aarch64_get_reg_u64 (cpu, 2, NO_SP)
13200                     );
13201     }
13202 }
13203
13204 /* Unconditional Branch register.
13205    Branch/return address is in source register.  */
13206
13207 /* Unconditional branch.  */
13208 static void
13209 br (sim_cpu *cpu)
13210 {
13211   unsigned rn = INSTR (9, 5);
13212   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13213   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13214 }
13215
13216 /* Unconditional branch and link -- writes return PC to LR.  */
13217 static void
13218 blr (sim_cpu *cpu)
13219 {
13220   unsigned rn = INSTR (9, 5);
13221
13222   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13223   /* The pseudo code in the spec says we update LR before fetching.
13224      the value from the rn.  */
13225   aarch64_save_LR (cpu);
13226   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13227
13228   if (TRACE_BRANCH_P (cpu))
13229     {
13230       ++ stack_depth;
13231       TRACE_BRANCH (cpu,
13232                     " %*scall %" PRIx64 " [%s]"
13233                     " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13234                     stack_depth, " ", aarch64_get_next_PC (cpu),
13235                     aarch64_get_func (CPU_STATE (cpu),
13236                                       aarch64_get_next_PC (cpu)),
13237                     aarch64_get_reg_u64 (cpu, 0, NO_SP),
13238                     aarch64_get_reg_u64 (cpu, 1, NO_SP),
13239                     aarch64_get_reg_u64 (cpu, 2, NO_SP)
13240                     );
13241     }
13242 }
13243
13244 /* Return -- assembler will default source to LR this is functionally
13245    equivalent to br but, presumably, unlike br it side effects the
13246    branch predictor.  */
13247 static void
13248 ret (sim_cpu *cpu)
13249 {
13250   unsigned rn = INSTR (9, 5);
13251   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13252
13253   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13254   if (TRACE_BRANCH_P (cpu))
13255     {
13256       TRACE_BRANCH (cpu,
13257                     " %*sreturn [result: %" PRIx64 "]",
13258                     stack_depth, " ", aarch64_get_reg_u64 (cpu, 0, NO_SP));
13259       -- stack_depth;
13260     }
13261 }
13262
13263 /* NOP -- we implement this and call it from the decode in case we
13264    want to intercept it later.  */
13265
13266 static void
13267 nop (sim_cpu *cpu)
13268 {
13269   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13270 }
13271
13272 /* Data synchronization barrier.  */
13273
13274 static void
13275 dsb (sim_cpu *cpu)
13276 {
13277   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13278 }
13279
13280 /* Data memory barrier.  */
13281
13282 static void
13283 dmb (sim_cpu *cpu)
13284 {
13285   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13286 }
13287
13288 /* Instruction synchronization barrier.  */
13289
13290 static void
13291 isb (sim_cpu *cpu)
13292 {
13293   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13294 }
13295
13296 static void
13297 dexBranchImmediate (sim_cpu *cpu)
13298 {
13299   /* assert instr[30,26] == 00101
13300      instr[31] ==> 0 == B, 1 == BL
13301      instr[25,0] == imm26 branch offset counted in words.  */
13302
13303   uint32_t top = INSTR (31, 31);
13304   /* We have a 26 byte signed word offset which we need to pass to the
13305      execute routine as a signed byte offset.  */
13306   int32_t offset = simm32 (aarch64_get_instr (cpu), 25, 0) << 2;
13307
13308   if (top)
13309     bl (cpu, offset);
13310   else
13311     buc (cpu, offset);
13312 }
13313
13314 /* Control Flow.  */
13315
13316 /* Conditional branch
13317
13318    Offset is a PC-relative byte offset in the range +/- 1MiB pos is
13319    a bit position in the range 0 .. 63
13320
13321    cc is a CondCode enum value as pulled out of the decode
13322
13323    N.B. any offset register (source) can only be Xn or Wn.  */
13324
13325 static void
13326 bcc (sim_cpu *cpu, int32_t offset, CondCode cc)
13327 {
13328   /* The test returns TRUE if CC is met.  */
13329   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13330   if (testConditionCode (cpu, cc))
13331     aarch64_set_next_PC_by_offset (cpu, offset);
13332 }
13333
13334 /* 32 bit branch on register non-zero.  */
13335 static void
13336 cbnz32 (sim_cpu *cpu, int32_t offset)
13337 {
13338   unsigned rt = INSTR (4, 0);
13339
13340   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13341   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) != 0)
13342     aarch64_set_next_PC_by_offset (cpu, offset);
13343 }
13344
13345 /* 64 bit branch on register zero.  */
13346 static void
13347 cbnz (sim_cpu *cpu, int32_t offset)
13348 {
13349   unsigned rt = INSTR (4, 0);
13350
13351   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13352   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) != 0)
13353     aarch64_set_next_PC_by_offset (cpu, offset);
13354 }
13355
13356 /* 32 bit branch on register non-zero.  */
13357 static void
13358 cbz32 (sim_cpu *cpu, int32_t offset)
13359 {
13360   unsigned rt = INSTR (4, 0);
13361
13362   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13363   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) == 0)
13364     aarch64_set_next_PC_by_offset (cpu, offset);
13365 }
13366
13367 /* 64 bit branch on register zero.  */
13368 static void
13369 cbz (sim_cpu *cpu, int32_t offset)
13370 {
13371   unsigned rt = INSTR (4, 0);
13372
13373   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13374   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) == 0)
13375     aarch64_set_next_PC_by_offset (cpu, offset);
13376 }
13377
13378 /* Branch on register bit test non-zero -- one size fits all.  */
13379 static void
13380 tbnz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13381 {
13382   unsigned rt = INSTR (4, 0);
13383
13384   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13385   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos))
13386     aarch64_set_next_PC_by_offset (cpu, offset);
13387 }
13388
13389 /* Branch on register bit test zero -- one size fits all.  */
13390 static void
13391 tbz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13392 {
13393   unsigned rt = INSTR (4, 0);
13394
13395   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13396   if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos)))
13397     aarch64_set_next_PC_by_offset (cpu, offset);
13398 }
13399
13400 static void
13401 dexCompareBranchImmediate (sim_cpu *cpu)
13402 {
13403   /* instr[30,25] = 01 1010
13404      instr[31]    = size : 0 ==> 32, 1 ==> 64
13405      instr[24]    = op : 0 ==> CBZ, 1 ==> CBNZ
13406      instr[23,5]  = simm19 branch offset counted in words
13407      instr[4,0]   = rt  */
13408
13409   uint32_t size = INSTR (31, 31);
13410   uint32_t op   = INSTR (24, 24);
13411   int32_t offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13412
13413   if (size == 0)
13414     {
13415       if (op == 0)
13416         cbz32 (cpu, offset);
13417       else
13418         cbnz32 (cpu, offset);
13419     }
13420   else
13421     {
13422       if (op == 0)
13423         cbz (cpu, offset);
13424       else
13425         cbnz (cpu, offset);
13426     }
13427 }
13428
13429 static void
13430 dexTestBranchImmediate (sim_cpu *cpu)
13431 {
13432   /* instr[31]    = b5 : bit 5 of test bit idx
13433      instr[30,25] = 01 1011
13434      instr[24]    = op : 0 ==> TBZ, 1 == TBNZ
13435      instr[23,19] = b40 : bits 4 to 0 of test bit idx
13436      instr[18,5]  = simm14 : signed offset counted in words
13437      instr[4,0]   = uimm5  */
13438
13439   uint32_t pos = ((INSTR (31, 31) << 5) | INSTR (23, 19));
13440   int32_t offset = simm32 (aarch64_get_instr (cpu), 18, 5) << 2;
13441
13442   NYI_assert (30, 25, 0x1b);
13443
13444   if (INSTR (24, 24) == 0)
13445     tbz (cpu, pos, offset);
13446   else
13447     tbnz (cpu, pos, offset);
13448 }
13449
13450 static void
13451 dexCondBranchImmediate (sim_cpu *cpu)
13452 {
13453   /* instr[31,25] = 010 1010
13454      instr[24]    = op1; op => 00 ==> B.cond
13455      instr[23,5]  = simm19 : signed offset counted in words
13456      instr[4]     = op0
13457      instr[3,0]   = cond  */
13458
13459   int32_t offset;
13460   uint32_t op = ((INSTR (24, 24) << 1) | INSTR (4, 4));
13461
13462   NYI_assert (31, 25, 0x2a);
13463
13464   if (op != 0)
13465     HALT_UNALLOC;
13466
13467   offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13468
13469   bcc (cpu, offset, INSTR (3, 0));
13470 }
13471
13472 static void
13473 dexBranchRegister (sim_cpu *cpu)
13474 {
13475   /* instr[31,25] = 110 1011
13476      instr[24,21] = op : 0 ==> BR, 1 => BLR, 2 => RET, 3 => ERET, 4 => DRPS
13477      instr[20,16] = op2 : must be 11111
13478      instr[15,10] = op3 : must be 000000
13479      instr[4,0]   = op2 : must be 11111.  */
13480
13481   uint32_t op = INSTR (24, 21);
13482   uint32_t op2 = INSTR (20, 16);
13483   uint32_t op3 = INSTR (15, 10);
13484   uint32_t op4 = INSTR (4, 0);
13485
13486   NYI_assert (31, 25, 0x6b);
13487
13488   if (op2 != 0x1F || op3 != 0 || op4 != 0)
13489     HALT_UNALLOC;
13490
13491   if (op == 0)
13492     br (cpu);
13493
13494   else if (op == 1)
13495     blr (cpu);
13496
13497   else if (op == 2)
13498     ret (cpu);
13499
13500   else
13501     {
13502       /* ERET and DRPS accept 0b11111 for rn = instr [4,0].  */
13503       /* anything else is unallocated.  */
13504       uint32_t rn = INSTR (4, 0);
13505
13506       if (rn != 0x1f)
13507         HALT_UNALLOC;
13508
13509       if (op == 4 || op == 5)
13510         HALT_NYI;
13511
13512       HALT_UNALLOC;
13513     }
13514 }
13515
13516 /* FIXME: We should get the Angel SWI values from ../../libgloss/aarch64/svc.h
13517    but this may not be available.  So instead we define the values we need
13518    here.  */
13519 #define AngelSVC_Reason_Open            0x01
13520 #define AngelSVC_Reason_Close           0x02
13521 #define AngelSVC_Reason_Write           0x05
13522 #define AngelSVC_Reason_Read            0x06
13523 #define AngelSVC_Reason_IsTTY           0x09
13524 #define AngelSVC_Reason_Seek            0x0A
13525 #define AngelSVC_Reason_FLen            0x0C
13526 #define AngelSVC_Reason_Remove          0x0E
13527 #define AngelSVC_Reason_Rename          0x0F
13528 #define AngelSVC_Reason_Clock           0x10
13529 #define AngelSVC_Reason_Time            0x11
13530 #define AngelSVC_Reason_System          0x12
13531 #define AngelSVC_Reason_Errno           0x13
13532 #define AngelSVC_Reason_GetCmdLine      0x15
13533 #define AngelSVC_Reason_HeapInfo        0x16
13534 #define AngelSVC_Reason_ReportException 0x18
13535 #define AngelSVC_Reason_Elapsed         0x30
13536
13537
13538 static void
13539 handle_halt (sim_cpu *cpu, uint32_t val)
13540 {
13541   uint64_t result = 0;
13542
13543   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13544   if (val != 0xf000)
13545     {
13546       TRACE_SYSCALL (cpu, " HLT [0x%x]", val);
13547       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13548                        sim_stopped, SIM_SIGTRAP);
13549     }
13550
13551   /* We have encountered an Angel SVC call.  See if we can process it.  */
13552   switch (aarch64_get_reg_u32 (cpu, 0, NO_SP))
13553     {
13554     case AngelSVC_Reason_HeapInfo:
13555       {
13556         /* Get the values.  */
13557         uint64_t stack_top = aarch64_get_stack_start (cpu);
13558         uint64_t heap_base = aarch64_get_heap_start (cpu);
13559
13560         /* Get the pointer  */
13561         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13562         ptr = aarch64_get_mem_u64 (cpu, ptr);
13563
13564         /* Fill in the memory block.  */
13565         /* Start addr of heap.  */
13566         aarch64_set_mem_u64 (cpu, ptr +  0, heap_base);
13567         /* End addr of heap.  */
13568         aarch64_set_mem_u64 (cpu, ptr +  8, stack_top);
13569         /* Lowest stack addr.  */
13570         aarch64_set_mem_u64 (cpu, ptr + 16, heap_base);
13571         /* Initial stack addr.  */
13572         aarch64_set_mem_u64 (cpu, ptr + 24, stack_top);
13573
13574         TRACE_SYSCALL (cpu, " AngelSVC: Get Heap Info");
13575       }
13576       break;
13577
13578     case AngelSVC_Reason_Open:
13579       {
13580         /* Get the pointer  */
13581         /* uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);.  */
13582         /* FIXME: For now we just assume that we will only be asked
13583            to open the standard file descriptors.  */
13584         static int fd = 0;
13585         result = fd ++;
13586
13587         TRACE_SYSCALL (cpu, " AngelSVC: Open file %d", fd - 1);
13588       }
13589       break;
13590
13591     case AngelSVC_Reason_Close:
13592       {
13593         uint64_t fh = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13594         TRACE_SYSCALL (cpu, " AngelSVC: Close file %d", (int) fh);
13595         result = 0;
13596       }
13597       break;
13598
13599     case AngelSVC_Reason_Errno:
13600       result = 0;
13601       TRACE_SYSCALL (cpu, " AngelSVC: Get Errno");
13602       break;
13603
13604     case AngelSVC_Reason_Clock:
13605       result =
13606 #ifdef CLOCKS_PER_SEC
13607         (CLOCKS_PER_SEC >= 100)
13608         ? (clock () / (CLOCKS_PER_SEC / 100))
13609         : ((clock () * 100) / CLOCKS_PER_SEC)
13610 #else
13611         /* Presume unix... clock() returns microseconds.  */
13612         (clock () / 10000)
13613 #endif
13614         ;
13615         TRACE_SYSCALL (cpu, " AngelSVC: Get Clock");
13616       break;
13617
13618     case AngelSVC_Reason_GetCmdLine:
13619       {
13620         /* Get the pointer  */
13621         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13622         ptr = aarch64_get_mem_u64 (cpu, ptr);
13623
13624         /* FIXME: No command line for now.  */
13625         aarch64_set_mem_u64 (cpu, ptr, 0);
13626         TRACE_SYSCALL (cpu, " AngelSVC: Get Command Line");
13627       }
13628       break;
13629
13630     case AngelSVC_Reason_IsTTY:
13631       result = 1;
13632         TRACE_SYSCALL (cpu, " AngelSVC: IsTTY ?");
13633       break;
13634
13635     case AngelSVC_Reason_Write:
13636       {
13637         /* Get the pointer  */
13638         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13639         /* Get the write control block.  */
13640         uint64_t fd  = aarch64_get_mem_u64 (cpu, ptr);
13641         uint64_t buf = aarch64_get_mem_u64 (cpu, ptr + 8);
13642         uint64_t len = aarch64_get_mem_u64 (cpu, ptr + 16);
13643
13644         TRACE_SYSCALL (cpu, "write of %" PRIx64 " bytes from %"
13645                        PRIx64 " on descriptor %" PRIx64,
13646                        len, buf, fd);
13647
13648         if (len > 1280)
13649           {
13650             TRACE_SYSCALL (cpu,
13651                            " AngelSVC: Write: Suspiciously long write: %ld",
13652                            (long) len);
13653             sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13654                              sim_stopped, SIM_SIGBUS);
13655           }
13656         else if (fd == 1)
13657           {
13658             printf ("%.*s", (int) len, aarch64_get_mem_ptr (cpu, buf));
13659           }
13660         else if (fd == 2)
13661           {
13662             TRACE (cpu, 0, "\n");
13663             sim_io_eprintf (CPU_STATE (cpu), "%.*s",
13664                             (int) len, aarch64_get_mem_ptr (cpu, buf));
13665             TRACE (cpu, 0, "\n");
13666           }
13667         else
13668           {
13669             TRACE_SYSCALL (cpu,
13670                            " AngelSVC: Write: Unexpected file handle: %d",
13671                            (int) fd);
13672             sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13673                              sim_stopped, SIM_SIGABRT);
13674           }
13675       }
13676       break;
13677
13678     case AngelSVC_Reason_ReportException:
13679       {
13680         /* Get the pointer  */
13681         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13682         /*ptr = aarch64_get_mem_u64 (cpu, ptr);.  */
13683         uint64_t type = aarch64_get_mem_u64 (cpu, ptr);
13684         uint64_t state = aarch64_get_mem_u64 (cpu, ptr + 8);
13685
13686         TRACE_SYSCALL (cpu,
13687                        "Angel Exception: type 0x%" PRIx64 " state %" PRIx64,
13688                        type, state);
13689
13690         if (type == 0x20026)
13691           sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13692                            sim_exited, state);
13693         else
13694           sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13695                            sim_stopped, SIM_SIGINT);
13696       }
13697       break;
13698
13699     case AngelSVC_Reason_Read:
13700     case AngelSVC_Reason_FLen:
13701     case AngelSVC_Reason_Seek:
13702     case AngelSVC_Reason_Remove:
13703     case AngelSVC_Reason_Time:
13704     case AngelSVC_Reason_System:
13705     case AngelSVC_Reason_Rename:
13706     case AngelSVC_Reason_Elapsed:
13707     default:
13708       TRACE_SYSCALL (cpu, " HLT [Unknown angel %x]",
13709                      aarch64_get_reg_u32 (cpu, 0, NO_SP));
13710       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13711                        sim_stopped, SIM_SIGTRAP);
13712     }
13713
13714   aarch64_set_reg_u64 (cpu, 0, NO_SP, result);
13715 }
13716
13717 static void
13718 dexExcpnGen (sim_cpu *cpu)
13719 {
13720   /* instr[31:24] = 11010100
13721      instr[23,21] = opc : 000 ==> GEN EXCPN, 001 ==> BRK
13722                           010 ==> HLT,       101 ==> DBG GEN EXCPN
13723      instr[20,5]  = imm16
13724      instr[4,2]   = opc2 000 ==> OK, ow ==> UNALLOC
13725      instr[1,0]   = LL : discriminates opc  */
13726
13727   uint32_t opc = INSTR (23, 21);
13728   uint32_t imm16 = INSTR (20, 5);
13729   uint32_t opc2 = INSTR (4, 2);
13730   uint32_t LL;
13731
13732   NYI_assert (31, 24, 0xd4);
13733
13734   if (opc2 != 0)
13735     HALT_UNALLOC;
13736
13737   LL = INSTR (1, 0);
13738
13739   /* We only implement HLT and BRK for now.  */
13740   if (opc == 1 && LL == 0)
13741     {
13742       TRACE_EVENTS (cpu, " BRK [0x%x]", imm16);
13743       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13744                        sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
13745     }
13746
13747   if (opc == 2 && LL == 0)
13748     handle_halt (cpu, imm16);
13749
13750   else if (opc == 0 || opc == 5)
13751     HALT_NYI;
13752
13753   else
13754     HALT_UNALLOC;
13755 }
13756
13757 /* Stub for accessing system registers.  */
13758
13759 static uint64_t
13760 system_get (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13761             unsigned crm, unsigned op2)
13762 {
13763   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 7)
13764     /* DCZID_EL0 - the Data Cache Zero ID register.
13765        We do not support DC ZVA at the moment, so
13766        we return a value with the disable bit set.
13767        We implement support for the DCZID register since
13768        it is used by the C library's memset function.  */
13769     return ((uint64_t) 1) << 4;
13770
13771   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 1)
13772     /* Cache Type Register.  */
13773     return 0x80008000UL;
13774
13775   if (crn == 13 && op1 == 3 && crm == 0 && op2 == 2)
13776     /* TPIDR_EL0 - thread pointer id.  */
13777     return aarch64_get_thread_id (cpu);
13778
13779   if (op1 == 3 && crm == 4 && op2 == 0)
13780     return aarch64_get_FPCR (cpu);
13781
13782   if (op1 == 3 && crm == 4 && op2 == 1)
13783     return aarch64_get_FPSR (cpu);
13784
13785   else if (op1 == 3 && crm == 2 && op2 == 0)
13786     return aarch64_get_CPSR (cpu);
13787
13788   HALT_NYI;
13789 }
13790
13791 static void
13792 system_set (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13793             unsigned crm, unsigned op2, uint64_t val)
13794 {
13795   if (op1 == 3 && crm == 4 && op2 == 0)
13796     aarch64_set_FPCR (cpu, val);
13797
13798   else if (op1 == 3 && crm == 4 && op2 == 1)
13799     aarch64_set_FPSR (cpu, val);
13800
13801   else if (op1 == 3 && crm == 2 && op2 == 0)
13802     aarch64_set_CPSR (cpu, val);
13803
13804   else
13805     HALT_NYI;
13806 }
13807
13808 static void
13809 do_mrs (sim_cpu *cpu)
13810 {
13811   /* instr[31:20] = 1101 0101 0001 1
13812      instr[19]    = op0
13813      instr[18,16] = op1
13814      instr[15,12] = CRn
13815      instr[11,8]  = CRm
13816      instr[7,5]   = op2
13817      instr[4,0]   = Rt  */
13818   unsigned sys_op0 = INSTR (19, 19) + 2;
13819   unsigned sys_op1 = INSTR (18, 16);
13820   unsigned sys_crn = INSTR (15, 12);
13821   unsigned sys_crm = INSTR (11, 8);
13822   unsigned sys_op2 = INSTR (7, 5);
13823   unsigned rt = INSTR (4, 0);
13824
13825   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13826   aarch64_set_reg_u64 (cpu, rt, NO_SP,
13827                        system_get (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2));
13828 }
13829
13830 static void
13831 do_MSR_immediate (sim_cpu *cpu)
13832 {
13833   /* instr[31:19] = 1101 0101 0000 0
13834      instr[18,16] = op1
13835      instr[15,12] = 0100
13836      instr[11,8]  = CRm
13837      instr[7,5]   = op2
13838      instr[4,0]   = 1 1111  */
13839
13840   unsigned op1 = INSTR (18, 16);
13841   /*unsigned crm = INSTR (11, 8);*/
13842   unsigned op2 = INSTR (7, 5);
13843
13844   NYI_assert (31, 19, 0x1AA0);
13845   NYI_assert (15, 12, 0x4);
13846   NYI_assert (4,  0,  0x1F);
13847
13848   if (op1 == 0)
13849     {
13850       if (op2 == 5)
13851         HALT_NYI; /* set SPSel.  */
13852       else
13853         HALT_UNALLOC;
13854     }
13855   else if (op1 == 3)
13856     {
13857       if (op2 == 6)
13858         HALT_NYI; /* set DAIFset.  */
13859       else if (op2 == 7)
13860         HALT_NYI; /* set DAIFclr.  */
13861       else
13862         HALT_UNALLOC;
13863     }
13864   else
13865     HALT_UNALLOC;
13866 }
13867
13868 static void
13869 do_MSR_reg (sim_cpu *cpu)
13870 {
13871   /* instr[31:20] = 1101 0101 0001
13872      instr[19]    = op0
13873      instr[18,16] = op1
13874      instr[15,12] = CRn
13875      instr[11,8]  = CRm
13876      instr[7,5]   = op2
13877      instr[4,0]   = Rt  */
13878
13879   unsigned sys_op0 = INSTR (19, 19) + 2;
13880   unsigned sys_op1 = INSTR (18, 16);
13881   unsigned sys_crn = INSTR (15, 12);
13882   unsigned sys_crm = INSTR (11, 8);
13883   unsigned sys_op2 = INSTR (7, 5);
13884   unsigned rt = INSTR (4, 0);
13885
13886   NYI_assert (31, 20, 0xD51);
13887
13888   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13889   system_set (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2,
13890               aarch64_get_reg_u64 (cpu, rt, NO_SP));
13891 }
13892
13893 static void
13894 do_SYS (sim_cpu *cpu)
13895 {
13896   /* instr[31,19] = 1101 0101 0000 1
13897      instr[18,16] = op1
13898      instr[15,12] = CRn
13899      instr[11,8]  = CRm
13900      instr[7,5]   = op2
13901      instr[4,0]   = Rt  */
13902   NYI_assert (31, 19, 0x1AA1);
13903
13904   /* FIXME: For now we just silently accept system ops.  */
13905 }
13906
13907 static void
13908 dexSystem (sim_cpu *cpu)
13909 {
13910   /* instr[31:22] = 1101 01010 0
13911      instr[21]    = L
13912      instr[20,19] = op0
13913      instr[18,16] = op1
13914      instr[15,12] = CRn
13915      instr[11,8]  = CRm
13916      instr[7,5]   = op2
13917      instr[4,0]   = uimm5  */
13918
13919   /* We are interested in HINT, DSB, DMB and ISB
13920
13921      Hint #0 encodes NOOP (this is the only hint we care about)
13922      L == 0, op0 == 0, op1 = 011, CRn = 0010, Rt = 11111,
13923      CRm op2  != 0000 000 OR CRm op2 == 0000 000 || CRm op > 0000 101
13924
13925      DSB, DMB, ISB are data store barrier, data memory barrier and
13926      instruction store barrier, respectively, where
13927
13928      L == 0, op0 == 0, op1 = 011, CRn = 0011, Rt = 11111,
13929      op2 : DSB ==> 100, DMB ==> 101, ISB ==> 110
13930      CRm<3:2> ==> domain, CRm<1:0> ==> types,
13931      domain : 00 ==> OuterShareable, 01 ==> Nonshareable,
13932               10 ==> InerShareable, 11 ==> FullSystem
13933      types :  01 ==> Reads, 10 ==> Writes,
13934               11 ==> All, 00 ==> All (domain == FullSystem).  */
13935
13936   unsigned rt = INSTR (4, 0);
13937
13938   NYI_assert (31, 22, 0x354);
13939
13940   switch (INSTR (21, 12))
13941     {
13942     case 0x032:
13943       if (rt == 0x1F)
13944         {
13945           /* NOP has CRm != 0000 OR.  */
13946           /*         (CRm == 0000 AND (op2 == 000 OR op2 > 101)).  */
13947           uint32_t crm = INSTR (11, 8);
13948           uint32_t op2 = INSTR (7, 5);
13949
13950           if (crm != 0 || (op2 == 0 || op2 > 5))
13951             {
13952               /* Actually call nop method so we can reimplement it later.  */
13953               nop (cpu);
13954               return;
13955             }
13956         }
13957       HALT_NYI;
13958
13959     case 0x033:
13960       {
13961         uint32_t op2 =  INSTR (7, 5);
13962
13963         switch (op2)
13964           {
13965           case 2: HALT_NYI;
13966           case 4: dsb (cpu); return;
13967           case 5: dmb (cpu); return;
13968           case 6: isb (cpu); return;
13969           default: HALT_UNALLOC;
13970         }
13971       }
13972
13973     case 0x3B0:
13974     case 0x3B4:
13975     case 0x3BD:
13976       do_mrs (cpu);
13977       return;
13978
13979     case 0x0B7:
13980       do_SYS (cpu); /* DC is an alias of SYS.  */
13981       return;
13982
13983     default:
13984       if (INSTR (21, 20) == 0x1)
13985         do_MSR_reg (cpu);
13986       else if (INSTR (21, 19) == 0 && INSTR (15, 12) == 0x4)
13987         do_MSR_immediate (cpu);
13988       else
13989         HALT_NYI;
13990       return;
13991     }
13992 }
13993
13994 static void
13995 dexBr (sim_cpu *cpu)
13996 {
13997   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
13998      assert  group == GROUP_BREXSYS_1010 || group == GROUP_BREXSYS_1011
13999      bits [31,29] of a BrExSys are the secondary dispatch vector.  */
14000   uint32_t group2 = dispatchBrExSys (aarch64_get_instr (cpu));
14001
14002   switch (group2)
14003     {
14004     case BR_IMM_000:
14005       return dexBranchImmediate (cpu);
14006
14007     case BR_IMMCMP_001:
14008       /* Compare has bit 25 clear while test has it set.  */
14009       if (!INSTR (25, 25))
14010         dexCompareBranchImmediate (cpu);
14011       else
14012         dexTestBranchImmediate (cpu);
14013       return;
14014
14015     case BR_IMMCOND_010:
14016       /* This is a conditional branch if bit 25 is clear otherwise
14017          unallocated.  */
14018       if (!INSTR (25, 25))
14019         dexCondBranchImmediate (cpu);
14020       else
14021         HALT_UNALLOC;
14022       return;
14023
14024     case BR_UNALLOC_011:
14025       HALT_UNALLOC;
14026
14027     case BR_IMM_100:
14028       dexBranchImmediate (cpu);
14029       return;
14030
14031     case BR_IMMCMP_101:
14032       /* Compare has bit 25 clear while test has it set.  */
14033       if (!INSTR (25, 25))
14034         dexCompareBranchImmediate (cpu);
14035       else
14036         dexTestBranchImmediate (cpu);
14037       return;
14038
14039     case BR_REG_110:
14040       /* Unconditional branch reg has bit 25 set.  */
14041       if (INSTR (25, 25))
14042         dexBranchRegister (cpu);
14043
14044       /* This includes both Excpn Gen, System and unalloc operations.
14045          We need to decode the Excpn Gen operation BRK so we can plant
14046          debugger entry points.
14047          Excpn Gen operations have instr [24] = 0.
14048          we need to decode at least one of the System operations NOP
14049          which is an alias for HINT #0.
14050          System operations have instr [24,22] = 100.  */
14051       else if (INSTR (24, 24) == 0)
14052         dexExcpnGen (cpu);
14053
14054       else if (INSTR (24, 22) == 4)
14055         dexSystem (cpu);
14056
14057       else
14058         HALT_UNALLOC;
14059
14060       return;
14061
14062     case BR_UNALLOC_111:
14063       HALT_UNALLOC;
14064
14065     default:
14066       /* Should never reach here.  */
14067       HALT_NYI;
14068     }
14069 }
14070
14071 static void
14072 aarch64_decode_and_execute (sim_cpu *cpu, uint64_t pc)
14073 {
14074   /* We need to check if gdb wants an in here.  */
14075   /* checkBreak (cpu);.  */
14076
14077   uint64_t group = dispatchGroup (aarch64_get_instr (cpu));
14078
14079   switch (group)
14080     {
14081     case GROUP_PSEUDO_0000:   dexPseudo (cpu); break;
14082     case GROUP_LDST_0100:     dexLdSt (cpu); break;
14083     case GROUP_DPREG_0101:    dexDPReg (cpu); break;
14084     case GROUP_LDST_0110:     dexLdSt (cpu); break;
14085     case GROUP_ADVSIMD_0111:  dexAdvSIMD0 (cpu); break;
14086     case GROUP_DPIMM_1000:    dexDPImm (cpu); break;
14087     case GROUP_DPIMM_1001:    dexDPImm (cpu); break;
14088     case GROUP_BREXSYS_1010:  dexBr (cpu); break;
14089     case GROUP_BREXSYS_1011:  dexBr (cpu); break;
14090     case GROUP_LDST_1100:     dexLdSt (cpu); break;
14091     case GROUP_DPREG_1101:    dexDPReg (cpu); break;
14092     case GROUP_LDST_1110:     dexLdSt (cpu); break;
14093     case GROUP_ADVSIMD_1111:  dexAdvSIMD1 (cpu); break;
14094
14095     case GROUP_UNALLOC_0001:
14096     case GROUP_UNALLOC_0010:
14097     case GROUP_UNALLOC_0011:
14098       HALT_UNALLOC;
14099
14100     default:
14101       /* Should never reach here.  */
14102       HALT_NYI;
14103     }
14104 }
14105
14106 static bfd_boolean
14107 aarch64_step (sim_cpu *cpu)
14108 {
14109   uint64_t pc = aarch64_get_PC (cpu);
14110
14111   if (pc == TOP_LEVEL_RETURN_PC)
14112     return FALSE;
14113
14114   aarch64_set_next_PC (cpu, pc + 4);
14115
14116   /* Code is always little-endian.  */
14117   sim_core_read_buffer (CPU_STATE (cpu), cpu, read_map,
14118                         & aarch64_get_instr (cpu), pc, 4);
14119   aarch64_get_instr (cpu) = endian_le2h_4 (aarch64_get_instr (cpu));
14120
14121   TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %08x", pc,
14122               aarch64_get_instr (cpu));
14123   TRACE_DISASM (cpu, pc);
14124
14125   aarch64_decode_and_execute (cpu, pc);
14126
14127   return TRUE;
14128 }
14129
14130 void
14131 aarch64_run (SIM_DESC sd)
14132 {
14133   sim_cpu *cpu = STATE_CPU (sd, 0);
14134
14135   while (aarch64_step (cpu))
14136     {
14137       aarch64_update_PC (cpu);
14138
14139       if (sim_events_tick (sd))
14140         sim_events_process (sd);
14141     }
14142
14143   sim_engine_halt (sd, cpu, NULL, aarch64_get_PC (cpu),
14144                    sim_exited, aarch64_get_reg_s32 (cpu, R0, NO_SP));
14145 }
14146
14147 void
14148 aarch64_init (sim_cpu *cpu, uint64_t pc)
14149 {
14150   uint64_t sp = aarch64_get_stack_start (cpu);
14151
14152   /* Install SP, FP and PC and set LR to -20
14153      so we can detect a top-level return.  */
14154   aarch64_set_reg_u64 (cpu, SP, SP_OK, sp);
14155   aarch64_set_reg_u64 (cpu, FP, SP_OK, sp);
14156   aarch64_set_reg_u64 (cpu, LR, SP_OK, TOP_LEVEL_RETURN_PC);
14157   aarch64_set_next_PC (cpu, pc);
14158   aarch64_update_PC (cpu);
14159   aarch64_init_LIT_table ();
14160 }