sim/aarch64/simulator.c

   1 /* simulator.c -- Interface for the AArch64 simulator.
   2
   3    Copyright (C) 2015-2017 Free Software Foundation, Inc.
   4
   5    Contributed by Red Hat.
   6
   7    This file is part of GDB.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include <stdlib.h>
  24 #include <stdio.h>
  25 #include <string.h>
  26 #include <sys/types.h>
  27 #include <math.h>
  28 #include <time.h>
  29 #include <limits.h>
  30
  31 #include "simulator.h"
  32 #include "cpustate.h"
  33 #include "memory.h"
  34
  35 #define NO_SP 0
  36 #define SP_OK 1
  37
  38 #define TST(_flag)   (aarch64_test_CPSR_bit (cpu, _flag))
  39 #define IS_SET(_X)   (TST (( _X )) ? 1 : 0)
  40 #define IS_CLEAR(_X) (TST (( _X )) ? 0 : 1)
  41
  42 /* Space saver macro.  */
  43 #define INSTR(HIGH, LOW) uimm (aarch64_get_instr (cpu), (HIGH), (LOW))
  44
  45 #define HALT_UNALLOC                                                    \
  46   do                                                                    \
  47     {                                                                   \
  48       TRACE_DISASM (cpu, aarch64_get_PC (cpu));                         \
  49       TRACE_INSN (cpu,                                                  \
  50                   "Unallocated instruction detected at sim line %d,"    \
  51                   " exe addr %" PRIx64,                                 \
  52                   __LINE__, aarch64_get_PC (cpu));                      \
  53       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
  54                        sim_stopped, SIM_SIGILL);                        \
  55     }                                                                   \
  56   while (0)
  57
  58 #define HALT_NYI                                                        \
  59   do                                                                    \
  60     {                                                                   \
  61       TRACE_DISASM (cpu, aarch64_get_PC (cpu));                         \
  62       TRACE_INSN (cpu,                                                  \
  63                   "Unimplemented instruction detected at sim line %d,"  \
  64                   " exe addr %" PRIx64,                                 \
  65                   __LINE__, aarch64_get_PC (cpu));                      \
  66       if (! TRACE_ANY_P (cpu))                                          \
  67         sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: %#08x\n", \
  68                         aarch64_get_instr (cpu));                       \
  69       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
  70                        sim_stopped, SIM_SIGABRT);                       \
  71     }                                                                   \
  72   while (0)
  73
  74 #define NYI_assert(HI, LO, EXPECTED)                                    \
  75   do                                                                    \
  76     {                                                                   \
  77       if (INSTR ((HI), (LO)) != (EXPECTED))                             \
  78         HALT_NYI;                                                       \
  79     }                                                                   \
  80   while (0)
  81
  82 /* Helper functions used by expandLogicalImmediate.  */
  83
  84 /* for i = 1, ... N result<i-1> = 1 other bits are zero  */
  85 static inline uint64_t
  86 ones (int N)
  87 {
  88   return (N == 64 ? (uint64_t)-1UL : ((1UL << N) - 1));
  89 }
  90
  91 /* result<0> to val<N>  */
  92 static inline uint64_t
  93 pickbit (uint64_t val, int N)
  94 {
  95   return pickbits64 (val, N, N);
  96 }
  97
  98 static uint64_t
  99 expand_logical_immediate (uint32_t S, uint32_t R, uint32_t N)
 100 {
 101   uint64_t mask;
 102   uint64_t imm;
 103   unsigned simd_size;
 104
 105   /* The immediate value is S+1 bits to 1, left rotated by SIMDsize - R
 106      (in other words, right rotated by R), then replicated. */
 107   if (N != 0)
 108     {
 109       simd_size = 64;
 110       mask = 0xffffffffffffffffull;
 111     }
 112   else
 113     {
 114       switch (S)
 115         {
 116         case 0x00 ... 0x1f: /* 0xxxxx */ simd_size = 32;           break;
 117         case 0x20 ... 0x2f: /* 10xxxx */ simd_size = 16; S &= 0xf; break;
 118         case 0x30 ... 0x37: /* 110xxx */ simd_size =  8; S &= 0x7; break;
 119         case 0x38 ... 0x3b: /* 1110xx */ simd_size =  4; S &= 0x3; break;
 120         case 0x3c ... 0x3d: /* 11110x */ simd_size =  2; S &= 0x1; break;
 121         default: return 0;
 122         }
 123       mask = (1ull << simd_size) - 1;
 124       /* Top bits are IGNORED.  */
 125       R &= simd_size - 1;
 126     }
 127
 128   /* NOTE: if S = simd_size - 1 we get 0xf..f which is rejected.  */
 129   if (S == simd_size - 1)
 130     return 0;
 131
 132   /* S+1 consecutive bits to 1.  */
 133   /* NOTE: S can't be 63 due to detection above.  */
 134   imm = (1ull << (S + 1)) - 1;
 135
 136   /* Rotate to the left by simd_size - R.  */
 137   if (R != 0)
 138     imm = ((imm << (simd_size - R)) & mask) | (imm >> R);
 139
 140   /* Replicate the value according to SIMD size.  */
 141   switch (simd_size)
 142     {
 143     case  2: imm = (imm <<  2) | imm;
 144     case  4: imm = (imm <<  4) | imm;
 145     case  8: imm = (imm <<  8) | imm;
 146     case 16: imm = (imm << 16) | imm;
 147     case 32: imm = (imm << 32) | imm;
 148     case 64: break;
 149     default: return 0;
 150     }
 151
 152   return imm;
 153 }
 154
 155 /* Instr[22,10] encodes N immr and imms. we want a lookup table
 156    for each possible combination i.e. 13 bits worth of int entries.  */
 157 #define  LI_TABLE_SIZE  (1 << 13)
 158 static uint64_t LITable[LI_TABLE_SIZE];
 159
 160 void
 161 aarch64_init_LIT_table (void)
 162 {
 163   unsigned index;
 164
 165   for (index = 0; index < LI_TABLE_SIZE; index++)
 166     {
 167       uint32_t N    = uimm (index, 12, 12);
 168       uint32_t immr = uimm (index, 11, 6);
 169       uint32_t imms = uimm (index, 5, 0);
 170
 171       LITable [index] = expand_logical_immediate (imms, immr, N);
 172     }
 173 }
 174
 175 static void
 176 dexNotify (sim_cpu *cpu)
 177 {
 178   /* instr[14,0] == type : 0 ==> method entry, 1 ==> method reentry
 179                            2 ==> exit Java, 3 ==> start next bytecode.  */
 180   uint32_t type = INSTR (14, 0);
 181
 182   TRACE_EVENTS (cpu, "Notify Insn encountered, type = 0x%x", type);
 183
 184   switch (type)
 185     {
 186     case 0:
 187       /* aarch64_notifyMethodEntry (aarch64_get_reg_u64 (cpu, R23, 0),
 188          aarch64_get_reg_u64 (cpu, R22, 0));  */
 189       break;
 190     case 1:
 191       /* aarch64_notifyMethodReentry (aarch64_get_reg_u64 (cpu, R23, 0),
 192          aarch64_get_reg_u64 (cpu, R22, 0));  */
 193       break;
 194     case 2:
 195       /* aarch64_notifyMethodExit ();  */
 196       break;
 197     case 3:
 198       /* aarch64_notifyBCStart (aarch64_get_reg_u64 (cpu, R23, 0),
 199          aarch64_get_reg_u64 (cpu, R22, 0));  */
 200       break;
 201     }
 202 }
 203
 204 /* secondary decode within top level groups  */
 205
 206 static void
 207 dexPseudo (sim_cpu *cpu)
 208 {
 209   /* assert instr[28,27] = 00
 210
 211      We provide 2 pseudo instructions:
 212
 213      HALT stops execution of the simulator causing an immediate
 214      return to the x86 code which entered it.
 215
 216      CALLOUT initiates recursive entry into x86 code.  A register
 217      argument holds the address of the x86 routine.  Immediate
 218      values in the instruction identify the number of general
 219      purpose and floating point register arguments to be passed
 220      and the type of any value to be returned.  */
 221
 222   uint32_t PSEUDO_HALT      =  0xE0000000U;
 223   uint32_t PSEUDO_CALLOUT   =  0x00018000U;
 224   uint32_t PSEUDO_CALLOUTR  =  0x00018001U;
 225   uint32_t PSEUDO_NOTIFY    =  0x00014000U;
 226   uint32_t dispatch;
 227
 228   if (aarch64_get_instr (cpu) == PSEUDO_HALT)
 229     {
 230       TRACE_EVENTS (cpu, " Pseudo Halt Instruction");
 231       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
 232                        sim_stopped, SIM_SIGTRAP);
 233     }
 234
 235   dispatch = INSTR (31, 15);
 236
 237   /* We do not handle callouts at the moment.  */
 238   if (dispatch == PSEUDO_CALLOUT || dispatch == PSEUDO_CALLOUTR)
 239     {
 240       TRACE_EVENTS (cpu, " Callout");
 241       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
 242                        sim_stopped, SIM_SIGABRT);
 243     }
 244
 245   else if (dispatch == PSEUDO_NOTIFY)
 246     dexNotify (cpu);
 247
 248   else
 249     HALT_UNALLOC;
 250 }
 251
 252 /* Load-store single register (unscaled offset)
 253    These instructions employ a base register plus an unscaled signed
 254    9 bit offset.
 255
 256    N.B. the base register (source) can be Xn or SP. all other
 257    registers may not be SP.  */
 258
 259 /* 32 bit load 32 bit unscaled signed 9 bit.  */
 260 static void
 261 ldur32 (sim_cpu *cpu, int32_t offset)
 262 {
 263   unsigned rn = INSTR (9, 5);
 264   unsigned rt = INSTR (4, 0);
 265
 266   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 267   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
 268                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 269                         + offset));
 270 }
 271
 272 /* 64 bit load 64 bit unscaled signed 9 bit.  */
 273 static void
 274 ldur64 (sim_cpu *cpu, int32_t offset)
 275 {
 276   unsigned rn = INSTR (9, 5);
 277   unsigned rt = INSTR (4, 0);
 278
 279   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 280   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
 281                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 282                         + offset));
 283 }
 284
 285 /* 32 bit load zero-extended byte unscaled signed 9 bit.  */
 286 static void
 287 ldurb32 (sim_cpu *cpu, int32_t offset)
 288 {
 289   unsigned rn = INSTR (9, 5);
 290   unsigned rt = INSTR (4, 0);
 291
 292   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 293   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8
 294                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 295                         + offset));
 296 }
 297
 298 /* 32 bit load sign-extended byte unscaled signed 9 bit.  */
 299 static void
 300 ldursb32 (sim_cpu *cpu, int32_t offset)
 301 {
 302   unsigned rn = INSTR (9, 5);
 303   unsigned rt = INSTR (4, 0);
 304
 305   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 306   aarch64_set_reg_u64 (cpu, rt, NO_SP, (uint32_t) aarch64_get_mem_s8
 307                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 308                         + offset));
 309 }
 310
 311 /* 64 bit load sign-extended byte unscaled signed 9 bit.  */
 312 static void
 313 ldursb64 (sim_cpu *cpu, int32_t offset)
 314 {
 315   unsigned rn = INSTR (9, 5);
 316   unsigned rt = INSTR (4, 0);
 317
 318   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 319   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s8
 320                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 321                         + offset));
 322 }
 323
 324 /* 32 bit load zero-extended short unscaled signed 9 bit  */
 325 static void
 326 ldurh32 (sim_cpu *cpu, int32_t offset)
 327 {
 328   unsigned rn = INSTR (9, 5);
 329   unsigned rd = INSTR (4, 0);
 330
 331   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 332   aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_mem_u16
 333                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 334                         + offset));
 335 }
 336
 337 /* 32 bit load sign-extended short unscaled signed 9 bit  */
 338 static void
 339 ldursh32 (sim_cpu *cpu, int32_t offset)
 340 {
 341   unsigned rn = INSTR (9, 5);
 342   unsigned rd = INSTR (4, 0);
 343
 344   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 345   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s16
 346                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 347                         + offset));
 348 }
 349
 350 /* 64 bit load sign-extended short unscaled signed 9 bit  */
 351 static void
 352 ldursh64 (sim_cpu *cpu, int32_t offset)
 353 {
 354   unsigned rn = INSTR (9, 5);
 355   unsigned rt = INSTR (4, 0);
 356
 357   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 358   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s16
 359                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 360                         + offset));
 361 }
 362
 363 /* 64 bit load sign-extended word unscaled signed 9 bit  */
 364 static void
 365 ldursw (sim_cpu *cpu, int32_t offset)
 366 {
 367   unsigned rn = INSTR (9, 5);
 368   unsigned rd = INSTR (4, 0);
 369
 370   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 371   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s32
 372                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 373                         + offset));
 374 }
 375
 376 /* N.B. with stores the value in source is written to the address
 377    identified by source2 modified by offset.  */
 378
 379 /* 32 bit store 32 bit unscaled signed 9 bit.  */
 380 static void
 381 stur32 (sim_cpu *cpu, int32_t offset)
 382 {
 383   unsigned rn = INSTR (9, 5);
 384   unsigned rd = INSTR (4, 0);
 385
 386   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 387   aarch64_set_mem_u32 (cpu,
 388                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 389                        aarch64_get_reg_u32 (cpu, rd, NO_SP));
 390 }
 391
 392 /* 64 bit store 64 bit unscaled signed 9 bit  */
 393 static void
 394 stur64 (sim_cpu *cpu, int32_t offset)
 395 {
 396   unsigned rn = INSTR (9, 5);
 397   unsigned rd = INSTR (4, 0);
 398
 399   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 400   aarch64_set_mem_u64 (cpu,
 401                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 402                        aarch64_get_reg_u64 (cpu, rd, NO_SP));
 403 }
 404
 405 /* 32 bit store byte unscaled signed 9 bit  */
 406 static void
 407 sturb (sim_cpu *cpu, int32_t offset)
 408 {
 409   unsigned rn = INSTR (9, 5);
 410   unsigned rd = INSTR (4, 0);
 411
 412   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 413   aarch64_set_mem_u8 (cpu,
 414                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 415                       aarch64_get_reg_u8 (cpu, rd, NO_SP));
 416 }
 417
 418 /* 32 bit store short unscaled signed 9 bit  */
 419 static void
 420 sturh (sim_cpu *cpu, int32_t offset)
 421 {
 422   unsigned rn = INSTR (9, 5);
 423   unsigned rd = INSTR (4, 0);
 424
 425   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 426   aarch64_set_mem_u16 (cpu,
 427                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 428                        aarch64_get_reg_u16 (cpu, rd, NO_SP));
 429 }
 430
 431 /* Load single register pc-relative label
 432    Offset is a signed 19 bit immediate count in words
 433    rt may not be SP.  */
 434
 435 /* 32 bit pc-relative load  */
 436 static void
 437 ldr32_pcrel (sim_cpu *cpu, int32_t offset)
 438 {
 439   unsigned rd = INSTR (4, 0);
 440
 441   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 442   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 443                        aarch64_get_mem_u32
 444                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 445 }
 446
 447 /* 64 bit pc-relative load  */
 448 static void
 449 ldr_pcrel (sim_cpu *cpu, int32_t offset)
 450 {
 451   unsigned rd = INSTR (4, 0);
 452
 453   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 454   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 455                        aarch64_get_mem_u64
 456                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 457 }
 458
 459 /* sign extended 32 bit pc-relative load  */
 460 static void
 461 ldrsw_pcrel (sim_cpu *cpu, int32_t offset)
 462 {
 463   unsigned rd = INSTR (4, 0);
 464
 465   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 466   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 467                        aarch64_get_mem_s32
 468                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 469 }
 470
 471 /* float pc-relative load  */
 472 static void
 473 fldrs_pcrel (sim_cpu *cpu, int32_t offset)
 474 {
 475   unsigned int rd = INSTR (4, 0);
 476
 477   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 478   aarch64_set_vec_u32 (cpu, rd, 0,
 479                        aarch64_get_mem_u32
 480                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 481 }
 482
 483 /* double pc-relative load  */
 484 static void
 485 fldrd_pcrel (sim_cpu *cpu, int32_t offset)
 486 {
 487   unsigned int st = INSTR (4, 0);
 488
 489   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 490   aarch64_set_vec_u64 (cpu, st, 0,
 491                        aarch64_get_mem_u64
 492                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 493 }
 494
 495 /* long double pc-relative load.  */
 496 static void
 497 fldrq_pcrel (sim_cpu *cpu, int32_t offset)
 498 {
 499   unsigned int st = INSTR (4, 0);
 500   uint64_t addr = aarch64_get_PC (cpu) + offset * 4;
 501   FRegister a;
 502
 503   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 504   aarch64_get_mem_long_double (cpu, addr, & a);
 505   aarch64_set_FP_long_double (cpu, st, a);
 506 }
 507
 508 /* This can be used to scale an offset by applying
 509    the requisite shift. the second argument is either
 510    16, 32 or 64.  */
 511
 512 #define SCALE(_offset, _elementSize) \
 513     ((_offset) << ScaleShift ## _elementSize)
 514
 515 /* This can be used to optionally scale a register derived offset
 516    by applying the requisite shift as indicated by the Scaling
 517    argument.  The second argument is either Byte, Short, Word
 518    or Long. The third argument is either Scaled or Unscaled.
 519    N.B. when _Scaling is Scaled the shift gets ANDed with
 520    all 1s while when it is Unscaled it gets ANDed with 0.  */
 521
 522 #define OPT_SCALE(_offset, _elementType, _Scaling) \
 523   ((_offset) << (_Scaling ? ScaleShift ## _elementType : 0))
 524
 525 /* This can be used to zero or sign extend a 32 bit register derived
 526    value to a 64 bit value.  the first argument must be the value as
 527    a uint32_t and the second must be either UXTW or SXTW. The result
 528    is returned as an int64_t.  */
 529
 530 static inline int64_t
 531 extend (uint32_t value, Extension extension)
 532 {
 533   union
 534   {
 535     uint32_t u;
 536     int32_t   n;
 537   } x;
 538
 539   /* A branchless variant of this ought to be possible.  */
 540   if (extension == UXTW || extension == NoExtension)
 541     return value;
 542
 543   x.u = value;
 544   return x.n;
 545 }
 546
 547 /* Scalar Floating Point
 548
 549    FP load/store single register (4 addressing modes)
 550
 551    N.B. the base register (source) can be the stack pointer.
 552    The secondary source register (source2) can only be an Xn register.  */
 553
 554 /* Load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
 555 static void
 556 fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 557 {
 558   unsigned rn = INSTR (9, 5);
 559   unsigned st = INSTR (4, 0);
 560   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 561
 562   if (wb != Post)
 563     address += offset;
 564
 565   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 566   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32 (cpu, address));
 567   if (wb == Post)
 568     address += offset;
 569
 570   if (wb != NoWriteBack)
 571     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 572 }
 573
 574 /* Load 8 bit with unsigned 12 bit offset.  */
 575 static void
 576 fldrb_abs (sim_cpu *cpu, uint32_t offset)
 577 {
 578   unsigned rd = INSTR (4, 0);
 579   unsigned rn = INSTR (9, 5);
 580   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
 581
 582   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 583   aarch64_set_vec_u8 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
 584 }
 585
 586 /* Load 16 bit scaled unsigned 12 bit.  */
 587 static void
 588 fldrh_abs (sim_cpu *cpu, uint32_t offset)
 589 {
 590   unsigned rd = INSTR (4, 0);
 591   unsigned rn = INSTR (9, 5);
 592   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16);
 593
 594   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 595   aarch64_set_vec_u16 (cpu, rd, 0, aarch64_get_mem_u16 (cpu, addr));
 596 }
 597
 598 /* Load 32 bit scaled unsigned 12 bit.  */
 599 static void
 600 fldrs_abs (sim_cpu *cpu, uint32_t offset)
 601 {
 602   unsigned rd = INSTR (4, 0);
 603   unsigned rn = INSTR (9, 5);
 604   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32);
 605
 606   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 607   aarch64_set_vec_u32 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
 608 }
 609
 610 /* Load 64 bit scaled unsigned 12 bit.  */
 611 static void
 612 fldrd_abs (sim_cpu *cpu, uint32_t offset)
 613 {
 614   unsigned rd = INSTR (4, 0);
 615   unsigned rn = INSTR (9, 5);
 616   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);
 617
 618   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 619   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
 620 }
 621
 622 /* Load 128 bit scaled unsigned 12 bit.  */
 623 static void
 624 fldrq_abs (sim_cpu *cpu, uint32_t offset)
 625 {
 626   unsigned rd = INSTR (4, 0);
 627   unsigned rn = INSTR (9, 5);
 628   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
 629
 630   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 631   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
 632   aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_mem_u64 (cpu, addr + 8));
 633 }
 634
 635 /* Load 32 bit scaled or unscaled zero- or sign-extended
 636    32-bit register offset.  */
 637 static void
 638 fldrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 639 {
 640   unsigned rm = INSTR (20, 16);
 641   unsigned rn = INSTR (9, 5);
 642   unsigned st = INSTR (4, 0);
 643   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 644   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 645   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
 646
 647   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 648   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
 649                        (cpu, address + displacement));
 650 }
 651
 652 /* Load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
 653 static void
 654 fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 655 {
 656   unsigned rn = INSTR (9, 5);
 657   unsigned st = INSTR (4, 0);
 658   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 659
 660   if (wb != Post)
 661     address += offset;
 662
 663   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 664   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));
 665
 666   if (wb == Post)
 667     address += offset;
 668
 669   if (wb != NoWriteBack)
 670     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 671 }
 672
 673 /* Load 64 bit scaled or unscaled zero- or sign-extended 32-bit register offset.  */
 674 static void
 675 fldrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 676 {
 677   unsigned rm = INSTR (20, 16);
 678   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 679   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
 680
 681   fldrd_wb (cpu, displacement, NoWriteBack);
 682 }
 683
 684 /* Load 128 bit unscaled signed 9 bit with pre- or post-writeback.  */
 685 static void
 686 fldrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 687 {
 688   FRegister a;
 689   unsigned rn = INSTR (9, 5);
 690   unsigned st = INSTR (4, 0);
 691   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 692
 693   if (wb != Post)
 694     address += offset;
 695
 696   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 697   aarch64_get_mem_long_double (cpu, address, & a);
 698   aarch64_set_FP_long_double (cpu, st, a);
 699
 700   if (wb == Post)
 701     address += offset;
 702
 703   if (wb != NoWriteBack)
 704     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 705 }
 706
 707 /* Load 128 bit scaled or unscaled zero- or sign-extended 32-bit register offset  */
 708 static void
 709 fldrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 710 {
 711   unsigned rm = INSTR (20, 16);
 712   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 713   uint64_t displacement = OPT_SCALE (extended, 128, scaling);
 714
 715   fldrq_wb (cpu, displacement, NoWriteBack);
 716 }
 717
 718 /* Memory Access
 719
 720    load-store single register
 721    There are four addressing modes available here which all employ a
 722    64 bit source (base) register.
 723
 724    N.B. the base register (source) can be the stack pointer.
 725    The secondary source register (source2)can only be an Xn register.
 726
 727    Scaled, 12-bit, unsigned immediate offset, without pre- and
 728    post-index options.
 729    Unscaled, 9-bit, signed immediate offset with pre- or post-index
 730    writeback.
 731    scaled or unscaled 64-bit register offset.
 732    scaled or unscaled 32-bit extended register offset.
 733
 734    All offsets are assumed to be raw from the decode i.e. the
 735    simulator is expected to adjust scaled offsets based on the
 736    accessed data size with register or extended register offset
 737    versions the same applies except that in the latter case the
 738    operation may also require a sign extend.
 739
 740    A separate method is provided for each possible addressing mode.  */
 741
 742 /* 32 bit load 32 bit scaled unsigned 12 bit  */
 743 static void
 744 ldr32_abs (sim_cpu *cpu, uint32_t offset)
 745 {
 746   unsigned rn = INSTR (9, 5);
 747   unsigned rt = INSTR (4, 0);
 748
 749   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 750   /* The target register may not be SP but the source may be.  */
 751   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
 752                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 753                         + SCALE (offset, 32)));
 754 }
 755
 756 /* 32 bit load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
 757 static void
 758 ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 759 {
 760   unsigned rn = INSTR (9, 5);
 761   unsigned rt = INSTR (4, 0);
 762   uint64_t address;
 763
 764   if (rn == rt && wb != NoWriteBack)
 765     HALT_UNALLOC;
 766
 767   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 768
 769   if (wb != Post)
 770     address += offset;
 771
 772   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 773   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
 774
 775   if (wb == Post)
 776     address += offset;
 777
 778   if (wb != NoWriteBack)
 779     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 780 }
 781
 782 /* 32 bit load 32 bit scaled or unscaled
 783    zero- or sign-extended 32-bit register offset  */
 784 static void
 785 ldr32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 786 {
 787   unsigned rm = INSTR (20, 16);
 788   unsigned rn = INSTR (9, 5);
 789   unsigned rt = INSTR (4, 0);
 790   /* rn may reference SP, rm and rt must reference ZR  */
 791
 792   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 793   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 794   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
 795
 796   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 797   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 798                        aarch64_get_mem_u32 (cpu, address + displacement));
 799 }
 800
 801 /* 64 bit load 64 bit scaled unsigned 12 bit  */
 802 static void
 803 ldr_abs (sim_cpu *cpu, uint32_t offset)
 804 {
 805   unsigned rn = INSTR (9, 5);
 806   unsigned rt = INSTR (4, 0);
 807
 808   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 809   /* The target register may not be SP but the source may be.  */
 810   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
 811                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 812                         + SCALE (offset, 64)));
 813 }
 814
 815 /* 64 bit load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
 816 static void
 817 ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 818 {
 819   unsigned rn = INSTR (9, 5);
 820   unsigned rt = INSTR (4, 0);
 821   uint64_t address;
 822
 823   if (rn == rt && wb != NoWriteBack)
 824     HALT_UNALLOC;
 825
 826   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 827
 828   if (wb != Post)
 829     address += offset;
 830
 831   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 832   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
 833
 834   if (wb == Post)
 835     address += offset;
 836
 837   if (wb != NoWriteBack)
 838     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 839 }
 840
 841 /* 64 bit load 64 bit scaled or unscaled zero-
 842    or sign-extended 32-bit register offset.  */
 843 static void
 844 ldr_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 845 {
 846   unsigned rm = INSTR (20, 16);
 847   unsigned rn = INSTR (9, 5);
 848   unsigned rt = INSTR (4, 0);
 849   /* rn may reference SP, rm and rt must reference ZR  */
 850
 851   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 852   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 853   uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
 854
 855   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 856   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 857                        aarch64_get_mem_u64 (cpu, address + displacement));
 858 }
 859
 860 /* 32 bit load zero-extended byte scaled unsigned 12 bit.  */
 861 static void
 862 ldrb32_abs (sim_cpu *cpu, uint32_t offset)
 863 {
 864   unsigned rn = INSTR (9, 5);
 865   unsigned rt = INSTR (4, 0);
 866
 867   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 868   /* The target register may not be SP but the source may be
 869      there is no scaling required for a byte load.  */
 870   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 871                        aarch64_get_mem_u8
 872                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
 873 }
 874
 875 /* 32 bit load zero-extended byte unscaled signed 9 bit with pre- or post-writeback.  */
 876 static void
 877 ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 878 {
 879   unsigned rn = INSTR (9, 5);
 880   unsigned rt = INSTR (4, 0);
 881   uint64_t address;
 882
 883   if (rn == rt && wb != NoWriteBack)
 884     HALT_UNALLOC;
 885
 886   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 887
 888   if (wb != Post)
 889     address += offset;
 890
 891   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 892   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
 893
 894   if (wb == Post)
 895     address += offset;
 896
 897   if (wb != NoWriteBack)
 898     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 899 }
 900
 901 /* 32 bit load zero-extended byte scaled or unscaled zero-
 902    or sign-extended 32-bit register offset.  */
 903 static void
 904 ldrb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 905 {
 906   unsigned rm = INSTR (20, 16);
 907   unsigned rn = INSTR (9, 5);
 908   unsigned rt = INSTR (4, 0);
 909   /* rn may reference SP, rm and rt must reference ZR  */
 910
 911   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 912   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 913                                  extension);
 914
 915   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 916   /* There is no scaling required for a byte load.  */
 917   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 918                        aarch64_get_mem_u8 (cpu, address + displacement));
 919 }
 920
 921 /* 64 bit load sign-extended byte unscaled signed 9 bit
 922    with pre- or post-writeback.  */
 923 static void
 924 ldrsb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 925 {
 926   unsigned rn = INSTR (9, 5);
 927   unsigned rt = INSTR (4, 0);
 928   uint64_t address;
 929   int64_t val;
 930
 931   if (rn == rt && wb != NoWriteBack)
 932     HALT_UNALLOC;
 933
 934   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 935
 936   if (wb != Post)
 937     address += offset;
 938
 939   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 940   val = aarch64_get_mem_s8 (cpu, address);
 941   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
 942
 943   if (wb == Post)
 944     address += offset;
 945
 946   if (wb != NoWriteBack)
 947     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 948 }
 949
 950 /* 64 bit load sign-extended byte scaled unsigned 12 bit.  */
 951 static void
 952 ldrsb_abs (sim_cpu *cpu, uint32_t offset)
 953 {
 954   ldrsb_wb (cpu, offset, NoWriteBack);
 955 }
 956
 957 /* 64 bit load sign-extended byte scaled or unscaled zero-
 958    or sign-extended 32-bit register offset.  */
 959 static void
 960 ldrsb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 961 {
 962   unsigned rm = INSTR (20, 16);
 963   unsigned rn = INSTR (9, 5);
 964   unsigned rt = INSTR (4, 0);
 965   /* rn may reference SP, rm and rt must reference ZR  */
 966
 967   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 968   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 969                                  extension);
 970   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 971   /* There is no scaling required for a byte load.  */
 972   aarch64_set_reg_s64 (cpu, rt, NO_SP,
 973                        aarch64_get_mem_s8 (cpu, address + displacement));
 974 }
 975
 976 /* 32 bit load zero-extended short scaled unsigned 12 bit.  */
 977 static void
 978 ldrh32_abs (sim_cpu *cpu, uint32_t offset)
 979 {
 980   unsigned rn = INSTR (9, 5);
 981   unsigned rt = INSTR (4, 0);
 982   uint32_t val;
 983
 984   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 985   /* The target register may not be SP but the source may be.  */
 986   val = aarch64_get_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 987                              + SCALE (offset, 16));
 988   aarch64_set_reg_u32 (cpu, rt, NO_SP, val);
 989 }
 990
 991 /* 32 bit load zero-extended short unscaled signed 9 bit
 992    with pre- or post-writeback.  */
 993 static void
 994 ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 995 {
 996   unsigned rn = INSTR (9, 5);
 997   unsigned rt = INSTR (4, 0);
 998   uint64_t address;
 999
1000   if (rn == rt && wb != NoWriteBack)
1001     HALT_UNALLOC;
1002
1003   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1004
1005   if (wb != Post)
1006     address += offset;
1007
1008   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1009   aarch64_set_reg_u32 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1010
1011   if (wb == Post)
1012     address += offset;
1013
1014   if (wb != NoWriteBack)
1015     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1016 }
1017
1018 /* 32 bit load zero-extended short scaled or unscaled zero-
1019    or sign-extended 32-bit register offset.  */
1020 static void
1021 ldrh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1022 {
1023   unsigned rm = INSTR (20, 16);
1024   unsigned rn = INSTR (9, 5);
1025   unsigned rt = INSTR (4, 0);
1026   /* rn may reference SP, rm and rt must reference ZR  */
1027
1028   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1029   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1030   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1031
1032   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1033   aarch64_set_reg_u32 (cpu, rt, NO_SP,
1034                        aarch64_get_mem_u16 (cpu, address + displacement));
1035 }
1036
1037 /* 32 bit load sign-extended short scaled unsigned 12 bit.  */
1038 static void
1039 ldrsh32_abs (sim_cpu *cpu, uint32_t offset)
1040 {
1041   unsigned rn = INSTR (9, 5);
1042   unsigned rt = INSTR (4, 0);
1043   int32_t val;
1044
1045   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1046   /* The target register may not be SP but the source may be.  */
1047   val = aarch64_get_mem_s16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1048                              + SCALE (offset, 16));
1049   aarch64_set_reg_s32 (cpu, rt, NO_SP, val);
1050 }
1051
1052 /* 32 bit load sign-extended short unscaled signed 9 bit
1053    with pre- or post-writeback.  */
1054 static void
1055 ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1056 {
1057   unsigned rn = INSTR (9, 5);
1058   unsigned rt = INSTR (4, 0);
1059   uint64_t address;
1060
1061   if (rn == rt && wb != NoWriteBack)
1062     HALT_UNALLOC;
1063
1064   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1065
1066   if (wb != Post)
1067     address += offset;
1068
1069   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1070   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1071                        (int32_t) aarch64_get_mem_s16 (cpu, address));
1072
1073   if (wb == Post)
1074     address += offset;
1075
1076   if (wb != NoWriteBack)
1077     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1078 }
1079
1080 /* 32 bit load sign-extended short scaled or unscaled zero-
1081    or sign-extended 32-bit register offset.  */
1082 static void
1083 ldrsh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1084 {
1085   unsigned rm = INSTR (20, 16);
1086   unsigned rn = INSTR (9, 5);
1087   unsigned rt = INSTR (4, 0);
1088   /* rn may reference SP, rm and rt must reference ZR  */
1089
1090   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1091   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1092   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1093
1094   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1095   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1096                        (int32_t) aarch64_get_mem_s16
1097                        (cpu, address + displacement));
1098 }
1099
1100 /* 64 bit load sign-extended short scaled unsigned 12 bit.  */
1101 static void
1102 ldrsh_abs (sim_cpu *cpu, uint32_t offset)
1103 {
1104   unsigned rn = INSTR (9, 5);
1105   unsigned rt = INSTR (4, 0);
1106   int64_t val;
1107
1108   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1109   /* The target register may not be SP but the source may be.  */
1110   val = aarch64_get_mem_s16  (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1111                               + SCALE (offset, 16));
1112   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1113 }
1114
1115 /* 64 bit load sign-extended short unscaled signed 9 bit
1116    with pre- or post-writeback.  */
1117 static void
1118 ldrsh64_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1119 {
1120   unsigned rn = INSTR (9, 5);
1121   unsigned rt = INSTR (4, 0);
1122   uint64_t address;
1123   int64_t val;
1124
1125   if (rn == rt && wb != NoWriteBack)
1126     HALT_UNALLOC;
1127
1128   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1129   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1130
1131   if (wb != Post)
1132     address += offset;
1133
1134   val = aarch64_get_mem_s16 (cpu, address);
1135   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1136
1137   if (wb == Post)
1138     address += offset;
1139
1140   if (wb != NoWriteBack)
1141     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1142 }
1143
1144 /* 64 bit load sign-extended short scaled or unscaled zero-
1145    or sign-extended 32-bit register offset.  */
1146 static void
1147 ldrsh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1148 {
1149   unsigned rm = INSTR (20, 16);
1150   unsigned rn = INSTR (9, 5);
1151   unsigned rt = INSTR (4, 0);
1152
1153   /* rn may reference SP, rm and rt must reference ZR  */
1154
1155   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1156   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1157   uint64_t displacement = OPT_SCALE (extended, 16, scaling);
1158   int64_t val;
1159
1160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1161   val = aarch64_get_mem_s16 (cpu, address + displacement);
1162   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1163 }
1164
1165 /* 64 bit load sign-extended 32 bit scaled unsigned 12 bit.  */
1166 static void
1167 ldrsw_abs (sim_cpu *cpu, uint32_t offset)
1168 {
1169   unsigned rn = INSTR (9, 5);
1170   unsigned rt = INSTR (4, 0);
1171   int64_t val;
1172
1173   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1174   val = aarch64_get_mem_s32 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1175                              + SCALE (offset, 32));
1176   /* The target register may not be SP but the source may be.  */
1177   return aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1178 }
1179
1180 /* 64 bit load sign-extended 32 bit unscaled signed 9 bit
1181    with pre- or post-writeback.  */
1182 static void
1183 ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1184 {
1185   unsigned rn = INSTR (9, 5);
1186   unsigned rt = INSTR (4, 0);
1187   uint64_t address;
1188
1189   if (rn == rt && wb != NoWriteBack)
1190     HALT_UNALLOC;
1191
1192   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1193
1194   if (wb != Post)
1195     address += offset;
1196
1197   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1198   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s32 (cpu, address));
1199
1200   if (wb == Post)
1201     address += offset;
1202
1203   if (wb != NoWriteBack)
1204     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1205 }
1206
1207 /* 64 bit load sign-extended 32 bit scaled or unscaled zero-
1208    or sign-extended 32-bit register offset.  */
1209 static void
1210 ldrsw_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1211 {
1212   unsigned rm = INSTR (20, 16);
1213   unsigned rn = INSTR (9, 5);
1214   unsigned rt = INSTR (4, 0);
1215   /* rn may reference SP, rm and rt must reference ZR  */
1216
1217   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1218   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1219   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
1220
1221   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1222   aarch64_set_reg_s64 (cpu, rt, NO_SP,
1223                        aarch64_get_mem_s32 (cpu, address + displacement));
1224 }
1225
1226 /* N.B. with stores the value in source is written to the
1227    address identified by source2 modified by source3/offset.  */
1228
1229 /* 32 bit store scaled unsigned 12 bit.  */
1230 static void
1231 str32_abs (sim_cpu *cpu, uint32_t offset)
1232 {
1233   unsigned rn = INSTR (9, 5);
1234   unsigned rt = INSTR (4, 0);
1235
1236   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1237   /* The target register may not be SP but the source may be.  */
1238   aarch64_set_mem_u32 (cpu, (aarch64_get_reg_u64 (cpu, rn, SP_OK)
1239                              + SCALE (offset, 32)),
1240                        aarch64_get_reg_u32 (cpu, rt, NO_SP));
1241 }
1242
1243 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1244 static void
1245 str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1246 {
1247   unsigned rn = INSTR (9, 5);
1248   unsigned rt = INSTR (4, 0);
1249   uint64_t address;
1250
1251   if (rn == rt && wb != NoWriteBack)
1252     HALT_UNALLOC;
1253
1254   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1255   if (wb != Post)
1256     address += offset;
1257
1258   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1259   aarch64_set_mem_u32 (cpu, address, aarch64_get_reg_u32 (cpu, rt, NO_SP));
1260
1261   if (wb == Post)
1262     address += offset;
1263
1264   if (wb != NoWriteBack)
1265     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1266 }
1267
1268 /* 32 bit store scaled or unscaled zero- or
1269    sign-extended 32-bit register offset.  */
1270 static void
1271 str32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1272 {
1273   unsigned rm = INSTR (20, 16);
1274   unsigned rn = INSTR (9, 5);
1275   unsigned rt = INSTR (4, 0);
1276
1277   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1278   int64_t  extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1279   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
1280
1281   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1282   aarch64_set_mem_u32 (cpu, address + displacement,
1283                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1284 }
1285
1286 /* 64 bit store scaled unsigned 12 bit.  */
1287 static void
1288 str_abs (sim_cpu *cpu, uint32_t offset)
1289 {
1290   unsigned rn = INSTR (9, 5);
1291   unsigned rt = INSTR (4, 0);
1292
1293   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1294   aarch64_set_mem_u64 (cpu,
1295                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
1296                        + SCALE (offset, 64),
1297                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1298 }
1299
1300 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1301 static void
1302 str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1303 {
1304   unsigned rn = INSTR (9, 5);
1305   unsigned rt = INSTR (4, 0);
1306   uint64_t address;
1307
1308   if (rn == rt && wb != NoWriteBack)
1309     HALT_UNALLOC;
1310
1311   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1312
1313   if (wb != Post)
1314     address += offset;
1315
1316   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1317   aarch64_set_mem_u64 (cpu, address, aarch64_get_reg_u64 (cpu, rt, NO_SP));
1318
1319   if (wb == Post)
1320     address += offset;
1321
1322   if (wb != NoWriteBack)
1323     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1324 }
1325
1326 /* 64 bit store scaled or unscaled zero-
1327    or sign-extended 32-bit register offset.  */
1328 static void
1329 str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1330 {
1331   unsigned rm = INSTR (20, 16);
1332   unsigned rn = INSTR (9, 5);
1333   unsigned rt = INSTR (4, 0);
1334   /* rn may reference SP, rm and rt must reference ZR  */
1335
1336   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1337   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1338                                extension);
1339   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
1340
1341   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1342   aarch64_set_mem_u64 (cpu, address + displacement,
1343                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1344 }
1345
1346 /* 32 bit store byte scaled unsigned 12 bit.  */
1347 static void
1348 strb_abs (sim_cpu *cpu, uint32_t offset)
1349 {
1350   unsigned rn = INSTR (9, 5);
1351   unsigned rt = INSTR (4, 0);
1352
1353   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1354   /* The target register may not be SP but the source may be.
1355      There is no scaling required for a byte load.  */
1356   aarch64_set_mem_u8 (cpu,
1357                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
1358                       aarch64_get_reg_u8 (cpu, rt, NO_SP));
1359 }
1360
1361 /* 32 bit store byte unscaled signed 9 bit with pre- or post-writeback.  */
1362 static void
1363 strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1364 {
1365   unsigned rn = INSTR (9, 5);
1366   unsigned rt = INSTR (4, 0);
1367   uint64_t address;
1368
1369   if (rn == rt && wb != NoWriteBack)
1370     HALT_UNALLOC;
1371
1372   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1373
1374   if (wb != Post)
1375     address += offset;
1376
1377   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1378   aarch64_set_mem_u8 (cpu, address, aarch64_get_reg_u8 (cpu, rt, NO_SP));
1379
1380   if (wb == Post)
1381     address += offset;
1382
1383   if (wb != NoWriteBack)
1384     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1385 }
1386
1387 /* 32 bit store byte scaled or unscaled zero-
1388    or sign-extended 32-bit register offset.  */
1389 static void
1390 strb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1391 {
1392   unsigned rm = INSTR (20, 16);
1393   unsigned rn = INSTR (9, 5);
1394   unsigned rt = INSTR (4, 0);
1395   /* rn may reference SP, rm and rt must reference ZR  */
1396
1397   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1398   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1399                                  extension);
1400
1401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1402   /* There is no scaling required for a byte load.  */
1403   aarch64_set_mem_u8 (cpu, address + displacement,
1404                       aarch64_get_reg_u8 (cpu, rt, NO_SP));
1405 }
1406
1407 /* 32 bit store short scaled unsigned 12 bit.  */
1408 static void
1409 strh_abs (sim_cpu *cpu, uint32_t offset)
1410 {
1411   unsigned rn = INSTR (9, 5);
1412   unsigned rt = INSTR (4, 0);
1413
1414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1415   /* The target register may not be SP but the source may be.  */
1416   aarch64_set_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1417                        + SCALE (offset, 16),
1418                        aarch64_get_reg_u16 (cpu, rt, NO_SP));
1419 }
1420
1421 /* 32 bit store short unscaled signed 9 bit with pre- or post-writeback.  */
1422 static void
1423 strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1424 {
1425   unsigned rn = INSTR (9, 5);
1426   unsigned rt = INSTR (4, 0);
1427   uint64_t address;
1428
1429   if (rn == rt && wb != NoWriteBack)
1430     HALT_UNALLOC;
1431
1432   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1433
1434   if (wb != Post)
1435     address += offset;
1436
1437   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1438   aarch64_set_mem_u16 (cpu, address, aarch64_get_reg_u16 (cpu, rt, NO_SP));
1439
1440   if (wb == Post)
1441     address += offset;
1442
1443   if (wb != NoWriteBack)
1444     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1445 }
1446
1447 /* 32 bit store short scaled or unscaled zero-
1448    or sign-extended 32-bit register offset.  */
1449 static void
1450 strh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1451 {
1452   unsigned rm = INSTR (20, 16);
1453   unsigned rn = INSTR (9, 5);
1454   unsigned rt = INSTR (4, 0);
1455   /* rn may reference SP, rm and rt must reference ZR  */
1456
1457   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1458   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1459   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1460
1461   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1462   aarch64_set_mem_u16 (cpu, address + displacement,
1463                        aarch64_get_reg_u16 (cpu, rt, NO_SP));
1464 }
1465
1466 /* Prefetch unsigned 12 bit.  */
1467 static void
1468 prfm_abs (sim_cpu *cpu, uint32_t offset)
1469 {
1470   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1471                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1472                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1473                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1474                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1475                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1476                           ow ==> UNALLOC
1477      PrfOp prfop = prfop (instr, 4, 0);
1478      uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK)
1479      + SCALE (offset, 64).  */
1480
1481   /* TODO : implement prefetch of address.  */
1482 }
1483
1484 /* Prefetch scaled or unscaled zero- or sign-extended 32-bit register offset.  */
1485 static void
1486 prfm_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1487 {
1488   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1489                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1490                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1491                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1492                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1493                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1494                           ow ==> UNALLOC
1495      rn may reference SP, rm may only reference ZR
1496      PrfOp prfop = prfop (instr, 4, 0);
1497      uint64_t base = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1498      int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1499                                 extension);
1500      uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
1501      uint64_t address = base + displacement.  */
1502
1503   /* TODO : implement prefetch of address  */
1504 }
1505
1506 /* 64 bit pc-relative prefetch.  */
1507 static void
1508 prfm_pcrel (sim_cpu *cpu, int32_t offset)
1509 {
1510   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1511                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1512                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1513                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1514                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1515                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1516                           ow ==> UNALLOC
1517      PrfOp prfop = prfop (instr, 4, 0);
1518      uint64_t address = aarch64_get_PC (cpu) + offset.  */
1519
1520   /* TODO : implement this  */
1521 }
1522
1523 /* Load-store exclusive.  */
1524
1525 static void
1526 ldxr (sim_cpu *cpu)
1527 {
1528   unsigned rn = INSTR (9, 5);
1529   unsigned rt = INSTR (4, 0);
1530   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1531   int size = INSTR (31, 30);
1532   /* int ordered = INSTR (15, 15);  */
1533   /* int exclusive = ! INSTR (23, 23);  */
1534
1535   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1536   switch (size)
1537     {
1538     case 0:
1539       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
1540       break;
1541     case 1:
1542       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1543       break;
1544     case 2:
1545       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
1546       break;
1547     case 3:
1548       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
1549       break;
1550     }
1551 }
1552
1553 static void
1554 stxr (sim_cpu *cpu)
1555 {
1556   unsigned rn = INSTR (9, 5);
1557   unsigned rt = INSTR (4, 0);
1558   unsigned rs = INSTR (20, 16);
1559   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1560   int      size = INSTR (31, 30);
1561   uint64_t data = aarch64_get_reg_u64 (cpu, rt, NO_SP);
1562
1563   switch (size)
1564     {
1565     case 0: aarch64_set_mem_u8 (cpu, address, data); break;
1566     case 1: aarch64_set_mem_u16 (cpu, address, data); break;
1567     case 2: aarch64_set_mem_u32 (cpu, address, data); break;
1568     case 3: aarch64_set_mem_u64 (cpu, address, data); break;
1569     }
1570
1571   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1572   aarch64_set_reg_u64 (cpu, rs, NO_SP, 0); /* Always exclusive...  */
1573 }
1574
1575 static void
1576 dexLoadLiteral (sim_cpu *cpu)
1577 {
1578   /* instr[29,27] == 011
1579      instr[25,24] == 00
1580      instr[31,30:26] = opc: 000 ==> LDRW,  001 ==> FLDRS
1581                             010 ==> LDRX,  011 ==> FLDRD
1582                             100 ==> LDRSW, 101 ==> FLDRQ
1583                             110 ==> PRFM, 111 ==> UNALLOC
1584      instr[26] ==> V : 0 ==> GReg, 1 ==> FReg
1585      instr[23, 5] == simm19  */
1586
1587   /* unsigned rt = INSTR (4, 0);  */
1588   uint32_t dispatch = (INSTR (31, 30) << 1) | INSTR (26, 26);
1589   int32_t imm = simm32 (aarch64_get_instr (cpu), 23, 5);
1590
1591   switch (dispatch)
1592     {
1593     case 0: ldr32_pcrel (cpu, imm); break;
1594     case 1: fldrs_pcrel (cpu, imm); break;
1595     case 2: ldr_pcrel   (cpu, imm); break;
1596     case 3: fldrd_pcrel (cpu, imm); break;
1597     case 4: ldrsw_pcrel (cpu, imm); break;
1598     case 5: fldrq_pcrel (cpu, imm); break;
1599     case 6: prfm_pcrel  (cpu, imm); break;
1600     case 7:
1601     default:
1602       HALT_UNALLOC;
1603     }
1604 }
1605
1606 /* Immediate arithmetic
1607    The aimm argument is a 12 bit unsigned value or a 12 bit unsigned
1608    value left shifted by 12 bits (done at decode).
1609
1610    N.B. the register args (dest, source) can normally be Xn or SP.
1611    the exception occurs for flag setting instructions which may
1612    only use Xn for the output (dest).  */
1613
1614 /* 32 bit add immediate.  */
1615 static void
1616 add32 (sim_cpu *cpu, uint32_t aimm)
1617 {
1618   unsigned rn = INSTR (9, 5);
1619   unsigned rd = INSTR (4, 0);
1620
1621   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1622   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1623                        aarch64_get_reg_u32 (cpu, rn, SP_OK) + aimm);
1624 }
1625
1626 /* 64 bit add immediate.  */
1627 static void
1628 add64 (sim_cpu *cpu, uint32_t aimm)
1629 {
1630   unsigned rn = INSTR (9, 5);
1631   unsigned rd = INSTR (4, 0);
1632
1633   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1634   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1635                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + aimm);
1636 }
1637
1638 static void
1639 set_flags_for_add32 (sim_cpu *cpu, int32_t value1, int32_t value2)
1640 {
1641   int32_t   result = value1 + value2;
1642   int64_t   sresult = (int64_t) value1 + (int64_t) value2;
1643   uint64_t  uresult = (uint64_t)(uint32_t) value1
1644     + (uint64_t)(uint32_t) value2;
1645   uint32_t  flags = 0;
1646
1647   if (result == 0)
1648     flags |= Z;
1649
1650   if (result & (1 << 31))
1651     flags |= N;
1652
1653   if (uresult != result)
1654     flags |= C;
1655
1656   if (sresult != result)
1657     flags |= V;
1658
1659   aarch64_set_CPSR (cpu, flags);
1660 }
1661
1662 #define NEG(a) (((a) & signbit) == signbit)
1663 #define POS(a) (((a) & signbit) == 0)
1664
1665 static void
1666 set_flags_for_add64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1667 {
1668   uint64_t result = value1 + value2;
1669   uint32_t flags = 0;
1670   uint64_t signbit = 1ULL << 63;
1671
1672   if (result == 0)
1673     flags |= Z;
1674
1675   if (NEG (result))
1676     flags |= N;
1677
1678   if (   (NEG (value1) && NEG (value2))
1679       || (NEG (value1) && POS (result))
1680       || (NEG (value2) && POS (result)))
1681     flags |= C;
1682
1683   if (   (NEG (value1) && NEG (value2) && POS (result))
1684       || (POS (value1) && POS (value2) && NEG (result)))
1685     flags |= V;
1686
1687   aarch64_set_CPSR (cpu, flags);
1688 }
1689
1690 static void
1691 set_flags_for_sub32 (sim_cpu *cpu, uint32_t value1, uint32_t value2)
1692 {
1693   uint32_t result = value1 - value2;
1694   uint32_t flags = 0;
1695   uint32_t signbit = 1U << 31;
1696
1697   if (result == 0)
1698     flags |= Z;
1699
1700   if (NEG (result))
1701     flags |= N;
1702
1703   if (   (NEG (value1) && POS (value2))
1704       || (NEG (value1) && POS (result))
1705       || (POS (value2) && POS (result)))
1706     flags |= C;
1707
1708   if (   (NEG (value1) && POS (value2) && POS (result))
1709       || (POS (value1) && NEG (value2) && NEG (result)))
1710     flags |= V;
1711
1712   aarch64_set_CPSR (cpu, flags);
1713 }
1714
1715 static void
1716 set_flags_for_sub64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1717 {
1718   uint64_t result = value1 - value2;
1719   uint32_t flags = 0;
1720   uint64_t signbit = 1ULL << 63;
1721
1722   if (result == 0)
1723     flags |= Z;
1724
1725   if (NEG (result))
1726     flags |= N;
1727
1728   if (   (NEG (value1) && POS (value2))
1729       || (NEG (value1) && POS (result))
1730       || (POS (value2) && POS (result)))
1731     flags |= C;
1732
1733   if (   (NEG (value1) && POS (value2) && POS (result))
1734       || (POS (value1) && NEG (value2) && NEG (result)))
1735     flags |= V;
1736
1737   aarch64_set_CPSR (cpu, flags);
1738 }
1739
1740 static void
1741 set_flags_for_binop32 (sim_cpu *cpu, uint32_t result)
1742 {
1743   uint32_t flags = 0;
1744
1745   if (result == 0)
1746     flags |= Z;
1747   else
1748     flags &= ~ Z;
1749
1750   if (result & (1 << 31))
1751     flags |= N;
1752   else
1753     flags &= ~ N;
1754
1755   aarch64_set_CPSR (cpu, flags);
1756 }
1757
1758 static void
1759 set_flags_for_binop64 (sim_cpu *cpu, uint64_t result)
1760 {
1761   uint32_t flags = 0;
1762
1763   if (result == 0)
1764     flags |= Z;
1765   else
1766     flags &= ~ Z;
1767
1768   if (result & (1ULL << 63))
1769     flags |= N;
1770   else
1771     flags &= ~ N;
1772
1773   aarch64_set_CPSR (cpu, flags);
1774 }
1775
1776 /* 32 bit add immediate set flags.  */
1777 static void
1778 adds32 (sim_cpu *cpu, uint32_t aimm)
1779 {
1780   unsigned rn = INSTR (9, 5);
1781   unsigned rd = INSTR (4, 0);
1782   /* TODO : do we need to worry about signs here?  */
1783   int32_t value1 = aarch64_get_reg_s32 (cpu, rn, SP_OK);
1784
1785   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1786   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + aimm);
1787   set_flags_for_add32 (cpu, value1, aimm);
1788 }
1789
1790 /* 64 bit add immediate set flags.  */
1791 static void
1792 adds64 (sim_cpu *cpu, uint32_t aimm)
1793 {
1794   unsigned rn = INSTR (9, 5);
1795   unsigned rd = INSTR (4, 0);
1796   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1797   uint64_t value2 = aimm;
1798
1799   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1800   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1801   set_flags_for_add64 (cpu, value1, value2);
1802 }
1803
1804 /* 32 bit sub immediate.  */
1805 static void
1806 sub32 (sim_cpu *cpu, uint32_t aimm)
1807 {
1808   unsigned rn = INSTR (9, 5);
1809   unsigned rd = INSTR (4, 0);
1810
1811   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1812   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1813                        aarch64_get_reg_u32 (cpu, rn, SP_OK) - aimm);
1814 }
1815
1816 /* 64 bit sub immediate.  */
1817 static void
1818 sub64 (sim_cpu *cpu, uint32_t aimm)
1819 {
1820   unsigned rn = INSTR (9, 5);
1821   unsigned rd = INSTR (4, 0);
1822
1823   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1824   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1825                        aarch64_get_reg_u64 (cpu, rn, SP_OK) - aimm);
1826 }
1827
1828 /* 32 bit sub immediate set flags.  */
1829 static void
1830 subs32 (sim_cpu *cpu, uint32_t aimm)
1831 {
1832   unsigned rn = INSTR (9, 5);
1833   unsigned rd = INSTR (4, 0);
1834   uint32_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1835   uint32_t value2 = aimm;
1836
1837   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1838   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1839   set_flags_for_sub32 (cpu, value1, value2);
1840 }
1841
1842 /* 64 bit sub immediate set flags.  */
1843 static void
1844 subs64 (sim_cpu *cpu, uint32_t aimm)
1845 {
1846   unsigned rn = INSTR (9, 5);
1847   unsigned rd = INSTR (4, 0);
1848   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1849   uint32_t value2 = aimm;
1850
1851   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1852   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1853   set_flags_for_sub64 (cpu, value1, value2);
1854 }
1855
1856 /* Data Processing Register.  */
1857
1858 /* First two helpers to perform the shift operations.  */
1859
1860 static inline uint32_t
1861 shifted32 (uint32_t value, Shift shift, uint32_t count)
1862 {
1863   switch (shift)
1864     {
1865     default:
1866     case LSL:
1867       return (value << count);
1868     case LSR:
1869       return (value >> count);
1870     case ASR:
1871       {
1872         int32_t svalue = value;
1873         return (svalue >> count);
1874       }
1875     case ROR:
1876       {
1877         uint32_t top = value >> count;
1878         uint32_t bottom = value << (32 - count);
1879         return (bottom | top);
1880       }
1881     }
1882 }
1883
1884 static inline uint64_t
1885 shifted64 (uint64_t value, Shift shift, uint32_t count)
1886 {
1887   switch (shift)
1888     {
1889     default:
1890     case LSL:
1891       return (value << count);
1892     case LSR:
1893       return (value >> count);
1894     case ASR:
1895       {
1896         int64_t svalue = value;
1897         return (svalue >> count);
1898       }
1899     case ROR:
1900       {
1901         uint64_t top = value >> count;
1902         uint64_t bottom = value << (64 - count);
1903         return (bottom | top);
1904       }
1905     }
1906 }
1907
1908 /* Arithmetic shifted register.
1909    These allow an optional LSL, ASR or LSR to the second source
1910    register with a count up to the register bit count.
1911
1912    N.B register args may not be SP.  */
1913
1914 /* 32 bit ADD shifted register.  */
1915 static void
1916 add32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1917 {
1918   unsigned rm = INSTR (20, 16);
1919   unsigned rn = INSTR (9, 5);
1920   unsigned rd = INSTR (4, 0);
1921
1922   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1923   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1924                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
1925                        + shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1926                                     shift, count));
1927 }
1928
1929 /* 64 bit ADD shifted register.  */
1930 static void
1931 add64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1932 {
1933   unsigned rm = INSTR (20, 16);
1934   unsigned rn = INSTR (9, 5);
1935   unsigned rd = INSTR (4, 0);
1936
1937   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1938   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1939                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
1940                        + shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1941                                     shift, count));
1942 }
1943
1944 /* 32 bit ADD shifted register setting flags.  */
1945 static void
1946 adds32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1947 {
1948   unsigned rm = INSTR (20, 16);
1949   unsigned rn = INSTR (9, 5);
1950   unsigned rd = INSTR (4, 0);
1951
1952   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
1953   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1954                                shift, count);
1955
1956   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1957   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1958   set_flags_for_add32 (cpu, value1, value2);
1959 }
1960
1961 /* 64 bit ADD shifted register setting flags.  */
1962 static void
1963 adds64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1964 {
1965   unsigned rm = INSTR (20, 16);
1966   unsigned rn = INSTR (9, 5);
1967   unsigned rd = INSTR (4, 0);
1968
1969   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
1970   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1971                                shift, count);
1972
1973   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1974   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1975   set_flags_for_add64 (cpu, value1, value2);
1976 }
1977
1978 /* 32 bit SUB shifted register.  */
1979 static void
1980 sub32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1981 {
1982   unsigned rm = INSTR (20, 16);
1983   unsigned rn = INSTR (9, 5);
1984   unsigned rd = INSTR (4, 0);
1985
1986   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1987   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1988                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
1989                        - shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1990                                     shift, count));
1991 }
1992
1993 /* 64 bit SUB shifted register.  */
1994 static void
1995 sub64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1996 {
1997   unsigned rm = INSTR (20, 16);
1998   unsigned rn = INSTR (9, 5);
1999   unsigned rd = INSTR (4, 0);
2000
2001   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2002   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2003                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2004                        - shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2005                                     shift, count));
2006 }
2007
2008 /* 32 bit SUB shifted register setting flags.  */
2009 static void
2010 subs32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2011 {
2012   unsigned rm = INSTR (20, 16);
2013   unsigned rn = INSTR (9, 5);
2014   unsigned rd = INSTR (4, 0);
2015
2016   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2017   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
2018                               shift, count);
2019
2020   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2021   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2022   set_flags_for_sub32 (cpu, value1, value2);
2023 }
2024
2025 /* 64 bit SUB shifted register setting flags.  */
2026 static void
2027 subs64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2028 {
2029   unsigned rm = INSTR (20, 16);
2030   unsigned rn = INSTR (9, 5);
2031   unsigned rd = INSTR (4, 0);
2032
2033   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2034   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2035                                shift, count);
2036
2037   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2038   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2039   set_flags_for_sub64 (cpu, value1, value2);
2040 }
2041
2042 /* First a couple more helpers to fetch the
2043    relevant source register element either
2044    sign or zero extended as required by the
2045    extension value.  */
2046
2047 static uint32_t
2048 extreg32 (sim_cpu *cpu, unsigned int lo, Extension extension)
2049 {
2050   switch (extension)
2051     {
2052     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2053     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2054     case UXTW: /* Fall through.  */
2055     case UXTX: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2056     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2057     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2058     case SXTW: /* Fall through.  */
2059     case SXTX: /* Fall through.  */
2060     default:   return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2061   }
2062 }
2063
2064 static uint64_t
2065 extreg64 (sim_cpu *cpu, unsigned int lo, Extension extension)
2066 {
2067   switch (extension)
2068     {
2069     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2070     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2071     case UXTW: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2072     case UXTX: return aarch64_get_reg_u64 (cpu, lo, NO_SP);
2073     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2074     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2075     case SXTW: return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2076     case SXTX:
2077     default:   return aarch64_get_reg_s64 (cpu, lo, NO_SP);
2078     }
2079 }
2080
2081 /* Arithmetic extending register
2082    These allow an optional sign extension of some portion of the
2083    second source register followed by an optional left shift of
2084    between 1 and 4 bits (i.e. a shift of 0-4 bits???)
2085
2086    N.B output (dest) and first input arg (source) may normally be Xn
2087    or SP. However, for flag setting operations dest can only be
2088    Xn. Second input registers are always Xn.  */
2089
2090 /* 32 bit ADD extending register.  */
2091 static void
2092 add32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2093 {
2094   unsigned rm = INSTR (20, 16);
2095   unsigned rn = INSTR (9, 5);
2096   unsigned rd = INSTR (4, 0);
2097
2098   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2099   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2100                        aarch64_get_reg_u32 (cpu, rn, SP_OK)
2101                        + (extreg32 (cpu, rm, extension) << shift));
2102 }
2103
2104 /* 64 bit ADD extending register.
2105    N.B. This subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2106 static void
2107 add64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2108 {
2109   unsigned rm = INSTR (20, 16);
2110   unsigned rn = INSTR (9, 5);
2111   unsigned rd = INSTR (4, 0);
2112
2113   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2114   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2115                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
2116                        + (extreg64 (cpu, rm, extension) << shift));
2117 }
2118
2119 /* 32 bit ADD extending register setting flags.  */
2120 static void
2121 adds32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2122 {
2123   unsigned rm = INSTR (20, 16);
2124   unsigned rn = INSTR (9, 5);
2125   unsigned rd = INSTR (4, 0);
2126
2127   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2128   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2129
2130   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2131   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2132   set_flags_for_add32 (cpu, value1, value2);
2133 }
2134
2135 /* 64 bit ADD extending register setting flags  */
2136 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2137 static void
2138 adds64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2139 {
2140   unsigned rm = INSTR (20, 16);
2141   unsigned rn = INSTR (9, 5);
2142   unsigned rd = INSTR (4, 0);
2143
2144   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2145   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2146
2147   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2148   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2149   set_flags_for_add64 (cpu, value1, value2);
2150 }
2151
2152 /* 32 bit SUB extending register.  */
2153 static void
2154 sub32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2155 {
2156   unsigned rm = INSTR (20, 16);
2157   unsigned rn = INSTR (9, 5);
2158   unsigned rd = INSTR (4, 0);
2159
2160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2161   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2162                        aarch64_get_reg_u32 (cpu, rn, SP_OK)
2163                        - (extreg32 (cpu, rm, extension) << shift));
2164 }
2165
2166 /* 64 bit SUB extending register.  */
2167 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2168 static void
2169 sub64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2170 {
2171   unsigned rm = INSTR (20, 16);
2172   unsigned rn = INSTR (9, 5);
2173   unsigned rd = INSTR (4, 0);
2174
2175   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2176   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2177                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
2178                        - (extreg64 (cpu, rm, extension) << shift));
2179 }
2180
2181 /* 32 bit SUB extending register setting flags.  */
2182 static void
2183 subs32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2184 {
2185   unsigned rm = INSTR (20, 16);
2186   unsigned rn = INSTR (9, 5);
2187   unsigned rd = INSTR (4, 0);
2188
2189   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2190   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2191
2192   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2193   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2194   set_flags_for_sub32 (cpu, value1, value2);
2195 }
2196
2197 /* 64 bit SUB extending register setting flags  */
2198 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2199 static void
2200 subs64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2201 {
2202   unsigned rm = INSTR (20, 16);
2203   unsigned rn = INSTR (9, 5);
2204   unsigned rd = INSTR (4, 0);
2205
2206   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2207   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2208
2209   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2210   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2211   set_flags_for_sub64 (cpu, value1, value2);
2212 }
2213
2214 static void
2215 dexAddSubtractImmediate (sim_cpu *cpu)
2216 {
2217   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2218      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2219      instr[29]    = set : 0 ==> no flags, 1 ==> set flags
2220      instr[28,24] = 10001
2221      instr[23,22] = shift : 00 == LSL#0, 01 = LSL#12 1x = UNALLOC
2222      instr[21,10] = uimm12
2223      instr[9,5]   = Rn
2224      instr[4,0]   = Rd  */
2225
2226   /* N.B. the shift is applied at decode before calling the add/sub routine.  */
2227   uint32_t shift = INSTR (23, 22);
2228   uint32_t imm = INSTR (21, 10);
2229   uint32_t dispatch = INSTR (31, 29);
2230
2231   NYI_assert (28, 24, 0x11);
2232
2233   if (shift > 1)
2234     HALT_UNALLOC;
2235
2236   if (shift)
2237     imm <<= 12;
2238
2239   switch (dispatch)
2240     {
2241     case 0: add32 (cpu, imm); break;
2242     case 1: adds32 (cpu, imm); break;
2243     case 2: sub32 (cpu, imm); break;
2244     case 3: subs32 (cpu, imm); break;
2245     case 4: add64 (cpu, imm); break;
2246     case 5: adds64 (cpu, imm); break;
2247     case 6: sub64 (cpu, imm); break;
2248     case 7: subs64 (cpu, imm); break;
2249     }
2250 }
2251
2252 static void
2253 dexAddSubtractShiftedRegister (sim_cpu *cpu)
2254 {
2255   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2256      instr[30,29] = op : 00 ==> ADD, 01 ==> ADDS, 10 ==> SUB, 11 ==> SUBS
2257      instr[28,24] = 01011
2258      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> UNALLOC
2259      instr[21]    = 0
2260      instr[20,16] = Rm
2261      instr[15,10] = count : must be 0xxxxx for 32 bit
2262      instr[9,5]   = Rn
2263      instr[4,0]   = Rd  */
2264
2265   uint32_t size = INSTR (31, 31);
2266   uint32_t count = INSTR (15, 10);
2267   Shift shiftType = INSTR (23, 22);
2268
2269   NYI_assert (28, 24, 0x0B);
2270   NYI_assert (21, 21, 0);
2271
2272   /* Shift encoded as ROR is unallocated.  */
2273   if (shiftType == ROR)
2274     HALT_UNALLOC;
2275
2276   /* 32 bit operations must have count[5] = 0
2277      or else we have an UNALLOC.  */
2278   if (size == 0 && uimm (count, 5, 5))
2279     HALT_UNALLOC;
2280
2281   /* Dispatch on size:op i.e instr [31,29].  */
2282   switch (INSTR (31, 29))
2283     {
2284     case 0: add32_shift  (cpu, shiftType, count); break;
2285     case 1: adds32_shift (cpu, shiftType, count); break;
2286     case 2: sub32_shift  (cpu, shiftType, count); break;
2287     case 3: subs32_shift (cpu, shiftType, count); break;
2288     case 4: add64_shift  (cpu, shiftType, count); break;
2289     case 5: adds64_shift (cpu, shiftType, count); break;
2290     case 6: sub64_shift  (cpu, shiftType, count); break;
2291     case 7: subs64_shift (cpu, shiftType, count); break;
2292     }
2293 }
2294
2295 static void
2296 dexAddSubtractExtendedRegister (sim_cpu *cpu)
2297 {
2298   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2299      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2300      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2301      instr[28,24] = 01011
2302      instr[23,22] = opt : 0 ==> ok, 1,2,3 ==> UNALLOC
2303      instr[21]    = 1
2304      instr[20,16] = Rm
2305      instr[15,13] = option : 000 ==> UXTB, 001 ==> UXTH,
2306                              000 ==> LSL|UXTW, 001 ==> UXTZ,
2307                              000 ==> SXTB, 001 ==> SXTH,
2308                              000 ==> SXTW, 001 ==> SXTX,
2309      instr[12,10] = shift : 0,1,2,3,4 ==> ok, 5,6,7 ==> UNALLOC
2310      instr[9,5]   = Rn
2311      instr[4,0]   = Rd  */
2312
2313   Extension extensionType = INSTR (15, 13);
2314   uint32_t shift = INSTR (12, 10);
2315
2316   NYI_assert (28, 24, 0x0B);
2317   NYI_assert (21, 21, 1);
2318
2319   /* Shift may not exceed 4.  */
2320   if (shift > 4)
2321     HALT_UNALLOC;
2322
2323   /* Dispatch on size:op:set?.  */
2324   switch (INSTR (31, 29))
2325     {
2326     case 0: add32_ext  (cpu, extensionType, shift); break;
2327     case 1: adds32_ext (cpu, extensionType, shift); break;
2328     case 2: sub32_ext  (cpu, extensionType, shift); break;
2329     case 3: subs32_ext (cpu, extensionType, shift); break;
2330     case 4: add64_ext  (cpu, extensionType, shift); break;
2331     case 5: adds64_ext (cpu, extensionType, shift); break;
2332     case 6: sub64_ext  (cpu, extensionType, shift); break;
2333     case 7: subs64_ext (cpu, extensionType, shift); break;
2334     }
2335 }
2336
2337 /* Conditional data processing
2338    Condition register is implicit 3rd source.  */
2339
2340 /* 32 bit add with carry.  */
2341 /* N.B register args may not be SP.  */
2342
2343 static void
2344 adc32 (sim_cpu *cpu)
2345 {
2346   unsigned rm = INSTR (20, 16);
2347   unsigned rn = INSTR (9, 5);
2348   unsigned rd = INSTR (4, 0);
2349
2350   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2351   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2352                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
2353                        + aarch64_get_reg_u32 (cpu, rm, NO_SP)
2354                        + IS_SET (C));
2355 }
2356
2357 /* 64 bit add with carry  */
2358 static void
2359 adc64 (sim_cpu *cpu)
2360 {
2361   unsigned rm = INSTR (20, 16);
2362   unsigned rn = INSTR (9, 5);
2363   unsigned rd = INSTR (4, 0);
2364
2365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2366   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2367                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2368                        + aarch64_get_reg_u64 (cpu, rm, NO_SP)
2369                        + IS_SET (C));
2370 }
2371
2372 /* 32 bit add with carry setting flags.  */
2373 static void
2374 adcs32 (sim_cpu *cpu)
2375 {
2376   unsigned rm = INSTR (20, 16);
2377   unsigned rn = INSTR (9, 5);
2378   unsigned rd = INSTR (4, 0);
2379
2380   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2381   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2382   uint32_t carry = IS_SET (C);
2383
2384   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2385   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2386   set_flags_for_add32 (cpu, value1, value2 + carry);
2387 }
2388
2389 /* 64 bit add with carry setting flags.  */
2390 static void
2391 adcs64 (sim_cpu *cpu)
2392 {
2393   unsigned rm = INSTR (20, 16);
2394   unsigned rn = INSTR (9, 5);
2395   unsigned rd = INSTR (4, 0);
2396
2397   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2398   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2399   uint64_t carry = IS_SET (C);
2400
2401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2402   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2403   set_flags_for_add64 (cpu, value1, value2 + carry);
2404 }
2405
2406 /* 32 bit sub with carry.  */
2407 static void
2408 sbc32 (sim_cpu *cpu)
2409 {
2410   unsigned rm = INSTR (20, 16);
2411   unsigned rn = INSTR (9, 5); /* ngc iff rn == 31.  */
2412   unsigned rd = INSTR (4, 0);
2413
2414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2415   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2416                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
2417                        - aarch64_get_reg_u32 (cpu, rm, NO_SP)
2418                        - 1 + IS_SET (C));
2419 }
2420
2421 /* 64 bit sub with carry  */
2422 static void
2423 sbc64 (sim_cpu *cpu)
2424 {
2425   unsigned rm = INSTR (20, 16);
2426   unsigned rn = INSTR (9, 5);
2427   unsigned rd = INSTR (4, 0);
2428
2429   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2430   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2431                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2432                        - aarch64_get_reg_u64 (cpu, rm, NO_SP)
2433                        - 1 + IS_SET (C));
2434 }
2435
2436 /* 32 bit sub with carry setting flags  */
2437 static void
2438 sbcs32 (sim_cpu *cpu)
2439 {
2440   unsigned rm = INSTR (20, 16);
2441   unsigned rn = INSTR (9, 5);
2442   unsigned rd = INSTR (4, 0);
2443
2444   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2445   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2446   uint32_t carry  = IS_SET (C);
2447   uint32_t result = value1 - value2 + 1 - carry;
2448
2449   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2450   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2451   set_flags_for_sub32 (cpu, value1, value2 + 1 - carry);
2452 }
2453
2454 /* 64 bit sub with carry setting flags  */
2455 static void
2456 sbcs64 (sim_cpu *cpu)
2457 {
2458   unsigned rm = INSTR (20, 16);
2459   unsigned rn = INSTR (9, 5);
2460   unsigned rd = INSTR (4, 0);
2461
2462   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2463   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2464   uint64_t carry  = IS_SET (C);
2465   uint64_t result = value1 - value2 + 1 - carry;
2466
2467   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2468   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2469   set_flags_for_sub64 (cpu, value1, value2 + 1 - carry);
2470 }
2471
2472 static void
2473 dexAddSubtractWithCarry (sim_cpu *cpu)
2474 {
2475   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2476      instr[30]    = op : 0 ==> ADC, 1 ==> SBC
2477      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2478      instr[28,21] = 1 1010 000
2479      instr[20,16] = Rm
2480      instr[15,10] = op2 : 00000 ==> ok, ow ==> UNALLOC
2481      instr[9,5]   = Rn
2482      instr[4,0]   = Rd  */
2483
2484   uint32_t op2 = INSTR (15, 10);
2485
2486   NYI_assert (28, 21, 0xD0);
2487
2488   if (op2 != 0)
2489     HALT_UNALLOC;
2490
2491   /* Dispatch on size:op:set?.  */
2492   switch (INSTR (31, 29))
2493     {
2494     case 0: adc32 (cpu); break;
2495     case 1: adcs32 (cpu); break;
2496     case 2: sbc32 (cpu); break;
2497     case 3: sbcs32 (cpu); break;
2498     case 4: adc64 (cpu); break;
2499     case 5: adcs64 (cpu); break;
2500     case 6: sbc64 (cpu); break;
2501     case 7: sbcs64 (cpu); break;
2502     }
2503 }
2504
2505 static uint32_t
2506 testConditionCode (sim_cpu *cpu, CondCode cc)
2507 {
2508   /* This should be reduceable to branchless logic
2509      by some careful testing of bits in CC followed
2510      by the requisite masking and combining of bits
2511      from the flag register.
2512
2513      For now we do it with a switch.  */
2514   int res;
2515
2516   switch (cc)
2517     {
2518     case EQ:  res = IS_SET (Z);    break;
2519     case NE:  res = IS_CLEAR (Z);  break;
2520     case CS:  res = IS_SET (C);    break;
2521     case CC:  res = IS_CLEAR (C);  break;
2522     case MI:  res = IS_SET (N);    break;
2523     case PL:  res = IS_CLEAR (N);  break;
2524     case VS:  res = IS_SET (V);    break;
2525     case VC:  res = IS_CLEAR (V);  break;
2526     case HI:  res = IS_SET (C) && IS_CLEAR (Z);  break;
2527     case LS:  res = IS_CLEAR (C) || IS_SET (Z);  break;
2528     case GE:  res = IS_SET (N) == IS_SET (V);    break;
2529     case LT:  res = IS_SET (N) != IS_SET (V);    break;
2530     case GT:  res = IS_CLEAR (Z) && (IS_SET (N) == IS_SET (V));  break;
2531     case LE:  res = IS_SET (Z) || (IS_SET (N) != IS_SET (V));    break;
2532     case AL:
2533     case NV:
2534     default:
2535       res = 1;
2536       break;
2537     }
2538   return res;
2539 }
2540
2541 static void
2542 CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
2543 {
2544   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2545      instr[30]    = compare with positive (1) or negative value (0)
2546      instr[29,21] = 1 1101 0010
2547      instr[20,16] = Rm or const
2548      instr[15,12] = cond
2549      instr[11]    = compare reg (0) or const (1)
2550      instr[10]    = 0
2551      instr[9,5]   = Rn
2552      instr[4]     = 0
2553      instr[3,0]   = value for CPSR bits if the comparison does not take place.  */
2554   signed int negate;
2555   unsigned rm;
2556   unsigned rn;
2557
2558   NYI_assert (29, 21, 0x1d2);
2559   NYI_assert (10, 10, 0);
2560   NYI_assert (4, 4, 0);
2561
2562   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2563   if (! testConditionCode (cpu, INSTR (15, 12)))
2564     {
2565       aarch64_set_CPSR (cpu, INSTR (3, 0));
2566       return;
2567     }
2568
2569   negate = INSTR (30, 30) ? 1 : -1;
2570   rm = INSTR (20, 16);
2571   rn = INSTR ( 9,  5);
2572
2573   if (INSTR (31, 31))
2574     {
2575       if (INSTR (11, 11))
2576         set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2577                              negate * (uint64_t) rm);
2578       else
2579         set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2580                              negate * aarch64_get_reg_u64 (cpu, rm, SP_OK));
2581     }
2582   else
2583     {
2584       if (INSTR (11, 11))
2585         set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2586                              negate * rm);
2587       else
2588         set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2589                              negate * aarch64_get_reg_u32 (cpu, rm, SP_OK));
2590     }
2591 }
2592
2593 static void
2594 do_vec_MOV_whole_vector (sim_cpu *cpu)
2595 {
2596   /* MOV Vd.T, Vs.T  (alias for ORR Vd.T, Vn.T, Vm.T where Vn == Vm)
2597
2598      instr[31]    = 0
2599      instr[30]    = half(0)/full(1)
2600      instr[29,21] = 001110101
2601      instr[20,16] = Vs
2602      instr[15,10] = 000111
2603      instr[9,5]   = Vs
2604      instr[4,0]   = Vd  */
2605
2606   unsigned vs = INSTR (9, 5);
2607   unsigned vd = INSTR (4, 0);
2608
2609   NYI_assert (29, 21, 0x075);
2610   NYI_assert (15, 10, 0x07);
2611
2612   if (INSTR (20, 16) != vs)
2613     HALT_NYI;
2614
2615   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2616   if (INSTR (30, 30))
2617     aarch64_set_vec_u64 (cpu, vd, 1, aarch64_get_vec_u64 (cpu, vs, 1));
2618
2619   aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vs, 0));
2620 }
2621
2622 static void
2623 do_vec_MOV_into_scalar (sim_cpu *cpu)
2624 {
2625   /* instr[31]    = 0
2626      instr[30]    = word(0)/long(1)
2627      instr[29,21] = 00 1110 000
2628      instr[20,18] = element size and index
2629      instr[17,10] = 00 0011 11
2630      instr[9,5]   = V source
2631      instr[4,0]   = R dest  */
2632
2633   unsigned vs = INSTR (9, 5);
2634   unsigned rd = INSTR (4, 0);
2635
2636   NYI_assert (29, 21, 0x070);
2637   NYI_assert (17, 10, 0x0F);
2638
2639   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2640   switch (INSTR (20, 18))
2641     {
2642     case 0x2:
2643       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 0));
2644       break;
2645
2646     case 0x6:
2647       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 1));
2648       break;
2649
2650     case 0x1:
2651     case 0x3:
2652     case 0x5:
2653     case 0x7:
2654       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u32
2655                            (cpu, vs, INSTR (20, 19)));
2656       break;
2657
2658     default:
2659       HALT_NYI;
2660     }
2661 }
2662
2663 static void
2664 do_vec_INS (sim_cpu *cpu)
2665 {
2666   /* instr[31,21] = 01001110000
2667      instr[20,16] = element size and index
2668      instr[15,10] = 000111
2669      instr[9,5]   = W source
2670      instr[4,0]   = V dest  */
2671
2672   int index;
2673   unsigned rs = INSTR (9, 5);
2674   unsigned vd = INSTR (4, 0);
2675
2676   NYI_assert (31, 21, 0x270);
2677   NYI_assert (15, 10, 0x07);
2678
2679   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2680   if (INSTR (16, 16))
2681     {
2682       index = INSTR (20, 17);
2683       aarch64_set_vec_u8 (cpu, vd, index,
2684                           aarch64_get_reg_u8 (cpu, rs, NO_SP));
2685     }
2686   else if (INSTR (17, 17))
2687     {
2688       index = INSTR (20, 18);
2689       aarch64_set_vec_u16 (cpu, vd, index,
2690                            aarch64_get_reg_u16 (cpu, rs, NO_SP));
2691     }
2692   else if (INSTR (18, 18))
2693     {
2694       index = INSTR (20, 19);
2695       aarch64_set_vec_u32 (cpu, vd, index,
2696                            aarch64_get_reg_u32 (cpu, rs, NO_SP));
2697     }
2698   else if (INSTR (19, 19))
2699     {
2700       index = INSTR (20, 20);
2701       aarch64_set_vec_u64 (cpu, vd, index,
2702                            aarch64_get_reg_u64 (cpu, rs, NO_SP));
2703     }
2704   else
2705     HALT_NYI;
2706 }
2707
2708 static void
2709 do_vec_DUP_vector_into_vector (sim_cpu *cpu)
2710 {
2711   /* instr[31]    = 0
2712      instr[30]    = half(0)/full(1)
2713      instr[29,21] = 00 1110 000
2714      instr[20,16] = element size and index
2715      instr[15,10] = 0000 01
2716      instr[9,5]   = V source
2717      instr[4,0]   = V dest.  */
2718
2719   unsigned full = INSTR (30, 30);
2720   unsigned vs = INSTR (9, 5);
2721   unsigned vd = INSTR (4, 0);
2722   int i, index;
2723
2724   NYI_assert (29, 21, 0x070);
2725   NYI_assert (15, 10, 0x01);
2726
2727   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2728   if (INSTR (16, 16))
2729     {
2730       index = INSTR (20, 17);
2731
2732       for (i = 0; i < (full ? 16 : 8); i++)
2733         aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vs, index));
2734     }
2735   else if (INSTR (17, 17))
2736     {
2737       index = INSTR (20, 18);
2738
2739       for (i = 0; i < (full ? 8 : 4); i++)
2740         aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, index));
2741     }
2742   else if (INSTR (18, 18))
2743     {
2744       index = INSTR (20, 19);
2745
2746       for (i = 0; i < (full ? 4 : 2); i++)
2747         aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, index));
2748     }
2749   else
2750     {
2751       if (INSTR (19, 19) == 0)
2752         HALT_UNALLOC;
2753
2754       if (! full)
2755         HALT_UNALLOC;
2756
2757       index = INSTR (20, 20);
2758
2759       for (i = 0; i < 2; i++)
2760         aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, index));
2761     }
2762 }
2763
2764 static void
2765 do_vec_TBL (sim_cpu *cpu)
2766 {
2767   /* instr[31]    = 0
2768      instr[30]    = half(0)/full(1)
2769      instr[29,21] = 00 1110 000
2770      instr[20,16] = Vm
2771      instr[15]    = 0
2772      instr[14,13] = vec length
2773      instr[12,10] = 000
2774      instr[9,5]   = V start
2775      instr[4,0]   = V dest  */
2776
2777   int full    = INSTR (30, 30);
2778   int len     = INSTR (14, 13) + 1;
2779   unsigned vm = INSTR (20, 16);
2780   unsigned vn = INSTR (9, 5);
2781   unsigned vd = INSTR (4, 0);
2782   unsigned i;
2783
2784   NYI_assert (29, 21, 0x070);
2785   NYI_assert (12, 10, 0);
2786
2787   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2788   for (i = 0; i < (full ? 16 : 8); i++)
2789     {
2790       unsigned int selector = aarch64_get_vec_u8 (cpu, vm, i);
2791       uint8_t val;
2792
2793       if (selector < 16)
2794         val = aarch64_get_vec_u8 (cpu, vn, selector);
2795       else if (selector < 32)
2796         val = len < 2 ? 0 : aarch64_get_vec_u8 (cpu, vn + 1, selector - 16);
2797       else if (selector < 48)
2798         val = len < 3 ? 0 : aarch64_get_vec_u8 (cpu, vn + 2, selector - 32);
2799       else if (selector < 64)
2800         val = len < 4 ? 0 : aarch64_get_vec_u8 (cpu, vn + 3, selector - 48);
2801       else
2802         val = 0;
2803
2804       aarch64_set_vec_u8 (cpu, vd, i, val);
2805     }
2806 }
2807
2808 static void
2809 do_vec_TRN (sim_cpu *cpu)
2810 {
2811   /* instr[31]    = 0
2812      instr[30]    = half(0)/full(1)
2813      instr[29,24] = 00 1110
2814      instr[23,22] = size
2815      instr[21]    = 0
2816      instr[20,16] = Vm
2817      instr[15]    = 0
2818      instr[14]    = TRN1 (0) / TRN2 (1)
2819      instr[13,10] = 1010
2820      instr[9,5]   = V source
2821      instr[4,0]   = V dest.  */
2822
2823   int full    = INSTR (30, 30);
2824   int second  = INSTR (14, 14);
2825   unsigned vm = INSTR (20, 16);
2826   unsigned vn = INSTR (9, 5);
2827   unsigned vd = INSTR (4, 0);
2828   unsigned i;
2829
2830   NYI_assert (29, 24, 0x0E);
2831   NYI_assert (13, 10, 0xA);
2832
2833   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2834   switch (INSTR (23, 22))
2835     {
2836     case 0:
2837       for (i = 0; i < (full ? 8 : 4); i++)
2838         {
2839           aarch64_set_vec_u8
2840             (cpu, vd, i * 2,
2841              aarch64_get_vec_u8 (cpu, second ? vm : vn, i * 2));
2842           aarch64_set_vec_u8
2843             (cpu, vd, 1 * 2 + 1,
2844              aarch64_get_vec_u8 (cpu, second ? vn : vm, i * 2 + 1));
2845         }
2846       break;
2847
2848     case 1:
2849       for (i = 0; i < (full ? 4 : 2); i++)
2850         {
2851           aarch64_set_vec_u16
2852             (cpu, vd, i * 2,
2853              aarch64_get_vec_u16 (cpu, second ? vm : vn, i * 2));
2854           aarch64_set_vec_u16
2855             (cpu, vd, 1 * 2 + 1,
2856              aarch64_get_vec_u16 (cpu, second ? vn : vm, i * 2 + 1));
2857         }
2858       break;
2859
2860     case 2:
2861       aarch64_set_vec_u32
2862         (cpu, vd, 0, aarch64_get_vec_u32 (cpu, second ? vm : vn, 0));
2863       aarch64_set_vec_u32
2864         (cpu, vd, 1, aarch64_get_vec_u32 (cpu, second ? vn : vm, 1));
2865       aarch64_set_vec_u32
2866         (cpu, vd, 2, aarch64_get_vec_u32 (cpu, second ? vm : vn, 2));
2867       aarch64_set_vec_u32
2868         (cpu, vd, 3, aarch64_get_vec_u32 (cpu, second ? vn : vm, 3));
2869       break;
2870
2871     case 3:
2872       if (! full)
2873         HALT_UNALLOC;
2874
2875       aarch64_set_vec_u64 (cpu, vd, 0,
2876                            aarch64_get_vec_u64 (cpu, second ? vm : vn, 0));
2877       aarch64_set_vec_u64 (cpu, vd, 1,
2878                            aarch64_get_vec_u64 (cpu, second ? vn : vm, 1));
2879       break;
2880     }
2881 }
2882
2883 static void
2884 do_vec_DUP_scalar_into_vector (sim_cpu *cpu)
2885 {
2886   /* instr[31]    = 0
2887      instr[30]    = 0=> zero top 64-bits, 1=> duplicate into top 64-bits
2888                     [must be 1 for 64-bit xfer]
2889      instr[29,20] = 00 1110 0000
2890      instr[19,16] = element size: 0001=> 8-bits, 0010=> 16-bits,
2891                                   0100=> 32-bits. 1000=>64-bits
2892      instr[15,10] = 0000 11
2893      instr[9,5]   = W source
2894      instr[4,0]   = V dest.  */
2895
2896   unsigned i;
2897   unsigned Vd = INSTR (4, 0);
2898   unsigned Rs = INSTR (9, 5);
2899   int both    = INSTR (30, 30);
2900
2901   NYI_assert (29, 20, 0x0E0);
2902   NYI_assert (15, 10, 0x03);
2903
2904   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2905   switch (INSTR (19, 16))
2906     {
2907     case 1:
2908       for (i = 0; i < (both ? 16 : 8); i++)
2909         aarch64_set_vec_u8 (cpu, Vd, i, aarch64_get_reg_u8 (cpu, Rs, NO_SP));
2910       break;
2911
2912     case 2:
2913       for (i = 0; i < (both ? 8 : 4); i++)
2914         aarch64_set_vec_u16 (cpu, Vd, i, aarch64_get_reg_u16 (cpu, Rs, NO_SP));
2915       break;
2916
2917     case 4:
2918       for (i = 0; i < (both ? 4 : 2); i++)
2919         aarch64_set_vec_u32 (cpu, Vd, i, aarch64_get_reg_u32 (cpu, Rs, NO_SP));
2920       break;
2921
2922     case 8:
2923       if (!both)
2924         HALT_NYI;
2925       aarch64_set_vec_u64 (cpu, Vd, 0, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
2926       aarch64_set_vec_u64 (cpu, Vd, 1, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
2927       break;
2928
2929     default:
2930       HALT_NYI;
2931     }
2932 }
2933
2934 static void
2935 do_vec_UZP (sim_cpu *cpu)
2936 {
2937   /* instr[31]    = 0
2938      instr[30]    = half(0)/full(1)
2939      instr[29,24] = 00 1110
2940      instr[23,22] = size: byte(00), half(01), word (10), long (11)
2941      instr[21]    = 0
2942      instr[20,16] = Vm
2943      instr[15]    = 0
2944      instr[14]    = lower (0) / upper (1)
2945      instr[13,10] = 0110
2946      instr[9,5]   = Vn
2947      instr[4,0]   = Vd.  */
2948
2949   int full = INSTR (30, 30);
2950   int upper = INSTR (14, 14);
2951
2952   unsigned vm = INSTR (20, 16);
2953   unsigned vn = INSTR (9, 5);
2954   unsigned vd = INSTR (4, 0);
2955
2956   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
2957   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
2958   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
2959   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
2960
2961   uint64_t val1;
2962   uint64_t val2;
2963
2964   uint64_t input2 = full ? val_n2 : val_m1;
2965
2966   NYI_assert (29, 24, 0x0E);
2967   NYI_assert (21, 21, 0);
2968   NYI_assert (15, 15, 0);
2969   NYI_assert (13, 10, 6);
2970
2971   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2972   switch (INSTR (23, 22))
2973     {
2974     case 0:
2975       val1 = (val_n1 >> (upper * 8)) & 0xFFULL;
2976       val1 |= (val_n1 >> ((upper * 8) + 8)) & 0xFF00ULL;
2977       val1 |= (val_n1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
2978       val1 |= (val_n1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
2979
2980       val1 |= (input2 << (32 - (upper * 8))) & 0xFF00000000ULL;
2981       val1 |= (input2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
2982       val1 |= (input2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
2983       val1 |= (input2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
2984
2985       if (full)
2986         {
2987           val2 = (val_m1 >> (upper * 8)) & 0xFFULL;
2988           val2 |= (val_m1 >> ((upper * 8) + 8)) & 0xFF00ULL;
2989           val2 |= (val_m1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
2990           val2 |= (val_m1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
2991
2992           val2 |= (val_m2 << (32 - (upper * 8))) & 0xFF00000000ULL;
2993           val2 |= (val_m2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
2994           val2 |= (val_m2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
2995           val2 |= (val_m2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
2996         }
2997       break;
2998
2999     case 1:
3000       val1 = (val_n1 >> (upper * 16)) & 0xFFFFULL;
3001       val1 |= (val_n1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3002
3003       val1 |= (input2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;;
3004       val1 |= (input2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3005
3006       if (full)
3007         {
3008           val2 = (val_m1 >> (upper * 16)) & 0xFFFFULL;
3009           val2 |= (val_m1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3010
3011           val2 |= (val_m2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;
3012           val2 |= (val_m2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3013         }
3014       break;
3015
3016     case 2:
3017       val1 = (val_n1 >> (upper * 32)) & 0xFFFFFFFF;
3018       val1 |= (input2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3019
3020       if (full)
3021         {
3022           val2 = (val_m1 >> (upper * 32)) & 0xFFFFFFFF;
3023           val2 |= (val_m2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3024         }
3025       break;
3026
3027     case 3:
3028       if (! full)
3029         HALT_UNALLOC;
3030
3031       val1 = upper ? val_n2 : val_n1;
3032       val2 = upper ? val_m2 : val_m1;
3033       break;
3034     }
3035
3036   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3037   if (full)
3038     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3039 }
3040
3041 static void
3042 do_vec_ZIP (sim_cpu *cpu)
3043 {
3044   /* instr[31]    = 0
3045      instr[30]    = half(0)/full(1)
3046      instr[29,24] = 00 1110
3047      instr[23,22] = size: byte(00), hald(01), word (10), long (11)
3048      instr[21]    = 0
3049      instr[20,16] = Vm
3050      instr[15]    = 0
3051      instr[14]    = lower (0) / upper (1)
3052      instr[13,10] = 1110
3053      instr[9,5]   = Vn
3054      instr[4,0]   = Vd.  */
3055
3056   int full = INSTR (30, 30);
3057   int upper = INSTR (14, 14);
3058
3059   unsigned vm = INSTR (20, 16);
3060   unsigned vn = INSTR (9, 5);
3061   unsigned vd = INSTR (4, 0);
3062
3063   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
3064   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
3065   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
3066   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
3067
3068   uint64_t val1 = 0;
3069   uint64_t val2 = 0;
3070
3071   uint64_t input1 = upper ? val_n1 : val_m1;
3072   uint64_t input2 = upper ? val_n2 : val_m2;
3073
3074   NYI_assert (29, 24, 0x0E);
3075   NYI_assert (21, 21, 0);
3076   NYI_assert (15, 15, 0);
3077   NYI_assert (13, 10, 0xE);
3078
3079   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3080   switch (INSTR (23, 23))
3081     {
3082     case 0:
3083       val1 =
3084           ((input1 <<  0) & (0xFF    <<  0))
3085         | ((input2 <<  8) & (0xFF    <<  8))
3086         | ((input1 <<  8) & (0xFF    << 16))
3087         | ((input2 << 16) & (0xFF    << 24))
3088         | ((input1 << 16) & (0xFFULL << 32))
3089         | ((input2 << 24) & (0xFFULL << 40))
3090         | ((input1 << 24) & (0xFFULL << 48))
3091         | ((input2 << 32) & (0xFFULL << 56));
3092
3093       val2 =
3094           ((input1 >> 32) & (0xFF    <<  0))
3095         | ((input2 >> 24) & (0xFF    <<  8))
3096         | ((input1 >> 24) & (0xFF    << 16))
3097         | ((input2 >> 16) & (0xFF    << 24))
3098         | ((input1 >> 16) & (0xFFULL << 32))
3099         | ((input2 >>  8) & (0xFFULL << 40))
3100         | ((input1 >>  8) & (0xFFULL << 48))
3101         | ((input2 >>  0) & (0xFFULL << 56));
3102       break;
3103
3104     case 1:
3105       val1 =
3106           ((input1 <<  0) & (0xFFFF    <<  0))
3107         | ((input2 << 16) & (0xFFFF    << 16))
3108         | ((input1 << 16) & (0xFFFFULL << 32))
3109         | ((input2 << 32) & (0xFFFFULL << 48));
3110
3111       val2 =
3112           ((input1 >> 32) & (0xFFFF    <<  0))
3113         | ((input2 >> 16) & (0xFFFF    << 16))
3114         | ((input1 >> 16) & (0xFFFFULL << 32))
3115         | ((input2 >>  0) & (0xFFFFULL << 48));
3116       break;
3117
3118     case 2:
3119       val1 = (input1 & 0xFFFFFFFFULL) | (input2 << 32);
3120       val2 = (input2 & 0xFFFFFFFFULL) | (input1 << 32);
3121       break;
3122
3123     case 3:
3124       val1 = input1;
3125       val2 = input2;
3126       break;
3127     }
3128
3129   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3130   if (full)
3131     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3132 }
3133
3134 /* Floating point immediates are encoded in 8 bits.
3135    fpimm[7] = sign bit.
3136    fpimm[6:4] = signed exponent.
3137    fpimm[3:0] = fraction (assuming leading 1).
3138    i.e. F = s * 1.f * 2^(e - b).  */
3139
3140 static float
3141 fp_immediate_for_encoding_32 (uint32_t imm8)
3142 {
3143   float u;
3144   uint32_t s, e, f, i;
3145
3146   s = (imm8 >> 7) & 0x1;
3147   e = (imm8 >> 4) & 0x7;
3148   f = imm8 & 0xf;
3149
3150   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3151   u = (16.0 + f) / 16.0;
3152
3153   /* N.B. exponent is signed.  */
3154   if (e < 4)
3155     {
3156       int epos = e;
3157
3158       for (i = 0; i <= epos; i++)
3159         u *= 2.0;
3160     }
3161   else
3162     {
3163       int eneg = 7 - e;
3164
3165       for (i = 0; i < eneg; i++)
3166         u /= 2.0;
3167     }
3168
3169   if (s)
3170     u = - u;
3171
3172   return u;
3173 }
3174
3175 static double
3176 fp_immediate_for_encoding_64 (uint32_t imm8)
3177 {
3178   double u;
3179   uint32_t s, e, f, i;
3180
3181   s = (imm8 >> 7) & 0x1;
3182   e = (imm8 >> 4) & 0x7;
3183   f = imm8 & 0xf;
3184
3185   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3186   u = (16.0 + f) / 16.0;
3187
3188   /* N.B. exponent is signed.  */
3189   if (e < 4)
3190     {
3191       int epos = e;
3192
3193       for (i = 0; i <= epos; i++)
3194         u *= 2.0;
3195     }
3196   else
3197     {
3198       int eneg = 7 - e;
3199
3200       for (i = 0; i < eneg; i++)
3201         u /= 2.0;
3202     }
3203
3204   if (s)
3205     u = - u;
3206
3207   return u;
3208 }
3209
3210 static void
3211 do_vec_MOV_immediate (sim_cpu *cpu)
3212 {
3213   /* instr[31]    = 0
3214      instr[30]    = full/half selector
3215      instr[29,19] = 00111100000
3216      instr[18,16] = high 3 bits of uimm8
3217      instr[15,12] = size & shift:
3218                                   0000 => 32-bit
3219                                   0010 => 32-bit + LSL#8
3220                                   0100 => 32-bit + LSL#16
3221                                   0110 => 32-bit + LSL#24
3222                                   1010 => 16-bit + LSL#8
3223                                   1000 => 16-bit
3224                                   1101 => 32-bit + MSL#16
3225                                   1100 => 32-bit + MSL#8
3226                                   1110 => 8-bit
3227                                   1111 => double
3228      instr[11,10] = 01
3229      instr[9,5]   = low 5-bits of uimm8
3230      instr[4,0]   = Vd.  */
3231
3232   int full     = INSTR (30, 30);
3233   unsigned vd  = INSTR (4, 0);
3234   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3235   unsigned i;
3236
3237   NYI_assert (29, 19, 0x1E0);
3238   NYI_assert (11, 10, 1);
3239
3240   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3241   switch (INSTR (15, 12))
3242     {
3243     case 0x0: /* 32-bit, no shift.  */
3244     case 0x2: /* 32-bit, shift by 8.  */
3245     case 0x4: /* 32-bit, shift by 16.  */
3246     case 0x6: /* 32-bit, shift by 24.  */
3247       val <<= (8 * INSTR (14, 13));
3248       for (i = 0; i < (full ? 4 : 2); i++)
3249         aarch64_set_vec_u32 (cpu, vd, i, val);
3250       break;
3251
3252     case 0xa: /* 16-bit, shift by 8.  */
3253       val <<= 8;
3254       /* Fall through.  */
3255     case 0x8: /* 16-bit, no shift.  */
3256       for (i = 0; i < (full ? 8 : 4); i++)
3257         aarch64_set_vec_u16 (cpu, vd, i, val);
3258       break;
3259
3260     case 0xd: /* 32-bit, mask shift by 16.  */
3261       val <<= 8;
3262       val |= 0xFF;
3263       /* Fall through.  */
3264     case 0xc: /* 32-bit, mask shift by 8. */
3265       val <<= 8;
3266       val |= 0xFF;
3267       for (i = 0; i < (full ? 4 : 2); i++)
3268         aarch64_set_vec_u32 (cpu, vd, i, val);
3269       break;
3270
3271     case 0xe: /* 8-bit, no shift.  */
3272       for (i = 0; i < (full ? 16 : 8); i++)
3273         aarch64_set_vec_u8 (cpu, vd, i, val);
3274       break;
3275
3276     case 0xf: /* FMOV Vs.{2|4}S, #fpimm.  */
3277       {
3278         float u = fp_immediate_for_encoding_32 (val);
3279         for (i = 0; i < (full ? 4 : 2); i++)
3280           aarch64_set_vec_float (cpu, vd, i, u);
3281         break;
3282       }
3283
3284     default:
3285       HALT_NYI;
3286     }
3287 }
3288
3289 static void
3290 do_vec_MVNI (sim_cpu *cpu)
3291 {
3292   /* instr[31]    = 0
3293      instr[30]    = full/half selector
3294      instr[29,19] = 10111100000
3295      instr[18,16] = high 3 bits of uimm8
3296      instr[15,12] = selector
3297      instr[11,10] = 01
3298      instr[9,5]   = low 5-bits of uimm8
3299      instr[4,0]   = Vd.  */
3300
3301   int full     = INSTR (30, 30);
3302   unsigned vd  = INSTR (4, 0);
3303   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3304   unsigned i;
3305
3306   NYI_assert (29, 19, 0x5E0);
3307   NYI_assert (11, 10, 1);
3308
3309   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3310   switch (INSTR (15, 12))
3311     {
3312     case 0x0: /* 32-bit, no shift.  */
3313     case 0x2: /* 32-bit, shift by 8.  */
3314     case 0x4: /* 32-bit, shift by 16.  */
3315     case 0x6: /* 32-bit, shift by 24.  */
3316       val <<= (8 * INSTR (14, 13));
3317       val = ~ val;
3318       for (i = 0; i < (full ? 4 : 2); i++)
3319         aarch64_set_vec_u32 (cpu, vd, i, val);
3320       return;
3321
3322     case 0xa: /* 16-bit, 8 bit shift. */
3323       val <<= 8;
3324     case 0x8: /* 16-bit, no shift. */
3325       val = ~ val;
3326       for (i = 0; i < (full ? 8 : 4); i++)
3327         aarch64_set_vec_u16 (cpu, vd, i, val);
3328       return;
3329
3330     case 0xd: /* 32-bit, mask shift by 16.  */
3331       val <<= 8;
3332       val |= 0xFF;
3333     case 0xc: /* 32-bit, mask shift by 8. */
3334       val <<= 8;
3335       val |= 0xFF;
3336       val = ~ val;
3337       for (i = 0; i < (full ? 4 : 2); i++)
3338         aarch64_set_vec_u32 (cpu, vd, i, val);
3339       return;
3340
3341     case 0xE: /* MOVI Dn, #mask64 */
3342       {
3343         uint64_t mask = 0;
3344
3345         for (i = 0; i < 8; i++)
3346           if (val & (1 << i))
3347             mask |= (0xFFUL << (i * 8));
3348         aarch64_set_vec_u64 (cpu, vd, 0, mask);
3349         aarch64_set_vec_u64 (cpu, vd, 1, mask);
3350         return;
3351       }
3352
3353     case 0xf: /* FMOV Vd.2D, #fpimm.  */
3354       {
3355         double u = fp_immediate_for_encoding_64 (val);
3356
3357         if (! full)
3358           HALT_UNALLOC;
3359
3360         aarch64_set_vec_double (cpu, vd, 0, u);
3361         aarch64_set_vec_double (cpu, vd, 1, u);
3362         return;
3363       }
3364
3365     default:
3366       HALT_NYI;
3367     }
3368 }
3369
3370 #define ABS(A) ((A) < 0 ? - (A) : (A))
3371
3372 static void
3373 do_vec_ABS (sim_cpu *cpu)
3374 {
3375   /* instr[31]    = 0
3376      instr[30]    = half(0)/full(1)
3377      instr[29,24] = 00 1110
3378      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3379      instr[21,10] = 10 0000 1011 10
3380      instr[9,5]   = Vn
3381      instr[4.0]   = Vd.  */
3382
3383   unsigned vn = INSTR (9, 5);
3384   unsigned vd = INSTR (4, 0);
3385   unsigned full = INSTR (30, 30);
3386   unsigned i;
3387
3388   NYI_assert (29, 24, 0x0E);
3389   NYI_assert (21, 10, 0x82E);
3390
3391   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3392   switch (INSTR (23, 22))
3393     {
3394     case 0:
3395       for (i = 0; i < (full ? 16 : 8); i++)
3396         aarch64_set_vec_s8 (cpu, vd, i,
3397                             ABS (aarch64_get_vec_s8 (cpu, vn, i)));
3398       break;
3399
3400     case 1:
3401       for (i = 0; i < (full ? 8 : 4); i++)
3402         aarch64_set_vec_s16 (cpu, vd, i,
3403                              ABS (aarch64_get_vec_s16 (cpu, vn, i)));
3404       break;
3405
3406     case 2:
3407       for (i = 0; i < (full ? 4 : 2); i++)
3408         aarch64_set_vec_s32 (cpu, vd, i,
3409                              ABS (aarch64_get_vec_s32 (cpu, vn, i)));
3410       break;
3411
3412     case 3:
3413       if (! full)
3414         HALT_NYI;
3415       for (i = 0; i < 2; i++)
3416         aarch64_set_vec_s64 (cpu, vd, i,
3417                              ABS (aarch64_get_vec_s64 (cpu, vn, i)));
3418       break;
3419     }
3420 }
3421
3422 static void
3423 do_vec_ADDV (sim_cpu *cpu)
3424 {
3425   /* instr[31]    = 0
3426      instr[30]    = full/half selector
3427      instr[29,24] = 00 1110
3428      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3429      instr[21,10] = 11 0001 1011 10
3430      instr[9,5]   = Vm
3431      instr[4.0]   = Rd.  */
3432
3433   unsigned vm = INSTR (9, 5);
3434   unsigned rd = INSTR (4, 0);
3435   unsigned i;
3436   uint64_t val = 0;
3437   int      full = INSTR (30, 30);
3438
3439   NYI_assert (29, 24, 0x0E);
3440   NYI_assert (21, 10, 0xC6E);
3441
3442   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3443   switch (INSTR (23, 22))
3444     {
3445     case 0:
3446       for (i = 0; i < (full ? 16 : 8); i++)
3447         val += aarch64_get_vec_u8 (cpu, vm, i);
3448       aarch64_set_vec_u64 (cpu, rd, 0, val);
3449       return;
3450
3451     case 1:
3452       for (i = 0; i < (full ? 8 : 4); i++)
3453         val += aarch64_get_vec_u16 (cpu, vm, i);
3454       aarch64_set_vec_u64 (cpu, rd, 0, val);
3455       return;
3456
3457     case 2:
3458       if (! full)
3459         HALT_UNALLOC;
3460       for (i = 0; i < 4; i++)
3461         val += aarch64_get_vec_u32 (cpu, vm, i);
3462       aarch64_set_vec_u64 (cpu, rd, 0, val);
3463       return;
3464
3465     case 3:
3466       HALT_UNALLOC;
3467     }
3468 }
3469
3470 static void
3471 do_vec_ins_2 (sim_cpu *cpu)
3472 {
3473   /* instr[31,21] = 01001110000
3474      instr[20,18] = size & element selector
3475      instr[17,14] = 0000
3476      instr[13]    = direction: to vec(0), from vec (1)
3477      instr[12,10] = 111
3478      instr[9,5]   = Vm
3479      instr[4,0]   = Vd.  */
3480
3481   unsigned elem;
3482   unsigned vm = INSTR (9, 5);
3483   unsigned vd = INSTR (4, 0);
3484
3485   NYI_assert (31, 21, 0x270);
3486   NYI_assert (17, 14, 0);
3487   NYI_assert (12, 10, 7);
3488
3489   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3490   if (INSTR (13, 13) == 1)
3491     {
3492       if (INSTR (18, 18) == 1)
3493         {
3494           /* 32-bit moves.  */
3495           elem = INSTR (20, 19);
3496           aarch64_set_reg_u64 (cpu, vd, NO_SP,
3497                                aarch64_get_vec_u32 (cpu, vm, elem));
3498         }
3499       else
3500         {
3501           /* 64-bit moves.  */
3502           if (INSTR (19, 19) != 1)
3503             HALT_NYI;
3504
3505           elem = INSTR (20, 20);
3506           aarch64_set_reg_u64 (cpu, vd, NO_SP,
3507                                aarch64_get_vec_u64 (cpu, vm, elem));
3508         }
3509     }
3510   else
3511     {
3512       if (INSTR (18, 18) == 1)
3513         {
3514           /* 32-bit moves.  */
3515           elem = INSTR (20, 19);
3516           aarch64_set_vec_u32 (cpu, vd, elem,
3517                                aarch64_get_reg_u32 (cpu, vm, NO_SP));
3518         }
3519       else
3520         {
3521           /* 64-bit moves.  */
3522           if (INSTR (19, 19) != 1)
3523             HALT_NYI;
3524
3525           elem = INSTR (20, 20);
3526           aarch64_set_vec_u64 (cpu, vd, elem,
3527                                aarch64_get_reg_u64 (cpu, vm, NO_SP));
3528         }
3529     }
3530 }
3531
3532 #define DO_VEC_WIDENING_MUL(N, DST_TYPE, READ_TYPE, WRITE_TYPE)   \
3533   do                                                              \
3534     {                                                             \
3535       DST_TYPE a[N], b[N];                                        \
3536                                                                   \
3537       for (i = 0; i < (N); i++)                                   \
3538         {                                                         \
3539           a[i] = aarch64_get_vec_##READ_TYPE (cpu, vn, i + bias); \
3540           b[i] = aarch64_get_vec_##READ_TYPE (cpu, vm, i + bias); \
3541         }                                                         \
3542       for (i = 0; i < (N); i++)                                   \
3543         aarch64_set_vec_##WRITE_TYPE (cpu, vd, i, a[i] * b[i]);   \
3544     }                                                             \
3545   while (0)
3546
3547 static void
3548 do_vec_mull (sim_cpu *cpu)
3549 {
3550   /* instr[31]    = 0
3551      instr[30]    = lower(0)/upper(1) selector
3552      instr[29]    = signed(0)/unsigned(1)
3553      instr[28,24] = 0 1110
3554      instr[23,22] = size: 8-bit (00), 16-bit (01), 32-bit (10)
3555      instr[21]    = 1
3556      instr[20,16] = Vm
3557      instr[15,10] = 11 0000
3558      instr[9,5]   = Vn
3559      instr[4.0]   = Vd.  */
3560
3561   int    unsign = INSTR (29, 29);
3562   int    bias = INSTR (30, 30);
3563   unsigned vm = INSTR (20, 16);
3564   unsigned vn = INSTR ( 9,  5);
3565   unsigned vd = INSTR ( 4,  0);
3566   unsigned i;
3567
3568   NYI_assert (28, 24, 0x0E);
3569   NYI_assert (15, 10, 0x30);
3570
3571   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3572   /* NB: Read source values before writing results, in case
3573      the source and destination vectors are the same.  */
3574   switch (INSTR (23, 22))
3575     {
3576     case 0:
3577       if (bias)
3578         bias = 8;
3579       if (unsign)
3580         DO_VEC_WIDENING_MUL (8, uint16_t, u8, u16);
3581       else
3582         DO_VEC_WIDENING_MUL (8, int16_t, s8, s16);
3583       return;
3584
3585     case 1:
3586       if (bias)
3587         bias = 4;
3588       if (unsign)
3589         DO_VEC_WIDENING_MUL (4, uint32_t, u16, u32);
3590       else
3591         DO_VEC_WIDENING_MUL (4, int32_t, s16, s32);
3592       return;
3593
3594     case 2:
3595       if (bias)
3596         bias = 2;
3597       if (unsign)
3598         DO_VEC_WIDENING_MUL (2, uint64_t, u32, u64);
3599       else
3600         DO_VEC_WIDENING_MUL (2, int64_t, s32, s64);
3601       return;
3602
3603     case 3:
3604       HALT_NYI;
3605     }
3606 }
3607
3608 static void
3609 do_vec_fadd (sim_cpu *cpu)
3610 {
3611   /* instr[31]    = 0
3612      instr[30]    = half(0)/full(1)
3613      instr[29,24] = 001110
3614      instr[23]    = FADD(0)/FSUB(1)
3615      instr[22]    = float (0)/double(1)
3616      instr[21]    = 1
3617      instr[20,16] = Vm
3618      instr[15,10] = 110101
3619      instr[9,5]   = Vn
3620      instr[4.0]   = Vd.  */
3621
3622   unsigned vm = INSTR (20, 16);
3623   unsigned vn = INSTR (9, 5);
3624   unsigned vd = INSTR (4, 0);
3625   unsigned i;
3626   int      full = INSTR (30, 30);
3627
3628   NYI_assert (29, 24, 0x0E);
3629   NYI_assert (21, 21, 1);
3630   NYI_assert (15, 10, 0x35);
3631
3632   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3633   if (INSTR (23, 23))
3634     {
3635       if (INSTR (22, 22))
3636         {
3637           if (! full)
3638             HALT_NYI;
3639
3640           for (i = 0; i < 2; i++)
3641             aarch64_set_vec_double (cpu, vd, i,
3642                                     aarch64_get_vec_double (cpu, vn, i)
3643                                     - aarch64_get_vec_double (cpu, vm, i));
3644         }
3645       else
3646         {
3647           for (i = 0; i < (full ? 4 : 2); i++)
3648             aarch64_set_vec_float (cpu, vd, i,
3649                                    aarch64_get_vec_float (cpu, vn, i)
3650                                    - aarch64_get_vec_float (cpu, vm, i));
3651         }
3652     }
3653   else
3654     {
3655       if (INSTR (22, 22))
3656         {
3657           if (! full)
3658             HALT_NYI;
3659
3660           for (i = 0; i < 2; i++)
3661             aarch64_set_vec_double (cpu, vd, i,
3662                                     aarch64_get_vec_double (cpu, vm, i)
3663                                     + aarch64_get_vec_double (cpu, vn, i));
3664         }
3665       else
3666         {
3667           for (i = 0; i < (full ? 4 : 2); i++)
3668             aarch64_set_vec_float (cpu, vd, i,
3669                                    aarch64_get_vec_float (cpu, vm, i)
3670                                    + aarch64_get_vec_float (cpu, vn, i));
3671         }
3672     }
3673 }
3674
3675 static void
3676 do_vec_add (sim_cpu *cpu)
3677 {
3678   /* instr[31]    = 0
3679      instr[30]    = full/half selector
3680      instr[29,24] = 001110
3681      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3682      instr[21]    = 1
3683      instr[20,16] = Vn
3684      instr[15,10] = 100001
3685      instr[9,5]   = Vm
3686      instr[4.0]   = Vd.  */
3687
3688   unsigned vm = INSTR (20, 16);
3689   unsigned vn = INSTR (9, 5);
3690   unsigned vd = INSTR (4, 0);
3691   unsigned i;
3692   int      full = INSTR (30, 30);
3693
3694   NYI_assert (29, 24, 0x0E);
3695   NYI_assert (21, 21, 1);
3696   NYI_assert (15, 10, 0x21);
3697
3698   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3699   switch (INSTR (23, 22))
3700     {
3701     case 0:
3702       for (i = 0; i < (full ? 16 : 8); i++)
3703         aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
3704                             + aarch64_get_vec_u8 (cpu, vm, i));
3705       return;
3706
3707     case 1:
3708       for (i = 0; i < (full ? 8 : 4); i++)
3709         aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
3710                              + aarch64_get_vec_u16 (cpu, vm, i));
3711       return;
3712
3713     case 2:
3714       for (i = 0; i < (full ? 4 : 2); i++)
3715         aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
3716                              + aarch64_get_vec_u32 (cpu, vm, i));
3717       return;
3718
3719     case 3:
3720       if (! full)
3721         HALT_UNALLOC;
3722       aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vn, 0)
3723                            + aarch64_get_vec_u64 (cpu, vm, 0));
3724       aarch64_set_vec_u64 (cpu, vd, 1,
3725                            aarch64_get_vec_u64 (cpu, vn, 1)
3726                            + aarch64_get_vec_u64 (cpu, vm, 1));
3727       return;
3728     }
3729 }
3730
3731 static void
3732 do_vec_mul (sim_cpu *cpu)
3733 {
3734   /* instr[31]    = 0
3735      instr[30]    = full/half selector
3736      instr[29,24] = 00 1110
3737      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3738      instr[21]    = 1
3739      instr[20,16] = Vn
3740      instr[15,10] = 10 0111
3741      instr[9,5]   = Vm
3742      instr[4.0]   = Vd.  */
3743
3744   unsigned vm = INSTR (20, 16);
3745   unsigned vn = INSTR (9, 5);
3746   unsigned vd = INSTR (4, 0);
3747   unsigned i;
3748   int      full = INSTR (30, 30);
3749   int      bias = 0;
3750
3751   NYI_assert (29, 24, 0x0E);
3752   NYI_assert (21, 21, 1);
3753   NYI_assert (15, 10, 0x27);
3754
3755   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3756   switch (INSTR (23, 22))
3757     {
3758     case 0:
3759       DO_VEC_WIDENING_MUL (full ? 16 : 8, uint8_t, u8, u8);
3760       return;
3761
3762     case 1:
3763       DO_VEC_WIDENING_MUL (full ? 8 : 4, uint16_t, u16, u16);
3764       return;
3765
3766     case 2:
3767       DO_VEC_WIDENING_MUL (full ? 4 : 2, uint32_t, u32, u32);
3768       return;
3769
3770     case 3:
3771       HALT_UNALLOC;
3772     }
3773 }
3774
3775 static void
3776 do_vec_MLA (sim_cpu *cpu)
3777 {
3778   /* instr[31]    = 0
3779      instr[30]    = full/half selector
3780      instr[29,24] = 00 1110
3781      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3782      instr[21]    = 1
3783      instr[20,16] = Vn
3784      instr[15,10] = 1001 01
3785      instr[9,5]   = Vm
3786      instr[4.0]   = Vd.  */
3787
3788   unsigned vm = INSTR (20, 16);
3789   unsigned vn = INSTR (9, 5);
3790   unsigned vd = INSTR (4, 0);
3791   unsigned i;
3792   int      full = INSTR (30, 30);
3793
3794   NYI_assert (29, 24, 0x0E);
3795   NYI_assert (21, 21, 1);
3796   NYI_assert (15, 10, 0x25);
3797
3798   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3799   switch (INSTR (23, 22))
3800     {
3801     case 0:
3802       {
3803         uint16_t a[16], b[16];
3804
3805         for (i = 0; i < (full ? 16 : 8); i++)
3806           {
3807             a[i] = aarch64_get_vec_u8 (cpu, vn, i);
3808             b[i] = aarch64_get_vec_u8 (cpu, vm, i);
3809           }
3810
3811         for (i = 0; i < (full ? 16 : 8); i++)
3812           {
3813             uint16_t v = aarch64_get_vec_u8 (cpu, vd, i);
3814
3815             aarch64_set_vec_u16 (cpu, vd, i, v + (a[i] * b[i]));
3816           }
3817       }
3818       return;
3819
3820     case 1:
3821       {
3822         uint32_t a[8], b[8];
3823
3824         for (i = 0; i < (full ? 8 : 4); i++)
3825           {
3826             a[i] = aarch64_get_vec_u16 (cpu, vn, i);
3827             b[i] = aarch64_get_vec_u16 (cpu, vm, i);
3828           }
3829
3830         for (i = 0; i < (full ? 8 : 4); i++)
3831           {
3832             uint32_t v = aarch64_get_vec_u16 (cpu, vd, i);
3833
3834             aarch64_set_vec_u32 (cpu, vd, i, v + (a[i] * b[i]));
3835           }
3836       }
3837       return;
3838
3839     case 2:
3840       {
3841         uint64_t a[4], b[4];
3842
3843         for (i = 0; i < (full ? 4 : 2); i++)
3844           {
3845             a[i] = aarch64_get_vec_u32 (cpu, vn, i);
3846             b[i] = aarch64_get_vec_u32 (cpu, vm, i);
3847           }
3848
3849         for (i = 0; i < (full ? 4 : 2); i++)
3850           {
3851             uint64_t v = aarch64_get_vec_u32 (cpu, vd, i);
3852
3853             aarch64_set_vec_u64 (cpu, vd, i, v + (a[i] * b[i]));
3854           }
3855       }
3856       return;
3857
3858     case 3:
3859       HALT_UNALLOC;
3860     }
3861 }
3862
3863 static float
3864 fmaxnm (float a, float b)
3865 {
3866   if (! isnan (a))
3867     {
3868       if (! isnan (b))
3869         return a > b ? a : b;
3870       return a;
3871     }
3872   else if (! isnan (b))
3873     return b;
3874   return a;
3875 }
3876
3877 static float
3878 fminnm (float a, float b)
3879 {
3880   if (! isnan (a))
3881     {
3882       if (! isnan (b))
3883         return a < b ? a : b;
3884       return a;
3885     }
3886   else if (! isnan (b))
3887     return b;
3888   return a;
3889 }
3890
3891 static double
3892 dmaxnm (double a, double b)
3893 {
3894   if (! isnan (a))
3895     {
3896       if (! isnan (b))
3897         return a > b ? a : b;
3898       return a;
3899     }
3900   else if (! isnan (b))
3901     return b;
3902   return a;
3903 }
3904
3905 static double
3906 dminnm (double a, double b)
3907 {
3908   if (! isnan (a))
3909     {
3910       if (! isnan (b))
3911         return a < b ? a : b;
3912       return a;
3913     }
3914   else if (! isnan (b))
3915     return b;
3916   return a;
3917 }
3918
3919 static void
3920 do_vec_FminmaxNMP (sim_cpu *cpu)
3921 {
3922   /* instr [31]    = 0
3923      instr [30]    = half (0)/full (1)
3924      instr [29,24] = 10 1110
3925      instr [23]    = max(0)/min(1)
3926      instr [22]    = float (0)/double (1)
3927      instr [21]    = 1
3928      instr [20,16] = Vn
3929      instr [15,10] = 1100 01
3930      instr [9,5]   = Vm
3931      instr [4.0]   = Vd.  */
3932
3933   unsigned vm = INSTR (20, 16);
3934   unsigned vn = INSTR (9, 5);
3935   unsigned vd = INSTR (4, 0);
3936   int      full = INSTR (30, 30);
3937
3938   NYI_assert (29, 24, 0x2E);
3939   NYI_assert (21, 21, 1);
3940   NYI_assert (15, 10, 0x31);
3941
3942   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3943   if (INSTR (22, 22))
3944     {
3945       double (* fn)(double, double) = INSTR (23, 23)
3946         ? dminnm : dmaxnm;
3947
3948       if (! full)
3949         HALT_NYI;
3950       aarch64_set_vec_double (cpu, vd, 0,
3951                               fn (aarch64_get_vec_double (cpu, vn, 0),
3952                                   aarch64_get_vec_double (cpu, vn, 1)));
3953       aarch64_set_vec_double (cpu, vd, 0,
3954                               fn (aarch64_get_vec_double (cpu, vm, 0),
3955                                   aarch64_get_vec_double (cpu, vm, 1)));
3956     }
3957   else
3958     {
3959       float (* fn)(float, float) = INSTR (23, 23)
3960         ? fminnm : fmaxnm;
3961
3962       aarch64_set_vec_float (cpu, vd, 0,
3963                              fn (aarch64_get_vec_float (cpu, vn, 0),
3964                                  aarch64_get_vec_float (cpu, vn, 1)));
3965       if (full)
3966         aarch64_set_vec_float (cpu, vd, 1,
3967                                fn (aarch64_get_vec_float (cpu, vn, 2),
3968                                    aarch64_get_vec_float (cpu, vn, 3)));
3969
3970       aarch64_set_vec_float (cpu, vd, (full ? 2 : 1),
3971                              fn (aarch64_get_vec_float (cpu, vm, 0),
3972                                  aarch64_get_vec_float (cpu, vm, 1)));
3973       if (full)
3974         aarch64_set_vec_float (cpu, vd, 3,
3975                                fn (aarch64_get_vec_float (cpu, vm, 2),
3976                                    aarch64_get_vec_float (cpu, vm, 3)));
3977     }
3978 }
3979
3980 static void
3981 do_vec_AND (sim_cpu *cpu)
3982 {
3983   /* instr[31]    = 0
3984      instr[30]    = half (0)/full (1)
3985      instr[29,21] = 001110001
3986      instr[20,16] = Vm
3987      instr[15,10] = 000111
3988      instr[9,5]   = Vn
3989      instr[4.0]   = Vd.  */
3990
3991   unsigned vm = INSTR (20, 16);
3992   unsigned vn = INSTR (9, 5);
3993   unsigned vd = INSTR (4, 0);
3994   unsigned i;
3995   int      full = INSTR (30, 30);
3996
3997   NYI_assert (29, 21, 0x071);
3998   NYI_assert (15, 10, 0x07);
3999
4000   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4001   for (i = 0; i < (full ? 4 : 2); i++)
4002     aarch64_set_vec_u32 (cpu, vd, i,
4003                          aarch64_get_vec_u32 (cpu, vn, i)
4004                          & aarch64_get_vec_u32 (cpu, vm, i));
4005 }
4006
4007 static void
4008 do_vec_BSL (sim_cpu *cpu)
4009 {
4010   /* instr[31]    = 0
4011      instr[30]    = half (0)/full (1)
4012      instr[29,21] = 101110011
4013      instr[20,16] = Vm
4014      instr[15,10] = 000111
4015      instr[9,5]   = Vn
4016      instr[4.0]   = Vd.  */
4017
4018   unsigned vm = INSTR (20, 16);
4019   unsigned vn = INSTR (9, 5);
4020   unsigned vd = INSTR (4, 0);
4021   unsigned i;
4022   int      full = INSTR (30, 30);
4023
4024   NYI_assert (29, 21, 0x173);
4025   NYI_assert (15, 10, 0x07);
4026
4027   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4028   for (i = 0; i < (full ? 16 : 8); i++)
4029     aarch64_set_vec_u8 (cpu, vd, i,
4030                         (    aarch64_get_vec_u8 (cpu, vd, i)
4031                            & aarch64_get_vec_u8 (cpu, vn, i))
4032                         | ((~ aarch64_get_vec_u8 (cpu, vd, i))
4033                            & aarch64_get_vec_u8 (cpu, vm, i)));
4034 }
4035
4036 static void
4037 do_vec_EOR (sim_cpu *cpu)
4038 {
4039   /* instr[31]    = 0
4040      instr[30]    = half (0)/full (1)
4041      instr[29,21] = 10 1110 001
4042      instr[20,16] = Vm
4043      instr[15,10] = 000111
4044      instr[9,5]   = Vn
4045      instr[4.0]   = Vd.  */
4046
4047   unsigned vm = INSTR (20, 16);
4048   unsigned vn = INSTR (9, 5);
4049   unsigned vd = INSTR (4, 0);
4050   unsigned i;
4051   int      full = INSTR (30, 30);
4052
4053   NYI_assert (29, 21, 0x171);
4054   NYI_assert (15, 10, 0x07);
4055
4056   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4057   for (i = 0; i < (full ? 4 : 2); i++)
4058     aarch64_set_vec_u32 (cpu, vd, i,
4059                          aarch64_get_vec_u32 (cpu, vn, i)
4060                          ^ aarch64_get_vec_u32 (cpu, vm, i));
4061 }
4062
4063 static void
4064 do_vec_bit (sim_cpu *cpu)
4065 {
4066   /* instr[31]    = 0
4067      instr[30]    = half (0)/full (1)
4068      instr[29,23] = 10 1110 1
4069      instr[22]    = BIT (0) / BIF (1)
4070      instr[21]    = 1
4071      instr[20,16] = Vm
4072      instr[15,10] = 0001 11
4073      instr[9,5]   = Vn
4074      instr[4.0]   = Vd.  */
4075
4076   unsigned vm = INSTR (20, 16);
4077   unsigned vn = INSTR (9, 5);
4078   unsigned vd = INSTR (4, 0);
4079   unsigned full = INSTR (30, 30);
4080   unsigned test_false = INSTR (22, 22);
4081   unsigned i;
4082
4083   NYI_assert (29, 23, 0x5D);
4084   NYI_assert (21, 21, 1);
4085   NYI_assert (15, 10, 0x07);
4086
4087   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4088   if (test_false)
4089     {
4090       for (i = 0; i < (full ? 16 : 8); i++)
4091         if (aarch64_get_vec_u32 (cpu, vn, i) == 0)
4092           aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vm, i));
4093     }
4094   else
4095     {
4096       for (i = 0; i < (full ? 16 : 8); i++)
4097         if (aarch64_get_vec_u32 (cpu, vn, i) != 0)
4098           aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vm, i));
4099     }
4100 }
4101
4102 static void
4103 do_vec_ORN (sim_cpu *cpu)
4104 {
4105   /* instr[31]    = 0
4106      instr[30]    = half (0)/full (1)
4107      instr[29,21] = 00 1110 111
4108      instr[20,16] = Vm
4109      instr[15,10] = 00 0111
4110      instr[9,5]   = Vn
4111      instr[4.0]   = Vd.  */
4112
4113   unsigned vm = INSTR (20, 16);
4114   unsigned vn = INSTR (9, 5);
4115   unsigned vd = INSTR (4, 0);
4116   unsigned i;
4117   int      full = INSTR (30, 30);
4118
4119   NYI_assert (29, 21, 0x077);
4120   NYI_assert (15, 10, 0x07);
4121
4122   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4123   for (i = 0; i < (full ? 16 : 8); i++)
4124     aarch64_set_vec_u8 (cpu, vd, i,
4125                         aarch64_get_vec_u8 (cpu, vn, i)
4126                         | ~ aarch64_get_vec_u8 (cpu, vm, i));
4127 }
4128
4129 static void
4130 do_vec_ORR (sim_cpu *cpu)
4131 {
4132   /* instr[31]    = 0
4133      instr[30]    = half (0)/full (1)
4134      instr[29,21] = 00 1110 101
4135      instr[20,16] = Vm
4136      instr[15,10] = 0001 11
4137      instr[9,5]   = Vn
4138      instr[4.0]   = Vd.  */
4139
4140   unsigned vm = INSTR (20, 16);
4141   unsigned vn = INSTR (9, 5);
4142   unsigned vd = INSTR (4, 0);
4143   unsigned i;
4144   int      full = INSTR (30, 30);
4145
4146   NYI_assert (29, 21, 0x075);
4147   NYI_assert (15, 10, 0x07);
4148
4149   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4150   for (i = 0; i < (full ? 16 : 8); i++)
4151     aarch64_set_vec_u8 (cpu, vd, i,
4152                         aarch64_get_vec_u8 (cpu, vn, i)
4153                         | aarch64_get_vec_u8 (cpu, vm, i));
4154 }
4155
4156 static void
4157 do_vec_BIC (sim_cpu *cpu)
4158 {
4159   /* instr[31]    = 0
4160      instr[30]    = half (0)/full (1)
4161      instr[29,21] = 00 1110 011
4162      instr[20,16] = Vm
4163      instr[15,10] = 00 0111
4164      instr[9,5]   = Vn
4165      instr[4.0]   = Vd.  */
4166
4167   unsigned vm = INSTR (20, 16);
4168   unsigned vn = INSTR (9, 5);
4169   unsigned vd = INSTR (4, 0);
4170   unsigned i;
4171   int      full = INSTR (30, 30);
4172
4173   NYI_assert (29, 21, 0x073);
4174   NYI_assert (15, 10, 0x07);
4175
4176   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4177   for (i = 0; i < (full ? 16 : 8); i++)
4178     aarch64_set_vec_u8 (cpu, vd, i,
4179                         aarch64_get_vec_u8 (cpu, vn, i)
4180                         & ~ aarch64_get_vec_u8 (cpu, vm, i));
4181 }
4182
4183 static void
4184 do_vec_XTN (sim_cpu *cpu)
4185 {
4186   /* instr[31]    = 0
4187      instr[30]    = first part (0)/ second part (1)
4188      instr[29,24] = 00 1110
4189      instr[23,22] = size: byte(00), half(01), word (10)
4190      instr[21,10] = 1000 0100 1010
4191      instr[9,5]   = Vs
4192      instr[4,0]   = Vd.  */
4193
4194   unsigned vs = INSTR (9, 5);
4195   unsigned vd = INSTR (4, 0);
4196   unsigned bias = INSTR (30, 30);
4197   unsigned i;
4198
4199   NYI_assert (29, 24, 0x0E);
4200   NYI_assert (21, 10, 0x84A);
4201
4202   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4203   switch (INSTR (23, 22))
4204     {
4205     case 0:
4206       for (i = 0; i < 8; i++)
4207         aarch64_set_vec_u8 (cpu, vd, i + (bias * 8),
4208                             aarch64_get_vec_u16 (cpu, vs, i));
4209       return;
4210
4211     case 1:
4212       for (i = 0; i < 4; i++)
4213         aarch64_set_vec_u16 (cpu, vd, i + (bias * 4),
4214                              aarch64_get_vec_u32 (cpu, vs, i));
4215       return;
4216
4217     case 2:
4218       for (i = 0; i < 2; i++)
4219         aarch64_set_vec_u32 (cpu, vd, i + (bias * 2),
4220                              aarch64_get_vec_u64 (cpu, vs, i));
4221       return;
4222     }
4223 }
4224
4225 static void
4226 do_vec_maxv (sim_cpu *cpu)
4227 {
4228   /* instr[31]    = 0
4229      instr[30]    = half(0)/full(1)
4230      instr[29]    = signed (0)/unsigned(1)
4231      instr[28,24] = 0 1110
4232      instr[23,22] = size: byte(00), half(01), word (10)
4233      instr[21]    = 1
4234      instr[20,17] = 1 000
4235      instr[16]    = max(0)/min(1)
4236      instr[15,10] = 1010 10
4237      instr[9,5]   = V source
4238      instr[4.0]   = R dest.  */
4239
4240   unsigned vs = INSTR (9, 5);
4241   unsigned rd = INSTR (4, 0);
4242   unsigned full = INSTR (30, 30);
4243   unsigned i;
4244
4245   NYI_assert (28, 24, 0x0E);
4246   NYI_assert (21, 21, 1);
4247   NYI_assert (20, 17, 8);
4248   NYI_assert (15, 10, 0x2A);
4249
4250   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4251   switch ((INSTR (29, 29) << 1) | INSTR (16, 16))
4252     {
4253     case 0: /* SMAXV.  */
4254        {
4255         int64_t smax;
4256         switch (INSTR (23, 22))
4257           {
4258           case 0:
4259             smax = aarch64_get_vec_s8 (cpu, vs, 0);
4260             for (i = 1; i < (full ? 16 : 8); i++)
4261               smax = max (smax, aarch64_get_vec_s8 (cpu, vs, i));
4262             break;
4263           case 1:
4264             smax = aarch64_get_vec_s16 (cpu, vs, 0);
4265             for (i = 1; i < (full ? 8 : 4); i++)
4266               smax = max (smax, aarch64_get_vec_s16 (cpu, vs, i));
4267             break;
4268           case 2:
4269             smax = aarch64_get_vec_s32 (cpu, vs, 0);
4270             for (i = 1; i < (full ? 4 : 2); i++)
4271               smax = max (smax, aarch64_get_vec_s32 (cpu, vs, i));
4272             break;
4273           case 3:
4274             HALT_UNALLOC;
4275           }
4276         aarch64_set_reg_s64 (cpu, rd, NO_SP, smax);
4277         return;
4278       }
4279
4280     case 1: /* SMINV.  */
4281       {
4282         int64_t smin;
4283         switch (INSTR (23, 22))
4284           {
4285           case 0:
4286             smin = aarch64_get_vec_s8 (cpu, vs, 0);
4287             for (i = 1; i < (full ? 16 : 8); i++)
4288               smin = min (smin, aarch64_get_vec_s8 (cpu, vs, i));
4289             break;
4290           case 1:
4291             smin = aarch64_get_vec_s16 (cpu, vs, 0);
4292             for (i = 1; i < (full ? 8 : 4); i++)
4293               smin = min (smin, aarch64_get_vec_s16 (cpu, vs, i));
4294             break;
4295           case 2:
4296             smin = aarch64_get_vec_s32 (cpu, vs, 0);
4297             for (i = 1; i < (full ? 4 : 2); i++)
4298               smin = min (smin, aarch64_get_vec_s32 (cpu, vs, i));
4299             break;
4300
4301           case 3:
4302             HALT_UNALLOC;
4303           }
4304         aarch64_set_reg_s64 (cpu, rd, NO_SP, smin);
4305         return;
4306       }
4307
4308     case 2: /* UMAXV.  */
4309       {
4310         uint64_t umax;
4311         switch (INSTR (23, 22))
4312           {
4313           case 0:
4314             umax = aarch64_get_vec_u8 (cpu, vs, 0);
4315             for (i = 1; i < (full ? 16 : 8); i++)
4316               umax = max (umax, aarch64_get_vec_u8 (cpu, vs, i));
4317             break;
4318           case 1:
4319             umax = aarch64_get_vec_u16 (cpu, vs, 0);
4320             for (i = 1; i < (full ? 8 : 4); i++)
4321               umax = max (umax, aarch64_get_vec_u16 (cpu, vs, i));
4322             break;
4323           case 2:
4324             umax = aarch64_get_vec_u32 (cpu, vs, 0);
4325             for (i = 1; i < (full ? 4 : 2); i++)
4326               umax = max (umax, aarch64_get_vec_u32 (cpu, vs, i));
4327             break;
4328
4329           case 3:
4330             HALT_UNALLOC;
4331           }
4332         aarch64_set_reg_u64 (cpu, rd, NO_SP, umax);
4333         return;
4334       }
4335
4336     case 3: /* UMINV.  */
4337       {
4338         uint64_t umin;
4339         switch (INSTR (23, 22))
4340           {
4341           case 0:
4342             umin = aarch64_get_vec_u8 (cpu, vs, 0);
4343             for (i = 1; i < (full ? 16 : 8); i++)
4344               umin = min (umin, aarch64_get_vec_u8 (cpu, vs, i));
4345             break;
4346           case 1:
4347             umin = aarch64_get_vec_u16 (cpu, vs, 0);
4348             for (i = 1; i < (full ? 8 : 4); i++)
4349               umin = min (umin, aarch64_get_vec_u16 (cpu, vs, i));
4350             break;
4351           case 2:
4352             umin = aarch64_get_vec_u32 (cpu, vs, 0);
4353             for (i = 1; i < (full ? 4 : 2); i++)
4354               umin = min (umin, aarch64_get_vec_u32 (cpu, vs, i));
4355             break;
4356
4357           case 3:
4358             HALT_UNALLOC;
4359           }
4360         aarch64_set_reg_u64 (cpu, rd, NO_SP, umin);
4361         return;
4362       }
4363     }
4364 }
4365
4366 static void
4367 do_vec_fminmaxV (sim_cpu *cpu)
4368 {
4369   /* instr[31,24] = 0110 1110
4370      instr[23]    = max(0)/min(1)
4371      instr[22,14] = 011 0000 11
4372      instr[13,12] = nm(00)/normal(11)
4373      instr[11,10] = 10
4374      instr[9,5]   = V source
4375      instr[4.0]   = R dest.  */
4376
4377   unsigned vs = INSTR (9, 5);
4378   unsigned rd = INSTR (4, 0);
4379   unsigned i;
4380   float res   = aarch64_get_vec_float (cpu, vs, 0);
4381
4382   NYI_assert (31, 24, 0x6E);
4383   NYI_assert (22, 14, 0x0C3);
4384   NYI_assert (11, 10, 2);
4385
4386   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4387   if (INSTR (23, 23))
4388     {
4389       switch (INSTR (13, 12))
4390         {
4391         case 0: /* FMNINNMV.  */
4392           for (i = 1; i < 4; i++)
4393             res = fminnm (res, aarch64_get_vec_float (cpu, vs, i));
4394           break;
4395
4396         case 3: /* FMINV.  */
4397           for (i = 1; i < 4; i++)
4398             res = min (res, aarch64_get_vec_float (cpu, vs, i));
4399           break;
4400
4401         default:
4402           HALT_NYI;
4403         }
4404     }
4405   else
4406     {
4407       switch (INSTR (13, 12))
4408         {
4409         case 0: /* FMNAXNMV.  */
4410           for (i = 1; i < 4; i++)
4411             res = fmaxnm (res, aarch64_get_vec_float (cpu, vs, i));
4412           break;
4413
4414         case 3: /* FMAXV.  */
4415           for (i = 1; i < 4; i++)
4416             res = max (res, aarch64_get_vec_float (cpu, vs, i));
4417           break;
4418
4419         default:
4420           HALT_NYI;
4421         }
4422     }
4423
4424   aarch64_set_FP_float (cpu, rd, res);
4425 }
4426
4427 static void
4428 do_vec_Fminmax (sim_cpu *cpu)
4429 {
4430   /* instr[31]    = 0
4431      instr[30]    = half(0)/full(1)
4432      instr[29,24] = 00 1110
4433      instr[23]    = max(0)/min(1)
4434      instr[22]    = float(0)/double(1)
4435      instr[21]    = 1
4436      instr[20,16] = Vm
4437      instr[15,14] = 11
4438      instr[13,12] = nm(00)/normal(11)
4439      instr[11,10] = 01
4440      instr[9,5]   = Vn
4441      instr[4,0]   = Vd.  */
4442
4443   unsigned vm = INSTR (20, 16);
4444   unsigned vn = INSTR (9, 5);
4445   unsigned vd = INSTR (4, 0);
4446   unsigned full = INSTR (30, 30);
4447   unsigned min = INSTR (23, 23);
4448   unsigned i;
4449
4450   NYI_assert (29, 24, 0x0E);
4451   NYI_assert (21, 21, 1);
4452   NYI_assert (15, 14, 3);
4453   NYI_assert (11, 10, 1);
4454
4455   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4456   if (INSTR (22, 22))
4457     {
4458       double (* func)(double, double);
4459
4460       if (! full)
4461         HALT_NYI;
4462
4463       if (INSTR (13, 12) == 0)
4464         func = min ? dminnm : dmaxnm;
4465       else if (INSTR (13, 12) == 3)
4466         func = min ? fmin : fmax;
4467       else
4468         HALT_NYI;
4469
4470       for (i = 0; i < 2; i++)
4471         aarch64_set_vec_double (cpu, vd, i,
4472                                 func (aarch64_get_vec_double (cpu, vn, i),
4473                                       aarch64_get_vec_double (cpu, vm, i)));
4474     }
4475   else
4476     {
4477       float (* func)(float, float);
4478
4479       if (INSTR (13, 12) == 0)
4480         func = min ? fminnm : fmaxnm;
4481       else if (INSTR (13, 12) == 3)
4482         func = min ? fminf : fmaxf;
4483       else
4484         HALT_NYI;
4485
4486       for (i = 0; i < (full ? 4 : 2); i++)
4487         aarch64_set_vec_float (cpu, vd, i,
4488                                func (aarch64_get_vec_float (cpu, vn, i),
4489                                      aarch64_get_vec_float (cpu, vm, i)));
4490     }
4491 }
4492
4493 static void
4494 do_vec_SCVTF (sim_cpu *cpu)
4495 {
4496   /* instr[31]    = 0
4497      instr[30]    = Q
4498      instr[29,23] = 00 1110 0
4499      instr[22]    = float(0)/double(1)
4500      instr[21,10] = 10 0001 1101 10
4501      instr[9,5]   = Vn
4502      instr[4,0]   = Vd.  */
4503
4504   unsigned vn = INSTR (9, 5);
4505   unsigned vd = INSTR (4, 0);
4506   unsigned full = INSTR (30, 30);
4507   unsigned size = INSTR (22, 22);
4508   unsigned i;
4509
4510   NYI_assert (29, 23, 0x1C);
4511   NYI_assert (21, 10, 0x876);
4512
4513   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4514   if (size)
4515     {
4516       if (! full)
4517         HALT_UNALLOC;
4518
4519       for (i = 0; i < 2; i++)
4520         {
4521           double val = (double) aarch64_get_vec_u64 (cpu, vn, i);
4522           aarch64_set_vec_double (cpu, vd, i, val);
4523         }
4524     }
4525   else
4526     {
4527       for (i = 0; i < (full ? 4 : 2); i++)
4528         {
4529           float val = (float) aarch64_get_vec_u32 (cpu, vn, i);
4530           aarch64_set_vec_float (cpu, vd, i, val);
4531         }
4532     }
4533 }
4534
4535 #define VEC_CMP(SOURCE, CMP)                                            \
4536   do                                                                    \
4537     {                                                                   \
4538       switch (size)                                                     \
4539         {                                                               \
4540         case 0:                                                         \
4541           for (i = 0; i < (full ? 16 : 8); i++)                         \
4542             aarch64_set_vec_u8 (cpu, vd, i,                             \
4543                                 aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4544                                 CMP                                     \
4545                                 aarch64_get_vec_##SOURCE##8 (cpu, vm, i) \
4546                                 ? -1 : 0);                              \
4547           return;                                                       \
4548         case 1:                                                         \
4549           for (i = 0; i < (full ? 8 : 4); i++)                          \
4550             aarch64_set_vec_u16 (cpu, vd, i,                            \
4551                                  aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4552                                  CMP                                    \
4553                                  aarch64_get_vec_##SOURCE##16 (cpu, vm, i) \
4554                                  ? -1 : 0);                             \
4555           return;                                                       \
4556         case 2:                                                         \
4557           for (i = 0; i < (full ? 4 : 2); i++)                          \
4558             aarch64_set_vec_u32 (cpu, vd, i, \
4559                                  aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4560                                  CMP                                    \
4561                                  aarch64_get_vec_##SOURCE##32 (cpu, vm, i) \
4562                                  ? -1 : 0);                             \
4563           return;                                                       \
4564         case 3:                                                         \
4565           if (! full)                                                   \
4566             HALT_UNALLOC;                                               \
4567           for (i = 0; i < 2; i++)                                       \
4568             aarch64_set_vec_u64 (cpu, vd, i, \
4569                                  aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4570                                  CMP                                    \
4571                                  aarch64_get_vec_##SOURCE##64 (cpu, vm, i) \
4572                                  ? -1ULL : 0);                          \
4573           return;                                                       \
4574         }                                                               \
4575     }                                                                   \
4576   while (0)
4577
4578 #define VEC_CMP0(SOURCE, CMP)                                           \
4579   do                                                                    \
4580     {                                                                   \
4581       switch (size)                                                     \
4582         {                                                               \
4583         case 0:                                                         \
4584           for (i = 0; i < (full ? 16 : 8); i++)                         \
4585             aarch64_set_vec_u8 (cpu, vd, i,                             \
4586                                 aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4587                                 CMP 0 ? -1 : 0);                        \
4588           return;                                                       \
4589         case 1:                                                         \
4590           for (i = 0; i < (full ? 8 : 4); i++)                          \
4591             aarch64_set_vec_u16 (cpu, vd, i,                            \
4592                                  aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4593                                  CMP 0 ? -1 : 0);                       \
4594           return;                                                       \
4595         case 2:                                                         \
4596           for (i = 0; i < (full ? 4 : 2); i++)                          \
4597             aarch64_set_vec_u32 (cpu, vd, i,                            \
4598                                  aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4599                                  CMP 0 ? -1 : 0);                       \
4600           return;                                                       \
4601         case 3:                                                         \
4602           if (! full)                                                   \
4603             HALT_UNALLOC;                                               \
4604           for (i = 0; i < 2; i++)                                       \
4605             aarch64_set_vec_u64 (cpu, vd, i,                            \
4606                                  aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4607                                  CMP 0 ? -1ULL : 0);                    \
4608           return;                                                       \
4609         }                                                               \
4610     }                                                                   \
4611   while (0)
4612
4613 #define VEC_FCMP0(CMP)                                                  \
4614   do                                                                    \
4615     {                                                                   \
4616       if (vm != 0)                                                      \
4617         HALT_NYI;                                                       \
4618       if (INSTR (22, 22))                                               \
4619         {                                                               \
4620           if (! full)                                                   \
4621             HALT_NYI;                                                   \
4622           for (i = 0; i < 2; i++)                                       \
4623             aarch64_set_vec_u64 (cpu, vd, i,                            \
4624                                  aarch64_get_vec_double (cpu, vn, i)    \
4625                                  CMP 0.0 ? -1 : 0);                     \
4626         }                                                               \
4627       else                                                              \
4628         {                                                               \
4629           for (i = 0; i < (full ? 4 : 2); i++)                          \
4630             aarch64_set_vec_u32 (cpu, vd, i,                            \
4631                                  aarch64_get_vec_float (cpu, vn, i)     \
4632                                  CMP 0.0 ? -1 : 0);                     \
4633         }                                                               \
4634       return;                                                           \
4635     }                                                                   \
4636   while (0)
4637
4638 #define VEC_FCMP(CMP)                                                   \
4639   do                                                                    \
4640     {                                                                   \
4641       if (INSTR (22, 22))                                               \
4642         {                                                               \
4643           if (! full)                                                   \
4644             HALT_NYI;                                                   \
4645           for (i = 0; i < 2; i++)                                       \
4646             aarch64_set_vec_u64 (cpu, vd, i,                            \
4647                                  aarch64_get_vec_double (cpu, vn, i)    \
4648                                  CMP                                    \
4649                                  aarch64_get_vec_double (cpu, vm, i)    \
4650                                  ? -1 : 0);                             \
4651         }                                                               \
4652       else                                                              \
4653         {                                                               \
4654           for (i = 0; i < (full ? 4 : 2); i++)                          \
4655             aarch64_set_vec_u32 (cpu, vd, i,                            \
4656                                  aarch64_get_vec_float (cpu, vn, i)     \
4657                                  CMP                                    \
4658                                  aarch64_get_vec_float (cpu, vm, i)     \
4659                                  ? -1 : 0);                             \
4660         }                                                               \
4661       return;                                                           \
4662     }                                                                   \
4663   while (0)
4664
4665 static void
4666 do_vec_compare (sim_cpu *cpu)
4667 {
4668   /* instr[31]    = 0
4669      instr[30]    = half(0)/full(1)
4670      instr[29]    = part-of-comparison-type
4671      instr[28,24] = 0 1110
4672      instr[23,22] = size of integer compares: byte(00), half(01), word (10), long (11)
4673                     type of float compares: single (-0) / double (-1)
4674      instr[21]    = 1
4675      instr[20,16] = Vm or 00000 (compare vs 0)
4676      instr[15,10] = part-of-comparison-type
4677      instr[9,5]   = Vn
4678      instr[4.0]   = Vd.  */
4679
4680   int full = INSTR (30, 30);
4681   int size = INSTR (23, 22);
4682   unsigned vm = INSTR (20, 16);
4683   unsigned vn = INSTR (9, 5);
4684   unsigned vd = INSTR (4, 0);
4685   unsigned i;
4686
4687   NYI_assert (28, 24, 0x0E);
4688   NYI_assert (21, 21, 1);
4689
4690   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4691   if ((INSTR (11, 11)
4692        && INSTR (14, 14))
4693       || ((INSTR (11, 11) == 0
4694            && INSTR (10, 10) == 0)))
4695     {
4696       /* A compare vs 0.  */
4697       if (vm != 0)
4698         {
4699           if (INSTR (15, 10) == 0x2A)
4700             do_vec_maxv (cpu);
4701           else if (INSTR (15, 10) == 0x32
4702                    || INSTR (15, 10) == 0x3E)
4703             do_vec_fminmaxV (cpu);
4704           else if (INSTR (29, 23) == 0x1C
4705                    && INSTR (21, 10) == 0x876)
4706             do_vec_SCVTF (cpu);
4707           else
4708             HALT_NYI;
4709           return;
4710         }
4711     }
4712
4713   if (INSTR (14, 14))
4714     {
4715       /* A floating point compare.  */
4716       unsigned decode = (INSTR (29, 29) << 5) | (INSTR (23, 23) << 4)
4717         | INSTR (13, 10);
4718
4719       NYI_assert (15, 15, 1);
4720
4721       switch (decode)
4722         {
4723         case /* 0b010010: GT#0 */ 0x12: VEC_FCMP0 (>);
4724         case /* 0b110010: GE#0 */ 0x32: VEC_FCMP0 (>=);
4725         case /* 0b010110: EQ#0 */ 0x16: VEC_FCMP0 (==);
4726         case /* 0b110110: LE#0 */ 0x36: VEC_FCMP0 (<=);
4727         case /* 0b011010: LT#0 */ 0x1A: VEC_FCMP0 (<);
4728         case /* 0b111001: GT */   0x39: VEC_FCMP  (>);
4729         case /* 0b101001: GE */   0x29: VEC_FCMP  (>=);
4730         case /* 0b001001: EQ */   0x09: VEC_FCMP  (==);
4731
4732         default:
4733           HALT_NYI;
4734         }
4735     }
4736   else
4737     {
4738       unsigned decode = (INSTR (29, 29) << 6) | INSTR (15, 10);
4739
4740       switch (decode)
4741         {
4742         case 0x0D: /* 0001101 GT */     VEC_CMP  (s, > );
4743         case 0x0F: /* 0001111 GE */     VEC_CMP  (s, >= );
4744         case 0x22: /* 0100010 GT #0 */  VEC_CMP0 (s, > );
4745         case 0x23: /* 0100011 TST */    VEC_CMP  (u, & );
4746         case 0x26: /* 0100110 EQ #0 */  VEC_CMP0 (s, == );
4747         case 0x2A: /* 0101010 LT #0 */  VEC_CMP0 (s, < );
4748         case 0x4D: /* 1001101 HI */     VEC_CMP  (u, > );
4749         case 0x4F: /* 1001111 HS */     VEC_CMP  (u, >= );
4750         case 0x62: /* 1100010 GE #0 */  VEC_CMP0 (s, >= );
4751         case 0x63: /* 1100011 EQ */     VEC_CMP  (u, == );
4752         case 0x66: /* 1100110 LE #0 */  VEC_CMP0 (s, <= );
4753         default:
4754           if (vm == 0)
4755             HALT_NYI;
4756           do_vec_maxv (cpu);
4757         }
4758     }
4759 }
4760
4761 static void
4762 do_vec_SSHL (sim_cpu *cpu)
4763 {
4764   /* instr[31]    = 0
4765      instr[30]    = first part (0)/ second part (1)
4766      instr[29,24] = 00 1110
4767      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4768      instr[21]    = 1
4769      instr[20,16] = Vm
4770      instr[15,10] = 0100 01
4771      instr[9,5]   = Vn
4772      instr[4,0]   = Vd.  */
4773
4774   unsigned full = INSTR (30, 30);
4775   unsigned vm = INSTR (20, 16);
4776   unsigned vn = INSTR (9, 5);
4777   unsigned vd = INSTR (4, 0);
4778   unsigned i;
4779   signed int shift;
4780
4781   NYI_assert (29, 24, 0x0E);
4782   NYI_assert (21, 21, 1);
4783   NYI_assert (15, 10, 0x11);
4784
4785   /* FIXME: What is a signed shift left in this context ?.  */
4786
4787   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4788   switch (INSTR (23, 22))
4789     {
4790     case 0:
4791       for (i = 0; i < (full ? 16 : 8); i++)
4792         {
4793           shift = aarch64_get_vec_s8 (cpu, vm, i);
4794           if (shift >= 0)
4795             aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4796                                 << shift);
4797           else
4798             aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4799                                 >> - shift);
4800         }
4801       return;
4802
4803     case 1:
4804       for (i = 0; i < (full ? 8 : 4); i++)
4805         {
4806           shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4807           if (shift >= 0)
4808             aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4809                                  << shift);
4810           else
4811             aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4812                                  >> - shift);
4813         }
4814       return;
4815
4816     case 2:
4817       for (i = 0; i < (full ? 4 : 2); i++)
4818         {
4819           shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4820           if (shift >= 0)
4821             aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4822                                  << shift);
4823           else
4824             aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4825                                  >> - shift);
4826         }
4827       return;
4828
4829     case 3:
4830       if (! full)
4831         HALT_UNALLOC;
4832       for (i = 0; i < 2; i++)
4833         {
4834           shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4835           if (shift >= 0)
4836             aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4837                                  << shift);
4838           else
4839             aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4840                                  >> - shift);
4841         }
4842       return;
4843     }
4844 }
4845
4846 static void
4847 do_vec_USHL (sim_cpu *cpu)
4848 {
4849   /* instr[31]    = 0
4850      instr[30]    = first part (0)/ second part (1)
4851      instr[29,24] = 10 1110
4852      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4853      instr[21]    = 1
4854      instr[20,16] = Vm
4855      instr[15,10] = 0100 01
4856      instr[9,5]   = Vn
4857      instr[4,0]   = Vd  */
4858
4859   unsigned full = INSTR (30, 30);
4860   unsigned vm = INSTR (20, 16);
4861   unsigned vn = INSTR (9, 5);
4862   unsigned vd = INSTR (4, 0);
4863   unsigned i;
4864   signed int shift;
4865
4866   NYI_assert (29, 24, 0x2E);
4867   NYI_assert (15, 10, 0x11);
4868
4869   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4870   switch (INSTR (23, 22))
4871     {
4872     case 0:
4873         for (i = 0; i < (full ? 16 : 8); i++)
4874           {
4875             shift = aarch64_get_vec_s8 (cpu, vm, i);
4876             if (shift >= 0)
4877               aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
4878                                   << shift);
4879             else
4880               aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
4881                                   >> - shift);
4882           }
4883       return;
4884
4885     case 1:
4886       for (i = 0; i < (full ? 8 : 4); i++)
4887         {
4888           shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4889           if (shift >= 0)
4890             aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
4891                                  << shift);
4892           else
4893             aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
4894                                  >> - shift);
4895         }
4896       return;
4897
4898     case 2:
4899       for (i = 0; i < (full ? 4 : 2); i++)
4900         {
4901           shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4902           if (shift >= 0)
4903             aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
4904                                  << shift);
4905           else
4906             aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
4907                                  >> - shift);
4908         }
4909       return;
4910
4911     case 3:
4912       if (! full)
4913         HALT_UNALLOC;
4914       for (i = 0; i < 2; i++)
4915         {
4916           shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4917           if (shift >= 0)
4918             aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
4919                                  << shift);
4920           else
4921             aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
4922                                  >> - shift);
4923         }
4924       return;
4925     }
4926 }
4927
4928 static void
4929 do_vec_FMLA (sim_cpu *cpu)
4930 {
4931   /* instr[31]    = 0
4932      instr[30]    = full/half selector
4933      instr[29,23] = 0011100
4934      instr[22]    = size: 0=>float, 1=>double
4935      instr[21]    = 1
4936      instr[20,16] = Vn
4937      instr[15,10] = 1100 11
4938      instr[9,5]   = Vm
4939      instr[4.0]   = Vd.  */
4940
4941   unsigned vm = INSTR (20, 16);
4942   unsigned vn = INSTR (9, 5);
4943   unsigned vd = INSTR (4, 0);
4944   unsigned i;
4945   int      full = INSTR (30, 30);
4946
4947   NYI_assert (29, 23, 0x1C);
4948   NYI_assert (21, 21, 1);
4949   NYI_assert (15, 10, 0x33);
4950
4951   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4952   if (INSTR (22, 22))
4953     {
4954       if (! full)
4955         HALT_UNALLOC;
4956       for (i = 0; i < 2; i++)
4957         aarch64_set_vec_double (cpu, vd, i,
4958                                 aarch64_get_vec_double (cpu, vn, i) *
4959                                 aarch64_get_vec_double (cpu, vm, i) +
4960                                 aarch64_get_vec_double (cpu, vd, i));
4961     }
4962   else
4963     {
4964       for (i = 0; i < (full ? 4 : 2); i++)
4965         aarch64_set_vec_float (cpu, vd, i,
4966                                aarch64_get_vec_float (cpu, vn, i) *
4967                                aarch64_get_vec_float (cpu, vm, i) +
4968                                aarch64_get_vec_float (cpu, vd, i));
4969     }
4970 }
4971
4972 static void
4973 do_vec_max (sim_cpu *cpu)
4974 {
4975   /* instr[31]    = 0
4976      instr[30]    = full/half selector
4977      instr[29]    = SMAX (0) / UMAX (1)
4978      instr[28,24] = 0 1110
4979      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
4980      instr[21]    = 1
4981      instr[20,16] = Vn
4982      instr[15,10] = 0110 01
4983      instr[9,5]   = Vm
4984      instr[4.0]   = Vd.  */
4985
4986   unsigned vm = INSTR (20, 16);
4987   unsigned vn = INSTR (9, 5);
4988   unsigned vd = INSTR (4, 0);
4989   unsigned i;
4990   int      full = INSTR (30, 30);
4991
4992   NYI_assert (28, 24, 0x0E);
4993   NYI_assert (21, 21, 1);
4994   NYI_assert (15, 10, 0x19);
4995
4996   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4997   if (INSTR (29, 29))
4998     {
4999       switch (INSTR (23, 22))
5000         {
5001         case 0:
5002           for (i = 0; i < (full ? 16 : 8); i++)
5003             aarch64_set_vec_u8 (cpu, vd, i,
5004                                 aarch64_get_vec_u8 (cpu, vn, i)
5005                                 > aarch64_get_vec_u8 (cpu, vm, i)
5006                                 ? aarch64_get_vec_u8 (cpu, vn, i)
5007                                 : aarch64_get_vec_u8 (cpu, vm, i));
5008           return;
5009
5010         case 1:
5011           for (i = 0; i < (full ? 8 : 4); i++)
5012             aarch64_set_vec_u16 (cpu, vd, i,
5013                                  aarch64_get_vec_u16 (cpu, vn, i)
5014                                  > aarch64_get_vec_u16 (cpu, vm, i)
5015                                  ? aarch64_get_vec_u16 (cpu, vn, i)
5016                                  : aarch64_get_vec_u16 (cpu, vm, i));
5017           return;
5018
5019         case 2:
5020           for (i = 0; i < (full ? 4 : 2); i++)
5021             aarch64_set_vec_u32 (cpu, vd, i,
5022                                  aarch64_get_vec_u32 (cpu, vn, i)
5023                                  > aarch64_get_vec_u32 (cpu, vm, i)
5024                                  ? aarch64_get_vec_u32 (cpu, vn, i)
5025                                  : aarch64_get_vec_u32 (cpu, vm, i));
5026           return;
5027
5028         case 3:
5029           HALT_UNALLOC;
5030         }
5031     }
5032   else
5033     {
5034       switch (INSTR (23, 22))
5035         {
5036         case 0:
5037           for (i = 0; i < (full ? 16 : 8); i++)
5038             aarch64_set_vec_s8 (cpu, vd, i,
5039                                 aarch64_get_vec_s8 (cpu, vn, i)
5040                                 > aarch64_get_vec_s8 (cpu, vm, i)
5041                                 ? aarch64_get_vec_s8 (cpu, vn, i)
5042                                 : aarch64_get_vec_s8 (cpu, vm, i));
5043           return;
5044
5045         case 1:
5046           for (i = 0; i < (full ? 8 : 4); i++)
5047             aarch64_set_vec_s16 (cpu, vd, i,
5048                                  aarch64_get_vec_s16 (cpu, vn, i)
5049                                  > aarch64_get_vec_s16 (cpu, vm, i)
5050                                  ? aarch64_get_vec_s16 (cpu, vn, i)
5051                                  : aarch64_get_vec_s16 (cpu, vm, i));
5052           return;
5053
5054         case 2:
5055           for (i = 0; i < (full ? 4 : 2); i++)
5056             aarch64_set_vec_s32 (cpu, vd, i,
5057                                  aarch64_get_vec_s32 (cpu, vn, i)
5058                                  > aarch64_get_vec_s32 (cpu, vm, i)
5059                                  ? aarch64_get_vec_s32 (cpu, vn, i)
5060                                  : aarch64_get_vec_s32 (cpu, vm, i));
5061           return;
5062
5063         case 3:
5064           HALT_UNALLOC;
5065         }
5066     }
5067 }
5068
5069 static void
5070 do_vec_min (sim_cpu *cpu)
5071 {
5072   /* instr[31]    = 0
5073      instr[30]    = full/half selector
5074      instr[29]    = SMIN (0) / UMIN (1)
5075      instr[28,24] = 0 1110
5076      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
5077      instr[21]    = 1
5078      instr[20,16] = Vn
5079      instr[15,10] = 0110 11
5080      instr[9,5]   = Vm
5081      instr[4.0]   = Vd.  */
5082
5083   unsigned vm = INSTR (20, 16);
5084   unsigned vn = INSTR (9, 5);
5085   unsigned vd = INSTR (4, 0);
5086   unsigned i;
5087   int      full = INSTR (30, 30);
5088
5089   NYI_assert (28, 24, 0x0E);
5090   NYI_assert (21, 21, 1);
5091   NYI_assert (15, 10, 0x1B);
5092
5093   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5094   if (INSTR (29, 29))
5095     {
5096       switch (INSTR (23, 22))
5097         {
5098         case 0:
5099           for (i = 0; i < (full ? 16 : 8); i++)
5100             aarch64_set_vec_u8 (cpu, vd, i,
5101                                 aarch64_get_vec_u8 (cpu, vn, i)
5102                                 < aarch64_get_vec_u8 (cpu, vm, i)
5103                                 ? aarch64_get_vec_u8 (cpu, vn, i)
5104                                 : aarch64_get_vec_u8 (cpu, vm, i));
5105           return;
5106
5107         case 1:
5108           for (i = 0; i < (full ? 8 : 4); i++)
5109             aarch64_set_vec_u16 (cpu, vd, i,
5110                                  aarch64_get_vec_u16 (cpu, vn, i)
5111                                  < aarch64_get_vec_u16 (cpu, vm, i)
5112                                  ? aarch64_get_vec_u16 (cpu, vn, i)
5113                                  : aarch64_get_vec_u16 (cpu, vm, i));
5114           return;
5115
5116         case 2:
5117           for (i = 0; i < (full ? 4 : 2); i++)
5118             aarch64_set_vec_u32 (cpu, vd, i,
5119                                  aarch64_get_vec_u32 (cpu, vn, i)
5120                                  < aarch64_get_vec_u32 (cpu, vm, i)
5121                                  ? aarch64_get_vec_u32 (cpu, vn, i)
5122                                  : aarch64_get_vec_u32 (cpu, vm, i));
5123           return;
5124
5125         case 3:
5126           HALT_UNALLOC;
5127         }
5128     }
5129   else
5130     {
5131       switch (INSTR (23, 22))
5132         {
5133         case 0:
5134           for (i = 0; i < (full ? 16 : 8); i++)
5135             aarch64_set_vec_s8 (cpu, vd, i,
5136                                 aarch64_get_vec_s8 (cpu, vn, i)
5137                                 < aarch64_get_vec_s8 (cpu, vm, i)
5138                                 ? aarch64_get_vec_s8 (cpu, vn, i)
5139                                 : aarch64_get_vec_s8 (cpu, vm, i));
5140           return;
5141
5142         case 1:
5143           for (i = 0; i < (full ? 8 : 4); i++)
5144             aarch64_set_vec_s16 (cpu, vd, i,
5145                                  aarch64_get_vec_s16 (cpu, vn, i)
5146                                  < aarch64_get_vec_s16 (cpu, vm, i)
5147                                  ? aarch64_get_vec_s16 (cpu, vn, i)
5148                                  : aarch64_get_vec_s16 (cpu, vm, i));
5149           return;
5150
5151         case 2:
5152           for (i = 0; i < (full ? 4 : 2); i++)
5153             aarch64_set_vec_s32 (cpu, vd, i,
5154                                  aarch64_get_vec_s32 (cpu, vn, i)
5155                                  < aarch64_get_vec_s32 (cpu, vm, i)
5156                                  ? aarch64_get_vec_s32 (cpu, vn, i)
5157                                  : aarch64_get_vec_s32 (cpu, vm, i));
5158           return;
5159
5160         case 3:
5161           HALT_UNALLOC;
5162         }
5163     }
5164 }
5165
5166 static void
5167 do_vec_sub_long (sim_cpu *cpu)
5168 {
5169   /* instr[31]    = 0
5170      instr[30]    = lower (0) / upper (1)
5171      instr[29]    = signed (0) / unsigned (1)
5172      instr[28,24] = 0 1110
5173      instr[23,22] = size: bytes (00), half (01), word (10)
5174      instr[21]    = 1
5175      insrt[20,16] = Vm
5176      instr[15,10] = 0010 00
5177      instr[9,5]   = Vn
5178      instr[4,0]   = V dest.  */
5179
5180   unsigned size = INSTR (23, 22);
5181   unsigned vm = INSTR (20, 16);
5182   unsigned vn = INSTR (9, 5);
5183   unsigned vd = INSTR (4, 0);
5184   unsigned bias = 0;
5185   unsigned i;
5186
5187   NYI_assert (28, 24, 0x0E);
5188   NYI_assert (21, 21, 1);
5189   NYI_assert (15, 10, 0x08);
5190
5191   if (size == 3)
5192     HALT_UNALLOC;
5193
5194   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5195   switch (INSTR (30, 29))
5196     {
5197     case 2: /* SSUBL2.  */
5198       bias = 2;
5199     case 0: /* SSUBL.  */
5200       switch (size)
5201         {
5202         case 0:
5203           bias *= 3;
5204           for (i = 0; i < 8; i++)
5205             aarch64_set_vec_s16 (cpu, vd, i,
5206                                  aarch64_get_vec_s8 (cpu, vn, i + bias)
5207                                  - aarch64_get_vec_s8 (cpu, vm, i + bias));
5208           break;
5209
5210         case 1:
5211           bias *= 2;
5212           for (i = 0; i < 4; i++)
5213             aarch64_set_vec_s32 (cpu, vd, i,
5214                                  aarch64_get_vec_s16 (cpu, vn, i + bias)
5215                                  - aarch64_get_vec_s16 (cpu, vm, i + bias));
5216           break;
5217
5218         case 2:
5219           for (i = 0; i < 2; i++)
5220             aarch64_set_vec_s64 (cpu, vd, i,
5221                                  aarch64_get_vec_s32 (cpu, vn, i + bias)
5222                                  - aarch64_get_vec_s32 (cpu, vm, i + bias));
5223           break;
5224
5225         default:
5226           HALT_UNALLOC;
5227         }
5228       break;
5229
5230     case 3: /* USUBL2.  */
5231       bias = 2;
5232     case 1: /* USUBL.  */
5233       switch (size)
5234         {
5235         case 0:
5236           bias *= 3;
5237           for (i = 0; i < 8; i++)
5238             aarch64_set_vec_u16 (cpu, vd, i,
5239                                  aarch64_get_vec_u8 (cpu, vn, i + bias)
5240                                  - aarch64_get_vec_u8 (cpu, vm, i + bias));
5241           break;
5242
5243         case 1:
5244           bias *= 2;
5245           for (i = 0; i < 4; i++)
5246             aarch64_set_vec_u32 (cpu, vd, i,
5247                                  aarch64_get_vec_u16 (cpu, vn, i + bias)
5248                                  - aarch64_get_vec_u16 (cpu, vm, i + bias));
5249           break;
5250
5251         case 2:
5252           for (i = 0; i < 2; i++)
5253             aarch64_set_vec_u64 (cpu, vd, i,
5254                                  aarch64_get_vec_u32 (cpu, vn, i + bias)
5255                                  - aarch64_get_vec_u32 (cpu, vm, i + bias));
5256           break;
5257
5258         default:
5259           HALT_UNALLOC;
5260         }
5261       break;
5262     }
5263 }
5264
5265 static void
5266 do_vec_ADDP (sim_cpu *cpu)
5267 {
5268   /* instr[31]    = 0
5269      instr[30]    = half(0)/full(1)
5270      instr[29,24] = 00 1110
5271      instr[23,22] = size: bytes (00), half (01), word (10), long (11)
5272      instr[21]    = 1
5273      insrt[20,16] = Vm
5274      instr[15,10] = 1011 11
5275      instr[9,5]   = Vn
5276      instr[4,0]   = V dest.  */
5277
5278   FRegister copy_vn;
5279   FRegister copy_vm;
5280   unsigned full = INSTR (30, 30);
5281   unsigned size = INSTR (23, 22);
5282   unsigned vm = INSTR (20, 16);
5283   unsigned vn = INSTR (9, 5);
5284   unsigned vd = INSTR (4, 0);
5285   unsigned i, range;
5286
5287   NYI_assert (29, 24, 0x0E);
5288   NYI_assert (21, 21, 1);
5289   NYI_assert (15, 10, 0x2F);
5290
5291   /* Make copies of the source registers in case vd == vn/vm.  */
5292   copy_vn = cpu->fr[vn];
5293   copy_vm = cpu->fr[vm];
5294
5295   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5296   switch (size)
5297     {
5298     case 0:
5299       range = full ? 8 : 4;
5300       for (i = 0; i < range; i++)
5301         {
5302           aarch64_set_vec_u8 (cpu, vd, i,
5303                               copy_vn.b[i * 2] + copy_vn.b[i * 2 + 1]);
5304           aarch64_set_vec_u8 (cpu, vd, i + range,
5305                               copy_vm.b[i * 2] + copy_vm.b[i * 2 + 1]);
5306         }
5307       return;
5308
5309     case 1:
5310       range = full ? 4 : 2;
5311       for (i = 0; i < range; i++)
5312         {
5313           aarch64_set_vec_u16 (cpu, vd, i,
5314                                copy_vn.h[i * 2] + copy_vn.h[i * 2 + 1]);
5315           aarch64_set_vec_u16 (cpu, vd, i + range,
5316                                copy_vm.h[i * 2] + copy_vm.h[i * 2 + 1]);
5317         }
5318       return;
5319
5320     case 2:
5321       range = full ? 2 : 1;
5322       for (i = 0; i < range; i++)
5323         {
5324           aarch64_set_vec_u32 (cpu, vd, i,
5325                                copy_vn.w[i * 2] + copy_vn.w[i * 2 + 1]);
5326           aarch64_set_vec_u32 (cpu, vd, i + range,
5327                                copy_vm.w[i * 2] + copy_vm.w[i * 2 + 1]);
5328         }
5329       return;
5330
5331     case 3:
5332       if (! full)
5333         HALT_UNALLOC;
5334       aarch64_set_vec_u64 (cpu, vd, 0, copy_vn.v[0] + copy_vn.v[1]);
5335       aarch64_set_vec_u64 (cpu, vd, 1, copy_vm.v[0] + copy_vm.v[1]);
5336       return;
5337     }
5338 }
5339
5340 static void
5341 do_vec_UMOV (sim_cpu *cpu)
5342 {
5343   /* instr[31]    = 0
5344      instr[30]    = 32-bit(0)/64-bit(1)
5345      instr[29,21] = 00 1110 000
5346      insrt[20,16] = size & index
5347      instr[15,10] = 0011 11
5348      instr[9,5]   = V source
5349      instr[4,0]   = R dest.  */
5350
5351   unsigned vs = INSTR (9, 5);
5352   unsigned rd = INSTR (4, 0);
5353   unsigned index;
5354
5355   NYI_assert (29, 21, 0x070);
5356   NYI_assert (15, 10, 0x0F);
5357
5358   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5359   if (INSTR (16, 16))
5360     {
5361       /* Byte transfer.  */
5362       index = INSTR (20, 17);
5363       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5364                            aarch64_get_vec_u8 (cpu, vs, index));
5365     }
5366   else if (INSTR (17, 17))
5367     {
5368       index = INSTR (20, 18);
5369       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5370                            aarch64_get_vec_u16 (cpu, vs, index));
5371     }
5372   else if (INSTR (18, 18))
5373     {
5374       index = INSTR (20, 19);
5375       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5376                            aarch64_get_vec_u32 (cpu, vs, index));
5377     }
5378   else
5379     {
5380       if (INSTR (30, 30) != 1)
5381         HALT_UNALLOC;
5382
5383       index = INSTR (20, 20);
5384       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5385                            aarch64_get_vec_u64 (cpu, vs, index));
5386     }
5387 }
5388
5389 static void
5390 do_vec_FABS (sim_cpu *cpu)
5391 {
5392   /* instr[31]    = 0
5393      instr[30]    = half(0)/full(1)
5394      instr[29,23] = 00 1110 1
5395      instr[22]    = float(0)/double(1)
5396      instr[21,16] = 10 0000
5397      instr[15,10] = 1111 10
5398      instr[9,5]   = Vn
5399      instr[4,0]   = Vd.  */
5400
5401   unsigned vn = INSTR (9, 5);
5402   unsigned vd = INSTR (4, 0);
5403   unsigned full = INSTR (30, 30);
5404   unsigned i;
5405
5406   NYI_assert (29, 23, 0x1D);
5407   NYI_assert (21, 10, 0x83E);
5408
5409   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5410   if (INSTR (22, 22))
5411     {
5412       if (! full)
5413         HALT_NYI;
5414
5415       for (i = 0; i < 2; i++)
5416         aarch64_set_vec_double (cpu, vd, i,
5417                                 fabs (aarch64_get_vec_double (cpu, vn, i)));
5418     }
5419   else
5420     {
5421       for (i = 0; i < (full ? 4 : 2); i++)
5422         aarch64_set_vec_float (cpu, vd, i,
5423                                fabsf (aarch64_get_vec_float (cpu, vn, i)));
5424     }
5425 }
5426
5427 static void
5428 do_vec_FCVTZS (sim_cpu *cpu)
5429 {
5430   /* instr[31]    = 0
5431      instr[30]    = half (0) / all (1)
5432      instr[29,23] = 00 1110 1
5433      instr[22]    = single (0) / double (1)
5434      instr[21,10] = 10 0001 1011 10
5435      instr[9,5]   = Rn
5436      instr[4,0]   = Rd.  */
5437
5438   unsigned rn = INSTR (9, 5);
5439   unsigned rd = INSTR (4, 0);
5440   unsigned full = INSTR (30, 30);
5441   unsigned i;
5442
5443   NYI_assert (31, 31, 0);
5444   NYI_assert (29, 23, 0x1D);
5445   NYI_assert (21, 10, 0x86E);
5446
5447   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5448   if (INSTR (22, 22))
5449     {
5450       if (! full)
5451         HALT_UNALLOC;
5452
5453       for (i = 0; i < 2; i++)
5454         aarch64_set_vec_s64 (cpu, rd, i,
5455                              (int64_t) aarch64_get_vec_double (cpu, rn, i));
5456     }
5457   else
5458     for (i = 0; i < (full ? 4 : 2); i++)
5459       aarch64_set_vec_s32 (cpu, rd, i,
5460                            (int32_t) aarch64_get_vec_float (cpu, rn, i));
5461 }
5462
5463 static void
5464 do_vec_REV64 (sim_cpu *cpu)
5465 {
5466   /* instr[31]    = 0
5467      instr[30]    = full/half
5468      instr[29,24] = 00 1110
5469      instr[23,22] = size
5470      instr[21,10] = 10 0000 0000 10
5471      instr[9,5]   = Rn
5472      instr[4,0]   = Rd.  */
5473
5474   unsigned rn = INSTR (9, 5);
5475   unsigned rd = INSTR (4, 0);
5476   unsigned size = INSTR (23, 22);
5477   unsigned full = INSTR (30, 30);
5478   unsigned i;
5479   FRegister val;
5480
5481   NYI_assert (29, 24, 0x0E);
5482   NYI_assert (21, 10, 0x802);
5483
5484   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5485   switch (size)
5486     {
5487     case 0:
5488       for (i = 0; i < (full ? 16 : 8); i++)
5489         val.b[i ^ 0x7] = aarch64_get_vec_u8 (cpu, rn, i);
5490       break;
5491
5492     case 1:
5493       for (i = 0; i < (full ? 8 : 4); i++)
5494         val.h[i ^ 0x3] = aarch64_get_vec_u16 (cpu, rn, i);
5495       break;
5496
5497     case 2:
5498       for (i = 0; i < (full ? 4 : 2); i++)
5499         val.w[i ^ 0x1] = aarch64_get_vec_u32 (cpu, rn, i);
5500       break;
5501
5502     case 3:
5503       HALT_UNALLOC;
5504     }
5505
5506   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5507   if (full)
5508     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5509 }
5510
5511 static void
5512 do_vec_REV16 (sim_cpu *cpu)
5513 {
5514   /* instr[31]    = 0
5515      instr[30]    = full/half
5516      instr[29,24] = 00 1110
5517      instr[23,22] = size
5518      instr[21,10] = 10 0000 0001 10
5519      instr[9,5]   = Rn
5520      instr[4,0]   = Rd.  */
5521
5522   unsigned rn = INSTR (9, 5);
5523   unsigned rd = INSTR (4, 0);
5524   unsigned size = INSTR (23, 22);
5525   unsigned full = INSTR (30, 30);
5526   unsigned i;
5527   FRegister val;
5528
5529   NYI_assert (29, 24, 0x0E);
5530   NYI_assert (21, 10, 0x806);
5531
5532   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5533   switch (size)
5534     {
5535     case 0:
5536       for (i = 0; i < (full ? 16 : 8); i++)
5537         val.b[i ^ 0x1] = aarch64_get_vec_u8 (cpu, rn, i);
5538       break;
5539
5540     default:
5541       HALT_UNALLOC;
5542     }
5543
5544   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5545   if (full)
5546     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5547 }
5548
5549 static void
5550 do_vec_op1 (sim_cpu *cpu)
5551 {
5552   /* instr[31]    = 0
5553      instr[30]    = half/full
5554      instr[29,24] = 00 1110
5555      instr[23,21] = ???
5556      instr[20,16] = Vm
5557      instr[15,10] = sub-opcode
5558      instr[9,5]   = Vn
5559      instr[4,0]   = Vd  */
5560   NYI_assert (29, 24, 0x0E);
5561
5562   if (INSTR (21, 21) == 0)
5563     {
5564       if (INSTR (23, 22) == 0)
5565         {
5566           if (INSTR (30, 30) == 1
5567               && INSTR (17, 14) == 0
5568               && INSTR (12, 10) == 7)
5569             return do_vec_ins_2 (cpu);
5570
5571           switch (INSTR (15, 10))
5572             {
5573             case 0x01: do_vec_DUP_vector_into_vector (cpu); return;
5574             case 0x03: do_vec_DUP_scalar_into_vector (cpu); return;
5575             case 0x07: do_vec_INS (cpu); return;
5576             case 0x0A: do_vec_TRN (cpu); return;
5577
5578             case 0x0F:
5579               if (INSTR (17, 16) == 0)
5580                 {
5581                   do_vec_MOV_into_scalar (cpu);
5582                   return;
5583                 }
5584               break;
5585
5586             case 0x00:
5587             case 0x08:
5588             case 0x10:
5589             case 0x18:
5590               do_vec_TBL (cpu); return;
5591
5592             case 0x06:
5593             case 0x16:
5594               do_vec_UZP (cpu); return;
5595
5596             case 0x0E:
5597             case 0x1E:
5598               do_vec_ZIP (cpu); return;
5599
5600             default:
5601               HALT_NYI;
5602             }
5603         }
5604
5605       switch (INSTR (13, 10))
5606         {
5607         case 0x6: do_vec_UZP (cpu); return;
5608         case 0xE: do_vec_ZIP (cpu); return;
5609         case 0xA: do_vec_TRN (cpu); return;
5610         case 0xF: do_vec_UMOV (cpu); return;
5611         default:  HALT_NYI;
5612         }
5613     }
5614
5615   switch (INSTR (15, 10))
5616     {
5617     case 0x02: do_vec_REV64 (cpu); return;
5618     case 0x06: do_vec_REV16 (cpu); return;
5619
5620     case 0x07:
5621       switch (INSTR (23, 21))
5622         {
5623         case 1: do_vec_AND (cpu); return;
5624         case 3: do_vec_BIC (cpu); return;
5625         case 5: do_vec_ORR (cpu); return;
5626         case 7: do_vec_ORN (cpu); return;
5627         default: HALT_NYI;
5628         }
5629
5630     case 0x08: do_vec_sub_long (cpu); return;
5631     case 0x0a: do_vec_XTN (cpu); return;
5632     case 0x11: do_vec_SSHL (cpu); return;
5633     case 0x19: do_vec_max (cpu); return;
5634     case 0x1B: do_vec_min (cpu); return;
5635     case 0x21: do_vec_add (cpu); return;
5636     case 0x25: do_vec_MLA (cpu); return;
5637     case 0x27: do_vec_mul (cpu); return;
5638     case 0x2F: do_vec_ADDP (cpu); return;
5639     case 0x30: do_vec_mull (cpu); return;
5640     case 0x33: do_vec_FMLA (cpu); return;
5641     case 0x35: do_vec_fadd (cpu); return;
5642
5643     case 0x2E:
5644       switch (INSTR (20, 16))
5645         {
5646         case 0x00: do_vec_ABS (cpu); return;
5647         case 0x01: do_vec_FCVTZS (cpu); return;
5648         case 0x11: do_vec_ADDV (cpu); return;
5649         default: HALT_NYI;
5650         }
5651
5652     case 0x31:
5653     case 0x3B:
5654       do_vec_Fminmax (cpu); return;
5655
5656     case 0x0D:
5657     case 0x0F:
5658     case 0x22:
5659     case 0x23:
5660     case 0x26:
5661     case 0x2A:
5662     case 0x32:
5663     case 0x36:
5664     case 0x39:
5665     case 0x3A:
5666       do_vec_compare (cpu); return;
5667
5668     case 0x3E:
5669       do_vec_FABS (cpu); return;
5670
5671     default:
5672       HALT_NYI;
5673     }
5674 }
5675
5676 static void
5677 do_vec_xtl (sim_cpu *cpu)
5678 {
5679   /* instr[31]    = 0
5680      instr[30,29] = SXTL (00), UXTL (01), SXTL2 (10), UXTL2 (11)
5681      instr[28,22] = 0 1111 00
5682      instr[21,16] = size & shift (USHLL, SSHLL, USHLL2, SSHLL2)
5683      instr[15,10] = 1010 01
5684      instr[9,5]   = V source
5685      instr[4,0]   = V dest.  */
5686
5687   unsigned vs = INSTR (9, 5);
5688   unsigned vd = INSTR (4, 0);
5689   unsigned i, shift, bias = 0;
5690
5691   NYI_assert (28, 22, 0x3C);
5692   NYI_assert (15, 10, 0x29);
5693
5694   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5695   switch (INSTR (30, 29))
5696     {
5697     case 2: /* SXTL2, SSHLL2.  */
5698       bias = 2;
5699     case 0: /* SXTL, SSHLL.  */
5700       if (INSTR (21, 21))
5701         {
5702           int64_t val1, val2;
5703
5704           shift = INSTR (20, 16);
5705           /* Get the source values before setting the destination values
5706              in case the source and destination are the same.  */
5707           val1 = aarch64_get_vec_s32 (cpu, vs, bias) << shift;
5708           val2 = aarch64_get_vec_s32 (cpu, vs, bias + 1) << shift;
5709           aarch64_set_vec_s64 (cpu, vd, 0, val1);
5710           aarch64_set_vec_s64 (cpu, vd, 1, val2);
5711         }
5712       else if (INSTR (20, 20))
5713         {
5714           int32_t v[4];
5715           int32_t v1,v2,v3,v4;
5716
5717           shift = INSTR (19, 16);
5718           bias *= 2;
5719           for (i = 0; i < 4; i++)
5720             v[i] = aarch64_get_vec_s16 (cpu, vs, bias + i) << shift;
5721           for (i = 0; i < 4; i++)
5722             aarch64_set_vec_s32 (cpu, vd, i, v[i]);
5723         }
5724       else
5725         {
5726           int16_t v[8];
5727           NYI_assert (19, 19, 1);
5728
5729           shift = INSTR (18, 16);
5730           bias *= 3;
5731           for (i = 0; i < 8; i++)
5732             v[i] = aarch64_get_vec_s8 (cpu, vs, i + bias) << shift;
5733           for (i = 0; i < 8; i++)
5734             aarch64_set_vec_s16 (cpu, vd, i, v[i]);
5735         }
5736       return;
5737
5738     case 3: /* UXTL2, USHLL2.  */
5739       bias = 2;
5740     case 1: /* UXTL, USHLL.  */
5741       if (INSTR (21, 21))
5742         {
5743           uint64_t v1, v2;
5744           shift = INSTR (20, 16);
5745           v1 = aarch64_get_vec_u32 (cpu, vs, bias) << shift;
5746           v2 = aarch64_get_vec_u32 (cpu, vs, bias + 1) << shift;
5747           aarch64_set_vec_u64 (cpu, vd, 0, v1);
5748           aarch64_set_vec_u64 (cpu, vd, 1, v2);
5749         }
5750       else if (INSTR (20, 20))
5751         {
5752           uint32_t v[4];
5753           shift = INSTR (19, 16);
5754           bias *= 2;
5755           for (i = 0; i < 4; i++)
5756             v[i] = aarch64_get_vec_u16 (cpu, vs, i + bias) << shift;
5757           for (i = 0; i < 4; i++)
5758             aarch64_set_vec_u32 (cpu, vd, i, v[i]);
5759         }
5760       else
5761         {
5762           uint16_t v[8];
5763           NYI_assert (19, 19, 1);
5764
5765           shift = INSTR (18, 16);
5766           bias *= 3;
5767           for (i = 0; i < 8; i++)
5768             v[i] = aarch64_get_vec_u8 (cpu, vs, i + bias) << shift;
5769           for (i = 0; i < 8; i++)
5770             aarch64_set_vec_u16 (cpu, vd, i, v[i]);
5771         }
5772       return;
5773     }
5774 }
5775
5776 static void
5777 do_vec_SHL (sim_cpu *cpu)
5778 {
5779   /* instr [31]    = 0
5780      instr [30]    = half(0)/full(1)
5781      instr [29,23] = 001 1110
5782      instr [22,16] = size and shift amount
5783      instr [15,10] = 01 0101
5784      instr [9, 5]  = Vs
5785      instr [4, 0]  = Vd.  */
5786
5787   int shift;
5788   int full    = INSTR (30, 30);
5789   unsigned vs = INSTR (9, 5);
5790   unsigned vd = INSTR (4, 0);
5791   unsigned i;
5792
5793   NYI_assert (29, 23, 0x1E);
5794   NYI_assert (15, 10, 0x15);
5795
5796   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5797   if (INSTR (22, 22))
5798     {
5799       shift = INSTR (21, 16);
5800
5801       if (full == 0)
5802         HALT_UNALLOC;
5803
5804       for (i = 0; i < 2; i++)
5805         {
5806           uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5807           aarch64_set_vec_u64 (cpu, vd, i, val << shift);
5808         }
5809
5810       return;
5811     }
5812
5813   if (INSTR (21, 21))
5814     {
5815       shift = INSTR (20, 16);
5816
5817       for (i = 0; i < (full ? 4 : 2); i++)
5818         {
5819           uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5820           aarch64_set_vec_u32 (cpu, vd, i, val << shift);
5821         }
5822
5823       return;
5824     }
5825
5826   if (INSTR (20, 20))
5827     {
5828       shift = INSTR (19, 16);
5829
5830       for (i = 0; i < (full ? 8 : 4); i++)
5831         {
5832           uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5833           aarch64_set_vec_u16 (cpu, vd, i, val << shift);
5834         }
5835
5836       return;
5837     }
5838
5839   if (INSTR (19, 19) == 0)
5840     HALT_UNALLOC;
5841
5842   shift = INSTR (18, 16);
5843
5844   for (i = 0; i < (full ? 16 : 8); i++)
5845     {
5846       uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5847       aarch64_set_vec_u8 (cpu, vd, i, val << shift);
5848     }
5849 }
5850
5851 static void
5852 do_vec_SSHR_USHR (sim_cpu *cpu)
5853 {
5854   /* instr [31]    = 0
5855      instr [30]    = half(0)/full(1)
5856      instr [29]    = signed(0)/unsigned(1)
5857      instr [28,23] = 0 1111 0
5858      instr [22,16] = size and shift amount
5859      instr [15,10] = 0000 01
5860      instr [9, 5]  = Vs
5861      instr [4, 0]  = Vd.  */
5862
5863   int full       = INSTR (30, 30);
5864   int sign       = ! INSTR (29, 29);
5865   unsigned shift = INSTR (22, 16);
5866   unsigned vs    = INSTR (9, 5);
5867   unsigned vd    = INSTR (4, 0);
5868   unsigned i;
5869
5870   NYI_assert (28, 23, 0x1E);
5871   NYI_assert (15, 10, 0x01);
5872
5873   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5874   if (INSTR (22, 22))
5875     {
5876       shift = 128 - shift;
5877
5878       if (full == 0)
5879         HALT_UNALLOC;
5880
5881       if (sign)
5882         for (i = 0; i < 2; i++)
5883           {
5884             int64_t val = aarch64_get_vec_s64 (cpu, vs, i);
5885             aarch64_set_vec_s64 (cpu, vd, i, val >> shift);
5886           }
5887       else
5888         for (i = 0; i < 2; i++)
5889           {
5890             uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5891             aarch64_set_vec_u64 (cpu, vd, i, val >> shift);
5892           }
5893
5894       return;
5895     }
5896
5897   if (INSTR (21, 21))
5898     {
5899       shift = 64 - shift;
5900
5901       if (sign)
5902         for (i = 0; i < (full ? 4 : 2); i++)
5903           {
5904             int32_t val = aarch64_get_vec_s32 (cpu, vs, i);
5905             aarch64_set_vec_s32 (cpu, vd, i, val >> shift);
5906           }
5907       else
5908         for (i = 0; i < (full ? 4 : 2); i++)
5909           {
5910             uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5911             aarch64_set_vec_u32 (cpu, vd, i, val >> shift);
5912           }
5913
5914       return;
5915     }
5916
5917   if (INSTR (20, 20))
5918     {
5919       shift = 32 - shift;
5920
5921       if (sign)
5922         for (i = 0; i < (full ? 8 : 4); i++)
5923           {
5924             int16_t val = aarch64_get_vec_s16 (cpu, vs, i);
5925             aarch64_set_vec_s16 (cpu, vd, i, val >> shift);
5926           }
5927       else
5928         for (i = 0; i < (full ? 8 : 4); i++)
5929           {
5930             uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5931             aarch64_set_vec_u16 (cpu, vd, i, val >> shift);
5932           }
5933
5934       return;
5935     }
5936
5937   if (INSTR (19, 19) == 0)
5938     HALT_UNALLOC;
5939
5940   shift = 16 - shift;
5941
5942   if (sign)
5943     for (i = 0; i < (full ? 16 : 8); i++)
5944       {
5945         int8_t val = aarch64_get_vec_s8 (cpu, vs, i);
5946         aarch64_set_vec_s8 (cpu, vd, i, val >> shift);
5947       }
5948   else
5949     for (i = 0; i < (full ? 16 : 8); i++)
5950       {
5951         uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5952         aarch64_set_vec_u8 (cpu, vd, i, val >> shift);
5953       }
5954 }
5955
5956 static void
5957 do_vec_MUL_by_element (sim_cpu *cpu)
5958 {
5959   /* instr[31]    = 0
5960      instr[30]    = half/full
5961      instr[29,24] = 00 1111
5962      instr[23,22] = size
5963      instr[21]    = L
5964      instr[20]    = M
5965      instr[19,16] = m
5966      instr[15,12] = 1000
5967      instr[11]    = H
5968      instr[10]    = 0
5969      instr[9,5]   = Vn
5970      instr[4,0]   = Vd  */
5971
5972   unsigned full     = INSTR (30, 30);
5973   unsigned L        = INSTR (21, 21);
5974   unsigned H        = INSTR (11, 11);
5975   unsigned vn       = INSTR (9, 5);
5976   unsigned vd       = INSTR (4, 0);
5977   unsigned size     = INSTR (23, 22);
5978   unsigned index;
5979   unsigned vm;
5980   unsigned e;
5981
5982   NYI_assert (29, 24, 0x0F);
5983   NYI_assert (15, 12, 0x8);
5984   NYI_assert (10, 10, 0);
5985
5986   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5987   switch (size)
5988     {
5989     case 1:
5990       {
5991         /* 16 bit products.  */
5992         uint16_t product;
5993         uint16_t element1;
5994         uint16_t element2;
5995
5996         index = (H << 2) | (L << 1) | INSTR (20, 20);
5997         vm = INSTR (19, 16);
5998         element2 = aarch64_get_vec_u16 (cpu, vm, index);
5999
6000         for (e = 0; e < (full ? 8 : 4); e ++)
6001           {
6002             element1 = aarch64_get_vec_u16 (cpu, vn, e);
6003             product  = element1 * element2;
6004             aarch64_set_vec_u16 (cpu, vd, e, product);
6005           }
6006       }
6007       break;
6008
6009     case 2:
6010       {
6011         /* 32 bit products.  */
6012         uint32_t product;
6013         uint32_t element1;
6014         uint32_t element2;
6015
6016         index = (H << 1) | L;
6017         vm = INSTR (20, 16);
6018         element2 = aarch64_get_vec_u32 (cpu, vm, index);
6019
6020         for (e = 0; e < (full ? 4 : 2); e ++)
6021           {
6022             element1 = aarch64_get_vec_u32 (cpu, vn, e);
6023             product  = element1 * element2;
6024             aarch64_set_vec_u32 (cpu, vd, e, product);
6025           }
6026       }
6027       break;
6028
6029     default:
6030       HALT_UNALLOC;
6031     }
6032 }
6033
6034 static void
6035 do_FMLA_by_element (sim_cpu *cpu)
6036 {
6037   /* instr[31]    = 0
6038      instr[30]    = half/full
6039      instr[29,23] = 00 1111 1
6040      instr[22]    = size
6041      instr[21]    = L
6042      instr[20,16] = m
6043      instr[15,12] = 0001
6044      instr[11]    = H
6045      instr[10]    = 0
6046      instr[9,5]   = Vn
6047      instr[4,0]   = Vd  */
6048
6049   unsigned full     = INSTR (30, 30);
6050   unsigned size     = INSTR (22, 22);
6051   unsigned L        = INSTR (21, 21);
6052   unsigned vm       = INSTR (20, 16);
6053   unsigned H        = INSTR (11, 11);
6054   unsigned vn       = INSTR (9, 5);
6055   unsigned vd       = INSTR (4, 0);
6056   unsigned e;
6057
6058   NYI_assert (29, 23, 0x1F);
6059   NYI_assert (15, 12, 0x1);
6060   NYI_assert (10, 10, 0);
6061
6062   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6063   if (size)
6064     {
6065       double element1, element2;
6066
6067       if (! full || L)
6068         HALT_UNALLOC;
6069
6070       element2 = aarch64_get_vec_double (cpu, vm, H);
6071
6072       for (e = 0; e < 2; e++)
6073         {
6074           element1 = aarch64_get_vec_double (cpu, vn, e);
6075           element1 *= element2;
6076           element1 += aarch64_get_vec_double (cpu, vd, e);
6077           aarch64_set_vec_double (cpu, vd, e, element1);
6078         }
6079     }
6080   else
6081     {
6082       float element1;
6083       float element2 = aarch64_get_vec_float (cpu, vm, (H << 1) | L);
6084
6085       for (e = 0; e < (full ? 4 : 2); e++)
6086         {
6087           element1 = aarch64_get_vec_float (cpu, vn, e);
6088           element1 *= element2;
6089           element1 += aarch64_get_vec_float (cpu, vd, e);
6090           aarch64_set_vec_float (cpu, vd, e, element1);
6091         }
6092     }
6093 }
6094
6095 static void
6096 do_vec_op2 (sim_cpu *cpu)
6097 {
6098   /* instr[31]    = 0
6099      instr[30]    = half/full
6100      instr[29,24] = 00 1111
6101      instr[23]    = ?
6102      instr[22,16] = element size & index
6103      instr[15,10] = sub-opcode
6104      instr[9,5]   = Vm
6105      instr[4,0]   = Vd  */
6106
6107   NYI_assert (29, 24, 0x0F);
6108
6109   if (INSTR (23, 23) != 0)
6110     {
6111       switch (INSTR (15, 10))
6112         {
6113         case 0x04:
6114         case 0x06:
6115           do_FMLA_by_element (cpu);
6116           return;
6117
6118         case 0x20:
6119         case 0x22:
6120           do_vec_MUL_by_element (cpu);
6121           return;
6122
6123         default:
6124           HALT_NYI;
6125         }
6126     }
6127   else
6128     {
6129       switch (INSTR (15, 10))
6130         {
6131         case 0x01: do_vec_SSHR_USHR (cpu); return;
6132         case 0x15: do_vec_SHL (cpu); return;
6133         case 0x20:
6134         case 0x22: do_vec_MUL_by_element (cpu); return;
6135         case 0x29: do_vec_xtl (cpu); return;
6136         default:   HALT_NYI;
6137         }
6138     }
6139 }
6140
6141 static void
6142 do_vec_neg (sim_cpu *cpu)
6143 {
6144   /* instr[31]    = 0
6145      instr[30]    = full(1)/half(0)
6146      instr[29,24] = 10 1110
6147      instr[23,22] = size: byte(00), half (01), word (10), long (11)
6148      instr[21,10] = 1000 0010 1110
6149      instr[9,5]   = Vs
6150      instr[4,0]   = Vd  */
6151
6152   int    full = INSTR (30, 30);
6153   unsigned vs = INSTR (9, 5);
6154   unsigned vd = INSTR (4, 0);
6155   unsigned i;
6156
6157   NYI_assert (29, 24, 0x2E);
6158   NYI_assert (21, 10, 0x82E);
6159
6160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6161   switch (INSTR (23, 22))
6162     {
6163     case 0:
6164       for (i = 0; i < (full ? 16 : 8); i++)
6165         aarch64_set_vec_s8 (cpu, vd, i, - aarch64_get_vec_s8 (cpu, vs, i));
6166       return;
6167
6168     case 1:
6169       for (i = 0; i < (full ? 8 : 4); i++)
6170         aarch64_set_vec_s16 (cpu, vd, i, - aarch64_get_vec_s16 (cpu, vs, i));
6171       return;
6172
6173     case 2:
6174       for (i = 0; i < (full ? 4 : 2); i++)
6175         aarch64_set_vec_s32 (cpu, vd, i, - aarch64_get_vec_s32 (cpu, vs, i));
6176       return;
6177
6178     case 3:
6179       if (! full)
6180         HALT_NYI;
6181       for (i = 0; i < 2; i++)
6182         aarch64_set_vec_s64 (cpu, vd, i, - aarch64_get_vec_s64 (cpu, vs, i));
6183       return;
6184     }
6185 }
6186
6187 static void
6188 do_vec_sqrt (sim_cpu *cpu)
6189 {
6190   /* instr[31]    = 0
6191      instr[30]    = full(1)/half(0)
6192      instr[29,23] = 101 1101
6193      instr[22]    = single(0)/double(1)
6194      instr[21,10] = 1000 0111 1110
6195      instr[9,5]   = Vs
6196      instr[4,0]   = Vd.  */
6197
6198   int    full = INSTR (30, 30);
6199   unsigned vs = INSTR (9, 5);
6200   unsigned vd = INSTR (4, 0);
6201   unsigned i;
6202
6203   NYI_assert (29, 23, 0x5B);
6204   NYI_assert (21, 10, 0x87E);
6205
6206   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6207   if (INSTR (22, 22) == 0)
6208     for (i = 0; i < (full ? 4 : 2); i++)
6209       aarch64_set_vec_float (cpu, vd, i,
6210                              sqrtf (aarch64_get_vec_float (cpu, vs, i)));
6211   else
6212     for (i = 0; i < 2; i++)
6213       aarch64_set_vec_double (cpu, vd, i,
6214                               sqrt (aarch64_get_vec_double (cpu, vs, i)));
6215 }
6216
6217 static void
6218 do_vec_mls_indexed (sim_cpu *cpu)
6219 {
6220   /* instr[31]       = 0
6221      instr[30]       = half(0)/full(1)
6222      instr[29,24]    = 10 1111
6223      instr[23,22]    = 16-bit(01)/32-bit(10)
6224      instr[21,20+11] = index (if 16-bit)
6225      instr[21+11]    = index (if 32-bit)
6226      instr[20,16]    = Vm
6227      instr[15,12]    = 0100
6228      instr[11]       = part of index
6229      instr[10]       = 0
6230      instr[9,5]      = Vs
6231      instr[4,0]      = Vd.  */
6232
6233   int    full = INSTR (30, 30);
6234   unsigned vs = INSTR (9, 5);
6235   unsigned vd = INSTR (4, 0);
6236   unsigned vm = INSTR (20, 16);
6237   unsigned i;
6238
6239   NYI_assert (15, 12, 4);
6240   NYI_assert (10, 10, 0);
6241
6242   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6243   switch (INSTR (23, 22))
6244     {
6245     case 1:
6246       {
6247         unsigned elem;
6248         uint32_t val;
6249
6250         if (vm > 15)
6251           HALT_NYI;
6252
6253         elem = (INSTR (21, 20) << 1) | INSTR (11, 11);
6254         val = aarch64_get_vec_u16 (cpu, vm, elem);
6255
6256         for (i = 0; i < (full ? 8 : 4); i++)
6257           aarch64_set_vec_u32 (cpu, vd, i,
6258                                aarch64_get_vec_u32 (cpu, vd, i) -
6259                                (aarch64_get_vec_u32 (cpu, vs, i) * val));
6260         return;
6261       }
6262
6263     case 2:
6264       {
6265         unsigned elem = (INSTR (21, 21) << 1) | INSTR (11, 11);
6266         uint64_t val = aarch64_get_vec_u32 (cpu, vm, elem);
6267
6268         for (i = 0; i < (full ? 4 : 2); i++)
6269           aarch64_set_vec_u64 (cpu, vd, i,
6270                                aarch64_get_vec_u64 (cpu, vd, i) -
6271                                (aarch64_get_vec_u64 (cpu, vs, i) * val));
6272         return;
6273       }
6274
6275     case 0:
6276     case 3:
6277     default:
6278       HALT_NYI;
6279     }
6280 }
6281
6282 static void
6283 do_vec_SUB (sim_cpu *cpu)
6284 {
6285   /* instr [31]    = 0
6286      instr [30]    = half(0)/full(1)
6287      instr [29,24] = 10 1110
6288      instr [23,22] = size: byte(00, half(01), word (10), long (11)
6289      instr [21]    = 1
6290      instr [20,16] = Vm
6291      instr [15,10] = 10 0001
6292      instr [9, 5]  = Vn
6293      instr [4, 0]  = Vd.  */
6294
6295   unsigned full = INSTR (30, 30);
6296   unsigned vm = INSTR (20, 16);
6297   unsigned vn = INSTR (9, 5);
6298   unsigned vd = INSTR (4, 0);
6299   unsigned i;
6300
6301   NYI_assert (29, 24, 0x2E);
6302   NYI_assert (21, 21, 1);
6303   NYI_assert (15, 10, 0x21);
6304
6305   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6306   switch (INSTR (23, 22))
6307     {
6308     case 0:
6309       for (i = 0; i < (full ? 16 : 8); i++)
6310         aarch64_set_vec_s8 (cpu, vd, i,
6311                             aarch64_get_vec_s8 (cpu, vn, i)
6312                             - aarch64_get_vec_s8 (cpu, vm, i));
6313       return;
6314
6315     case 1:
6316       for (i = 0; i < (full ? 8 : 4); i++)
6317         aarch64_set_vec_s16 (cpu, vd, i,
6318                              aarch64_get_vec_s16 (cpu, vn, i)
6319                              - aarch64_get_vec_s16 (cpu, vm, i));
6320       return;
6321
6322     case 2:
6323       for (i = 0; i < (full ? 4 : 2); i++)
6324         aarch64_set_vec_s32 (cpu, vd, i,
6325                              aarch64_get_vec_s32 (cpu, vn, i)
6326                              - aarch64_get_vec_s32 (cpu, vm, i));
6327       return;
6328
6329     case 3:
6330       if (full == 0)
6331         HALT_UNALLOC;
6332
6333       for (i = 0; i < 2; i++)
6334         aarch64_set_vec_s64 (cpu, vd, i,
6335                              aarch64_get_vec_s64 (cpu, vn, i)
6336                              - aarch64_get_vec_s64 (cpu, vm, i));
6337       return;
6338     }
6339 }
6340
6341 static void
6342 do_vec_MLS (sim_cpu *cpu)
6343 {
6344   /* instr [31]    = 0
6345      instr [30]    = half(0)/full(1)
6346      instr [29,24] = 10 1110
6347      instr [23,22] = size: byte(00, half(01), word (10)
6348      instr [21]    = 1
6349      instr [20,16] = Vm
6350      instr [15,10] = 10 0101
6351      instr [9, 5]  = Vn
6352      instr [4, 0]  = Vd.  */
6353
6354   unsigned full = INSTR (30, 30);
6355   unsigned vm = INSTR (20, 16);
6356   unsigned vn = INSTR (9, 5);
6357   unsigned vd = INSTR (4, 0);
6358   unsigned i;
6359
6360   NYI_assert (29, 24, 0x2E);
6361   NYI_assert (21, 21, 1);
6362   NYI_assert (15, 10, 0x25);
6363
6364   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6365   switch (INSTR (23, 22))
6366     {
6367     case 0:
6368       for (i = 0; i < (full ? 16 : 8); i++)
6369         aarch64_set_vec_u8 (cpu, vd, i,
6370                             aarch64_get_vec_u8 (cpu, vd, i)
6371                             - (aarch64_get_vec_u8 (cpu, vn, i)
6372                                * aarch64_get_vec_u8 (cpu, vm, i)));
6373       return;
6374
6375     case 1:
6376       for (i = 0; i < (full ? 8 : 4); i++)
6377         aarch64_set_vec_u16 (cpu, vd, i,
6378                              aarch64_get_vec_u16 (cpu, vd, i)
6379                              - (aarch64_get_vec_u16 (cpu, vn, i)
6380                                 * aarch64_get_vec_u16 (cpu, vm, i)));
6381       return;
6382
6383     case 2:
6384       for (i = 0; i < (full ? 4 : 2); i++)
6385         aarch64_set_vec_u32 (cpu, vd, i,
6386                              aarch64_get_vec_u32 (cpu, vd, i)
6387                              - (aarch64_get_vec_u32 (cpu, vn, i)
6388                                 * aarch64_get_vec_u32 (cpu, vm, i)));
6389       return;
6390
6391     default:
6392       HALT_UNALLOC;
6393     }
6394 }
6395
6396 static void
6397 do_vec_FDIV (sim_cpu *cpu)
6398 {
6399   /* instr [31]    = 0
6400      instr [30]    = half(0)/full(1)
6401      instr [29,23] = 10 1110 0
6402      instr [22]    = float()/double(1)
6403      instr [21]    = 1
6404      instr [20,16] = Vm
6405      instr [15,10] = 1111 11
6406      instr [9, 5]  = Vn
6407      instr [4, 0]  = Vd.  */
6408
6409   unsigned full = INSTR (30, 30);
6410   unsigned vm = INSTR (20, 16);
6411   unsigned vn = INSTR (9, 5);
6412   unsigned vd = INSTR (4, 0);
6413   unsigned i;
6414
6415   NYI_assert (29, 23, 0x5C);
6416   NYI_assert (21, 21, 1);
6417   NYI_assert (15, 10, 0x3F);
6418
6419   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6420   if (INSTR (22, 22))
6421     {
6422       if (! full)
6423         HALT_UNALLOC;
6424
6425       for (i = 0; i < 2; i++)
6426         aarch64_set_vec_double (cpu, vd, i,
6427                                 aarch64_get_vec_double (cpu, vn, i)
6428                                 / aarch64_get_vec_double (cpu, vm, i));
6429     }
6430   else
6431     for (i = 0; i < (full ? 4 : 2); i++)
6432       aarch64_set_vec_float (cpu, vd, i,
6433                              aarch64_get_vec_float (cpu, vn, i)
6434                              / aarch64_get_vec_float (cpu, vm, i));
6435 }
6436
6437 static void
6438 do_vec_FMUL (sim_cpu *cpu)
6439 {
6440   /* instr [31]    = 0
6441      instr [30]    = half(0)/full(1)
6442      instr [29,23] = 10 1110 0
6443      instr [22]    = float(0)/double(1)
6444      instr [21]    = 1
6445      instr [20,16] = Vm
6446      instr [15,10] = 1101 11
6447      instr [9, 5]  = Vn
6448      instr [4, 0]  = Vd.  */
6449
6450   unsigned full = INSTR (30, 30);
6451   unsigned vm = INSTR (20, 16);
6452   unsigned vn = INSTR (9, 5);
6453   unsigned vd = INSTR (4, 0);
6454   unsigned i;
6455
6456   NYI_assert (29, 23, 0x5C);
6457   NYI_assert (21, 21, 1);
6458   NYI_assert (15, 10, 0x37);
6459
6460   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6461   if (INSTR (22, 22))
6462     {
6463       if (! full)
6464         HALT_UNALLOC;
6465
6466       for (i = 0; i < 2; i++)
6467         aarch64_set_vec_double (cpu, vd, i,
6468                                 aarch64_get_vec_double (cpu, vn, i)
6469                                 * aarch64_get_vec_double (cpu, vm, i));
6470     }
6471   else
6472     for (i = 0; i < (full ? 4 : 2); i++)
6473       aarch64_set_vec_float (cpu, vd, i,
6474                              aarch64_get_vec_float (cpu, vn, i)
6475                              * aarch64_get_vec_float (cpu, vm, i));
6476 }
6477
6478 static void
6479 do_vec_FADDP (sim_cpu *cpu)
6480 {
6481   /* instr [31]    = 0
6482      instr [30]    = half(0)/full(1)
6483      instr [29,23] = 10 1110 0
6484      instr [22]    = float(0)/double(1)
6485      instr [21]    = 1
6486      instr [20,16] = Vm
6487      instr [15,10] = 1101 01
6488      instr [9, 5]  = Vn
6489      instr [4, 0]  = Vd.  */
6490
6491   unsigned full = INSTR (30, 30);
6492   unsigned vm = INSTR (20, 16);
6493   unsigned vn = INSTR (9, 5);
6494   unsigned vd = INSTR (4, 0);
6495
6496   NYI_assert (29, 23, 0x5C);
6497   NYI_assert (21, 21, 1);
6498   NYI_assert (15, 10, 0x35);
6499
6500   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6501   if (INSTR (22, 22))
6502     {
6503       /* Extract values before adding them incase vd == vn/vm.  */
6504       double tmp1 = aarch64_get_vec_double (cpu, vn, 0);
6505       double tmp2 = aarch64_get_vec_double (cpu, vn, 1);
6506       double tmp3 = aarch64_get_vec_double (cpu, vm, 0);
6507       double tmp4 = aarch64_get_vec_double (cpu, vm, 1);
6508
6509       if (! full)
6510         HALT_UNALLOC;
6511
6512       aarch64_set_vec_double (cpu, vd, 0, tmp1 + tmp2);
6513       aarch64_set_vec_double (cpu, vd, 1, tmp3 + tmp4);
6514     }
6515   else
6516     {
6517       /* Extract values before adding them incase vd == vn/vm.  */
6518       float tmp1 = aarch64_get_vec_float (cpu, vn, 0);
6519       float tmp2 = aarch64_get_vec_float (cpu, vn, 1);
6520       float tmp5 = aarch64_get_vec_float (cpu, vm, 0);
6521       float tmp6 = aarch64_get_vec_float (cpu, vm, 1);
6522
6523       if (full)
6524         {
6525           float tmp3 = aarch64_get_vec_float (cpu, vn, 2);
6526           float tmp4 = aarch64_get_vec_float (cpu, vn, 3);
6527           float tmp7 = aarch64_get_vec_float (cpu, vm, 2);
6528           float tmp8 = aarch64_get_vec_float (cpu, vm, 3);
6529
6530           aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6531           aarch64_set_vec_float (cpu, vd, 1, tmp3 + tmp4);
6532           aarch64_set_vec_float (cpu, vd, 2, tmp5 + tmp6);
6533           aarch64_set_vec_float (cpu, vd, 3, tmp7 + tmp8);
6534         }
6535       else
6536         {
6537           aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6538           aarch64_set_vec_float (cpu, vd, 1, tmp5 + tmp6);
6539         }
6540     }
6541 }
6542
6543 static void
6544 do_vec_FSQRT (sim_cpu *cpu)
6545 {
6546   /* instr[31]    = 0
6547      instr[30]    = half(0)/full(1)
6548      instr[29,23] = 10 1110 1
6549      instr[22]    = single(0)/double(1)
6550      instr[21,10] = 10 0001 1111 10
6551      instr[9,5]   = Vsrc
6552      instr[4,0]   = Vdest.  */
6553
6554   unsigned vn = INSTR (9, 5);
6555   unsigned vd = INSTR (4, 0);
6556   unsigned full = INSTR (30, 30);
6557   int i;
6558
6559   NYI_assert (29, 23, 0x5D);
6560   NYI_assert (21, 10, 0x87E);
6561
6562   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6563   if (INSTR (22, 22))
6564     {
6565       if (! full)
6566         HALT_UNALLOC;
6567
6568       for (i = 0; i < 2; i++)
6569         aarch64_set_vec_double (cpu, vd, i,
6570                                 sqrt (aarch64_get_vec_double (cpu, vn, i)));
6571     }
6572   else
6573     {
6574       for (i = 0; i < (full ? 4 : 2); i++)
6575         aarch64_set_vec_float (cpu, vd, i,
6576                                sqrtf (aarch64_get_vec_float (cpu, vn, i)));
6577     }
6578 }
6579
6580 static void
6581 do_vec_FNEG (sim_cpu *cpu)
6582 {
6583   /* instr[31]    = 0
6584      instr[30]    = half (0)/full (1)
6585      instr[29,23] = 10 1110 1
6586      instr[22]    = single (0)/double (1)
6587      instr[21,10] = 10 0000 1111 10
6588      instr[9,5]   = Vsrc
6589      instr[4,0]   = Vdest.  */
6590
6591   unsigned vn = INSTR (9, 5);
6592   unsigned vd = INSTR (4, 0);
6593   unsigned full = INSTR (30, 30);
6594   int i;
6595
6596   NYI_assert (29, 23, 0x5D);
6597   NYI_assert (21, 10, 0x83E);
6598
6599   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6600   if (INSTR (22, 22))
6601     {
6602       if (! full)
6603         HALT_UNALLOC;
6604
6605       for (i = 0; i < 2; i++)
6606         aarch64_set_vec_double (cpu, vd, i,
6607                                 - aarch64_get_vec_double (cpu, vn, i));
6608     }
6609   else
6610     {
6611       for (i = 0; i < (full ? 4 : 2); i++)
6612         aarch64_set_vec_float (cpu, vd, i,
6613                                - aarch64_get_vec_float (cpu, vn, i));
6614     }
6615 }
6616
6617 static void
6618 do_vec_NOT (sim_cpu *cpu)
6619 {
6620   /* instr[31]    = 0
6621      instr[30]    = half (0)/full (1)
6622      instr[29,10] = 10 1110 0010 0000 0101 10
6623      instr[9,5]   = Vn
6624      instr[4.0]   = Vd.  */
6625
6626   unsigned vn = INSTR (9, 5);
6627   unsigned vd = INSTR (4, 0);
6628   unsigned i;
6629   int      full = INSTR (30, 30);
6630
6631   NYI_assert (29, 10, 0xB8816);
6632
6633   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6634   for (i = 0; i < (full ? 16 : 8); i++)
6635     aarch64_set_vec_u8 (cpu, vd, i, ~ aarch64_get_vec_u8 (cpu, vn, i));
6636 }
6637
6638 static unsigned int
6639 clz (uint64_t val, unsigned size)
6640 {
6641   uint64_t mask = 1;
6642   int      count;
6643
6644   mask <<= (size - 1);
6645   count = 0;
6646   do
6647     {
6648       if (val & mask)
6649         break;
6650       mask >>= 1;
6651       count ++;
6652     }
6653   while (mask);
6654
6655   return count;
6656 }
6657
6658 static void
6659 do_vec_CLZ (sim_cpu *cpu)
6660 {
6661   /* instr[31]    = 0
6662      instr[30]    = half (0)/full (1)
6663      instr[29,24] = 10 1110
6664      instr[23,22] = size
6665      instr[21,10] = 10 0000 0100 10
6666      instr[9,5]   = Vn
6667      instr[4.0]   = Vd.  */
6668
6669   unsigned vn = INSTR (9, 5);
6670   unsigned vd = INSTR (4, 0);
6671   unsigned i;
6672   int      full = INSTR (30,30);
6673
6674   NYI_assert (29, 24, 0x2E);
6675   NYI_assert (21, 10, 0x812);
6676
6677   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6678   switch (INSTR (23, 22))
6679     {
6680     case 0:
6681       for (i = 0; i < (full ? 16 : 8); i++)
6682         aarch64_set_vec_u8 (cpu, vd, i, clz (aarch64_get_vec_u8 (cpu, vn, i), 8));
6683       break;
6684     case 1:
6685       for (i = 0; i < (full ? 8 : 4); i++)
6686         aarch64_set_vec_u16 (cpu, vd, i, clz (aarch64_get_vec_u16 (cpu, vn, i), 16));
6687       break;
6688     case 2:
6689       for (i = 0; i < (full ? 4 : 2); i++)
6690         aarch64_set_vec_u32 (cpu, vd, i, clz (aarch64_get_vec_u32 (cpu, vn, i), 32));
6691       break;
6692     case 3:
6693       if (! full)
6694         HALT_UNALLOC;
6695       aarch64_set_vec_u64 (cpu, vd, 0, clz (aarch64_get_vec_u64 (cpu, vn, 0), 64));
6696       aarch64_set_vec_u64 (cpu, vd, 1, clz (aarch64_get_vec_u64 (cpu, vn, 1), 64));
6697       break;
6698     }
6699 }
6700
6701 static void
6702 do_vec_MOV_element (sim_cpu *cpu)
6703 {
6704   /* instr[31,21] = 0110 1110 000
6705      instr[20,16] = size & dest index
6706      instr[15]    = 0
6707      instr[14,11] = source index
6708      instr[10]    = 1
6709      instr[9,5]   = Vs
6710      instr[4.0]   = Vd.  */
6711
6712   unsigned vs = INSTR (9, 5);
6713   unsigned vd = INSTR (4, 0);
6714   unsigned src_index;
6715   unsigned dst_index;
6716
6717   NYI_assert (31, 21, 0x370);
6718   NYI_assert (15, 15, 0);
6719   NYI_assert (10, 10, 1);
6720
6721   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6722   if (INSTR (16, 16))
6723     {
6724       /* Move a byte.  */
6725       src_index = INSTR (14, 11);
6726       dst_index = INSTR (20, 17);
6727       aarch64_set_vec_u8 (cpu, vd, dst_index,
6728                           aarch64_get_vec_u8 (cpu, vs, src_index));
6729     }
6730   else if (INSTR (17, 17))
6731     {
6732       /* Move 16-bits.  */
6733       NYI_assert (11, 11, 0);
6734       src_index = INSTR (14, 12);
6735       dst_index = INSTR (20, 18);
6736       aarch64_set_vec_u16 (cpu, vd, dst_index,
6737                            aarch64_get_vec_u16 (cpu, vs, src_index));
6738     }
6739   else if (INSTR (18, 18))
6740     {
6741       /* Move 32-bits.  */
6742       NYI_assert (12, 11, 0);
6743       src_index = INSTR (14, 13);
6744       dst_index = INSTR (20, 19);
6745       aarch64_set_vec_u32 (cpu, vd, dst_index,
6746                            aarch64_get_vec_u32 (cpu, vs, src_index));
6747     }
6748   else
6749     {
6750       NYI_assert (19, 19, 1);
6751       NYI_assert (13, 11, 0);
6752       src_index = INSTR (14, 14);
6753       dst_index = INSTR (20, 20);
6754       aarch64_set_vec_u64 (cpu, vd, dst_index,
6755                            aarch64_get_vec_u64 (cpu, vs, src_index));
6756     }
6757 }
6758
6759 static void
6760 do_vec_REV32 (sim_cpu *cpu)
6761 {
6762   /* instr[31]    = 0
6763      instr[30]    = full/half
6764      instr[29,24] = 10 1110
6765      instr[23,22] = size
6766      instr[21,10] = 10 0000 0000 10
6767      instr[9,5]   = Rn
6768      instr[4,0]   = Rd.  */
6769
6770   unsigned rn = INSTR (9, 5);
6771   unsigned rd = INSTR (4, 0);
6772   unsigned size = INSTR (23, 22);
6773   unsigned full = INSTR (30, 30);
6774   unsigned i;
6775   FRegister val;
6776
6777   NYI_assert (29, 24, 0x2E);
6778   NYI_assert (21, 10, 0x802);
6779
6780   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6781   switch (size)
6782     {
6783     case 0:
6784       for (i = 0; i < (full ? 16 : 8); i++)
6785         val.b[i ^ 0x3] = aarch64_get_vec_u8 (cpu, rn, i);
6786       break;
6787
6788     case 1:
6789       for (i = 0; i < (full ? 8 : 4); i++)
6790         val.h[i ^ 0x1] = aarch64_get_vec_u16 (cpu, rn, i);
6791       break;
6792
6793     default:
6794       HALT_UNALLOC;
6795     }
6796
6797   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
6798   if (full)
6799     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
6800 }
6801
6802 static void
6803 do_vec_EXT (sim_cpu *cpu)
6804 {
6805   /* instr[31]    = 0
6806      instr[30]    = full/half
6807      instr[29,21] = 10 1110 000
6808      instr[20,16] = Vm
6809      instr[15]    = 0
6810      instr[14,11] = source index
6811      instr[10]    = 0
6812      instr[9,5]   = Vn
6813      instr[4.0]   = Vd.  */
6814
6815   unsigned vm = INSTR (20, 16);
6816   unsigned vn = INSTR (9, 5);
6817   unsigned vd = INSTR (4, 0);
6818   unsigned src_index = INSTR (14, 11);
6819   unsigned full = INSTR (30, 30);
6820   unsigned i;
6821   unsigned j;
6822   FRegister val;
6823
6824   NYI_assert (31, 21, 0x370);
6825   NYI_assert (15, 15, 0);
6826   NYI_assert (10, 10, 0);
6827
6828   if (!full && (src_index & 0x8))
6829     HALT_UNALLOC;
6830
6831   j = 0;
6832
6833   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6834   for (i = src_index; i < (full ? 16 : 8); i++)
6835     val.b[j ++] = aarch64_get_vec_u8 (cpu, vn, i);
6836   for (i = 0; i < src_index; i++)
6837     val.b[j ++] = aarch64_get_vec_u8 (cpu, vm, i);
6838
6839   aarch64_set_vec_u64 (cpu, vd, 0, val.v[0]);
6840   if (full)
6841     aarch64_set_vec_u64 (cpu, vd, 1, val.v[1]);
6842 }
6843
6844 static void
6845 dexAdvSIMD0 (sim_cpu *cpu)
6846 {
6847   /* instr [28,25] = 0 111.  */
6848   if (    INSTR (15, 10) == 0x07
6849       && (INSTR (9, 5) ==
6850           INSTR (20, 16)))
6851     {
6852       if (INSTR (31, 21) == 0x075
6853           || INSTR (31, 21) == 0x275)
6854         {
6855           do_vec_MOV_whole_vector (cpu);
6856           return;
6857         }
6858     }
6859
6860   if (INSTR (29, 19) == 0x1E0)
6861     {
6862       do_vec_MOV_immediate (cpu);
6863       return;
6864     }
6865
6866   if (INSTR (29, 19) == 0x5E0)
6867     {
6868       do_vec_MVNI (cpu);
6869       return;
6870     }
6871
6872   if (INSTR (29, 19) == 0x1C0
6873       || INSTR (29, 19) == 0x1C1)
6874     {
6875       if (INSTR (15, 10) == 0x03)
6876         {
6877           do_vec_DUP_scalar_into_vector (cpu);
6878           return;
6879         }
6880     }
6881
6882   switch (INSTR (29, 24))
6883     {
6884     case 0x0E: do_vec_op1 (cpu); return;
6885     case 0x0F: do_vec_op2 (cpu); return;
6886
6887     case 0x2E:
6888       if (INSTR (21, 21) == 1)
6889         {
6890           switch (INSTR (15, 10))
6891             {
6892             case 0x02:
6893               do_vec_REV32 (cpu);
6894               return;
6895
6896             case 0x07:
6897               switch (INSTR (23, 22))
6898                 {
6899                 case 0: do_vec_EOR (cpu); return;
6900                 case 1: do_vec_BSL (cpu); return;
6901                 case 2:
6902                 case 3: do_vec_bit (cpu); return;
6903                 }
6904               break;
6905
6906             case 0x08: do_vec_sub_long (cpu); return;
6907             case 0x11: do_vec_USHL (cpu); return;
6908             case 0x12: do_vec_CLZ (cpu); return;
6909             case 0x16: do_vec_NOT (cpu); return;
6910             case 0x19: do_vec_max (cpu); return;
6911             case 0x1B: do_vec_min (cpu); return;
6912             case 0x21: do_vec_SUB (cpu); return;
6913             case 0x25: do_vec_MLS (cpu); return;
6914             case 0x31: do_vec_FminmaxNMP (cpu); return;
6915             case 0x35: do_vec_FADDP (cpu); return;
6916             case 0x37: do_vec_FMUL (cpu); return;
6917             case 0x3F: do_vec_FDIV (cpu); return;
6918
6919             case 0x3E:
6920               switch (INSTR (20, 16))
6921                 {
6922                 case 0x00: do_vec_FNEG (cpu); return;
6923                 case 0x01: do_vec_FSQRT (cpu); return;
6924                 default:   HALT_NYI;
6925                 }
6926
6927             case 0x0D:
6928             case 0x0F:
6929             case 0x22:
6930             case 0x23:
6931             case 0x26:
6932             case 0x2A:
6933             case 0x32:
6934             case 0x36:
6935             case 0x39:
6936             case 0x3A:
6937               do_vec_compare (cpu); return;
6938
6939             default:
6940               break;
6941             }
6942         }
6943
6944       if (INSTR (31, 21) == 0x370)
6945         {
6946           if (INSTR (10, 10))
6947             do_vec_MOV_element (cpu);
6948           else
6949             do_vec_EXT (cpu);
6950           return;
6951         }
6952
6953       switch (INSTR (21, 10))
6954         {
6955         case 0x82E: do_vec_neg (cpu); return;
6956         case 0x87E: do_vec_sqrt (cpu); return;
6957         default:
6958           if (INSTR (15, 10) == 0x30)
6959             {
6960               do_vec_mull (cpu);
6961               return;
6962             }
6963           break;
6964         }
6965       break;
6966
6967     case 0x2f:
6968       switch (INSTR (15, 10))
6969         {
6970         case 0x01: do_vec_SSHR_USHR (cpu); return;
6971         case 0x10:
6972         case 0x12: do_vec_mls_indexed (cpu); return;
6973         case 0x29: do_vec_xtl (cpu); return;
6974         default:
6975           HALT_NYI;
6976         }
6977
6978     default:
6979       break;
6980     }
6981
6982   HALT_NYI;
6983 }
6984
6985 /* 3 sources.  */
6986
6987 /* Float multiply add.  */
6988 static void
6989 fmadds (sim_cpu *cpu)
6990 {
6991   unsigned sa = INSTR (14, 10);
6992   unsigned sm = INSTR (20, 16);
6993   unsigned sn = INSTR ( 9,  5);
6994   unsigned sd = INSTR ( 4,  0);
6995
6996   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6997   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
6998                         + aarch64_get_FP_float (cpu, sn)
6999                         * aarch64_get_FP_float (cpu, sm));
7000 }
7001
7002 /* Double multiply add.  */
7003 static void
7004 fmaddd (sim_cpu *cpu)
7005 {
7006   unsigned sa = INSTR (14, 10);
7007   unsigned sm = INSTR (20, 16);
7008   unsigned sn = INSTR ( 9,  5);
7009   unsigned sd = INSTR ( 4,  0);
7010
7011   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7012   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7013                          + aarch64_get_FP_double (cpu, sn)
7014                          * aarch64_get_FP_double (cpu, sm));
7015 }
7016
7017 /* Float multiply subtract.  */
7018 static void
7019 fmsubs (sim_cpu *cpu)
7020 {
7021   unsigned sa = INSTR (14, 10);
7022   unsigned sm = INSTR (20, 16);
7023   unsigned sn = INSTR ( 9,  5);
7024   unsigned sd = INSTR ( 4,  0);
7025
7026   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7027   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
7028                         - aarch64_get_FP_float (cpu, sn)
7029                         * aarch64_get_FP_float (cpu, sm));
7030 }
7031
7032 /* Double multiply subtract.  */
7033 static void
7034 fmsubd (sim_cpu *cpu)
7035 {
7036   unsigned sa = INSTR (14, 10);
7037   unsigned sm = INSTR (20, 16);
7038   unsigned sn = INSTR ( 9,  5);
7039   unsigned sd = INSTR ( 4,  0);
7040
7041   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7042   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7043                          - aarch64_get_FP_double (cpu, sn)
7044                          * aarch64_get_FP_double (cpu, sm));
7045 }
7046
7047 /* Float negative multiply add.  */
7048 static void
7049 fnmadds (sim_cpu *cpu)
7050 {
7051   unsigned sa = INSTR (14, 10);
7052   unsigned sm = INSTR (20, 16);
7053   unsigned sn = INSTR ( 9,  5);
7054   unsigned sd = INSTR ( 4,  0);
7055
7056   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7057   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7058                         + (- aarch64_get_FP_float (cpu, sn))
7059                         * aarch64_get_FP_float (cpu, sm));
7060 }
7061
7062 /* Double negative multiply add.  */
7063 static void
7064 fnmaddd (sim_cpu *cpu)
7065 {
7066   unsigned sa = INSTR (14, 10);
7067   unsigned sm = INSTR (20, 16);
7068   unsigned sn = INSTR ( 9,  5);
7069   unsigned sd = INSTR ( 4,  0);
7070
7071   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7072   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7073                          + (- aarch64_get_FP_double (cpu, sn))
7074                          * aarch64_get_FP_double (cpu, sm));
7075 }
7076
7077 /* Float negative multiply subtract.  */
7078 static void
7079 fnmsubs (sim_cpu *cpu)
7080 {
7081   unsigned sa = INSTR (14, 10);
7082   unsigned sm = INSTR (20, 16);
7083   unsigned sn = INSTR ( 9,  5);
7084   unsigned sd = INSTR ( 4,  0);
7085
7086   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7087   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7088                         + aarch64_get_FP_float (cpu, sn)
7089                         * aarch64_get_FP_float (cpu, sm));
7090 }
7091
7092 /* Double negative multiply subtract.  */
7093 static void
7094 fnmsubd (sim_cpu *cpu)
7095 {
7096   unsigned sa = INSTR (14, 10);
7097   unsigned sm = INSTR (20, 16);
7098   unsigned sn = INSTR ( 9,  5);
7099   unsigned sd = INSTR ( 4,  0);
7100
7101   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7102   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7103                          + aarch64_get_FP_double (cpu, sn)
7104                          * aarch64_get_FP_double (cpu, sm));
7105 }
7106
7107 static void
7108 dexSimpleFPDataProc3Source (sim_cpu *cpu)
7109 {
7110   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7111      instr[30]    = 0
7112      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7113      instr[28,25] = 1111
7114      instr[24]    = 1
7115      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7116      instr[21]    ==> o1 : 0 ==> unnegated, 1 ==> negated
7117      instr[15]    ==> o2 : 0 ==> ADD, 1 ==> SUB  */
7118
7119   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7120   /* dispatch on combined type:o1:o2.  */
7121   uint32_t dispatch = (INSTR (23, 21) << 1) | INSTR (15, 15);
7122
7123   if (M_S != 0)
7124     HALT_UNALLOC;
7125
7126   switch (dispatch)
7127     {
7128     case 0: fmadds (cpu); return;
7129     case 1: fmsubs (cpu); return;
7130     case 2: fnmadds (cpu); return;
7131     case 3: fnmsubs (cpu); return;
7132     case 4: fmaddd (cpu); return;
7133     case 5: fmsubd (cpu); return;
7134     case 6: fnmaddd (cpu); return;
7135     case 7: fnmsubd (cpu); return;
7136     default:
7137       /* type > 1 is currently unallocated.  */
7138       HALT_UNALLOC;
7139     }
7140 }
7141
7142 static void
7143 dexSimpleFPFixedConvert (sim_cpu *cpu)
7144 {
7145   HALT_NYI;
7146 }
7147
7148 static void
7149 dexSimpleFPCondCompare (sim_cpu *cpu)
7150 {
7151   /* instr [31,23] = 0001 1110 0
7152      instr [22]    = type
7153      instr [21]    = 1
7154      instr [20,16] = Rm
7155      instr [15,12] = condition
7156      instr [11,10] = 01
7157      instr [9,5]   = Rn
7158      instr [4]     = 0
7159      instr [3,0]   = nzcv  */
7160
7161   unsigned rm = INSTR (20, 16);
7162   unsigned rn = INSTR (9, 5);
7163
7164   NYI_assert (31, 23, 0x3C);
7165   NYI_assert (11, 10, 0x1);
7166   NYI_assert (4,  4,  0);
7167
7168   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7169   if (! testConditionCode (cpu, INSTR (15, 12)))
7170     {
7171       aarch64_set_CPSR (cpu, INSTR (3, 0));
7172       return;
7173     }
7174
7175   if (INSTR (22, 22))
7176     {
7177       /* Double precision.  */
7178       double val1 = aarch64_get_vec_double (cpu, rn, 0);
7179       double val2 = aarch64_get_vec_double (cpu, rm, 0);
7180
7181       /* FIXME: Check for NaNs.  */
7182       if (val1 == val2)
7183         aarch64_set_CPSR (cpu, (Z | C));
7184       else if (val1 < val2)
7185         aarch64_set_CPSR (cpu, N);
7186       else /* val1 > val2 */
7187         aarch64_set_CPSR (cpu, C);
7188     }
7189   else
7190     {
7191       /* Single precision.  */
7192       float val1 = aarch64_get_vec_float (cpu, rn, 0);
7193       float val2 = aarch64_get_vec_float (cpu, rm, 0);
7194
7195       /* FIXME: Check for NaNs.  */
7196       if (val1 == val2)
7197         aarch64_set_CPSR (cpu, (Z | C));
7198       else if (val1 < val2)
7199         aarch64_set_CPSR (cpu, N);
7200       else /* val1 > val2 */
7201         aarch64_set_CPSR (cpu, C);
7202     }
7203 }
7204
7205 /* 2 sources.  */
7206
7207 /* Float add.  */
7208 static void
7209 fadds (sim_cpu *cpu)
7210 {
7211   unsigned sm = INSTR (20, 16);
7212   unsigned sn = INSTR ( 9,  5);
7213   unsigned sd = INSTR ( 4,  0);
7214
7215   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7216   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7217                         + aarch64_get_FP_float (cpu, sm));
7218 }
7219
7220 /* Double add.  */
7221 static void
7222 faddd (sim_cpu *cpu)
7223 {
7224   unsigned sm = INSTR (20, 16);
7225   unsigned sn = INSTR ( 9,  5);
7226   unsigned sd = INSTR ( 4,  0);
7227
7228   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7229   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7230                          + aarch64_get_FP_double (cpu, sm));
7231 }
7232
7233 /* Float divide.  */
7234 static void
7235 fdivs (sim_cpu *cpu)
7236 {
7237   unsigned sm = INSTR (20, 16);
7238   unsigned sn = INSTR ( 9,  5);
7239   unsigned sd = INSTR ( 4,  0);
7240
7241   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7242   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7243                         / aarch64_get_FP_float (cpu, sm));
7244 }
7245
7246 /* Double divide.  */
7247 static void
7248 fdivd (sim_cpu *cpu)
7249 {
7250   unsigned sm = INSTR (20, 16);
7251   unsigned sn = INSTR ( 9,  5);
7252   unsigned sd = INSTR ( 4,  0);
7253
7254   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7255   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7256                          / aarch64_get_FP_double (cpu, sm));
7257 }
7258
7259 /* Float multiply.  */
7260 static void
7261 fmuls (sim_cpu *cpu)
7262 {
7263   unsigned sm = INSTR (20, 16);
7264   unsigned sn = INSTR ( 9,  5);
7265   unsigned sd = INSTR ( 4,  0);
7266
7267   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7268   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7269                         * aarch64_get_FP_float (cpu, sm));
7270 }
7271
7272 /* Double multiply.  */
7273 static void
7274 fmuld (sim_cpu *cpu)
7275 {
7276   unsigned sm = INSTR (20, 16);
7277   unsigned sn = INSTR ( 9,  5);
7278   unsigned sd = INSTR ( 4,  0);
7279
7280   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7281   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7282                          * aarch64_get_FP_double (cpu, sm));
7283 }
7284
7285 /* Float negate and multiply.  */
7286 static void
7287 fnmuls (sim_cpu *cpu)
7288 {
7289   unsigned sm = INSTR (20, 16);
7290   unsigned sn = INSTR ( 9,  5);
7291   unsigned sd = INSTR ( 4,  0);
7292
7293   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7294   aarch64_set_FP_float (cpu, sd, - (aarch64_get_FP_float (cpu, sn)
7295                                     * aarch64_get_FP_float (cpu, sm)));
7296 }
7297
7298 /* Double negate and multiply.  */
7299 static void
7300 fnmuld (sim_cpu *cpu)
7301 {
7302   unsigned sm = INSTR (20, 16);
7303   unsigned sn = INSTR ( 9,  5);
7304   unsigned sd = INSTR ( 4,  0);
7305
7306   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7307   aarch64_set_FP_double (cpu, sd, - (aarch64_get_FP_double (cpu, sn)
7308                                      * aarch64_get_FP_double (cpu, sm)));
7309 }
7310
7311 /* Float subtract.  */
7312 static void
7313 fsubs (sim_cpu *cpu)
7314 {
7315   unsigned sm = INSTR (20, 16);
7316   unsigned sn = INSTR ( 9,  5);
7317   unsigned sd = INSTR ( 4,  0);
7318
7319   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7320   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7321                         - aarch64_get_FP_float (cpu, sm));
7322 }
7323
7324 /* Double subtract.  */
7325 static void
7326 fsubd (sim_cpu *cpu)
7327 {
7328   unsigned sm = INSTR (20, 16);
7329   unsigned sn = INSTR ( 9,  5);
7330   unsigned sd = INSTR ( 4,  0);
7331
7332   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7333   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7334                          - aarch64_get_FP_double (cpu, sm));
7335 }
7336
7337 static void
7338 do_FMINNM (sim_cpu *cpu)
7339 {
7340   /* instr[31,23] = 0 0011 1100
7341      instr[22]    = float(0)/double(1)
7342      instr[21]    = 1
7343      instr[20,16] = Sm
7344      instr[15,10] = 01 1110
7345      instr[9,5]   = Sn
7346      instr[4,0]   = Cpu  */
7347
7348   unsigned sm = INSTR (20, 16);
7349   unsigned sn = INSTR ( 9,  5);
7350   unsigned sd = INSTR ( 4,  0);
7351
7352   NYI_assert (31, 23, 0x03C);
7353   NYI_assert (15, 10, 0x1E);
7354
7355   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7356   if (INSTR (22, 22))
7357     aarch64_set_FP_double (cpu, sd,
7358                            dminnm (aarch64_get_FP_double (cpu, sn),
7359                                    aarch64_get_FP_double (cpu, sm)));
7360   else
7361     aarch64_set_FP_float (cpu, sd,
7362                           fminnm (aarch64_get_FP_float (cpu, sn),
7363                                   aarch64_get_FP_float (cpu, sm)));
7364 }
7365
7366 static void
7367 do_FMAXNM (sim_cpu *cpu)
7368 {
7369   /* instr[31,23] = 0 0011 1100
7370      instr[22]    = float(0)/double(1)
7371      instr[21]    = 1
7372      instr[20,16] = Sm
7373      instr[15,10] = 01 1010
7374      instr[9,5]   = Sn
7375      instr[4,0]   = Cpu  */
7376
7377   unsigned sm = INSTR (20, 16);
7378   unsigned sn = INSTR ( 9,  5);
7379   unsigned sd = INSTR ( 4,  0);
7380
7381   NYI_assert (31, 23, 0x03C);
7382   NYI_assert (15, 10, 0x1A);
7383
7384   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7385   if (INSTR (22, 22))
7386     aarch64_set_FP_double (cpu, sd,
7387                            dmaxnm (aarch64_get_FP_double (cpu, sn),
7388                                    aarch64_get_FP_double (cpu, sm)));
7389   else
7390     aarch64_set_FP_float (cpu, sd,
7391                           fmaxnm (aarch64_get_FP_float (cpu, sn),
7392                                   aarch64_get_FP_float (cpu, sm)));
7393 }
7394
7395 static void
7396 dexSimpleFPDataProc2Source (sim_cpu *cpu)
7397 {
7398   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7399      instr[30]    = 0
7400      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7401      instr[28,25] = 1111
7402      instr[24]    = 0
7403      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7404      instr[21]    = 1
7405      instr[20,16] = Vm
7406      instr[15,12] ==> opcode : 0000 ==> FMUL, 0001 ==> FDIV
7407                                0010 ==> FADD, 0011 ==> FSUB,
7408                                0100 ==> FMAX, 0101 ==> FMIN
7409                                0110 ==> FMAXNM, 0111 ==> FMINNM
7410                                1000 ==> FNMUL, ow ==> UNALLOC
7411      instr[11,10] = 10
7412      instr[9,5]   = Vn
7413      instr[4,0]   = Vd  */
7414
7415   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7416   uint32_t type = INSTR (23, 22);
7417   /* Dispatch on opcode.  */
7418   uint32_t dispatch = INSTR (15, 12);
7419
7420   if (type > 1)
7421     HALT_UNALLOC;
7422
7423   if (M_S != 0)
7424     HALT_UNALLOC;
7425
7426   if (type)
7427     switch (dispatch)
7428       {
7429       case 0: fmuld (cpu); return;
7430       case 1: fdivd (cpu); return;
7431       case 2: faddd (cpu); return;
7432       case 3: fsubd (cpu); return;
7433       case 6: do_FMAXNM (cpu); return;
7434       case 7: do_FMINNM (cpu); return;
7435       case 8: fnmuld (cpu); return;
7436
7437         /* Have not yet implemented fmax and fmin.  */
7438       case 4:
7439       case 5:
7440         HALT_NYI;
7441
7442       default:
7443         HALT_UNALLOC;
7444       }
7445   else /* type == 0 => floats.  */
7446     switch (dispatch)
7447       {
7448       case 0: fmuls (cpu); return;
7449       case 1: fdivs (cpu); return;
7450       case 2: fadds (cpu); return;
7451       case 3: fsubs (cpu); return;
7452       case 6: do_FMAXNM (cpu); return;
7453       case 7: do_FMINNM (cpu); return;
7454       case 8: fnmuls (cpu); return;
7455
7456       case 4:
7457       case 5:
7458         HALT_NYI;
7459
7460       default:
7461         HALT_UNALLOC;
7462       }
7463 }
7464
7465 static void
7466 dexSimpleFPCondSelect (sim_cpu *cpu)
7467 {
7468   /* FCSEL
7469      instr[31,23] = 0 0011 1100
7470      instr[22]    = 0=>single 1=>double
7471      instr[21]    = 1
7472      instr[20,16] = Sm
7473      instr[15,12] = cond
7474      instr[11,10] = 11
7475      instr[9,5]   = Sn
7476      instr[4,0]   = Cpu  */
7477   unsigned sm = INSTR (20, 16);
7478   unsigned sn = INSTR ( 9, 5);
7479   unsigned sd = INSTR ( 4, 0);
7480   uint32_t set = testConditionCode (cpu, INSTR (15, 12));
7481
7482   NYI_assert (31, 23, 0x03C);
7483   NYI_assert (11, 10, 0x3);
7484
7485   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7486   if (INSTR (22, 22))
7487     aarch64_set_FP_double (cpu, sd, (set ? aarch64_get_FP_double (cpu, sn)
7488                                      : aarch64_get_FP_double (cpu, sm)));
7489   else
7490     aarch64_set_FP_float (cpu, sd, (set ? aarch64_get_FP_float (cpu, sn)
7491                                     : aarch64_get_FP_float (cpu, sm)));
7492 }
7493
7494 /* Store 32 bit unscaled signed 9 bit.  */
7495 static void
7496 fsturs (sim_cpu *cpu, int32_t offset)
7497 {
7498   unsigned int rn = INSTR (9, 5);
7499   unsigned int st = INSTR (4, 0);
7500
7501   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7502   aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7503                        aarch64_get_vec_u32 (cpu, st, 0));
7504 }
7505
7506 /* Store 64 bit unscaled signed 9 bit.  */
7507 static void
7508 fsturd (sim_cpu *cpu, int32_t offset)
7509 {
7510   unsigned int rn = INSTR (9, 5);
7511   unsigned int st = INSTR (4, 0);
7512
7513   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7514   aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7515                        aarch64_get_vec_u64 (cpu, st, 0));
7516 }
7517
7518 /* Store 128 bit unscaled signed 9 bit.  */
7519 static void
7520 fsturq (sim_cpu *cpu, int32_t offset)
7521 {
7522   unsigned int rn = INSTR (9, 5);
7523   unsigned int st = INSTR (4, 0);
7524   FRegister a;
7525
7526   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7527   aarch64_get_FP_long_double (cpu, st, & a);
7528   aarch64_set_mem_long_double (cpu,
7529                                aarch64_get_reg_u64 (cpu, rn, 1)
7530                                + offset, a);
7531 }
7532
7533 /* TODO FP move register.  */
7534
7535 /* 32 bit fp to fp move register.  */
7536 static void
7537 ffmovs (sim_cpu *cpu)
7538 {
7539   unsigned int rn = INSTR (9, 5);
7540   unsigned int st = INSTR (4, 0);
7541
7542   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7543   aarch64_set_FP_float (cpu, st, aarch64_get_FP_float (cpu, rn));
7544 }
7545
7546 /* 64 bit fp to fp move register.  */
7547 static void
7548 ffmovd (sim_cpu *cpu)
7549 {
7550   unsigned int rn = INSTR (9, 5);
7551   unsigned int st = INSTR (4, 0);
7552
7553   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7554   aarch64_set_FP_double (cpu, st, aarch64_get_FP_double (cpu, rn));
7555 }
7556
7557 /* 32 bit GReg to Vec move register.  */
7558 static void
7559 fgmovs (sim_cpu *cpu)
7560 {
7561   unsigned int rn = INSTR (9, 5);
7562   unsigned int st = INSTR (4, 0);
7563
7564   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7565   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_reg_u32 (cpu, rn, NO_SP));
7566 }
7567
7568 /* 64 bit g to fp move register.  */
7569 static void
7570 fgmovd (sim_cpu *cpu)
7571 {
7572   unsigned int rn = INSTR (9, 5);
7573   unsigned int st = INSTR (4, 0);
7574
7575   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7576   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_reg_u64 (cpu, rn, NO_SP));
7577 }
7578
7579 /* 32 bit fp to g move register.  */
7580 static void
7581 gfmovs (sim_cpu *cpu)
7582 {
7583   unsigned int rn = INSTR (9, 5);
7584   unsigned int st = INSTR (4, 0);
7585
7586   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7587   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u32 (cpu, rn, 0));
7588 }
7589
7590 /* 64 bit fp to g move register.  */
7591 static void
7592 gfmovd (sim_cpu *cpu)
7593 {
7594   unsigned int rn = INSTR (9, 5);
7595   unsigned int st = INSTR (4, 0);
7596
7597   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7598   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u64 (cpu, rn, 0));
7599 }
7600
7601 /* FP move immediate
7602
7603    These install an immediate 8 bit value in the target register
7604    where the 8 bits comprise 1 sign bit, 4 bits of fraction and a 3
7605    bit exponent.  */
7606
7607 static void
7608 fmovs (sim_cpu *cpu)
7609 {
7610   unsigned int sd = INSTR (4, 0);
7611   uint32_t imm = INSTR (20, 13);
7612   float f = fp_immediate_for_encoding_32 (imm);
7613
7614   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7615   aarch64_set_FP_float (cpu, sd, f);
7616 }
7617
7618 static void
7619 fmovd (sim_cpu *cpu)
7620 {
7621   unsigned int sd = INSTR (4, 0);
7622   uint32_t imm = INSTR (20, 13);
7623   double d = fp_immediate_for_encoding_64 (imm);
7624
7625   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7626   aarch64_set_FP_double (cpu, sd, d);
7627 }
7628
7629 static void
7630 dexSimpleFPImmediate (sim_cpu *cpu)
7631 {
7632   /* instr[31,23] == 00111100
7633      instr[22]    == type : single(0)/double(1)
7634      instr[21]    == 1
7635      instr[20,13] == imm8
7636      instr[12,10] == 100
7637      instr[9,5]   == imm5 : 00000 ==> PK, ow ==> UNALLOC
7638      instr[4,0]   == Rd  */
7639   uint32_t imm5 = INSTR (9, 5);
7640
7641   NYI_assert (31, 23, 0x3C);
7642
7643   if (imm5 != 0)
7644     HALT_UNALLOC;
7645
7646   if (INSTR (22, 22))
7647     fmovd (cpu);
7648   else
7649     fmovs (cpu);
7650 }
7651
7652 /* TODO specific decode and execute for group Load Store.  */
7653
7654 /* TODO FP load/store single register (unscaled offset).  */
7655
7656 /* TODO load 8 bit unscaled signed 9 bit.  */
7657 /* TODO load 16 bit unscaled signed 9 bit.  */
7658
7659 /* Load 32 bit unscaled signed 9 bit.  */
7660 static void
7661 fldurs (sim_cpu *cpu, int32_t offset)
7662 {
7663   unsigned int rn = INSTR (9, 5);
7664   unsigned int st = INSTR (4, 0);
7665
7666   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7667   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
7668                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7669 }
7670
7671 /* Load 64 bit unscaled signed 9 bit.  */
7672 static void
7673 fldurd (sim_cpu *cpu, int32_t offset)
7674 {
7675   unsigned int rn = INSTR (9, 5);
7676   unsigned int st = INSTR (4, 0);
7677
7678   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7679   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64
7680                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7681 }
7682
7683 /* Load 128 bit unscaled signed 9 bit.  */
7684 static void
7685 fldurq (sim_cpu *cpu, int32_t offset)
7686 {
7687   unsigned int rn = INSTR (9, 5);
7688   unsigned int st = INSTR (4, 0);
7689   FRegister a;
7690   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
7691
7692   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7693   aarch64_get_mem_long_double (cpu, addr, & a);
7694   aarch64_set_FP_long_double (cpu, st, a);
7695 }
7696
7697 /* TODO store 8 bit unscaled signed 9 bit.  */
7698 /* TODO store 16 bit unscaled signed 9 bit.  */
7699
7700
7701 /* 1 source.  */
7702
7703 /* Float absolute value.  */
7704 static void
7705 fabss (sim_cpu *cpu)
7706 {
7707   unsigned sn = INSTR (9, 5);
7708   unsigned sd = INSTR (4, 0);
7709   float value = aarch64_get_FP_float (cpu, sn);
7710
7711   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7712   aarch64_set_FP_float (cpu, sd, fabsf (value));
7713 }
7714
7715 /* Double absolute value.  */
7716 static void
7717 fabcpu (sim_cpu *cpu)
7718 {
7719   unsigned sn = INSTR (9, 5);
7720   unsigned sd = INSTR (4, 0);
7721   double value = aarch64_get_FP_double (cpu, sn);
7722
7723   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7724   aarch64_set_FP_double (cpu, sd, fabs (value));
7725 }
7726
7727 /* Float negative value.  */
7728 static void
7729 fnegs (sim_cpu *cpu)
7730 {
7731   unsigned sn = INSTR (9, 5);
7732   unsigned sd = INSTR (4, 0);
7733
7734   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7735   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sn));
7736 }
7737
7738 /* Double negative value.  */
7739 static void
7740 fnegd (sim_cpu *cpu)
7741 {
7742   unsigned sn = INSTR (9, 5);
7743   unsigned sd = INSTR (4, 0);
7744
7745   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7746   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sn));
7747 }
7748
7749 /* Float square root.  */
7750 static void
7751 fsqrts (sim_cpu *cpu)
7752 {
7753   unsigned sn = INSTR (9, 5);
7754   unsigned sd = INSTR (4, 0);
7755
7756   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7757   aarch64_set_FP_float (cpu, sd, sqrtf (aarch64_get_FP_float (cpu, sn)));
7758 }
7759
7760 /* Double square root.  */
7761 static void
7762 fsqrtd (sim_cpu *cpu)
7763 {
7764   unsigned sn = INSTR (9, 5);
7765   unsigned sd = INSTR (4, 0);
7766
7767   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7768   aarch64_set_FP_double (cpu, sd,
7769                          sqrt (aarch64_get_FP_double (cpu, sn)));
7770 }
7771
7772 /* Convert double to float.  */
7773 static void
7774 fcvtds (sim_cpu *cpu)
7775 {
7776   unsigned sn = INSTR (9, 5);
7777   unsigned sd = INSTR (4, 0);
7778
7779   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7780   aarch64_set_FP_float (cpu, sd, (float) aarch64_get_FP_double (cpu, sn));
7781 }
7782
7783 /* Convert float to double.  */
7784 static void
7785 fcvtcpu (sim_cpu *cpu)
7786 {
7787   unsigned sn = INSTR (9, 5);
7788   unsigned sd = INSTR (4, 0);
7789
7790   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7791   aarch64_set_FP_double (cpu, sd, (double) aarch64_get_FP_float (cpu, sn));
7792 }
7793
7794 static void
7795 do_FRINT (sim_cpu *cpu)
7796 {
7797   /* instr[31,23] = 0001 1110 0
7798      instr[22]    = single(0)/double(1)
7799      instr[21,18] = 1001
7800      instr[17,15] = rounding mode
7801      instr[14,10] = 10000
7802      instr[9,5]   = source
7803      instr[4,0]   = dest  */
7804
7805   float val;
7806   unsigned rs = INSTR (9, 5);
7807   unsigned rd = INSTR (4, 0);
7808   unsigned int rmode = INSTR (17, 15);
7809
7810   NYI_assert (31, 23, 0x03C);
7811   NYI_assert (21, 18, 0x9);
7812   NYI_assert (14, 10, 0x10);
7813
7814   if (rmode == 6 || rmode == 7)
7815     /* FIXME: Add support for rmode == 6 exactness check.  */
7816     rmode = uimm (aarch64_get_FPSR (cpu), 23, 22);
7817
7818   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7819   if (INSTR (22, 22))
7820     {
7821       double val = aarch64_get_FP_double (cpu, rs);
7822
7823       switch (rmode)
7824         {
7825         case 0: /* mode N: nearest or even.  */
7826           {
7827             double rval = round (val);
7828
7829             if (val - rval == 0.5)
7830               {
7831                 if (((rval / 2.0) * 2.0) != rval)
7832                   rval += 1.0;
7833               }
7834
7835             aarch64_set_FP_double (cpu, rd, round (val));
7836             return;
7837           }
7838
7839         case 1: /* mode P: towards +inf.  */
7840           if (val < 0.0)
7841             aarch64_set_FP_double (cpu, rd, trunc (val));
7842           else
7843             aarch64_set_FP_double (cpu, rd, round (val));
7844           return;
7845
7846         case 2: /* mode M: towards -inf.  */
7847           if (val < 0.0)
7848             aarch64_set_FP_double (cpu, rd, round (val));
7849           else
7850             aarch64_set_FP_double (cpu, rd, trunc (val));
7851           return;
7852
7853         case 3: /* mode Z: towards 0.  */
7854           aarch64_set_FP_double (cpu, rd, trunc (val));
7855           return;
7856
7857         case 4: /* mode A: away from 0.  */
7858           aarch64_set_FP_double (cpu, rd, round (val));
7859           return;
7860
7861         case 6: /* mode X: use FPCR with exactness check.  */
7862         case 7: /* mode I: use FPCR mode.  */
7863           HALT_NYI;
7864
7865         default:
7866           HALT_UNALLOC;
7867         }
7868     }
7869
7870   val = aarch64_get_FP_float (cpu, rs);
7871
7872   switch (rmode)
7873     {
7874     case 0: /* mode N: nearest or even.  */
7875       {
7876         float rval = roundf (val);
7877
7878         if (val - rval == 0.5)
7879           {
7880             if (((rval / 2.0) * 2.0) != rval)
7881               rval += 1.0;
7882           }
7883
7884         aarch64_set_FP_float (cpu, rd, rval);
7885         return;
7886       }
7887
7888     case 1: /* mode P: towards +inf.  */
7889       if (val < 0.0)
7890         aarch64_set_FP_float (cpu, rd, truncf (val));
7891       else
7892         aarch64_set_FP_float (cpu, rd, roundf (val));
7893       return;
7894
7895     case 2: /* mode M: towards -inf.  */
7896       if (val < 0.0)
7897         aarch64_set_FP_float (cpu, rd, truncf (val));
7898       else
7899         aarch64_set_FP_float (cpu, rd, roundf (val));
7900       return;
7901
7902     case 3: /* mode Z: towards 0.  */
7903       aarch64_set_FP_float (cpu, rd, truncf (val));
7904       return;
7905
7906     case 4: /* mode A: away from 0.  */
7907       aarch64_set_FP_float (cpu, rd, roundf (val));
7908       return;
7909
7910     case 6: /* mode X: use FPCR with exactness check.  */
7911     case 7: /* mode I: use FPCR mode.  */
7912       HALT_NYI;
7913
7914     default:
7915       HALT_UNALLOC;
7916     }
7917 }
7918
7919 /* Convert half to float.  */
7920 static void
7921 do_FCVT_half_to_single (sim_cpu *cpu)
7922 {
7923   unsigned rn = INSTR (9, 5);
7924   unsigned rd = INSTR (4, 0);
7925
7926   NYI_assert (31, 10, 0x7B890);
7927
7928   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7929   aarch64_set_FP_float (cpu, rd, (float) aarch64_get_FP_half  (cpu, rn));
7930 }
7931
7932 /* Convert half to double.  */
7933 static void
7934 do_FCVT_half_to_double (sim_cpu *cpu)
7935 {
7936   unsigned rn = INSTR (9, 5);
7937   unsigned rd = INSTR (4, 0);
7938
7939   NYI_assert (31, 10, 0x7B8B0);
7940
7941   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7942   aarch64_set_FP_double (cpu, rd, (double) aarch64_get_FP_half  (cpu, rn));
7943 }
7944
7945 static void
7946 do_FCVT_single_to_half (sim_cpu *cpu)
7947 {
7948   unsigned rn = INSTR (9, 5);
7949   unsigned rd = INSTR (4, 0);
7950
7951   NYI_assert (31, 10, 0x788F0);
7952
7953   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7954   aarch64_set_FP_half (cpu, rd, aarch64_get_FP_float  (cpu, rn));
7955 }
7956
7957 /* Convert double to half.  */
7958 static void
7959 do_FCVT_double_to_half (sim_cpu *cpu)
7960 {
7961   unsigned rn = INSTR (9, 5);
7962   unsigned rd = INSTR (4, 0);
7963
7964   NYI_assert (31, 10, 0x798F0);
7965
7966   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7967   aarch64_set_FP_half (cpu, rd, (float) aarch64_get_FP_double  (cpu, rn));
7968 }
7969
7970 static void
7971 dexSimpleFPDataProc1Source (sim_cpu *cpu)
7972 {
7973   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7974      instr[30]    = 0
7975      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7976      instr[28,25] = 1111
7977      instr[24]    = 0
7978      instr[23,22] ==> type : 00 ==> source is single,
7979                              01 ==> source is double
7980                              10 ==> UNALLOC
7981                              11 ==> UNALLOC or source is half
7982      instr[21]    = 1
7983      instr[20,15] ==> opcode : with type 00 or 01
7984                                000000 ==> FMOV, 000001 ==> FABS,
7985                                000010 ==> FNEG, 000011 ==> FSQRT,
7986                                000100 ==> UNALLOC, 000101 ==> FCVT,(to single/double)
7987                                000110 ==> UNALLOC, 000111 ==> FCVT (to half)
7988                                001000 ==> FRINTN, 001001 ==> FRINTP,
7989                                001010 ==> FRINTM, 001011 ==> FRINTZ,
7990                                001100 ==> FRINTA, 001101 ==> UNALLOC
7991                                001110 ==> FRINTX, 001111 ==> FRINTI
7992                                with type 11
7993                                000100 ==> FCVT (half-to-single)
7994                                000101 ==> FCVT (half-to-double)
7995                                instr[14,10] = 10000.  */
7996
7997   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7998   uint32_t type   = INSTR (23, 22);
7999   uint32_t opcode = INSTR (20, 15);
8000
8001   if (M_S != 0)
8002     HALT_UNALLOC;
8003
8004   if (type == 3)
8005     {
8006       if (opcode == 4)
8007         do_FCVT_half_to_single (cpu);
8008       else if (opcode == 5)
8009         do_FCVT_half_to_double (cpu);
8010       else
8011         HALT_UNALLOC;
8012       return;
8013     }
8014
8015   if (type == 2)
8016     HALT_UNALLOC;
8017
8018   switch (opcode)
8019     {
8020     case 0:
8021       if (type)
8022         ffmovd (cpu);
8023       else
8024         ffmovs (cpu);
8025       return;
8026
8027     case 1:
8028       if (type)
8029         fabcpu (cpu);
8030       else
8031         fabss (cpu);
8032       return;
8033
8034     case 2:
8035       if (type)
8036         fnegd (cpu);
8037       else
8038         fnegs (cpu);
8039       return;
8040
8041     case 3:
8042       if (type)
8043         fsqrtd (cpu);
8044       else
8045         fsqrts (cpu);
8046       return;
8047
8048     case 4:
8049       if (type)
8050         fcvtds (cpu);
8051       else
8052         HALT_UNALLOC;
8053       return;
8054
8055     case 5:
8056       if (type)
8057         HALT_UNALLOC;
8058       fcvtcpu (cpu);
8059       return;
8060
8061     case 8:             /* FRINTN etc.  */
8062     case 9:
8063     case 10:
8064     case 11:
8065     case 12:
8066     case 14:
8067     case 15:
8068        do_FRINT (cpu);
8069        return;
8070
8071     case 7:
8072       if (INSTR (22, 22))
8073         do_FCVT_double_to_half (cpu);
8074       else
8075         do_FCVT_single_to_half (cpu);
8076       return;
8077
8078     case 13:
8079       HALT_NYI;
8080
8081     default:
8082       HALT_UNALLOC;
8083     }
8084 }
8085
8086 /* 32 bit signed int to float.  */
8087 static void
8088 scvtf32 (sim_cpu *cpu)
8089 {
8090   unsigned rn = INSTR (9, 5);
8091   unsigned sd = INSTR (4, 0);
8092
8093   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8094   aarch64_set_FP_float
8095     (cpu, sd, (float) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8096 }
8097
8098 /* signed int to float.  */
8099 static void
8100 scvtf (sim_cpu *cpu)
8101 {
8102   unsigned rn = INSTR (9, 5);
8103   unsigned sd = INSTR (4, 0);
8104
8105   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8106   aarch64_set_FP_float
8107     (cpu, sd, (float) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8108 }
8109
8110 /* 32 bit signed int to double.  */
8111 static void
8112 scvtd32 (sim_cpu *cpu)
8113 {
8114   unsigned rn = INSTR (9, 5);
8115   unsigned sd = INSTR (4, 0);
8116
8117   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8118   aarch64_set_FP_double
8119     (cpu, sd, (double) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8120 }
8121
8122 /* signed int to double.  */
8123 static void
8124 scvtd (sim_cpu *cpu)
8125 {
8126   unsigned rn = INSTR (9, 5);
8127   unsigned sd = INSTR (4, 0);
8128
8129   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8130   aarch64_set_FP_double
8131     (cpu, sd, (double) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8132 }
8133
8134 static const float  FLOAT_INT_MAX   = (float)  INT_MAX;
8135 static const float  FLOAT_INT_MIN   = (float)  INT_MIN;
8136 static const double DOUBLE_INT_MAX  = (double) INT_MAX;
8137 static const double DOUBLE_INT_MIN  = (double) INT_MIN;
8138 static const float  FLOAT_LONG_MAX  = (float)  LONG_MAX;
8139 static const float  FLOAT_LONG_MIN  = (float)  LONG_MIN;
8140 static const double DOUBLE_LONG_MAX = (double) LONG_MAX;
8141 static const double DOUBLE_LONG_MIN = (double) LONG_MIN;
8142
8143 #define UINT_MIN 0
8144 #define ULONG_MIN 0
8145 static const float  FLOAT_UINT_MAX   = (float)  UINT_MAX;
8146 static const float  FLOAT_UINT_MIN   = (float)  UINT_MIN;
8147 static const double DOUBLE_UINT_MAX  = (double) UINT_MAX;
8148 static const double DOUBLE_UINT_MIN  = (double) UINT_MIN;
8149 static const float  FLOAT_ULONG_MAX  = (float)  ULONG_MAX;
8150 static const float  FLOAT_ULONG_MIN  = (float)  ULONG_MIN;
8151 static const double DOUBLE_ULONG_MAX = (double) ULONG_MAX;
8152 static const double DOUBLE_ULONG_MIN = (double) ULONG_MIN;
8153
8154 /* Check for FP exception conditions:
8155      NaN raises IO
8156      Infinity raises IO
8157      Out of Range raises IO and IX and saturates value
8158      Denormal raises ID and IX and sets to zero.  */
8159 #define RAISE_EXCEPTIONS(F, VALUE, FTYPE, ITYPE)        \
8160   do                                                    \
8161     {                                                   \
8162       switch (fpclassify (F))                           \
8163         {                                               \
8164         case FP_INFINITE:                               \
8165         case FP_NAN:                                    \
8166           aarch64_set_FPSR (cpu, IO);                   \
8167           if (signbit (F))                              \
8168             VALUE = ITYPE##_MAX;                        \
8169           else                                          \
8170             VALUE = ITYPE##_MIN;                        \
8171           break;                                        \
8172                                                         \
8173         case FP_NORMAL:                                 \
8174           if (F >= FTYPE##_##ITYPE##_MAX)               \
8175             {                                           \
8176               aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);    \
8177               VALUE = ITYPE##_MAX;                      \
8178             }                                           \
8179           else if (F <= FTYPE##_##ITYPE##_MIN)          \
8180             {                                           \
8181               aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);    \
8182               VALUE = ITYPE##_MIN;                      \
8183             }                                           \
8184           break;                                        \
8185                                                         \
8186         case FP_SUBNORMAL:                              \
8187           aarch64_set_FPSR_bits (cpu, IO | IX | ID, IX | ID);   \
8188           VALUE = 0;                                    \
8189           break;                                        \
8190                                                         \
8191         default:                                        \
8192         case FP_ZERO:                                   \
8193           VALUE = 0;                                    \
8194           break;                                        \
8195         }                                               \
8196     }                                                   \
8197   while (0)
8198
8199 /* 32 bit convert float to signed int truncate towards zero.  */
8200 static void
8201 fcvtszs32 (sim_cpu *cpu)
8202 {
8203   unsigned sn = INSTR (9, 5);
8204   unsigned rd = INSTR (4, 0);
8205   /* TODO : check that this rounds toward zero.  */
8206   float   f = aarch64_get_FP_float (cpu, sn);
8207   int32_t value = (int32_t) f;
8208
8209   RAISE_EXCEPTIONS (f, value, FLOAT, INT);
8210
8211   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8212   /* Avoid sign extension to 64 bit.  */
8213   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8214 }
8215
8216 /* 64 bit convert float to signed int truncate towards zero.  */
8217 static void
8218 fcvtszs (sim_cpu *cpu)
8219 {
8220   unsigned sn = INSTR (9, 5);
8221   unsigned rd = INSTR (4, 0);
8222   float f = aarch64_get_FP_float (cpu, sn);
8223   int64_t value = (int64_t) f;
8224
8225   RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
8226
8227   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8228   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8229 }
8230
8231 /* 32 bit convert double to signed int truncate towards zero.  */
8232 static void
8233 fcvtszd32 (sim_cpu *cpu)
8234 {
8235   unsigned sn = INSTR (9, 5);
8236   unsigned rd = INSTR (4, 0);
8237   /* TODO : check that this rounds toward zero.  */
8238   double   d = aarch64_get_FP_double (cpu, sn);
8239   int32_t  value = (int32_t) d;
8240
8241   RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
8242
8243   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8244   /* Avoid sign extension to 64 bit.  */
8245   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8246 }
8247
8248 /* 64 bit convert double to signed int truncate towards zero.  */
8249 static void
8250 fcvtszd (sim_cpu *cpu)
8251 {
8252   unsigned sn = INSTR (9, 5);
8253   unsigned rd = INSTR (4, 0);
8254   /* TODO : check that this rounds toward zero.  */
8255   double  d = aarch64_get_FP_double (cpu, sn);
8256   int64_t value;
8257
8258   value = (int64_t) d;
8259
8260   RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
8261
8262   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8263   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8264 }
8265
8266 static void
8267 do_fcvtzu (sim_cpu *cpu)
8268 {
8269   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8270      instr[30,23] = 00111100
8271      instr[22]    = type: single (0)/ double (1)
8272      instr[21]    = enable (0)/disable(1) precision
8273      instr[20,16] = 11001
8274      instr[15,10] = precision
8275      instr[9,5]   = Rs
8276      instr[4,0]   = Rd.  */
8277
8278   unsigned rs = INSTR (9, 5);
8279   unsigned rd = INSTR (4, 0);
8280
8281   NYI_assert (30, 23, 0x3C);
8282   NYI_assert (20, 16, 0x19);
8283
8284   if (INSTR (21, 21) != 1)
8285     /* Convert to fixed point.  */
8286     HALT_NYI;
8287
8288   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8289   if (INSTR (31, 31))
8290     {
8291       /* Convert to unsigned 64-bit integer.  */
8292       if (INSTR (22, 22))
8293         {
8294           double  d = aarch64_get_FP_double (cpu, rs);
8295           uint64_t value = (uint64_t) d;
8296
8297           /* Do not raise an exception if we have reached ULONG_MAX.  */
8298           if (value != (1UL << 63))
8299             RAISE_EXCEPTIONS (d, value, DOUBLE, ULONG);
8300
8301           aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8302         }
8303       else
8304         {
8305           float  f = aarch64_get_FP_float (cpu, rs);
8306           uint64_t value = (uint64_t) f;
8307
8308           /* Do not raise an exception if we have reached ULONG_MAX.  */
8309           if (value != (1UL << 63))
8310             RAISE_EXCEPTIONS (f, value, FLOAT, ULONG);
8311
8312           aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8313         }
8314     }
8315   else
8316     {
8317       uint32_t value;
8318
8319       /* Convert to unsigned 32-bit integer.  */
8320       if (INSTR (22, 22))
8321         {
8322           double  d = aarch64_get_FP_double (cpu, rs);
8323
8324           value = (uint32_t) d;
8325           /* Do not raise an exception if we have reached UINT_MAX.  */
8326           if (value != (1UL << 31))
8327             RAISE_EXCEPTIONS (d, value, DOUBLE, UINT);
8328         }
8329       else
8330         {
8331           float  f = aarch64_get_FP_float (cpu, rs);
8332
8333           value = (uint32_t) f;
8334           /* Do not raise an exception if we have reached UINT_MAX.  */
8335           if (value != (1UL << 31))
8336             RAISE_EXCEPTIONS (f, value, FLOAT, UINT);
8337         }
8338
8339       aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8340     }
8341 }
8342
8343 static void
8344 do_UCVTF (sim_cpu *cpu)
8345 {
8346   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8347      instr[30,23] = 001 1110 0
8348      instr[22]    = type: single (0)/ double (1)
8349      instr[21]    = enable (0)/disable(1) precision
8350      instr[20,16] = 0 0011
8351      instr[15,10] = precision
8352      instr[9,5]   = Rs
8353      instr[4,0]   = Rd.  */
8354
8355   unsigned rs = INSTR (9, 5);
8356   unsigned rd = INSTR (4, 0);
8357
8358   NYI_assert (30, 23, 0x3C);
8359   NYI_assert (20, 16, 0x03);
8360
8361   if (INSTR (21, 21) != 1)
8362     HALT_NYI;
8363
8364   /* FIXME: Add exception raising.  */
8365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8366   if (INSTR (31, 31))
8367     {
8368       uint64_t value = aarch64_get_reg_u64 (cpu, rs, NO_SP);
8369
8370       if (INSTR (22, 22))
8371         aarch64_set_FP_double (cpu, rd, (double) value);
8372       else
8373         aarch64_set_FP_float (cpu, rd, (float) value);
8374     }
8375   else
8376     {
8377       uint32_t value =  aarch64_get_reg_u32 (cpu, rs, NO_SP);
8378
8379       if (INSTR (22, 22))
8380         aarch64_set_FP_double (cpu, rd, (double) value);
8381       else
8382         aarch64_set_FP_float (cpu, rd, (float) value);
8383     }
8384 }
8385
8386 static void
8387 float_vector_move (sim_cpu *cpu)
8388 {
8389   /* instr[31,17] == 100 1111 0101 0111
8390      instr[16]    ==> direction 0=> to GR, 1=> from GR
8391      instr[15,10] => ???
8392      instr[9,5]   ==> source
8393      instr[4,0]   ==> dest.  */
8394
8395   unsigned rn = INSTR (9, 5);
8396   unsigned rd = INSTR (4, 0);
8397
8398   NYI_assert (31, 17, 0x4F57);
8399
8400   if (INSTR (15, 10) != 0)
8401     HALT_UNALLOC;
8402
8403   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8404   if (INSTR (16, 16))
8405     aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_reg_u64 (cpu, rn, NO_SP));
8406   else
8407     aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, rn, 1));
8408 }
8409
8410 static void
8411 dexSimpleFPIntegerConvert (sim_cpu *cpu)
8412 {
8413   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
8414      instr[30     = 0
8415      instr[29]    = S :  0 ==> OK, 1 ==> UNALLOC
8416      instr[28,25] = 1111
8417      instr[24]    = 0
8418      instr[23,22] = type : 00 ==> single, 01 ==> double, 1x ==> UNALLOC
8419      instr[21]    = 1
8420      instr[20,19] = rmode
8421      instr[18,16] = opcode
8422      instr[15,10] = 10 0000  */
8423
8424   uint32_t rmode_opcode;
8425   uint32_t size_type;
8426   uint32_t type;
8427   uint32_t size;
8428   uint32_t S;
8429
8430   if (INSTR (31, 17) == 0x4F57)
8431     {
8432       float_vector_move (cpu);
8433       return;
8434     }
8435
8436   size = INSTR (31, 31);
8437   S = INSTR (29, 29);
8438   if (S != 0)
8439     HALT_UNALLOC;
8440
8441   type = INSTR (23, 22);
8442   if (type > 1)
8443     HALT_UNALLOC;
8444
8445   rmode_opcode = INSTR (20, 16);
8446   size_type = (size << 1) | type; /* 0==32f, 1==32d, 2==64f, 3==64d.  */
8447
8448   switch (rmode_opcode)
8449     {
8450     case 2:                     /* SCVTF.  */
8451       switch (size_type)
8452         {
8453         case 0: scvtf32 (cpu); return;
8454         case 1: scvtd32 (cpu); return;
8455         case 2: scvtf (cpu); return;
8456         case 3: scvtd (cpu); return;
8457         }
8458
8459     case 6:                     /* FMOV GR, Vec.  */
8460       switch (size_type)
8461         {
8462         case 0:  gfmovs (cpu); return;
8463         case 3:  gfmovd (cpu); return;
8464         default: HALT_UNALLOC;
8465         }
8466
8467     case 7:                     /* FMOV vec, GR.  */
8468       switch (size_type)
8469         {
8470         case 0:  fgmovs (cpu); return;
8471         case 3:  fgmovd (cpu); return;
8472         default: HALT_UNALLOC;
8473         }
8474
8475     case 24:                    /* FCVTZS.  */
8476       switch (size_type)
8477         {
8478         case 0: fcvtszs32 (cpu); return;
8479         case 1: fcvtszd32 (cpu); return;
8480         case 2: fcvtszs (cpu); return;
8481         case 3: fcvtszd (cpu); return;
8482         }
8483
8484     case 25: do_fcvtzu (cpu); return;
8485     case 3:  do_UCVTF (cpu); return;
8486
8487     case 0:     /* FCVTNS.  */
8488     case 1:     /* FCVTNU.  */
8489     case 4:     /* FCVTAS.  */
8490     case 5:     /* FCVTAU.  */
8491     case 8:     /* FCVPTS.  */
8492     case 9:     /* FCVTPU.  */
8493     case 16:    /* FCVTMS.  */
8494     case 17:    /* FCVTMU.  */
8495     default:
8496       HALT_NYI;
8497     }
8498 }
8499
8500 static void
8501 set_flags_for_float_compare (sim_cpu *cpu, float fvalue1, float fvalue2)
8502 {
8503   uint32_t flags;
8504
8505   /* FIXME: Add exception raising.  */
8506   if (isnan (fvalue1) || isnan (fvalue2))
8507     flags = C|V;
8508   else if (isinf (fvalue1) && isinf (fvalue2))
8509     {
8510       /* Subtracting two infinities may give a NaN.  We only need to compare
8511          the signs, which we can get from isinf.  */
8512       int result = isinf (fvalue1) - isinf (fvalue2);
8513
8514       if (result == 0)
8515         flags = Z|C;
8516       else if (result < 0)
8517         flags = N;
8518       else /* (result > 0).  */
8519         flags = C;
8520     }
8521   else
8522     {
8523       float result = fvalue1 - fvalue2;
8524
8525       if (result == 0.0)
8526         flags = Z|C;
8527       else if (result < 0)
8528         flags = N;
8529       else /* (result > 0).  */
8530         flags = C;
8531     }
8532
8533   aarch64_set_CPSR (cpu, flags);
8534 }
8535
8536 static void
8537 fcmps (sim_cpu *cpu)
8538 {
8539   unsigned sm = INSTR (20, 16);
8540   unsigned sn = INSTR ( 9,  5);
8541
8542   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8543   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8544
8545   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8546   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8547 }
8548
8549 /* Float compare to zero -- Invalid Operation exception
8550    only on signaling NaNs.  */
8551 static void
8552 fcmpzs (sim_cpu *cpu)
8553 {
8554   unsigned sn = INSTR ( 9,  5);
8555   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8556
8557   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8558   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8559 }
8560
8561 /* Float compare -- Invalid Operation exception on all NaNs.  */
8562 static void
8563 fcmpes (sim_cpu *cpu)
8564 {
8565   unsigned sm = INSTR (20, 16);
8566   unsigned sn = INSTR ( 9,  5);
8567
8568   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8569   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8570
8571   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8572   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8573 }
8574
8575 /* Float compare to zero -- Invalid Operation exception on all NaNs.  */
8576 static void
8577 fcmpzes (sim_cpu *cpu)
8578 {
8579   unsigned sn = INSTR ( 9,  5);
8580   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8581
8582   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8583   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8584 }
8585
8586 static void
8587 set_flags_for_double_compare (sim_cpu *cpu, double dval1, double dval2)
8588 {
8589   uint32_t flags;
8590
8591   /* FIXME: Add exception raising.  */
8592   if (isnan (dval1) || isnan (dval2))
8593     flags = C|V;
8594   else if (isinf (dval1) && isinf (dval2))
8595     {
8596       /* Subtracting two infinities may give a NaN.  We only need to compare
8597          the signs, which we can get from isinf.  */
8598       int result = isinf (dval1) - isinf (dval2);
8599
8600       if (result == 0)
8601         flags = Z|C;
8602       else if (result < 0)
8603         flags = N;
8604       else /* (result > 0).  */
8605         flags = C;
8606     }
8607   else
8608     {
8609       double result = dval1 - dval2;
8610
8611       if (result == 0.0)
8612         flags = Z|C;
8613       else if (result < 0)
8614         flags = N;
8615       else /* (result > 0).  */
8616         flags = C;
8617     }
8618
8619   aarch64_set_CPSR (cpu, flags);
8620 }
8621
8622 /* Double compare -- Invalid Operation exception only on signaling NaNs.  */
8623 static void
8624 fcmpd (sim_cpu *cpu)
8625 {
8626   unsigned sm = INSTR (20, 16);
8627   unsigned sn = INSTR ( 9,  5);
8628
8629   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8630   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8631
8632   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8633   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8634 }
8635
8636 /* Double compare to zero -- Invalid Operation exception
8637    only on signaling NaNs.  */
8638 static void
8639 fcmpzd (sim_cpu *cpu)
8640 {
8641   unsigned sn = INSTR ( 9,  5);
8642   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8643
8644   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8645   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8646 }
8647
8648 /* Double compare -- Invalid Operation exception on all NaNs.  */
8649 static void
8650 fcmped (sim_cpu *cpu)
8651 {
8652   unsigned sm = INSTR (20, 16);
8653   unsigned sn = INSTR ( 9,  5);
8654
8655   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8656   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8657
8658   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8659   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8660 }
8661
8662 /* Double compare to zero -- Invalid Operation exception on all NaNs.  */
8663 static void
8664 fcmpzed (sim_cpu *cpu)
8665 {
8666   unsigned sn = INSTR ( 9,  5);
8667   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8668
8669   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8670   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8671 }
8672
8673 static void
8674 dexSimpleFPCompare (sim_cpu *cpu)
8675 {
8676   /* assert instr[28,25] == 1111
8677      instr[30:24:21:13,10] = 0011000
8678      instr[31] = M : 0 ==> OK, 1 ==> UNALLOC
8679      instr[29] ==> S :  0 ==> OK, 1 ==> UNALLOC
8680      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
8681      instr[15,14] ==> op : 00 ==> OK, ow ==> UNALLOC
8682      instr[4,0] ==> opcode2 : 00000 ==> FCMP, 10000 ==> FCMPE,
8683                               01000 ==> FCMPZ, 11000 ==> FCMPEZ,
8684                               ow ==> UNALLOC  */
8685   uint32_t dispatch;
8686   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
8687   uint32_t type = INSTR (23, 22);
8688   uint32_t op = INSTR (15, 14);
8689   uint32_t op2_2_0 = INSTR (2, 0);
8690
8691   if (op2_2_0 != 0)
8692     HALT_UNALLOC;
8693
8694   if (M_S != 0)
8695     HALT_UNALLOC;
8696
8697   if (type > 1)
8698     HALT_UNALLOC;
8699
8700   if (op != 0)
8701     HALT_UNALLOC;
8702
8703   /* dispatch on type and top 2 bits of opcode.  */
8704   dispatch = (type << 2) | INSTR (4, 3);
8705
8706   switch (dispatch)
8707     {
8708     case 0: fcmps (cpu); return;
8709     case 1: fcmpzs (cpu); return;
8710     case 2: fcmpes (cpu); return;
8711     case 3: fcmpzes (cpu); return;
8712     case 4: fcmpd (cpu); return;
8713     case 5: fcmpzd (cpu); return;
8714     case 6: fcmped (cpu); return;
8715     case 7: fcmpzed (cpu); return;
8716     }
8717 }
8718
8719 static void
8720 do_scalar_FADDP (sim_cpu *cpu)
8721 {
8722   /* instr [31,23] = 0111 1110 0
8723      instr [22]    = single(0)/double(1)
8724      instr [21,10] = 11 0000 1101 10
8725      instr [9,5]   = Fn
8726      instr [4,0]   = Fd.  */
8727
8728   unsigned Fn = INSTR (9, 5);
8729   unsigned Fd = INSTR (4, 0);
8730
8731   NYI_assert (31, 23, 0x0FC);
8732   NYI_assert (21, 10, 0xC36);
8733
8734   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8735   if (INSTR (22, 22))
8736     {
8737       double val1 = aarch64_get_vec_double (cpu, Fn, 0);
8738       double val2 = aarch64_get_vec_double (cpu, Fn, 1);
8739
8740       aarch64_set_FP_double (cpu, Fd, val1 + val2);
8741     }
8742   else
8743     {
8744       float val1 = aarch64_get_vec_float (cpu, Fn, 0);
8745       float val2 = aarch64_get_vec_float (cpu, Fn, 1);
8746
8747       aarch64_set_FP_float (cpu, Fd, val1 + val2);
8748     }
8749 }
8750
8751 /* Floating point absolute difference.  */
8752
8753 static void
8754 do_scalar_FABD (sim_cpu *cpu)
8755 {
8756   /* instr [31,23] = 0111 1110 1
8757      instr [22]    = float(0)/double(1)
8758      instr [21]    = 1
8759      instr [20,16] = Rm
8760      instr [15,10] = 1101 01
8761      instr [9, 5]  = Rn
8762      instr [4, 0]  = Rd.  */
8763
8764   unsigned rm = INSTR (20, 16);
8765   unsigned rn = INSTR (9, 5);
8766   unsigned rd = INSTR (4, 0);
8767
8768   NYI_assert (31, 23, 0x0FD);
8769   NYI_assert (21, 21, 1);
8770   NYI_assert (15, 10, 0x35);
8771
8772   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8773   if (INSTR (22, 22))
8774     aarch64_set_FP_double (cpu, rd,
8775                            fabs (aarch64_get_FP_double (cpu, rn)
8776                                  - aarch64_get_FP_double (cpu, rm)));
8777   else
8778     aarch64_set_FP_float (cpu, rd,
8779                           fabsf (aarch64_get_FP_float (cpu, rn)
8780                                  - aarch64_get_FP_float (cpu, rm)));
8781 }
8782
8783 static void
8784 do_scalar_CMGT (sim_cpu *cpu)
8785 {
8786   /* instr [31,21] = 0101 1110 111
8787      instr [20,16] = Rm
8788      instr [15,10] = 00 1101
8789      instr [9, 5]  = Rn
8790      instr [4, 0]  = Rd.  */
8791
8792   unsigned rm = INSTR (20, 16);
8793   unsigned rn = INSTR (9, 5);
8794   unsigned rd = INSTR (4, 0);
8795
8796   NYI_assert (31, 21, 0x2F7);
8797   NYI_assert (15, 10, 0x0D);
8798
8799   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8800   aarch64_set_vec_u64 (cpu, rd, 0,
8801                        aarch64_get_vec_u64 (cpu, rn, 0) >
8802                        aarch64_get_vec_u64 (cpu, rm, 0) ? -1L : 0L);
8803 }
8804
8805 static void
8806 do_scalar_USHR (sim_cpu *cpu)
8807 {
8808   /* instr [31,23] = 0111 1111 0
8809      instr [22,16] = shift amount
8810      instr [15,10] = 0000 01
8811      instr [9, 5]  = Rn
8812      instr [4, 0]  = Rd.  */
8813
8814   unsigned amount = 128 - INSTR (22, 16);
8815   unsigned rn = INSTR (9, 5);
8816   unsigned rd = INSTR (4, 0);
8817
8818   NYI_assert (31, 23, 0x0FE);
8819   NYI_assert (15, 10, 0x01);
8820
8821   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8822   aarch64_set_vec_u64 (cpu, rd, 0,
8823                        aarch64_get_vec_u64 (cpu, rn, 0) >> amount);
8824 }
8825
8826 static void
8827 do_scalar_SSHL (sim_cpu *cpu)
8828 {
8829   /* instr [31,21] = 0101 1110 111
8830      instr [20,16] = Rm
8831      instr [15,10] = 0100 01
8832      instr [9, 5]  = Rn
8833      instr [4, 0]  = Rd.  */
8834
8835   unsigned rm = INSTR (20, 16);
8836   unsigned rn = INSTR (9, 5);
8837   unsigned rd = INSTR (4, 0);
8838   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
8839
8840   NYI_assert (31, 21, 0x2F7);
8841   NYI_assert (15, 10, 0x11);
8842
8843   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8844   if (shift >= 0)
8845     aarch64_set_vec_s64 (cpu, rd, 0,
8846                          aarch64_get_vec_s64 (cpu, rn, 0) << shift);
8847   else
8848     aarch64_set_vec_s64 (cpu, rd, 0,
8849                          aarch64_get_vec_s64 (cpu, rn, 0) >> - shift);
8850 }
8851
8852 static void
8853 do_scalar_shift (sim_cpu *cpu)
8854 {
8855   /* instr [31,23] = 0101 1111 0
8856      instr [22,16] = shift amount
8857      instr [15,10] = 0101 01   [SHL]
8858      instr [15,10] = 0000 01   [SSHR]
8859      instr [9, 5]  = Rn
8860      instr [4, 0]  = Rd.  */
8861
8862   unsigned rn = INSTR (9, 5);
8863   unsigned rd = INSTR (4, 0);
8864   unsigned amount;
8865
8866   NYI_assert (31, 23, 0x0BE);
8867
8868   if (INSTR (22, 22) == 0)
8869     HALT_UNALLOC;
8870
8871   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8872   switch (INSTR (15, 10))
8873     {
8874     case 0x01: /* SSHR */
8875       amount = 128 - INSTR (22, 16);
8876       aarch64_set_vec_s64 (cpu, rd, 0,
8877                            aarch64_get_vec_s64 (cpu, rn, 0) >> amount);
8878       return;
8879     case 0x15: /* SHL */
8880       amount = INSTR (22, 16) - 64;
8881       aarch64_set_vec_u64 (cpu, rd, 0,
8882                            aarch64_get_vec_u64 (cpu, rn, 0) << amount);
8883       return;
8884     default:
8885       HALT_NYI;
8886     }
8887 }
8888
8889 /* FCMEQ FCMGT FCMGE.  */
8890 static void
8891 do_scalar_FCM (sim_cpu *cpu)
8892 {
8893   /* instr [31,30] = 01
8894      instr [29]    = U
8895      instr [28,24] = 1 1110
8896      instr [23]    = E
8897      instr [22]    = size
8898      instr [21]    = 1
8899      instr [20,16] = Rm
8900      instr [15,12] = 1110
8901      instr [11]    = AC
8902      instr [10]    = 1
8903      instr [9, 5]  = Rn
8904      instr [4, 0]  = Rd.  */
8905
8906   unsigned rm = INSTR (20, 16);
8907   unsigned rn = INSTR (9, 5);
8908   unsigned rd = INSTR (4, 0);
8909   unsigned EUac = (INSTR (23, 23) << 2) | (INSTR (29, 29) << 1) | INSTR (11, 11);
8910   unsigned result;
8911   float val1;
8912   float val2;
8913
8914   NYI_assert (31, 30, 1);
8915   NYI_assert (28, 24, 0x1E);
8916   NYI_assert (21, 21, 1);
8917   NYI_assert (15, 12, 0xE);
8918   NYI_assert (10, 10, 1);
8919
8920   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8921   if (INSTR (22, 22))
8922     {
8923       double val1 = aarch64_get_FP_double (cpu, rn);
8924       double val2 = aarch64_get_FP_double (cpu, rm);
8925
8926       switch (EUac)
8927         {
8928         case 0: /* 000 */
8929           result = val1 == val2;
8930           break;
8931
8932         case 3: /* 011 */
8933           val1 = fabs (val1);
8934           val2 = fabs (val2);
8935           /* Fall through. */
8936         case 2: /* 010 */
8937           result = val1 >= val2;
8938           break;
8939
8940         case 7: /* 111 */
8941           val1 = fabs (val1);
8942           val2 = fabs (val2);
8943           /* Fall through. */
8944         case 6: /* 110 */
8945           result = val1 > val2;
8946           break;
8947
8948         default:
8949           HALT_UNALLOC;
8950         }
8951
8952       aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
8953       return;
8954     }
8955
8956   val1 = aarch64_get_FP_float (cpu, rn);
8957   val2 = aarch64_get_FP_float (cpu, rm);
8958
8959   switch (EUac)
8960     {
8961     case 0: /* 000 */
8962       result = val1 == val2;
8963       break;
8964
8965     case 3: /* 011 */
8966       val1 = fabsf (val1);
8967       val2 = fabsf (val2);
8968       /* Fall through. */
8969     case 2: /* 010 */
8970       result = val1 >= val2;
8971       break;
8972
8973     case 7: /* 111 */
8974       val1 = fabsf (val1);
8975       val2 = fabsf (val2);
8976       /* Fall through. */
8977     case 6: /* 110 */
8978       result = val1 > val2;
8979       break;
8980
8981     default:
8982       HALT_UNALLOC;
8983     }
8984
8985   aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
8986 }
8987
8988 /* An alias of DUP.  */
8989 static void
8990 do_scalar_MOV (sim_cpu *cpu)
8991 {
8992   /* instr [31,21] = 0101 1110 000
8993      instr [20,16] = imm5
8994      instr [15,10] = 0000 01
8995      instr [9, 5]  = Rn
8996      instr [4, 0]  = Rd.  */
8997
8998   unsigned rn = INSTR (9, 5);
8999   unsigned rd = INSTR (4, 0);
9000   unsigned index;
9001
9002   NYI_assert (31, 21, 0x2F0);
9003   NYI_assert (15, 10, 0x01);
9004
9005   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9006   if (INSTR (16, 16))
9007     {
9008       /* 8-bit.  */
9009       index = INSTR (20, 17);
9010       aarch64_set_vec_u8
9011         (cpu, rd, 0, aarch64_get_vec_u8 (cpu, rn, index));
9012     }
9013   else if (INSTR (17, 17))
9014     {
9015       /* 16-bit.  */
9016       index = INSTR (20, 18);
9017       aarch64_set_vec_u16
9018         (cpu, rd, 0, aarch64_get_vec_u16 (cpu, rn, index));
9019     }
9020   else if (INSTR (18, 18))
9021     {
9022       /* 32-bit.  */
9023       index = INSTR (20, 19);
9024       aarch64_set_vec_u32
9025         (cpu, rd, 0, aarch64_get_vec_u32 (cpu, rn, index));
9026     }
9027   else if (INSTR (19, 19))
9028     {
9029       /* 64-bit.  */
9030       index = INSTR (20, 20);
9031       aarch64_set_vec_u64
9032         (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, index));
9033     }
9034   else
9035     HALT_UNALLOC;
9036 }
9037
9038 static void
9039 do_scalar_NEG (sim_cpu *cpu)
9040 {
9041   /* instr [31,10] = 0111 1110 1110 0000 1011 10
9042      instr [9, 5]  = Rn
9043      instr [4, 0]  = Rd.  */
9044
9045   unsigned rn = INSTR (9, 5);
9046   unsigned rd = INSTR (4, 0);
9047
9048   NYI_assert (31, 10, 0x1FB82E);
9049
9050   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9051   aarch64_set_vec_u64 (cpu, rd, 0, - aarch64_get_vec_u64 (cpu, rn, 0));
9052 }
9053
9054 static void
9055 do_scalar_USHL (sim_cpu *cpu)
9056 {
9057   /* instr [31,21] = 0111 1110 111
9058      instr [20,16] = Rm
9059      instr [15,10] = 0100 01
9060      instr [9, 5]  = Rn
9061      instr [4, 0]  = Rd.  */
9062
9063   unsigned rm = INSTR (20, 16);
9064   unsigned rn = INSTR (9, 5);
9065   unsigned rd = INSTR (4, 0);
9066   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
9067
9068   NYI_assert (31, 21, 0x3F7);
9069   NYI_assert (15, 10, 0x11);
9070
9071   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9072   if (shift >= 0)
9073     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) << shift);
9074   else
9075     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) >> - shift);
9076 }
9077
9078 static void
9079 do_double_add (sim_cpu *cpu)
9080 {
9081   /* instr [31,21] = 0101 1110 111
9082      instr [20,16] = Fn
9083      instr [15,10] = 1000 01
9084      instr [9,5]   = Fm
9085      instr [4,0]   = Fd.  */
9086   unsigned Fd;
9087   unsigned Fm;
9088   unsigned Fn;
9089   double val1;
9090   double val2;
9091
9092   NYI_assert (31, 21, 0x2F7);
9093   NYI_assert (15, 10, 0x21);
9094
9095   Fd = INSTR (4, 0);
9096   Fm = INSTR (9, 5);
9097   Fn = INSTR (20, 16);
9098
9099   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9100   val1 = aarch64_get_FP_double (cpu, Fm);
9101   val2 = aarch64_get_FP_double (cpu, Fn);
9102
9103   aarch64_set_FP_double (cpu, Fd, val1 + val2);
9104 }
9105
9106 static void
9107 do_scalar_UCVTF (sim_cpu *cpu)
9108 {
9109   /* instr [31,23] = 0111 1110 0
9110      instr [22]    = single(0)/double(1)
9111      instr [21,10] = 10 0001 1101 10
9112      instr [9,5]   = rn
9113      instr [4,0]   = rd.  */
9114
9115   unsigned rn = INSTR (9, 5);
9116   unsigned rd = INSTR (4, 0);
9117
9118   NYI_assert (31, 23, 0x0FC);
9119   NYI_assert (21, 10, 0x876);
9120
9121   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9122   if (INSTR (22, 22))
9123     {
9124       uint64_t val = aarch64_get_vec_u64 (cpu, rn, 0);
9125
9126       aarch64_set_vec_double (cpu, rd, 0, (double) val);
9127     }
9128   else
9129     {
9130       uint32_t val = aarch64_get_vec_u32 (cpu, rn, 0);
9131
9132       aarch64_set_vec_float (cpu, rd, 0, (float) val);
9133     }
9134 }
9135
9136 static void
9137 do_scalar_vec (sim_cpu *cpu)
9138 {
9139   /* instr [30] = 1.  */
9140   /* instr [28,25] = 1111.  */
9141   switch (INSTR (31, 23))
9142     {
9143     case 0xBC:
9144       switch (INSTR (15, 10))
9145         {
9146         case 0x01: do_scalar_MOV (cpu); return;
9147         case 0x39: do_scalar_FCM (cpu); return;
9148         case 0x3B: do_scalar_FCM (cpu); return;
9149         }
9150       break;
9151
9152     case 0xBE: do_scalar_shift (cpu); return;
9153
9154     case 0xFC:
9155       switch (INSTR (15, 10))
9156         {
9157         case 0x36:
9158           switch (INSTR (21, 16))
9159             {
9160             case 0x30: do_scalar_FADDP (cpu); return;
9161             case 0x21: do_scalar_UCVTF (cpu); return;
9162             }
9163           HALT_NYI;
9164         case 0x39: do_scalar_FCM (cpu); return;
9165         case 0x3B: do_scalar_FCM (cpu); return;
9166         }
9167       break;
9168
9169     case 0xFD:
9170       switch (INSTR (15, 10))
9171         {
9172         case 0x0D: do_scalar_CMGT (cpu); return;
9173         case 0x11: do_scalar_USHL (cpu); return;
9174         case 0x2E: do_scalar_NEG (cpu); return;
9175         case 0x35: do_scalar_FABD (cpu); return;
9176         case 0x39: do_scalar_FCM (cpu); return;
9177         case 0x3B: do_scalar_FCM (cpu); return;
9178         default:
9179           HALT_NYI;
9180         }
9181
9182     case 0xFE: do_scalar_USHR (cpu); return;
9183
9184     case 0xBD:
9185       switch (INSTR (15, 10))
9186         {
9187         case 0x21: do_double_add (cpu); return;
9188         case 0x11: do_scalar_SSHL (cpu); return;
9189         default:
9190           HALT_NYI;
9191         }
9192
9193     default:
9194       HALT_NYI;
9195     }
9196 }
9197
9198 static void
9199 dexAdvSIMD1 (sim_cpu *cpu)
9200 {
9201   /* instr [28,25] = 1 111.  */
9202
9203   /* We are currently only interested in the basic
9204      scalar fp routines which all have bit 30 = 0.  */
9205   if (INSTR (30, 30))
9206     do_scalar_vec (cpu);
9207
9208   /* instr[24] is set for FP data processing 3-source and clear for
9209      all other basic scalar fp instruction groups.  */
9210   else if (INSTR (24, 24))
9211     dexSimpleFPDataProc3Source (cpu);
9212
9213   /* instr[21] is clear for floating <-> fixed conversions and set for
9214      all other basic scalar fp instruction groups.  */
9215   else if (!INSTR (21, 21))
9216     dexSimpleFPFixedConvert (cpu);
9217
9218   /* instr[11,10] : 01 ==> cond compare, 10 ==> Data Proc 2 Source
9219      11 ==> cond select,  00 ==> other.  */
9220   else
9221     switch (INSTR (11, 10))
9222       {
9223       case 1: dexSimpleFPCondCompare (cpu); return;
9224       case 2: dexSimpleFPDataProc2Source (cpu); return;
9225       case 3: dexSimpleFPCondSelect (cpu); return;
9226
9227       default:
9228         /* Now an ordered cascade of tests.
9229            FP immediate has instr [12] == 1.
9230            FP compare has   instr [13] == 1.
9231            FP Data Proc 1 Source has instr [14] == 1.
9232            FP floating <--> integer conversions has instr [15] == 0.  */
9233         if (INSTR (12, 12))
9234           dexSimpleFPImmediate (cpu);
9235
9236         else if (INSTR (13, 13))
9237           dexSimpleFPCompare (cpu);
9238
9239         else if (INSTR (14, 14))
9240           dexSimpleFPDataProc1Source (cpu);
9241
9242         else if (!INSTR (15, 15))
9243           dexSimpleFPIntegerConvert (cpu);
9244
9245         else
9246           /* If we get here then instr[15] == 1 which means UNALLOC.  */
9247           HALT_UNALLOC;
9248       }
9249 }
9250
9251 /* PC relative addressing.  */
9252
9253 static void
9254 pcadr (sim_cpu *cpu)
9255 {
9256   /* instr[31] = op : 0 ==> ADR, 1 ==> ADRP
9257      instr[30,29] = immlo
9258      instr[23,5] = immhi.  */
9259   uint64_t address;
9260   unsigned rd = INSTR (4, 0);
9261   uint32_t isPage = INSTR (31, 31);
9262   union { int64_t u64; uint64_t s64; } imm;
9263   uint64_t offset;
9264
9265   imm.s64 = simm64 (aarch64_get_instr (cpu), 23, 5);
9266   offset = imm.u64;
9267   offset = (offset << 2) | INSTR (30, 29);
9268
9269   address = aarch64_get_PC (cpu);
9270
9271   if (isPage)
9272     {
9273       offset <<= 12;
9274       address &= ~0xfff;
9275     }
9276
9277   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9278   aarch64_set_reg_u64 (cpu, rd, NO_SP, address + offset);
9279 }
9280
9281 /* Specific decode and execute for group Data Processing Immediate.  */
9282
9283 static void
9284 dexPCRelAddressing (sim_cpu *cpu)
9285 {
9286   /* assert instr[28,24] = 10000.  */
9287   pcadr (cpu);
9288 }
9289
9290 /* Immediate logical.
9291    The bimm32/64 argument is constructed by replicating a 2, 4, 8,
9292    16, 32 or 64 bit sequence pulled out at decode and possibly
9293    inverting it..
9294
9295    N.B. the output register (dest) can normally be Xn or SP
9296    the exception occurs for flag setting instructions which may
9297    only use Xn for the output (dest).  The input register can
9298    never be SP.  */
9299
9300 /* 32 bit and immediate.  */
9301 static void
9302 and32 (sim_cpu *cpu, uint32_t bimm)
9303 {
9304   unsigned rn = INSTR (9, 5);
9305   unsigned rd = INSTR (4, 0);
9306
9307   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9308   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9309                        aarch64_get_reg_u32 (cpu, rn, NO_SP) & bimm);
9310 }
9311
9312 /* 64 bit and immediate.  */
9313 static void
9314 and64 (sim_cpu *cpu, uint64_t bimm)
9315 {
9316   unsigned rn = INSTR (9, 5);
9317   unsigned rd = INSTR (4, 0);
9318
9319   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9320   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9321                        aarch64_get_reg_u64 (cpu, rn, NO_SP) & bimm);
9322 }
9323
9324 /* 32 bit and immediate set flags.  */
9325 static void
9326 ands32 (sim_cpu *cpu, uint32_t bimm)
9327 {
9328   unsigned rn = INSTR (9, 5);
9329   unsigned rd = INSTR (4, 0);
9330
9331   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9332   uint32_t value2 = bimm;
9333
9334   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9335   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9336   set_flags_for_binop32 (cpu, value1 & value2);
9337 }
9338
9339 /* 64 bit and immediate set flags.  */
9340 static void
9341 ands64 (sim_cpu *cpu, uint64_t bimm)
9342 {
9343   unsigned rn = INSTR (9, 5);
9344   unsigned rd = INSTR (4, 0);
9345
9346   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9347   uint64_t value2 = bimm;
9348
9349   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9350   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9351   set_flags_for_binop64 (cpu, value1 & value2);
9352 }
9353
9354 /* 32 bit exclusive or immediate.  */
9355 static void
9356 eor32 (sim_cpu *cpu, uint32_t bimm)
9357 {
9358   unsigned rn = INSTR (9, 5);
9359   unsigned rd = INSTR (4, 0);
9360
9361   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9362   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9363                        aarch64_get_reg_u32 (cpu, rn, NO_SP) ^ bimm);
9364 }
9365
9366 /* 64 bit exclusive or immediate.  */
9367 static void
9368 eor64 (sim_cpu *cpu, uint64_t bimm)
9369 {
9370   unsigned rn = INSTR (9, 5);
9371   unsigned rd = INSTR (4, 0);
9372
9373   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9374   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9375                        aarch64_get_reg_u64 (cpu, rn, NO_SP) ^ bimm);
9376 }
9377
9378 /* 32 bit or immediate.  */
9379 static void
9380 orr32 (sim_cpu *cpu, uint32_t bimm)
9381 {
9382   unsigned rn = INSTR (9, 5);
9383   unsigned rd = INSTR (4, 0);
9384
9385   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9386   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9387                        aarch64_get_reg_u32 (cpu, rn, NO_SP) | bimm);
9388 }
9389
9390 /* 64 bit or immediate.  */
9391 static void
9392 orr64 (sim_cpu *cpu, uint64_t bimm)
9393 {
9394   unsigned rn = INSTR (9, 5);
9395   unsigned rd = INSTR (4, 0);
9396
9397   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9398   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9399                        aarch64_get_reg_u64 (cpu, rn, NO_SP) | bimm);
9400 }
9401
9402 /* Logical shifted register.
9403    These allow an optional LSL, ASR, LSR or ROR to the second source
9404    register with a count up to the register bit count.
9405    N.B register args may not be SP.  */
9406
9407 /* 32 bit AND shifted register.  */
9408 static void
9409 and32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9410 {
9411   unsigned rm = INSTR (20, 16);
9412   unsigned rn = INSTR (9, 5);
9413   unsigned rd = INSTR (4, 0);
9414
9415   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9416   aarch64_set_reg_u64
9417     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9418      & shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9419 }
9420
9421 /* 64 bit AND shifted register.  */
9422 static void
9423 and64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9424 {
9425   unsigned rm = INSTR (20, 16);
9426   unsigned rn = INSTR (9, 5);
9427   unsigned rd = INSTR (4, 0);
9428
9429   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9430   aarch64_set_reg_u64
9431     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9432      & shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9433 }
9434
9435 /* 32 bit AND shifted register setting flags.  */
9436 static void
9437 ands32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9438 {
9439   unsigned rm = INSTR (20, 16);
9440   unsigned rn = INSTR (9, 5);
9441   unsigned rd = INSTR (4, 0);
9442
9443   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9444   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9445                                shift, count);
9446
9447   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9448   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9449   set_flags_for_binop32 (cpu, value1 & value2);
9450 }
9451
9452 /* 64 bit AND shifted register setting flags.  */
9453 static void
9454 ands64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9455 {
9456   unsigned rm = INSTR (20, 16);
9457   unsigned rn = INSTR (9, 5);
9458   unsigned rd = INSTR (4, 0);
9459
9460   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9461   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9462                                shift, count);
9463
9464   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9465   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9466   set_flags_for_binop64 (cpu, value1 & value2);
9467 }
9468
9469 /* 32 bit BIC shifted register.  */
9470 static void
9471 bic32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9472 {
9473   unsigned rm = INSTR (20, 16);
9474   unsigned rn = INSTR (9, 5);
9475   unsigned rd = INSTR (4, 0);
9476
9477   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9478   aarch64_set_reg_u64
9479     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9480      & ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9481 }
9482
9483 /* 64 bit BIC shifted register.  */
9484 static void
9485 bic64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9486 {
9487   unsigned rm = INSTR (20, 16);
9488   unsigned rn = INSTR (9, 5);
9489   unsigned rd = INSTR (4, 0);
9490
9491   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9492   aarch64_set_reg_u64
9493     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9494      & ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9495 }
9496
9497 /* 32 bit BIC shifted register setting flags.  */
9498 static void
9499 bics32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9500 {
9501   unsigned rm = INSTR (20, 16);
9502   unsigned rn = INSTR (9, 5);
9503   unsigned rd = INSTR (4, 0);
9504
9505   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9506   uint32_t value2 = ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9507                                  shift, count);
9508
9509   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9510   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9511   set_flags_for_binop32 (cpu, value1 & value2);
9512 }
9513
9514 /* 64 bit BIC shifted register setting flags.  */
9515 static void
9516 bics64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9517 {
9518   unsigned rm = INSTR (20, 16);
9519   unsigned rn = INSTR (9, 5);
9520   unsigned rd = INSTR (4, 0);
9521
9522   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9523   uint64_t value2 = ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9524                                  shift, count);
9525
9526   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9527   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9528   set_flags_for_binop64 (cpu, value1 & value2);
9529 }
9530
9531 /* 32 bit EON shifted register.  */
9532 static void
9533 eon32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9534 {
9535   unsigned rm = INSTR (20, 16);
9536   unsigned rn = INSTR (9, 5);
9537   unsigned rd = INSTR (4, 0);
9538
9539   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9540   aarch64_set_reg_u64
9541     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9542      ^ ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9543 }
9544
9545 /* 64 bit EON shifted register.  */
9546 static void
9547 eon64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9548 {
9549   unsigned rm = INSTR (20, 16);
9550   unsigned rn = INSTR (9, 5);
9551   unsigned rd = INSTR (4, 0);
9552
9553   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9554   aarch64_set_reg_u64
9555     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9556      ^ ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9557 }
9558
9559 /* 32 bit EOR shifted register.  */
9560 static void
9561 eor32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9562 {
9563   unsigned rm = INSTR (20, 16);
9564   unsigned rn = INSTR (9, 5);
9565   unsigned rd = INSTR (4, 0);
9566
9567   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9568   aarch64_set_reg_u64
9569     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9570      ^ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9571 }
9572
9573 /* 64 bit EOR shifted register.  */
9574 static void
9575 eor64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9576 {
9577   unsigned rm = INSTR (20, 16);
9578   unsigned rn = INSTR (9, 5);
9579   unsigned rd = INSTR (4, 0);
9580
9581   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9582   aarch64_set_reg_u64
9583     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9584      ^ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9585 }
9586
9587 /* 32 bit ORR shifted register.  */
9588 static void
9589 orr32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9590 {
9591   unsigned rm = INSTR (20, 16);
9592   unsigned rn = INSTR (9, 5);
9593   unsigned rd = INSTR (4, 0);
9594
9595   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9596   aarch64_set_reg_u64
9597     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9598      | shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9599 }
9600
9601 /* 64 bit ORR shifted register.  */
9602 static void
9603 orr64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9604 {
9605   unsigned rm = INSTR (20, 16);
9606   unsigned rn = INSTR (9, 5);
9607   unsigned rd = INSTR (4, 0);
9608
9609   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9610   aarch64_set_reg_u64
9611     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9612      | shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9613 }
9614
9615 /* 32 bit ORN shifted register.  */
9616 static void
9617 orn32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9618 {
9619   unsigned rm = INSTR (20, 16);
9620   unsigned rn = INSTR (9, 5);
9621   unsigned rd = INSTR (4, 0);
9622
9623   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9624   aarch64_set_reg_u64
9625     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9626      | ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9627 }
9628
9629 /* 64 bit ORN shifted register.  */
9630 static void
9631 orn64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9632 {
9633   unsigned rm = INSTR (20, 16);
9634   unsigned rn = INSTR (9, 5);
9635   unsigned rd = INSTR (4, 0);
9636
9637   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9638   aarch64_set_reg_u64
9639     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9640      | ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9641 }
9642
9643 static void
9644 dexLogicalImmediate (sim_cpu *cpu)
9645 {
9646   /* assert instr[28,23] = 1001000
9647      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9648      instr[30,29] = op : 0 ==> AND, 1 ==> ORR, 2 ==> EOR, 3 ==> ANDS
9649      instr[22] = N : used to construct immediate mask
9650      instr[21,16] = immr
9651      instr[15,10] = imms
9652      instr[9,5] = Rn
9653      instr[4,0] = Rd  */
9654
9655   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
9656   uint32_t size = INSTR (31, 31);
9657   uint32_t N = INSTR (22, 22);
9658   /* uint32_t immr = INSTR (21, 16);.  */
9659   /* uint32_t imms = INSTR (15, 10);.  */
9660   uint32_t index = INSTR (22, 10);
9661   uint64_t bimm64 = LITable [index];
9662   uint32_t dispatch = INSTR (30, 29);
9663
9664   if (~size & N)
9665     HALT_UNALLOC;
9666
9667   if (!bimm64)
9668     HALT_UNALLOC;
9669
9670   if (size == 0)
9671     {
9672       uint32_t bimm = (uint32_t) bimm64;
9673
9674       switch (dispatch)
9675         {
9676         case 0: and32 (cpu, bimm); return;
9677         case 1: orr32 (cpu, bimm); return;
9678         case 2: eor32 (cpu, bimm); return;
9679         case 3: ands32 (cpu, bimm); return;
9680         }
9681     }
9682   else
9683     {
9684       switch (dispatch)
9685         {
9686         case 0: and64 (cpu, bimm64); return;
9687         case 1: orr64 (cpu, bimm64); return;
9688         case 2: eor64 (cpu, bimm64); return;
9689         case 3: ands64 (cpu, bimm64); return;
9690         }
9691     }
9692   HALT_UNALLOC;
9693 }
9694
9695 /* Immediate move.
9696    The uimm argument is a 16 bit value to be inserted into the
9697    target register the pos argument locates the 16 bit word in the
9698    dest register i.e. it is in {0, 1} for 32 bit and {0, 1, 2,
9699    3} for 64 bit.
9700    N.B register arg may not be SP so it should be.
9701    accessed using the setGZRegisterXXX accessors.  */
9702
9703 /* 32 bit move 16 bit immediate zero remaining shorts.  */
9704 static void
9705 movz32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9706 {
9707   unsigned rd = INSTR (4, 0);
9708
9709   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9710   aarch64_set_reg_u64 (cpu, rd, NO_SP, val << (pos * 16));
9711 }
9712
9713 /* 64 bit move 16 bit immediate zero remaining shorts.  */
9714 static void
9715 movz64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9716 {
9717   unsigned rd = INSTR (4, 0);
9718
9719   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9720   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((uint64_t) val) << (pos * 16));
9721 }
9722
9723 /* 32 bit move 16 bit immediate negated.  */
9724 static void
9725 movn32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9726 {
9727   unsigned rd = INSTR (4, 0);
9728
9729   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9730   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((val << (pos * 16)) ^ 0xffffffffU));
9731 }
9732
9733 /* 64 bit move 16 bit immediate negated.  */
9734 static void
9735 movn64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9736 {
9737   unsigned rd = INSTR (4, 0);
9738
9739   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9740   aarch64_set_reg_u64
9741     (cpu, rd, NO_SP, ((((uint64_t) val) << (pos * 16))
9742                       ^ 0xffffffffffffffffULL));
9743 }
9744
9745 /* 32 bit move 16 bit immediate keep remaining shorts.  */
9746 static void
9747 movk32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9748 {
9749   unsigned rd = INSTR (4, 0);
9750   uint32_t current = aarch64_get_reg_u32 (cpu, rd, NO_SP);
9751   uint32_t value = val << (pos * 16);
9752   uint32_t mask = ~(0xffffU << (pos * 16));
9753
9754   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9755   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
9756 }
9757
9758 /* 64 bit move 16 it immediate keep remaining shorts.  */
9759 static void
9760 movk64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9761 {
9762   unsigned rd = INSTR (4, 0);
9763   uint64_t current = aarch64_get_reg_u64 (cpu, rd, NO_SP);
9764   uint64_t value = (uint64_t) val << (pos * 16);
9765   uint64_t mask = ~(0xffffULL << (pos * 16));
9766
9767   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9768   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
9769 }
9770
9771 static void
9772 dexMoveWideImmediate (sim_cpu *cpu)
9773 {
9774   /* assert instr[28:23] = 100101
9775      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9776      instr[30,29] = op : 0 ==> MOVN, 1 ==> UNALLOC, 2 ==> MOVZ, 3 ==> MOVK
9777      instr[22,21] = shift : 00 == LSL#0, 01 = LSL#16, 10 = LSL#32, 11 = LSL#48
9778      instr[20,5] = uimm16
9779      instr[4,0] = Rd  */
9780
9781   /* N.B. the (multiple of 16) shift is applied by the called routine,
9782      we just pass the multiplier.  */
9783
9784   uint32_t imm;
9785   uint32_t size = INSTR (31, 31);
9786   uint32_t op = INSTR (30, 29);
9787   uint32_t shift = INSTR (22, 21);
9788
9789   /* 32 bit can only shift 0 or 1 lot of 16.
9790      anything else is an unallocated instruction.  */
9791   if (size == 0 && (shift > 1))
9792     HALT_UNALLOC;
9793
9794   if (op == 1)
9795     HALT_UNALLOC;
9796
9797   imm = INSTR (20, 5);
9798
9799   if (size == 0)
9800     {
9801       if (op == 0)
9802         movn32 (cpu, imm, shift);
9803       else if (op == 2)
9804         movz32 (cpu, imm, shift);
9805       else
9806         movk32 (cpu, imm, shift);
9807     }
9808   else
9809     {
9810       if (op == 0)
9811         movn64 (cpu, imm, shift);
9812       else if (op == 2)
9813         movz64 (cpu, imm, shift);
9814       else
9815         movk64 (cpu, imm, shift);
9816     }
9817 }
9818
9819 /* Bitfield operations.
9820    These take a pair of bit positions r and s which are in {0..31}
9821    or {0..63} depending on the instruction word size.
9822    N.B register args may not be SP.  */
9823
9824 /* OK, we start with ubfm which just needs to pick
9825    some bits out of source zero the rest and write
9826    the result to dest.  Just need two logical shifts.  */
9827
9828 /* 32 bit bitfield move, left and right of affected zeroed
9829    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9830 static void
9831 ubfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9832 {
9833   unsigned rd;
9834   unsigned rn = INSTR (9, 5);
9835   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9836
9837   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
9838   if (r <= s)
9839     {
9840       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
9841          We want only bits s:xxx:r at the bottom of the word
9842          so we LSL bit s up to bit 31 i.e. by 31 - s
9843          and then we LSR to bring bit 31 down to bit s - r
9844          i.e. by 31 + r - s.  */
9845       value <<= 31 - s;
9846       value >>= 31 + r - s;
9847     }
9848   else
9849     {
9850       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0
9851          We want only bits s:xxx:0 starting at it 31-(r-1)
9852          so we LSL bit s up to bit 31 i.e. by 31 - s
9853          and then we LSL to bring bit 31 down to 31-(r-1)+s
9854          i.e. by r - (s + 1).  */
9855       value <<= 31 - s;
9856       value >>= r - (s + 1);
9857     }
9858
9859   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9860   rd = INSTR (4, 0);
9861   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
9862 }
9863
9864 /* 64 bit bitfield move, left and right of affected zeroed
9865    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9866 static void
9867 ubfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9868 {
9869   unsigned rd;
9870   unsigned rn = INSTR (9, 5);
9871   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9872
9873   if (r <= s)
9874     {
9875       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
9876          We want only bits s:xxx:r at the bottom of the word.
9877          So we LSL bit s up to bit 63 i.e. by 63 - s
9878          and then we LSR to bring bit 63 down to bit s - r
9879          i.e. by 63 + r - s.  */
9880       value <<= 63 - s;
9881       value >>= 63 + r - s;
9882     }
9883   else
9884     {
9885       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0.
9886          We want only bits s:xxx:0 starting at it 63-(r-1).
9887          So we LSL bit s up to bit 63 i.e. by 63 - s
9888          and then we LSL to bring bit 63 down to 63-(r-1)+s
9889          i.e. by r - (s + 1).  */
9890       value <<= 63 - s;
9891       value >>= r - (s + 1);
9892     }
9893
9894   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9895   rd = INSTR (4, 0);
9896   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
9897 }
9898
9899 /* The signed versions need to insert sign bits
9900    on the left of the inserted bit field. so we do
9901    much the same as the unsigned version except we
9902    use an arithmetic shift right -- this just means
9903    we need to operate on signed values.  */
9904
9905 /* 32 bit bitfield move, left of affected sign-extended, right zeroed.  */
9906 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9907 static void
9908 sbfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9909 {
9910   unsigned rd;
9911   unsigned rn = INSTR (9, 5);
9912   /* as per ubfm32 but use an ASR instead of an LSR.  */
9913   int32_t value = aarch64_get_reg_s32 (cpu, rn, NO_SP);
9914
9915   if (r <= s)
9916     {
9917       value <<= 31 - s;
9918       value >>= 31 + r - s;
9919     }
9920   else
9921     {
9922       value <<= 31 - s;
9923       value >>= r - (s + 1);
9924     }
9925
9926   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9927   rd = INSTR (4, 0);
9928   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
9929 }
9930
9931 /* 64 bit bitfield move, left of affected sign-extended, right zeroed.  */
9932 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9933 static void
9934 sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9935 {
9936   unsigned rd;
9937   unsigned rn = INSTR (9, 5);
9938   /* acpu per ubfm but use an ASR instead of an LSR.  */
9939   int64_t value = aarch64_get_reg_s64 (cpu, rn, NO_SP);
9940
9941   if (r <= s)
9942     {
9943       value <<= 63 - s;
9944       value >>= 63 + r - s;
9945     }
9946   else
9947     {
9948       value <<= 63 - s;
9949       value >>= r - (s + 1);
9950     }
9951
9952   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9953   rd = INSTR (4, 0);
9954   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
9955 }
9956
9957 /* Finally, these versions leave non-affected bits
9958    as is. so we need to generate the bits as per
9959    ubfm and also generate a mask to pick the
9960    bits from the original and computed values.  */
9961
9962 /* 32 bit bitfield move, non-affected bits left as is.
9963    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9964 static void
9965 bfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9966 {
9967   unsigned rn = INSTR (9, 5);
9968   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9969   uint32_t mask = -1;
9970   unsigned rd;
9971   uint32_t value2;
9972
9973   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
9974   if (r <= s)
9975     {
9976       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
9977          We want only bits s:xxx:r at the bottom of the word
9978          so we LSL bit s up to bit 31 i.e. by 31 - s
9979          and then we LSR to bring bit 31 down to bit s - r
9980          i.e. by 31 + r - s.  */
9981       value <<= 31 - s;
9982       value >>= 31 + r - s;
9983       /* the mask must include the same bits.  */
9984       mask <<= 31 - s;
9985       mask >>= 31 + r - s;
9986     }
9987   else
9988     {
9989       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0.
9990          We want only bits s:xxx:0 starting at it 31-(r-1)
9991          so we LSL bit s up to bit 31 i.e. by 31 - s
9992          and then we LSL to bring bit 31 down to 31-(r-1)+s
9993          i.e. by r - (s + 1).  */
9994       value <<= 31 - s;
9995       value >>= r - (s + 1);
9996       /* The mask must include the same bits.  */
9997       mask <<= 31 - s;
9998       mask >>= r - (s + 1);
9999     }
10000
10001   rd = INSTR (4, 0);
10002   value2 = aarch64_get_reg_u32 (cpu, rd, NO_SP);
10003
10004   value2 &= ~mask;
10005   value2 |= value;
10006
10007   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10008   aarch64_set_reg_u64
10009     (cpu, rd, NO_SP, (aarch64_get_reg_u32 (cpu, rd, NO_SP) & ~mask) | value);
10010 }
10011
10012 /* 64 bit bitfield move, non-affected bits left as is.
10013    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
10014 static void
10015 bfm (sim_cpu *cpu, uint32_t r, uint32_t s)
10016 {
10017   unsigned rd;
10018   unsigned rn = INSTR (9, 5);
10019   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
10020   uint64_t mask = 0xffffffffffffffffULL;
10021
10022   if (r <= s)
10023     {
10024       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
10025          We want only bits s:xxx:r at the bottom of the word
10026          so we LSL bit s up to bit 63 i.e. by 63 - s
10027          and then we LSR to bring bit 63 down to bit s - r
10028          i.e. by 63 + r - s.  */
10029       value <<= 63 - s;
10030       value >>= 63 + r - s;
10031       /* The mask must include the same bits.  */
10032       mask <<= 63 - s;
10033       mask >>= 63 + r - s;
10034     }
10035   else
10036     {
10037       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0
10038          We want only bits s:xxx:0 starting at it 63-(r-1)
10039          so we LSL bit s up to bit 63 i.e. by 63 - s
10040          and then we LSL to bring bit 63 down to 63-(r-1)+s
10041          i.e. by r - (s + 1).  */
10042       value <<= 63 - s;
10043       value >>= r - (s + 1);
10044       /* The mask must include the same bits.  */
10045       mask <<= 63 - s;
10046       mask >>= r - (s + 1);
10047     }
10048
10049   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10050   rd = INSTR (4, 0);
10051   aarch64_set_reg_u64
10052     (cpu, rd, NO_SP, (aarch64_get_reg_u64 (cpu, rd, NO_SP) & ~mask) | value);
10053 }
10054
10055 static void
10056 dexBitfieldImmediate (sim_cpu *cpu)
10057 {
10058   /* assert instr[28:23] = 100110
10059      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
10060      instr[30,29] = op : 0 ==> SBFM, 1 ==> BFM, 2 ==> UBFM, 3 ==> UNALLOC
10061      instr[22] = N : must be 0 for 32 bit, 1 for 64 bit ow UNALLOC
10062      instr[21,16] = immr : 0xxxxx for 32 bit, xxxxxx for 64 bit
10063      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10064      instr[9,5] = Rn
10065      instr[4,0] = Rd  */
10066
10067   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10068   uint32_t dispatch;
10069   uint32_t imms;
10070   uint32_t size = INSTR (31, 31);
10071   uint32_t N = INSTR (22, 22);
10072   /* 32 bit operations must have immr[5] = 0 and imms[5] = 0.  */
10073   /* or else we have an UNALLOC.  */
10074   uint32_t immr = INSTR (21, 16);
10075
10076   if (~size & N)
10077     HALT_UNALLOC;
10078
10079   if (!size && uimm (immr, 5, 5))
10080     HALT_UNALLOC;
10081
10082   imms = INSTR (15, 10);
10083   if (!size && uimm (imms, 5, 5))
10084     HALT_UNALLOC;
10085
10086   /* Switch on combined size and op.  */
10087   dispatch = INSTR (31, 29);
10088   switch (dispatch)
10089     {
10090     case 0: sbfm32 (cpu, immr, imms); return;
10091     case 1: bfm32 (cpu, immr, imms); return;
10092     case 2: ubfm32 (cpu, immr, imms); return;
10093     case 4: sbfm (cpu, immr, imms); return;
10094     case 5: bfm (cpu, immr, imms); return;
10095     case 6: ubfm (cpu, immr, imms); return;
10096     default: HALT_UNALLOC;
10097     }
10098 }
10099
10100 static void
10101 do_EXTR_32 (sim_cpu *cpu)
10102 {
10103   /* instr[31:21] = 00010011100
10104      instr[20,16] = Rm
10105      instr[15,10] = imms :  0xxxxx for 32 bit
10106      instr[9,5]   = Rn
10107      instr[4,0]   = Rd  */
10108   unsigned rm   = INSTR (20, 16);
10109   unsigned imms = INSTR (15, 10) & 31;
10110   unsigned rn   = INSTR ( 9,  5);
10111   unsigned rd   = INSTR ( 4,  0);
10112   uint64_t val1;
10113   uint64_t val2;
10114
10115   val1 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
10116   val1 >>= imms;
10117   val2 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10118   val2 <<= (32 - imms);
10119
10120   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10121   aarch64_set_reg_u64 (cpu, rd, NO_SP, val1 | val2);
10122 }
10123
10124 static void
10125 do_EXTR_64 (sim_cpu *cpu)
10126 {
10127   /* instr[31:21] = 10010011100
10128      instr[20,16] = Rm
10129      instr[15,10] = imms
10130      instr[9,5]   = Rn
10131      instr[4,0]   = Rd  */
10132   unsigned rm   = INSTR (20, 16);
10133   unsigned imms = INSTR (15, 10) & 63;
10134   unsigned rn   = INSTR ( 9,  5);
10135   unsigned rd   = INSTR ( 4,  0);
10136   uint64_t val;
10137
10138   val = aarch64_get_reg_u64 (cpu, rm, NO_SP);
10139   val >>= imms;
10140   val |= (aarch64_get_reg_u64 (cpu, rn, NO_SP) << (64 - imms));
10141
10142   aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
10143 }
10144
10145 static void
10146 dexExtractImmediate (sim_cpu *cpu)
10147 {
10148   /* assert instr[28:23] = 100111
10149      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
10150      instr[30,29] = op21 : 0 ==> EXTR, 1,2,3 ==> UNALLOC
10151      instr[22]    = N : must be 0 for 32 bit, 1 for 64 bit or UNALLOC
10152      instr[21]    = op0 : must be 0 or UNALLOC
10153      instr[20,16] = Rm
10154      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10155      instr[9,5]   = Rn
10156      instr[4,0]   = Rd  */
10157
10158   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10159   /* 64 bit operations must have N = 1 or else we have an UNALLOC.  */
10160   uint32_t dispatch;
10161   uint32_t size = INSTR (31, 31);
10162   uint32_t N = INSTR (22, 22);
10163   /* 32 bit operations must have imms[5] = 0
10164      or else we have an UNALLOC.  */
10165   uint32_t imms = INSTR (15, 10);
10166
10167   if (size ^ N)
10168     HALT_UNALLOC;
10169
10170   if (!size && uimm (imms, 5, 5))
10171     HALT_UNALLOC;
10172
10173   /* Switch on combined size and op.  */
10174   dispatch = INSTR (31, 29);
10175
10176   if (dispatch == 0)
10177     do_EXTR_32 (cpu);
10178
10179   else if (dispatch == 4)
10180     do_EXTR_64 (cpu);
10181
10182   else if (dispatch == 1)
10183     HALT_NYI;
10184   else
10185     HALT_UNALLOC;
10186 }
10187
10188 static void
10189 dexDPImm (sim_cpu *cpu)
10190 {
10191   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
10192      assert  group == GROUP_DPIMM_1000 || grpoup == GROUP_DPIMM_1001
10193      bits [25,23] of a DPImm are the secondary dispatch vector.  */
10194   uint32_t group2 = dispatchDPImm (aarch64_get_instr (cpu));
10195
10196   switch (group2)
10197     {
10198     case DPIMM_PCADR_000:
10199     case DPIMM_PCADR_001:
10200       dexPCRelAddressing (cpu);
10201       return;
10202
10203     case DPIMM_ADDSUB_010:
10204     case DPIMM_ADDSUB_011:
10205       dexAddSubtractImmediate (cpu);
10206       return;
10207
10208     case DPIMM_LOG_100:
10209       dexLogicalImmediate (cpu);
10210       return;
10211
10212     case DPIMM_MOV_101:
10213       dexMoveWideImmediate (cpu);
10214       return;
10215
10216     case DPIMM_BITF_110:
10217       dexBitfieldImmediate (cpu);
10218       return;
10219
10220     case DPIMM_EXTR_111:
10221       dexExtractImmediate (cpu);
10222       return;
10223
10224     default:
10225       /* Should never reach here.  */
10226       HALT_NYI;
10227     }
10228 }
10229
10230 static void
10231 dexLoadUnscaledImmediate (sim_cpu *cpu)
10232 {
10233   /* instr[29,24] == 111_00
10234      instr[21] == 0
10235      instr[11,10] == 00
10236      instr[31,30] = size
10237      instr[26] = V
10238      instr[23,22] = opc
10239      instr[20,12] = simm9
10240      instr[9,5] = rn may be SP.  */
10241   /* unsigned rt = INSTR (4, 0);  */
10242   uint32_t V = INSTR (26, 26);
10243   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10244   int32_t imm = simm32 (aarch64_get_instr (cpu), 20, 12);
10245
10246   if (!V)
10247     {
10248       /* GReg operations.  */
10249       switch (dispatch)
10250         {
10251         case 0:  sturb (cpu, imm); return;
10252         case 1:  ldurb32 (cpu, imm); return;
10253         case 2:  ldursb64 (cpu, imm); return;
10254         case 3:  ldursb32 (cpu, imm); return;
10255         case 4:  sturh (cpu, imm); return;
10256         case 5:  ldurh32 (cpu, imm); return;
10257         case 6:  ldursh64 (cpu, imm); return;
10258         case 7:  ldursh32 (cpu, imm); return;
10259         case 8:  stur32 (cpu, imm); return;
10260         case 9:  ldur32 (cpu, imm); return;
10261         case 10: ldursw (cpu, imm); return;
10262         case 12: stur64 (cpu, imm); return;
10263         case 13: ldur64 (cpu, imm); return;
10264
10265         case 14:
10266           /* PRFUM NYI.  */
10267           HALT_NYI;
10268
10269         default:
10270         case 11:
10271         case 15:
10272           HALT_UNALLOC;
10273         }
10274     }
10275
10276   /* FReg operations.  */
10277   switch (dispatch)
10278     {
10279     case 2:  fsturq (cpu, imm); return;
10280     case 3:  fldurq (cpu, imm); return;
10281     case 8:  fsturs (cpu, imm); return;
10282     case 9:  fldurs (cpu, imm); return;
10283     case 12: fsturd (cpu, imm); return;
10284     case 13: fldurd (cpu, imm); return;
10285
10286     case 0: /* STUR 8 bit FP.  */
10287     case 1: /* LDUR 8 bit FP.  */
10288     case 4: /* STUR 16 bit FP.  */
10289     case 5: /* LDUR 8 bit FP.  */
10290       HALT_NYI;
10291
10292     default:
10293     case 6:
10294     case 7:
10295     case 10:
10296     case 11:
10297     case 14:
10298     case 15:
10299       HALT_UNALLOC;
10300     }
10301 }
10302
10303 /*  N.B. A preliminary note regarding all the ldrs<x>32
10304     instructions
10305
10306    The signed value loaded by these instructions is cast to unsigned
10307    before being assigned to aarch64_get_reg_u64 (cpu, N) i.e. to the
10308    64 bit element of the GReg union. this performs a 32 bit sign extension
10309    (as required) but avoids 64 bit sign extension, thus ensuring that the
10310    top half of the register word is zero. this is what the spec demands
10311    when a 32 bit load occurs.  */
10312
10313 /* 32 bit load sign-extended byte scaled unsigned 12 bit.  */
10314 static void
10315 ldrsb32_abs (sim_cpu *cpu, uint32_t offset)
10316 {
10317   unsigned int rn = INSTR (9, 5);
10318   unsigned int rt = INSTR (4, 0);
10319
10320   /* The target register may not be SP but the source may be
10321      there is no scaling required for a byte load.  */
10322   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
10323   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10324                        (int64_t) aarch64_get_mem_s8 (cpu, address));
10325 }
10326
10327 /* 32 bit load sign-extended byte scaled or unscaled zero-
10328    or sign-extended 32-bit register offset.  */
10329 static void
10330 ldrsb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10331 {
10332   unsigned int rm = INSTR (20, 16);
10333   unsigned int rn = INSTR (9, 5);
10334   unsigned int rt = INSTR (4, 0);
10335
10336   /* rn may reference SP, rm and rt must reference ZR.  */
10337
10338   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10339   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10340                                  extension);
10341
10342   /* There is no scaling required for a byte load.  */
10343   aarch64_set_reg_u64
10344     (cpu, rt, NO_SP, (int64_t) aarch64_get_mem_s8 (cpu, address
10345                                                    + displacement));
10346 }
10347
10348 /* 32 bit load sign-extended byte unscaled signed 9 bit with
10349    pre- or post-writeback.  */
10350 static void
10351 ldrsb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10352 {
10353   uint64_t address;
10354   unsigned int rn = INSTR (9, 5);
10355   unsigned int rt = INSTR (4, 0);
10356
10357   if (rn == rt && wb != NoWriteBack)
10358     HALT_UNALLOC;
10359
10360   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10361
10362   if (wb == Pre)
10363       address += offset;
10364
10365   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10366                        (int64_t) aarch64_get_mem_s8 (cpu, address));
10367
10368   if (wb == Post)
10369     address += offset;
10370
10371   if (wb != NoWriteBack)
10372     aarch64_set_reg_u64 (cpu, rn, NO_SP, address);
10373 }
10374
10375 /* 8 bit store scaled.  */
10376 static void
10377 fstrb_abs (sim_cpu *cpu, uint32_t offset)
10378 {
10379   unsigned st = INSTR (4, 0);
10380   unsigned rn = INSTR (9, 5);
10381
10382   aarch64_set_mem_u8 (cpu,
10383                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
10384                       aarch64_get_vec_u8 (cpu, st, 0));
10385 }
10386
10387 /* 8 bit store scaled or unscaled zero- or
10388    sign-extended 8-bit register offset.  */
10389 static void
10390 fstrb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10391 {
10392   unsigned rm = INSTR (20, 16);
10393   unsigned rn = INSTR (9, 5);
10394   unsigned st = INSTR (4, 0);
10395
10396   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10397   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10398                                extension);
10399   uint64_t  displacement = scaling == Scaled ? extended : 0;
10400
10401   aarch64_set_mem_u8
10402     (cpu, address + displacement, aarch64_get_vec_u8 (cpu, st, 0));
10403 }
10404
10405 /* 16 bit store scaled.  */
10406 static void
10407 fstrh_abs (sim_cpu *cpu, uint32_t offset)
10408 {
10409   unsigned st = INSTR (4, 0);
10410   unsigned rn = INSTR (9, 5);
10411
10412   aarch64_set_mem_u16
10413     (cpu,
10414      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16),
10415      aarch64_get_vec_u16 (cpu, st, 0));
10416 }
10417
10418 /* 16 bit store scaled or unscaled zero-
10419    or sign-extended 16-bit register offset.  */
10420 static void
10421 fstrh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10422 {
10423   unsigned rm = INSTR (20, 16);
10424   unsigned rn = INSTR (9, 5);
10425   unsigned st = INSTR (4, 0);
10426
10427   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10428   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10429                                extension);
10430   uint64_t  displacement = OPT_SCALE (extended, 16, scaling);
10431
10432   aarch64_set_mem_u16
10433     (cpu, address + displacement, aarch64_get_vec_u16 (cpu, st, 0));
10434 }
10435
10436 /* 32 bit store scaled unsigned 12 bit.  */
10437 static void
10438 fstrs_abs (sim_cpu *cpu, uint32_t offset)
10439 {
10440   unsigned st = INSTR (4, 0);
10441   unsigned rn = INSTR (9, 5);
10442
10443   aarch64_set_mem_u32
10444     (cpu,
10445      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32),
10446      aarch64_get_vec_u32 (cpu, st, 0));
10447 }
10448
10449 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10450 static void
10451 fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10452 {
10453   unsigned rn = INSTR (9, 5);
10454   unsigned st = INSTR (4, 0);
10455
10456   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10457
10458   if (wb != Post)
10459     address += offset;
10460
10461   aarch64_set_mem_u32 (cpu, address, aarch64_get_vec_u32 (cpu, st, 0));
10462
10463   if (wb == Post)
10464     address += offset;
10465
10466   if (wb != NoWriteBack)
10467     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10468 }
10469
10470 /* 32 bit store scaled or unscaled zero-
10471    or sign-extended 32-bit register offset.  */
10472 static void
10473 fstrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10474 {
10475   unsigned rm = INSTR (20, 16);
10476   unsigned rn = INSTR (9, 5);
10477   unsigned st = INSTR (4, 0);
10478
10479   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10480   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10481                                extension);
10482   uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
10483
10484   aarch64_set_mem_u32
10485     (cpu, address + displacement, aarch64_get_vec_u32 (cpu, st, 0));
10486 }
10487
10488 /* 64 bit store scaled unsigned 12 bit.  */
10489 static void
10490 fstrd_abs (sim_cpu *cpu, uint32_t offset)
10491 {
10492   unsigned st = INSTR (4, 0);
10493   unsigned rn = INSTR (9, 5);
10494
10495   aarch64_set_mem_u64
10496     (cpu,
10497      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64),
10498      aarch64_get_vec_u64 (cpu, st, 0));
10499 }
10500
10501 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10502 static void
10503 fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10504 {
10505   unsigned rn = INSTR (9, 5);
10506   unsigned st = INSTR (4, 0);
10507
10508   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10509
10510   if (wb != Post)
10511     address += offset;
10512
10513   aarch64_set_mem_u64 (cpu, address, aarch64_get_vec_u64 (cpu, st, 0));
10514
10515   if (wb == Post)
10516     address += offset;
10517
10518   if (wb != NoWriteBack)
10519     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10520 }
10521
10522 /* 64 bit store scaled or unscaled zero-
10523    or sign-extended 32-bit register offset.  */
10524 static void
10525 fstrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10526 {
10527   unsigned rm = INSTR (20, 16);
10528   unsigned rn = INSTR (9, 5);
10529   unsigned st = INSTR (4, 0);
10530
10531   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10532   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10533                                extension);
10534   uint64_t  displacement = OPT_SCALE (extended, 64, scaling);
10535
10536   aarch64_set_mem_u64
10537     (cpu, address + displacement, aarch64_get_vec_u64 (cpu, st, 0));
10538 }
10539
10540 /* 128 bit store scaled unsigned 12 bit.  */
10541 static void
10542 fstrq_abs (sim_cpu *cpu, uint32_t offset)
10543 {
10544   FRegister a;
10545   unsigned st = INSTR (4, 0);
10546   unsigned rn = INSTR (9, 5);
10547   uint64_t addr;
10548
10549   aarch64_get_FP_long_double (cpu, st, & a);
10550
10551   addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
10552   aarch64_set_mem_long_double (cpu, addr, a);
10553 }
10554
10555 /* 128 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10556 static void
10557 fstrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10558 {
10559   FRegister a;
10560   unsigned rn = INSTR (9, 5);
10561   unsigned st = INSTR (4, 0);
10562   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10563
10564   if (wb != Post)
10565     address += offset;
10566
10567   aarch64_get_FP_long_double (cpu, st, & a);
10568   aarch64_set_mem_long_double (cpu, address, a);
10569
10570   if (wb == Post)
10571     address += offset;
10572
10573   if (wb != NoWriteBack)
10574     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10575 }
10576
10577 /* 128 bit store scaled or unscaled zero-
10578    or sign-extended 32-bit register offset.  */
10579 static void
10580 fstrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10581 {
10582   unsigned rm = INSTR (20, 16);
10583   unsigned rn = INSTR (9, 5);
10584   unsigned st = INSTR (4, 0);
10585
10586   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10587   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10588                                extension);
10589   uint64_t  displacement = OPT_SCALE (extended, 128, scaling);
10590
10591   FRegister a;
10592
10593   aarch64_get_FP_long_double (cpu, st, & a);
10594   aarch64_set_mem_long_double (cpu, address + displacement, a);
10595 }
10596
10597 static void
10598 dexLoadImmediatePrePost (sim_cpu *cpu)
10599 {
10600   /* instr[31,30] = size
10601      instr[29,27] = 111
10602      instr[26]    = V
10603      instr[25,24] = 00
10604      instr[23,22] = opc
10605      instr[21]    = 0
10606      instr[20,12] = simm9
10607      instr[11]    = wb : 0 ==> Post, 1 ==> Pre
10608      instr[10]    = 0
10609      instr[9,5]   = Rn may be SP.
10610      instr[4,0]   = Rt */
10611
10612   uint32_t  V        = INSTR (26, 26);
10613   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10614   int32_t   imm      = simm32 (aarch64_get_instr (cpu), 20, 12);
10615   WriteBack wb       = INSTR (11, 11);
10616
10617   if (!V)
10618     {
10619       /* GReg operations.  */
10620       switch (dispatch)
10621         {
10622         case 0:  strb_wb (cpu, imm, wb); return;
10623         case 1:  ldrb32_wb (cpu, imm, wb); return;
10624         case 2:  ldrsb_wb (cpu, imm, wb); return;
10625         case 3:  ldrsb32_wb (cpu, imm, wb); return;
10626         case 4:  strh_wb (cpu, imm, wb); return;
10627         case 5:  ldrh32_wb (cpu, imm, wb); return;
10628         case 6:  ldrsh64_wb (cpu, imm, wb); return;
10629         case 7:  ldrsh32_wb (cpu, imm, wb); return;
10630         case 8:  str32_wb (cpu, imm, wb); return;
10631         case 9:  ldr32_wb (cpu, imm, wb); return;
10632         case 10: ldrsw_wb (cpu, imm, wb); return;
10633         case 12: str_wb (cpu, imm, wb); return;
10634         case 13: ldr_wb (cpu, imm, wb); return;
10635
10636         default:
10637         case 11:
10638         case 14:
10639         case 15:
10640           HALT_UNALLOC;
10641         }
10642     }
10643
10644   /* FReg operations.  */
10645   switch (dispatch)
10646     {
10647     case 2:  fstrq_wb (cpu, imm, wb); return;
10648     case 3:  fldrq_wb (cpu, imm, wb); return;
10649     case 8:  fstrs_wb (cpu, imm, wb); return;
10650     case 9:  fldrs_wb (cpu, imm, wb); return;
10651     case 12: fstrd_wb (cpu, imm, wb); return;
10652     case 13: fldrd_wb (cpu, imm, wb); return;
10653
10654     case 0:       /* STUR 8 bit FP.  */
10655     case 1:       /* LDUR 8 bit FP.  */
10656     case 4:       /* STUR 16 bit FP.  */
10657     case 5:       /* LDUR 8 bit FP.  */
10658       HALT_NYI;
10659
10660     default:
10661     case 6:
10662     case 7:
10663     case 10:
10664     case 11:
10665     case 14:
10666     case 15:
10667       HALT_UNALLOC;
10668     }
10669 }
10670
10671 static void
10672 dexLoadRegisterOffset (sim_cpu *cpu)
10673 {
10674   /* instr[31,30] = size
10675      instr[29,27] = 111
10676      instr[26]    = V
10677      instr[25,24] = 00
10678      instr[23,22] = opc
10679      instr[21]    = 1
10680      instr[20,16] = rm
10681      instr[15,13] = option : 010 ==> UXTW, 011 ==> UXTX/LSL,
10682                              110 ==> SXTW, 111 ==> SXTX,
10683                              ow ==> RESERVED
10684      instr[12]    = scaled
10685      instr[11,10] = 10
10686      instr[9,5]   = rn
10687      instr[4,0]   = rt.  */
10688
10689   uint32_t  V = INSTR (26, 26);
10690   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10691   Scaling   scale = INSTR (12, 12);
10692   Extension extensionType = INSTR (15, 13);
10693
10694   /* Check for illegal extension types.  */
10695   if (uimm (extensionType, 1, 1) == 0)
10696     HALT_UNALLOC;
10697
10698   if (extensionType == UXTX || extensionType == SXTX)
10699     extensionType = NoExtension;
10700
10701   if (!V)
10702     {
10703       /* GReg operations.  */
10704       switch (dispatch)
10705         {
10706         case 0:  strb_scale_ext (cpu, scale, extensionType); return;
10707         case 1:  ldrb32_scale_ext (cpu, scale, extensionType); return;
10708         case 2:  ldrsb_scale_ext (cpu, scale, extensionType); return;
10709         case 3:  ldrsb32_scale_ext (cpu, scale, extensionType); return;
10710         case 4:  strh_scale_ext (cpu, scale, extensionType); return;
10711         case 5:  ldrh32_scale_ext (cpu, scale, extensionType); return;
10712         case 6:  ldrsh_scale_ext (cpu, scale, extensionType); return;
10713         case 7:  ldrsh32_scale_ext (cpu, scale, extensionType); return;
10714         case 8:  str32_scale_ext (cpu, scale, extensionType); return;
10715         case 9:  ldr32_scale_ext (cpu, scale, extensionType); return;
10716         case 10: ldrsw_scale_ext (cpu, scale, extensionType); return;
10717         case 12: str_scale_ext (cpu, scale, extensionType); return;
10718         case 13: ldr_scale_ext (cpu, scale, extensionType); return;
10719         case 14: prfm_scale_ext (cpu, scale, extensionType); return;
10720
10721         default:
10722         case 11:
10723         case 15:
10724           HALT_UNALLOC;
10725         }
10726     }
10727
10728   /* FReg operations.  */
10729   switch (dispatch)
10730     {
10731     case 1: /* LDUR 8 bit FP.  */
10732       HALT_NYI;
10733     case 3:  fldrq_scale_ext (cpu, scale, extensionType); return;
10734     case 5: /* LDUR 8 bit FP.  */
10735       HALT_NYI;
10736     case 9:  fldrs_scale_ext (cpu, scale, extensionType); return;
10737     case 13: fldrd_scale_ext (cpu, scale, extensionType); return;
10738
10739     case 0:  fstrb_scale_ext (cpu, scale, extensionType); return;
10740     case 2:  fstrq_scale_ext (cpu, scale, extensionType); return;
10741     case 4:  fstrh_scale_ext (cpu, scale, extensionType); return;
10742     case 8:  fstrs_scale_ext (cpu, scale, extensionType); return;
10743     case 12: fstrd_scale_ext (cpu, scale, extensionType); return;
10744
10745     default:
10746     case 6:
10747     case 7:
10748     case 10:
10749     case 11:
10750     case 14:
10751     case 15:
10752       HALT_UNALLOC;
10753     }
10754 }
10755
10756 static void
10757 dexLoadUnsignedImmediate (sim_cpu *cpu)
10758 {
10759   /* instr[29,24] == 111_01
10760      instr[31,30] = size
10761      instr[26]    = V
10762      instr[23,22] = opc
10763      instr[21,10] = uimm12 : unsigned immediate offset
10764      instr[9,5]   = rn may be SP.
10765      instr[4,0]   = rt.  */
10766
10767   uint32_t V = INSTR (26,26);
10768   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10769   uint32_t imm = INSTR (21, 10);
10770
10771   if (!V)
10772     {
10773       /* GReg operations.  */
10774       switch (dispatch)
10775         {
10776         case 0:  strb_abs (cpu, imm); return;
10777         case 1:  ldrb32_abs (cpu, imm); return;
10778         case 2:  ldrsb_abs (cpu, imm); return;
10779         case 3:  ldrsb32_abs (cpu, imm); return;
10780         case 4:  strh_abs (cpu, imm); return;
10781         case 5:  ldrh32_abs (cpu, imm); return;
10782         case 6:  ldrsh_abs (cpu, imm); return;
10783         case 7:  ldrsh32_abs (cpu, imm); return;
10784         case 8:  str32_abs (cpu, imm); return;
10785         case 9:  ldr32_abs (cpu, imm); return;
10786         case 10: ldrsw_abs (cpu, imm); return;
10787         case 12: str_abs (cpu, imm); return;
10788         case 13: ldr_abs (cpu, imm); return;
10789         case 14: prfm_abs (cpu, imm); return;
10790
10791         default:
10792         case 11:
10793         case 15:
10794           HALT_UNALLOC;
10795         }
10796     }
10797
10798   /* FReg operations.  */
10799   switch (dispatch)
10800     {
10801     case 0:  fstrb_abs (cpu, imm); return;
10802     case 4:  fstrh_abs (cpu, imm); return;
10803     case 8:  fstrs_abs (cpu, imm); return;
10804     case 12: fstrd_abs (cpu, imm); return;
10805     case 2:  fstrq_abs (cpu, imm); return;
10806
10807     case 1:  fldrb_abs (cpu, imm); return;
10808     case 5:  fldrh_abs (cpu, imm); return;
10809     case 9:  fldrs_abs (cpu, imm); return;
10810     case 13: fldrd_abs (cpu, imm); return;
10811     case 3:  fldrq_abs (cpu, imm); return;
10812
10813     default:
10814     case 6:
10815     case 7:
10816     case 10:
10817     case 11:
10818     case 14:
10819     case 15:
10820       HALT_UNALLOC;
10821     }
10822 }
10823
10824 static void
10825 dexLoadExclusive (sim_cpu *cpu)
10826 {
10827   /* assert instr[29:24] = 001000;
10828      instr[31,30] = size
10829      instr[23] = 0 if exclusive
10830      instr[22] = L : 1 if load, 0 if store
10831      instr[21] = 1 if pair
10832      instr[20,16] = Rs
10833      instr[15] = o0 : 1 if ordered
10834      instr[14,10] = Rt2
10835      instr[9,5] = Rn
10836      instr[4.0] = Rt.  */
10837
10838   switch (INSTR (22, 21))
10839     {
10840     case 2:   ldxr (cpu); return;
10841     case 0:   stxr (cpu); return;
10842     default:  HALT_NYI;
10843     }
10844 }
10845
10846 static void
10847 dexLoadOther (sim_cpu *cpu)
10848 {
10849   uint32_t dispatch;
10850
10851   /* instr[29,25] = 111_0
10852      instr[24] == 0 ==> dispatch, 1 ==> ldst reg unsigned immediate
10853      instr[21:11,10] is the secondary dispatch.  */
10854   if (INSTR (24, 24))
10855     {
10856       dexLoadUnsignedImmediate (cpu);
10857       return;
10858     }
10859
10860   dispatch = ((INSTR (21, 21) << 2) | INSTR (11, 10));
10861   switch (dispatch)
10862     {
10863     case 0: dexLoadUnscaledImmediate (cpu); return;
10864     case 1: dexLoadImmediatePrePost (cpu); return;
10865     case 3: dexLoadImmediatePrePost (cpu); return;
10866     case 6: dexLoadRegisterOffset (cpu); return;
10867
10868     default:
10869     case 2:
10870     case 4:
10871     case 5:
10872     case 7:
10873       HALT_NYI;
10874     }
10875 }
10876
10877 static void
10878 store_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10879 {
10880   unsigned rn = INSTR (14, 10);
10881   unsigned rd = INSTR (9, 5);
10882   unsigned rm = INSTR (4, 0);
10883   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10884
10885   if ((rn == rd || rm == rd) && wb != NoWriteBack)
10886     HALT_UNALLOC; /* ??? */
10887
10888   offset <<= 2;
10889
10890   if (wb != Post)
10891     address += offset;
10892
10893   aarch64_set_mem_u32 (cpu, address,
10894                        aarch64_get_reg_u32 (cpu, rm, NO_SP));
10895   aarch64_set_mem_u32 (cpu, address + 4,
10896                        aarch64_get_reg_u32 (cpu, rn, NO_SP));
10897
10898   if (wb == Post)
10899     address += offset;
10900
10901   if (wb != NoWriteBack)
10902     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10903 }
10904
10905 static void
10906 store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10907 {
10908   unsigned rn = INSTR (14, 10);
10909   unsigned rd = INSTR (9, 5);
10910   unsigned rm = INSTR (4, 0);
10911   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10912
10913   if ((rn == rd || rm == rd) && wb != NoWriteBack)
10914     HALT_UNALLOC; /* ??? */
10915
10916   offset <<= 3;
10917
10918   if (wb != Post)
10919     address += offset;
10920
10921   aarch64_set_mem_u64 (cpu, address,
10922                        aarch64_get_reg_u64 (cpu, rm, NO_SP));
10923   aarch64_set_mem_u64 (cpu, address + 8,
10924                        aarch64_get_reg_u64 (cpu, rn, NO_SP));
10925
10926   if (wb == Post)
10927     address += offset;
10928
10929   if (wb != NoWriteBack)
10930     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10931 }
10932
10933 static void
10934 load_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10935 {
10936   unsigned rn = INSTR (14, 10);
10937   unsigned rd = INSTR (9, 5);
10938   unsigned rm = INSTR (4, 0);
10939   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10940
10941   /* Treat this as unalloc to make sure we don't do it.  */
10942   if (rn == rm)
10943     HALT_UNALLOC;
10944
10945   offset <<= 2;
10946
10947   if (wb != Post)
10948     address += offset;
10949
10950   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u32 (cpu, address));
10951   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u32 (cpu, address + 4));
10952
10953   if (wb == Post)
10954     address += offset;
10955
10956   if (wb != NoWriteBack)
10957     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10958 }
10959
10960 static void
10961 load_pair_s32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10962 {
10963   unsigned rn = INSTR (14, 10);
10964   unsigned rd = INSTR (9, 5);
10965   unsigned rm = INSTR (4, 0);
10966   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10967
10968   /* Treat this as unalloc to make sure we don't do it.  */
10969   if (rn == rm)
10970     HALT_UNALLOC;
10971
10972   offset <<= 2;
10973
10974   if (wb != Post)
10975     address += offset;
10976
10977   aarch64_set_reg_s64 (cpu, rm, SP_OK, aarch64_get_mem_s32 (cpu, address));
10978   aarch64_set_reg_s64 (cpu, rn, SP_OK, aarch64_get_mem_s32 (cpu, address + 4));
10979
10980   if (wb == Post)
10981     address += offset;
10982
10983   if (wb != NoWriteBack)
10984     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10985 }
10986
10987 static void
10988 load_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10989 {
10990   unsigned rn = INSTR (14, 10);
10991   unsigned rd = INSTR (9, 5);
10992   unsigned rm = INSTR (4, 0);
10993   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10994
10995   /* Treat this as unalloc to make sure we don't do it.  */
10996   if (rn == rm)
10997     HALT_UNALLOC;
10998
10999   offset <<= 3;
11000
11001   if (wb != Post)
11002     address += offset;
11003
11004   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u64 (cpu, address));
11005   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u64 (cpu, address + 8));
11006
11007   if (wb == Post)
11008     address += offset;
11009
11010   if (wb != NoWriteBack)
11011     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11012 }
11013
11014 static void
11015 dex_load_store_pair_gr (sim_cpu *cpu)
11016 {
11017   /* instr[31,30] = size (10=> 64-bit, 01=> signed 32-bit, 00=> 32-bit)
11018      instr[29,25] = instruction encoding: 101_0
11019      instr[26]    = V : 1 if fp 0 if gp
11020      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11021      instr[22]    = load/store (1=> load)
11022      instr[21,15] = signed, scaled, offset
11023      instr[14,10] = Rn
11024      instr[ 9, 5] = Rd
11025      instr[ 4, 0] = Rm.  */
11026
11027   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11028   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11029
11030   switch (dispatch)
11031     {
11032     case 2: store_pair_u32 (cpu, offset, Post); return;
11033     case 3: load_pair_u32  (cpu, offset, Post); return;
11034     case 4: store_pair_u32 (cpu, offset, NoWriteBack); return;
11035     case 5: load_pair_u32  (cpu, offset, NoWriteBack); return;
11036     case 6: store_pair_u32 (cpu, offset, Pre); return;
11037     case 7: load_pair_u32  (cpu, offset, Pre); return;
11038
11039     case 11: load_pair_s32  (cpu, offset, Post); return;
11040     case 13: load_pair_s32  (cpu, offset, NoWriteBack); return;
11041     case 15: load_pair_s32  (cpu, offset, Pre); return;
11042
11043     case 18: store_pair_u64 (cpu, offset, Post); return;
11044     case 19: load_pair_u64  (cpu, offset, Post); return;
11045     case 20: store_pair_u64 (cpu, offset, NoWriteBack); return;
11046     case 21: load_pair_u64  (cpu, offset, NoWriteBack); return;
11047     case 22: store_pair_u64 (cpu, offset, Pre); return;
11048     case 23: load_pair_u64  (cpu, offset, Pre); return;
11049
11050     default:
11051       HALT_UNALLOC;
11052     }
11053 }
11054
11055 static void
11056 store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11057 {
11058   unsigned rn = INSTR (14, 10);
11059   unsigned rd = INSTR (9, 5);
11060   unsigned rm = INSTR (4, 0);
11061   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11062
11063   offset <<= 2;
11064
11065   if (wb != Post)
11066     address += offset;
11067
11068   aarch64_set_mem_u32 (cpu, address,     aarch64_get_vec_u32 (cpu, rm, 0));
11069   aarch64_set_mem_u32 (cpu, address + 4, aarch64_get_vec_u32 (cpu, rn, 0));
11070
11071   if (wb == Post)
11072     address += offset;
11073
11074   if (wb != NoWriteBack)
11075     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11076 }
11077
11078 static void
11079 store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11080 {
11081   unsigned rn = INSTR (14, 10);
11082   unsigned rd = INSTR (9, 5);
11083   unsigned rm = INSTR (4, 0);
11084   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11085
11086   offset <<= 3;
11087
11088   if (wb != Post)
11089     address += offset;
11090
11091   aarch64_set_mem_u64 (cpu, address,     aarch64_get_vec_u64 (cpu, rm, 0));
11092   aarch64_set_mem_u64 (cpu, address + 8, aarch64_get_vec_u64 (cpu, rn, 0));
11093
11094   if (wb == Post)
11095     address += offset;
11096
11097   if (wb != NoWriteBack)
11098     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11099 }
11100
11101 static void
11102 store_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11103 {
11104   FRegister a;
11105   unsigned rn = INSTR (14, 10);
11106   unsigned rd = INSTR (9, 5);
11107   unsigned rm = INSTR (4, 0);
11108   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11109
11110   offset <<= 4;
11111
11112   if (wb != Post)
11113     address += offset;
11114
11115   aarch64_get_FP_long_double (cpu, rm, & a);
11116   aarch64_set_mem_long_double (cpu, address, a);
11117   aarch64_get_FP_long_double (cpu, rn, & a);
11118   aarch64_set_mem_long_double (cpu, address + 16, a);
11119
11120   if (wb == Post)
11121     address += offset;
11122
11123   if (wb != NoWriteBack)
11124     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11125 }
11126
11127 static void
11128 load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11129 {
11130   unsigned rn = INSTR (14, 10);
11131   unsigned rd = INSTR (9, 5);
11132   unsigned rm = INSTR (4, 0);
11133   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11134
11135   if (rm == rn)
11136     HALT_UNALLOC;
11137
11138   offset <<= 2;
11139
11140   if (wb != Post)
11141     address += offset;
11142
11143   aarch64_set_vec_u32 (cpu, rm, 0, aarch64_get_mem_u32 (cpu, address));
11144   aarch64_set_vec_u32 (cpu, rn, 0, aarch64_get_mem_u32 (cpu, address + 4));
11145
11146   if (wb == Post)
11147     address += offset;
11148
11149   if (wb != NoWriteBack)
11150     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11151 }
11152
11153 static void
11154 load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11155 {
11156   unsigned rn = INSTR (14, 10);
11157   unsigned rd = INSTR (9, 5);
11158   unsigned rm = INSTR (4, 0);
11159   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11160
11161   if (rm == rn)
11162     HALT_UNALLOC;
11163
11164   offset <<= 3;
11165
11166   if (wb != Post)
11167     address += offset;
11168
11169   aarch64_set_vec_u64 (cpu, rm, 0, aarch64_get_mem_u64 (cpu, address));
11170   aarch64_set_vec_u64 (cpu, rn, 0, aarch64_get_mem_u64 (cpu, address + 8));
11171
11172   if (wb == Post)
11173     address += offset;
11174
11175   if (wb != NoWriteBack)
11176     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11177 }
11178
11179 static void
11180 load_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11181 {
11182   FRegister a;
11183   unsigned rn = INSTR (14, 10);
11184   unsigned rd = INSTR (9, 5);
11185   unsigned rm = INSTR (4, 0);
11186   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11187
11188   if (rm == rn)
11189     HALT_UNALLOC;
11190
11191   offset <<= 4;
11192
11193   if (wb != Post)
11194     address += offset;
11195
11196   aarch64_get_mem_long_double (cpu, address, & a);
11197   aarch64_set_FP_long_double (cpu, rm, a);
11198   aarch64_get_mem_long_double (cpu, address + 16, & a);
11199   aarch64_set_FP_long_double (cpu, rn, a);
11200
11201   if (wb == Post)
11202     address += offset;
11203
11204   if (wb != NoWriteBack)
11205     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11206 }
11207
11208 static void
11209 dex_load_store_pair_fp (sim_cpu *cpu)
11210 {
11211   /* instr[31,30] = size (10=> 128-bit, 01=> 64-bit, 00=> 32-bit)
11212      instr[29,25] = instruction encoding
11213      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11214      instr[22]    = load/store (1=> load)
11215      instr[21,15] = signed, scaled, offset
11216      instr[14,10] = Rn
11217      instr[ 9, 5] = Rd
11218      instr[ 4, 0] = Rm  */
11219
11220   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11221   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11222
11223   switch (dispatch)
11224     {
11225     case 2: store_pair_float (cpu, offset, Post); return;
11226     case 3: load_pair_float  (cpu, offset, Post); return;
11227     case 4: store_pair_float (cpu, offset, NoWriteBack); return;
11228     case 5: load_pair_float  (cpu, offset, NoWriteBack); return;
11229     case 6: store_pair_float (cpu, offset, Pre); return;
11230     case 7: load_pair_float  (cpu, offset, Pre); return;
11231
11232     case 10: store_pair_double (cpu, offset, Post); return;
11233     case 11: load_pair_double  (cpu, offset, Post); return;
11234     case 12: store_pair_double (cpu, offset, NoWriteBack); return;
11235     case 13: load_pair_double  (cpu, offset, NoWriteBack); return;
11236     case 14: store_pair_double (cpu, offset, Pre); return;
11237     case 15: load_pair_double  (cpu, offset, Pre); return;
11238
11239     case 18: store_pair_long_double (cpu, offset, Post); return;
11240     case 19: load_pair_long_double  (cpu, offset, Post); return;
11241     case 20: store_pair_long_double (cpu, offset, NoWriteBack); return;
11242     case 21: load_pair_long_double  (cpu, offset, NoWriteBack); return;
11243     case 22: store_pair_long_double (cpu, offset, Pre); return;
11244     case 23: load_pair_long_double  (cpu, offset, Pre); return;
11245
11246     default:
11247       HALT_UNALLOC;
11248     }
11249 }
11250
11251 static inline unsigned
11252 vec_reg (unsigned v, unsigned o)
11253 {
11254   return (v + o) & 0x3F;
11255 }
11256
11257 /* Load multiple N-element structures to N consecutive registers.  */
11258 static void
11259 vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
11260 {
11261   int      all  = INSTR (30, 30);
11262   unsigned size = INSTR (11, 10);
11263   unsigned vd   = INSTR (4, 0);
11264   unsigned i;
11265
11266   switch (size)
11267     {
11268     case 0: /* 8-bit operations.  */
11269       if (all)
11270         for (i = 0; i < (16 * N); i++)
11271           aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
11272                               aarch64_get_mem_u8 (cpu, address + i));
11273       else
11274         for (i = 0; i < (8 * N); i++)
11275           aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
11276                               aarch64_get_mem_u8 (cpu, address + i));
11277       return;
11278
11279     case 1: /* 16-bit operations.  */
11280       if (all)
11281         for (i = 0; i < (8 * N); i++)
11282           aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
11283                                aarch64_get_mem_u16 (cpu, address + i * 2));
11284       else
11285         for (i = 0; i < (4 * N); i++)
11286           aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
11287                                aarch64_get_mem_u16 (cpu, address + i * 2));
11288       return;
11289
11290     case 2: /* 32-bit operations.  */
11291       if (all)
11292         for (i = 0; i < (4 * N); i++)
11293           aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
11294                                aarch64_get_mem_u32 (cpu, address + i * 4));
11295       else
11296         for (i = 0; i < (2 * N); i++)
11297           aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
11298                                aarch64_get_mem_u32 (cpu, address + i * 4));
11299       return;
11300
11301     case 3: /* 64-bit operations.  */
11302       if (all)
11303         for (i = 0; i < (2 * N); i++)
11304           aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
11305                                aarch64_get_mem_u64 (cpu, address + i * 8));
11306       else
11307         for (i = 0; i < N; i++)
11308           aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
11309                                aarch64_get_mem_u64 (cpu, address + i * 8));
11310       return;
11311     }
11312 }
11313
11314 /* LD4: load multiple 4-element to four consecutive registers.  */
11315 static void
11316 LD4 (sim_cpu *cpu, uint64_t address)
11317 {
11318   vec_load (cpu, address, 4);
11319 }
11320
11321 /* LD3: load multiple 3-element structures to three consecutive registers.  */
11322 static void
11323 LD3 (sim_cpu *cpu, uint64_t address)
11324 {
11325   vec_load (cpu, address, 3);
11326 }
11327
11328 /* LD2: load multiple 2-element structures to two consecutive registers.  */
11329 static void
11330 LD2 (sim_cpu *cpu, uint64_t address)
11331 {
11332   vec_load (cpu, address, 2);
11333 }
11334
11335 /* Load multiple 1-element structures into one register.  */
11336 static void
11337 LD1_1 (sim_cpu *cpu, uint64_t address)
11338 {
11339   int      all  = INSTR (30, 30);
11340   unsigned size = INSTR (11, 10);
11341   unsigned vd   = INSTR (4, 0);
11342   unsigned i;
11343
11344   switch (size)
11345     {
11346     case 0:
11347       /* LD1 {Vd.16b}, addr, #16 */
11348       /* LD1 {Vd.8b}, addr, #8 */
11349       for (i = 0; i < (all ? 16 : 8); i++)
11350         aarch64_set_vec_u8 (cpu, vd, i,
11351                             aarch64_get_mem_u8 (cpu, address + i));
11352       return;
11353
11354     case 1:
11355       /* LD1 {Vd.8h}, addr, #16 */
11356       /* LD1 {Vd.4h}, addr, #8 */
11357       for (i = 0; i < (all ? 8 : 4); i++)
11358         aarch64_set_vec_u16 (cpu, vd, i,
11359                              aarch64_get_mem_u16 (cpu, address + i * 2));
11360       return;
11361
11362     case 2:
11363       /* LD1 {Vd.4s}, addr, #16 */
11364       /* LD1 {Vd.2s}, addr, #8 */
11365       for (i = 0; i < (all ? 4 : 2); i++)
11366         aarch64_set_vec_u32 (cpu, vd, i,
11367                              aarch64_get_mem_u32 (cpu, address + i * 4));
11368       return;
11369
11370     case 3:
11371       /* LD1 {Vd.2d}, addr, #16 */
11372       /* LD1 {Vd.1d}, addr, #8 */
11373       for (i = 0; i < (all ? 2 : 1); i++)
11374         aarch64_set_vec_u64 (cpu, vd, i,
11375                              aarch64_get_mem_u64 (cpu, address + i * 8));
11376       return;
11377     }
11378 }
11379
11380 /* Load multiple 1-element structures into two registers.  */
11381 static void
11382 LD1_2 (sim_cpu *cpu, uint64_t address)
11383 {
11384   /* FIXME: This algorithm is *exactly* the same as the LD2 version.
11385      So why have two different instructions ?  There must be something
11386      wrong somewhere.  */
11387   vec_load (cpu, address, 2);
11388 }
11389
11390 /* Load multiple 1-element structures into three registers.  */
11391 static void
11392 LD1_3 (sim_cpu *cpu, uint64_t address)
11393 {
11394   /* FIXME: This algorithm is *exactly* the same as the LD3 version.
11395      So why have two different instructions ?  There must be something
11396      wrong somewhere.  */
11397   vec_load (cpu, address, 3);
11398 }
11399
11400 /* Load multiple 1-element structures into four registers.  */
11401 static void
11402 LD1_4 (sim_cpu *cpu, uint64_t address)
11403 {
11404   /* FIXME: This algorithm is *exactly* the same as the LD4 version.
11405      So why have two different instructions ?  There must be something
11406      wrong somewhere.  */
11407   vec_load (cpu, address, 4);
11408 }
11409
11410 /* Store multiple N-element structures to N consecutive registers.  */
11411 static void
11412 vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
11413 {
11414   int      all  = INSTR (30, 30);
11415   unsigned size = INSTR (11, 10);
11416   unsigned vd   = INSTR (4, 0);
11417   unsigned i;
11418
11419   switch (size)
11420     {
11421     case 0: /* 8-bit operations.  */
11422       if (all)
11423         for (i = 0; i < (16 * N); i++)
11424           aarch64_set_mem_u8
11425             (cpu, address + i,
11426              aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
11427       else
11428         for (i = 0; i < (8 * N); i++)
11429           aarch64_set_mem_u8
11430             (cpu, address + i,
11431              aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
11432       return;
11433
11434     case 1: /* 16-bit operations.  */
11435       if (all)
11436         for (i = 0; i < (8 * N); i++)
11437           aarch64_set_mem_u16
11438             (cpu, address + i * 2,
11439              aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
11440       else
11441         for (i = 0; i < (4 * N); i++)
11442           aarch64_set_mem_u16
11443             (cpu, address + i * 2,
11444              aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
11445       return;
11446
11447     case 2: /* 32-bit operations.  */
11448       if (all)
11449         for (i = 0; i < (4 * N); i++)
11450           aarch64_set_mem_u32
11451             (cpu, address + i * 4,
11452              aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
11453       else
11454         for (i = 0; i < (2 * N); i++)
11455           aarch64_set_mem_u32
11456             (cpu, address + i * 4,
11457              aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
11458       return;
11459
11460     case 3: /* 64-bit operations.  */
11461       if (all)
11462         for (i = 0; i < (2 * N); i++)
11463           aarch64_set_mem_u64
11464             (cpu, address + i * 8,
11465              aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
11466       else
11467         for (i = 0; i < N; i++)
11468           aarch64_set_mem_u64
11469             (cpu, address + i * 8,
11470              aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
11471       return;
11472     }
11473 }
11474
11475 /* Store multiple 4-element structure to four consecutive registers.  */
11476 static void
11477 ST4 (sim_cpu *cpu, uint64_t address)
11478 {
11479   vec_store (cpu, address, 4);
11480 }
11481
11482 /* Store multiple 3-element structures to three consecutive registers.  */
11483 static void
11484 ST3 (sim_cpu *cpu, uint64_t address)
11485 {
11486   vec_store (cpu, address, 3);
11487 }
11488
11489 /* Store multiple 2-element structures to two consecutive registers.  */
11490 static void
11491 ST2 (sim_cpu *cpu, uint64_t address)
11492 {
11493   vec_store (cpu, address, 2);
11494 }
11495
11496 /* Store multiple 1-element structures into one register.  */
11497 static void
11498 ST1_1 (sim_cpu *cpu, uint64_t address)
11499 {
11500   int      all  = INSTR (30, 30);
11501   unsigned size = INSTR (11, 10);
11502   unsigned vd   = INSTR (4, 0);
11503   unsigned i;
11504
11505   switch (size)
11506     {
11507     case 0:
11508       for (i = 0; i < (all ? 16 : 8); i++)
11509         aarch64_set_mem_u8 (cpu, address + i,
11510                             aarch64_get_vec_u8 (cpu, vd, i));
11511       return;
11512
11513     case 1:
11514       for (i = 0; i < (all ? 8 : 4); i++)
11515         aarch64_set_mem_u16 (cpu, address + i * 2,
11516                              aarch64_get_vec_u16 (cpu, vd, i));
11517       return;
11518
11519     case 2:
11520       for (i = 0; i < (all ? 4 : 2); i++)
11521         aarch64_set_mem_u32 (cpu, address + i * 4,
11522                              aarch64_get_vec_u32 (cpu, vd, i));
11523       return;
11524
11525     case 3:
11526       for (i = 0; i < (all ? 2 : 1); i++)
11527         aarch64_set_mem_u64 (cpu, address + i * 8,
11528                              aarch64_get_vec_u64 (cpu, vd, i));
11529       return;
11530     }
11531 }
11532
11533 /* Store multiple 1-element structures into two registers.  */
11534 static void
11535 ST1_2 (sim_cpu *cpu, uint64_t address)
11536 {
11537   /* FIXME: This algorithm is *exactly* the same as the ST2 version.
11538      So why have two different instructions ?  There must be
11539      something wrong somewhere.  */
11540   vec_store (cpu, address, 2);
11541 }
11542
11543 /* Store multiple 1-element structures into three registers.  */
11544 static void
11545 ST1_3 (sim_cpu *cpu, uint64_t address)
11546 {
11547   /* FIXME: This algorithm is *exactly* the same as the ST3 version.
11548      So why have two different instructions ?  There must be
11549      something wrong somewhere.  */
11550   vec_store (cpu, address, 3);
11551 }
11552
11553 /* Store multiple 1-element structures into four registers.  */
11554 static void
11555 ST1_4 (sim_cpu *cpu, uint64_t address)
11556 {
11557   /* FIXME: This algorithm is *exactly* the same as the ST4 version.
11558      So why have two different instructions ?  There must be
11559      something wrong somewhere.  */
11560   vec_store (cpu, address, 4);
11561 }
11562
11563 #define LDn_STn_SINGLE_LANE_AND_SIZE()                          \
11564   do                                                            \
11565     {                                                           \
11566       switch (INSTR (15, 14))                                   \
11567         {                                                       \
11568         case 0:                                                 \
11569           lane = (full << 3) | (s << 2) | size;                 \
11570           size = 0;                                             \
11571           break;                                                \
11572                                                                 \
11573         case 1:                                                 \
11574           if ((size & 1) == 1)                                  \
11575             HALT_UNALLOC;                                       \
11576           lane = (full << 2) | (s << 1) | (size >> 1);          \
11577           size = 1;                                             \
11578           break;                                                \
11579                                                                 \
11580         case 2:                                                 \
11581           if ((size & 2) == 2)                                  \
11582             HALT_UNALLOC;                                       \
11583                                                                 \
11584           if ((size & 1) == 0)                                  \
11585             {                                                   \
11586               lane = (full << 1) | s;                           \
11587               size = 2;                                         \
11588             }                                                   \
11589           else                                                  \
11590             {                                                   \
11591               if (s)                                            \
11592                 HALT_UNALLOC;                                   \
11593               lane = full;                                      \
11594               size = 3;                                         \
11595             }                                                   \
11596           break;                                                \
11597                                                                 \
11598         default:                                                \
11599           HALT_UNALLOC;                                         \
11600         }                                                       \
11601     }                                                           \
11602   while (0)
11603
11604 /* Load single structure into one lane of N registers.  */
11605 static void
11606 do_vec_LDn_single (sim_cpu *cpu, uint64_t address)
11607 {
11608   /* instr[31]    = 0
11609      instr[30]    = element selector 0=>half, 1=>all elements
11610      instr[29,24] = 00 1101
11611      instr[23]    = 0=>simple, 1=>post
11612      instr[22]    = 1
11613      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11614      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11615                       11111 (immediate post inc)
11616      instr[15,13] = opcode
11617      instr[12]    = S, used for lane number
11618      instr[11,10] = size, also used for lane number
11619      instr[9,5]   = address
11620      instr[4,0]   = Vd  */
11621
11622   unsigned full = INSTR (30, 30);
11623   unsigned vd = INSTR (4, 0);
11624   unsigned size = INSTR (11, 10);
11625   unsigned s = INSTR (12, 12);
11626   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11627   int lane = 0;
11628   int i;
11629
11630   NYI_assert (29, 24, 0x0D);
11631   NYI_assert (22, 22, 1);
11632
11633   /* Compute the lane number first (using size), and then compute size.  */
11634   LDn_STn_SINGLE_LANE_AND_SIZE ();
11635
11636   for (i = 0; i < nregs; i++)
11637     switch (size)
11638       {
11639       case 0:
11640         {
11641           uint8_t val = aarch64_get_mem_u8 (cpu, address + i);
11642           aarch64_set_vec_u8 (cpu, vd + i, lane, val);
11643           break;
11644         }
11645
11646       case 1:
11647         {
11648           uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2));
11649           aarch64_set_vec_u16 (cpu, vd + i, lane, val);
11650           break;
11651         }
11652
11653       case 2:
11654         {
11655           uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4));
11656           aarch64_set_vec_u32 (cpu, vd + i, lane, val);
11657           break;
11658         }
11659
11660       case 3:
11661         {
11662           uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8));
11663           aarch64_set_vec_u64 (cpu, vd + i, lane, val);
11664           break;
11665         }
11666       }
11667 }
11668
11669 /* Store single structure from one lane from N registers.  */
11670 static void
11671 do_vec_STn_single (sim_cpu *cpu, uint64_t address)
11672 {
11673   /* instr[31]    = 0
11674      instr[30]    = element selector 0=>half, 1=>all elements
11675      instr[29,24] = 00 1101
11676      instr[23]    = 0=>simple, 1=>post
11677      instr[22]    = 0
11678      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11679      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11680                       11111 (immediate post inc)
11681      instr[15,13] = opcode
11682      instr[12]    = S, used for lane number
11683      instr[11,10] = size, also used for lane number
11684      instr[9,5]   = address
11685      instr[4,0]   = Vd  */
11686
11687   unsigned full = INSTR (30, 30);
11688   unsigned vd = INSTR (4, 0);
11689   unsigned size = INSTR (11, 10);
11690   unsigned s = INSTR (12, 12);
11691   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11692   int lane = 0;
11693   int i;
11694
11695   NYI_assert (29, 24, 0x0D);
11696   NYI_assert (22, 22, 0);
11697
11698   /* Compute the lane number first (using size), and then compute size.  */
11699   LDn_STn_SINGLE_LANE_AND_SIZE ();
11700
11701   for (i = 0; i < nregs; i++)
11702     switch (size)
11703       {
11704       case 0:
11705         {
11706           uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane);
11707           aarch64_set_mem_u8 (cpu, address + i, val);
11708           break;
11709         }
11710
11711       case 1:
11712         {
11713           uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane);
11714           aarch64_set_mem_u16 (cpu, address + (i * 2), val);
11715           break;
11716         }
11717
11718       case 2:
11719         {
11720           uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane);
11721           aarch64_set_mem_u32 (cpu, address + (i * 4), val);
11722           break;
11723         }
11724
11725       case 3:
11726         {
11727           uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane);
11728           aarch64_set_mem_u64 (cpu, address + (i * 8), val);
11729           break;
11730         }
11731       }
11732 }
11733
11734 /* Load single structure into all lanes of N registers.  */
11735 static void
11736 do_vec_LDnR (sim_cpu *cpu, uint64_t address)
11737 {
11738   /* instr[31]    = 0
11739      instr[30]    = element selector 0=>half, 1=>all elements
11740      instr[29,24] = 00 1101
11741      instr[23]    = 0=>simple, 1=>post
11742      instr[22]    = 1
11743      instr[21]    = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
11744      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11745                       11111 (immediate post inc)
11746      instr[15,14] = 11
11747      instr[13]    = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
11748      instr[12]    = 0
11749      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11750                                  10=> word(s), 11=> double(d)
11751      instr[9,5]   = address
11752      instr[4,0]   = Vd  */
11753
11754   unsigned full = INSTR (30, 30);
11755   unsigned vd = INSTR (4, 0);
11756   unsigned size = INSTR (11, 10);
11757   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11758   int i, n;
11759
11760   NYI_assert (29, 24, 0x0D);
11761   NYI_assert (22, 22, 1);
11762   NYI_assert (15, 14, 3);
11763   NYI_assert (12, 12, 0);
11764
11765   for (n = 0; n < nregs; n++)
11766     switch (size)
11767       {
11768       case 0:
11769         {
11770           uint8_t val = aarch64_get_mem_u8 (cpu, address + n);
11771           for (i = 0; i < (full ? 16 : 8); i++)
11772             aarch64_set_vec_u8 (cpu, vd + n, i, val);
11773           break;
11774         }
11775
11776       case 1:
11777         {
11778           uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2));
11779           for (i = 0; i < (full ? 8 : 4); i++)
11780             aarch64_set_vec_u16 (cpu, vd + n, i, val);
11781           break;
11782         }
11783
11784       case 2:
11785         {
11786           uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4));
11787           for (i = 0; i < (full ? 4 : 2); i++)
11788             aarch64_set_vec_u32 (cpu, vd + n, i, val);
11789           break;
11790         }
11791
11792       case 3:
11793         {
11794           uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8));
11795           for (i = 0; i < (full ? 2 : 1); i++)
11796             aarch64_set_vec_u64 (cpu, vd + n, i, val);
11797           break;
11798         }
11799
11800       default:
11801         HALT_UNALLOC;
11802       }
11803 }
11804
11805 static void
11806 do_vec_load_store (sim_cpu *cpu)
11807 {
11808   /* {LD|ST}<N>   {Vd..Vd+N}, vaddr
11809
11810      instr[31]    = 0
11811      instr[30]    = element selector 0=>half, 1=>all elements
11812      instr[29,25] = 00110
11813      instr[24]    = 0=>multiple struct, 1=>single struct
11814      instr[23]    = 0=>simple, 1=>post
11815      instr[22]    = 0=>store, 1=>load
11816      instr[21]    = 0 (LDn) / small(0)-large(1) selector (LDnR)
11817      instr[20,16] = 00000 (simple), Vinc (reg-post-inc, no SP),
11818                     11111 (immediate post inc)
11819      instr[15,12] = elements and destinations.  eg for load:
11820                      0000=>LD4 => load multiple 4-element to
11821                      four consecutive registers
11822                      0100=>LD3 => load multiple 3-element to
11823                      three consecutive registers
11824                      1000=>LD2 => load multiple 2-element to
11825                      two consecutive registers
11826                      0010=>LD1 => load multiple 1-element to
11827                      four consecutive registers
11828                      0110=>LD1 => load multiple 1-element to
11829                      three consecutive registers
11830                      1010=>LD1 => load multiple 1-element to
11831                      two consecutive registers
11832                      0111=>LD1 => load multiple 1-element to
11833                      one register
11834                      1100=>LDR1,LDR2
11835                      1110=>LDR3,LDR4
11836      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11837                                  10=> word(s), 11=> double(d)
11838      instr[9,5]   = Vn, can be SP
11839      instr[4,0]   = Vd  */
11840
11841   int single;
11842   int post;
11843   int load;
11844   unsigned vn;
11845   uint64_t address;
11846   int type;
11847
11848   if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
11849     HALT_NYI;
11850
11851   single = INSTR (24, 24);
11852   post = INSTR (23, 23);
11853   load = INSTR (22, 22);
11854   type = INSTR (15, 12);
11855   vn = INSTR (9, 5);
11856   address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
11857
11858   if (! single && INSTR (21, 21) != 0)
11859     HALT_UNALLOC;
11860
11861   if (post)
11862     {
11863       unsigned vm = INSTR (20, 16);
11864
11865       if (vm == R31)
11866         {
11867           unsigned sizeof_operation;
11868
11869           if (single)
11870             {
11871               if ((type >= 0) && (type <= 11))
11872                 {
11873                   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11874                   switch (INSTR (15, 14))
11875                     {
11876                     case 0:
11877                       sizeof_operation = nregs * 1;
11878                       break;
11879                     case 1:
11880                       sizeof_operation = nregs * 2;
11881                       break;
11882                     case 2:
11883                       if (INSTR (10, 10) == 0)
11884                         sizeof_operation = nregs * 4;
11885                       else
11886                         sizeof_operation = nregs * 8;
11887                       break;
11888                     default:
11889                       HALT_UNALLOC;
11890                     }
11891                 }
11892               else if (type == 0xC)
11893                 {
11894                   sizeof_operation = INSTR (21, 21) ? 2 : 1;
11895                   sizeof_operation <<= INSTR (11, 10);
11896                 }
11897               else if (type == 0xE)
11898                 {
11899                   sizeof_operation = INSTR (21, 21) ? 4 : 3;
11900                   sizeof_operation <<= INSTR (11, 10);
11901                 }
11902               else
11903                 HALT_UNALLOC;
11904             }
11905           else
11906             {
11907               switch (type)
11908                 {
11909                 case 0: sizeof_operation = 32; break;
11910                 case 4: sizeof_operation = 24; break;
11911                 case 8: sizeof_operation = 16; break;
11912
11913                 case 7:
11914                   /* One register, immediate offset variant.  */
11915                   sizeof_operation = 8;
11916                   break;
11917
11918                 case 10:
11919                   /* Two registers, immediate offset variant.  */
11920                   sizeof_operation = 16;
11921                   break;
11922
11923                 case 6:
11924                   /* Three registers, immediate offset variant.  */
11925                   sizeof_operation = 24;
11926                   break;
11927
11928                 case 2:
11929                   /* Four registers, immediate offset variant.  */
11930                   sizeof_operation = 32;
11931                   break;
11932
11933                 default:
11934                   HALT_UNALLOC;
11935                 }
11936
11937               if (INSTR (30, 30))
11938                 sizeof_operation *= 2;
11939             }
11940
11941           aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
11942         }
11943       else
11944         aarch64_set_reg_u64 (cpu, vn, SP_OK,
11945                              address + aarch64_get_reg_u64 (cpu, vm, NO_SP));
11946     }
11947   else
11948     {
11949       NYI_assert (20, 16, 0);
11950     }
11951
11952   if (single)
11953     {
11954       if (load)
11955         {
11956           if ((type >= 0) && (type <= 11))
11957             do_vec_LDn_single (cpu, address);
11958           else if ((type == 0xC) || (type == 0xE))
11959             do_vec_LDnR (cpu, address);
11960           else
11961             HALT_UNALLOC;
11962           return;
11963         }
11964
11965       /* Stores.  */
11966       if ((type >= 0) && (type <= 11))
11967         {
11968           do_vec_STn_single (cpu, address);
11969           return;
11970         }
11971
11972       HALT_UNALLOC;
11973     }
11974
11975   if (load)
11976     {
11977       switch (type)
11978         {
11979         case 0:  LD4 (cpu, address); return;
11980         case 4:  LD3 (cpu, address); return;
11981         case 8:  LD2 (cpu, address); return;
11982         case 2:  LD1_4 (cpu, address); return;
11983         case 6:  LD1_3 (cpu, address); return;
11984         case 10: LD1_2 (cpu, address); return;
11985         case 7:  LD1_1 (cpu, address); return;
11986
11987         default:
11988           HALT_UNALLOC;
11989         }
11990     }
11991
11992   /* Stores.  */
11993   switch (type)
11994     {
11995     case 0:  ST4 (cpu, address); return;
11996     case 4:  ST3 (cpu, address); return;
11997     case 8:  ST2 (cpu, address); return;
11998     case 2:  ST1_4 (cpu, address); return;
11999     case 6:  ST1_3 (cpu, address); return;
12000     case 10: ST1_2 (cpu, address); return;
12001     case 7:  ST1_1 (cpu, address); return;
12002     default:
12003       HALT_UNALLOC;
12004     }
12005 }
12006
12007 static void
12008 dexLdSt (sim_cpu *cpu)
12009 {
12010   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
12011      assert  group == GROUP_LDST_0100 || group == GROUP_LDST_0110 ||
12012              group == GROUP_LDST_1100 || group == GROUP_LDST_1110
12013      bits [29,28:26] of a LS are the secondary dispatch vector.  */
12014   uint32_t group2 = dispatchLS (aarch64_get_instr (cpu));
12015
12016   switch (group2)
12017     {
12018     case LS_EXCL_000:
12019       dexLoadExclusive (cpu); return;
12020
12021     case LS_LIT_010:
12022     case LS_LIT_011:
12023       dexLoadLiteral (cpu); return;
12024
12025     case LS_OTHER_110:
12026     case LS_OTHER_111:
12027       dexLoadOther (cpu); return;
12028
12029     case LS_ADVSIMD_001:
12030       do_vec_load_store (cpu); return;
12031
12032     case LS_PAIR_100:
12033       dex_load_store_pair_gr (cpu); return;
12034
12035     case LS_PAIR_101:
12036       dex_load_store_pair_fp (cpu); return;
12037
12038     default:
12039       /* Should never reach here.  */
12040       HALT_NYI;
12041     }
12042 }
12043
12044 /* Specific decode and execute for group Data Processing Register.  */
12045
12046 static void
12047 dexLogicalShiftedRegister (sim_cpu *cpu)
12048 {
12049   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12050      instr[30,29] = op
12051      instr[28:24] = 01010
12052      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> ROR
12053      instr[21]    = N
12054      instr[20,16] = Rm
12055      instr[15,10] = count : must be 0xxxxx for 32 bit
12056      instr[9,5]   = Rn
12057      instr[4,0]   = Rd  */
12058
12059   uint32_t size      = INSTR (31, 31);
12060   Shift    shiftType = INSTR (23, 22);
12061   uint32_t count     = INSTR (15, 10);
12062
12063   /* 32 bit operations must have count[5] = 0.
12064      or else we have an UNALLOC.  */
12065   if (size == 0 && uimm (count, 5, 5))
12066     HALT_UNALLOC;
12067
12068   /* Dispatch on size:op:N.  */
12069   switch ((INSTR (31, 29) << 1) | INSTR (21, 21))
12070     {
12071     case 0: and32_shift  (cpu, shiftType, count); return;
12072     case 1: bic32_shift  (cpu, shiftType, count); return;
12073     case 2: orr32_shift  (cpu, shiftType, count); return;
12074     case 3: orn32_shift  (cpu, shiftType, count); return;
12075     case 4: eor32_shift  (cpu, shiftType, count); return;
12076     case 5: eon32_shift  (cpu, shiftType, count); return;
12077     case 6: ands32_shift (cpu, shiftType, count); return;
12078     case 7: bics32_shift (cpu, shiftType, count); return;
12079     case 8: and64_shift  (cpu, shiftType, count); return;
12080     case 9: bic64_shift  (cpu, shiftType, count); return;
12081     case 10:orr64_shift  (cpu, shiftType, count); return;
12082     case 11:orn64_shift  (cpu, shiftType, count); return;
12083     case 12:eor64_shift  (cpu, shiftType, count); return;
12084     case 13:eon64_shift  (cpu, shiftType, count); return;
12085     case 14:ands64_shift (cpu, shiftType, count); return;
12086     case 15:bics64_shift (cpu, shiftType, count); return;
12087     }
12088 }
12089
12090 /* 32 bit conditional select.  */
12091 static void
12092 csel32 (sim_cpu *cpu, CondCode cc)
12093 {
12094   unsigned rm = INSTR (20, 16);
12095   unsigned rn = INSTR (9, 5);
12096   unsigned rd = INSTR (4, 0);
12097
12098   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12099                        testConditionCode (cpu, cc)
12100                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12101                        : aarch64_get_reg_u32 (cpu, rm, NO_SP));
12102 }
12103
12104 /* 64 bit conditional select.  */
12105 static void
12106 csel64 (sim_cpu *cpu, CondCode cc)
12107 {
12108   unsigned rm = INSTR (20, 16);
12109   unsigned rn = INSTR (9, 5);
12110   unsigned rd = INSTR (4, 0);
12111
12112   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12113                        testConditionCode (cpu, cc)
12114                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12115                        : aarch64_get_reg_u64 (cpu, rm, NO_SP));
12116 }
12117
12118 /* 32 bit conditional increment.  */
12119 static void
12120 csinc32 (sim_cpu *cpu, CondCode cc)
12121 {
12122   unsigned rm = INSTR (20, 16);
12123   unsigned rn = INSTR (9, 5);
12124   unsigned rd = INSTR (4, 0);
12125
12126   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12127                        testConditionCode (cpu, cc)
12128                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12129                        : aarch64_get_reg_u32 (cpu, rm, NO_SP) + 1);
12130 }
12131
12132 /* 64 bit conditional increment.  */
12133 static void
12134 csinc64 (sim_cpu *cpu, CondCode cc)
12135 {
12136   unsigned rm = INSTR (20, 16);
12137   unsigned rn = INSTR (9, 5);
12138   unsigned rd = INSTR (4, 0);
12139
12140   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12141                        testConditionCode (cpu, cc)
12142                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12143                        : aarch64_get_reg_u64 (cpu, rm, NO_SP) + 1);
12144 }
12145
12146 /* 32 bit conditional invert.  */
12147 static void
12148 csinv32 (sim_cpu *cpu, CondCode cc)
12149 {
12150   unsigned rm = INSTR (20, 16);
12151   unsigned rn = INSTR (9, 5);
12152   unsigned rd = INSTR (4, 0);
12153
12154   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12155                        testConditionCode (cpu, cc)
12156                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12157                        : ~ aarch64_get_reg_u32 (cpu, rm, NO_SP));
12158 }
12159
12160 /* 64 bit conditional invert.  */
12161 static void
12162 csinv64 (sim_cpu *cpu, CondCode cc)
12163 {
12164   unsigned rm = INSTR (20, 16);
12165   unsigned rn = INSTR (9, 5);
12166   unsigned rd = INSTR (4, 0);
12167
12168   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12169                        testConditionCode (cpu, cc)
12170                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12171                        : ~ aarch64_get_reg_u64 (cpu, rm, NO_SP));
12172 }
12173
12174 /* 32 bit conditional negate.  */
12175 static void
12176 csneg32 (sim_cpu *cpu, CondCode cc)
12177 {
12178   unsigned rm = INSTR (20, 16);
12179   unsigned rn = INSTR (9, 5);
12180   unsigned rd = INSTR (4, 0);
12181
12182   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12183                        testConditionCode (cpu, cc)
12184                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12185                        : - aarch64_get_reg_u32 (cpu, rm, NO_SP));
12186 }
12187
12188 /* 64 bit conditional negate.  */
12189 static void
12190 csneg64 (sim_cpu *cpu, CondCode cc)
12191 {
12192   unsigned rm = INSTR (20, 16);
12193   unsigned rn = INSTR (9, 5);
12194   unsigned rd = INSTR (4, 0);
12195
12196   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12197                        testConditionCode (cpu, cc)
12198                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12199                        : - aarch64_get_reg_u64 (cpu, rm, NO_SP));
12200 }
12201
12202 static void
12203 dexCondSelect (sim_cpu *cpu)
12204 {
12205   /* instr[28,21] = 11011011
12206      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12207      instr[30:11,10] = op : 000 ==> CSEL, 001 ==> CSINC,
12208                             100 ==> CSINV, 101 ==> CSNEG,
12209                             _1_ ==> UNALLOC
12210      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12211      instr[15,12] = cond
12212      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC  */
12213
12214   CondCode cc = INSTR (15, 12);
12215   uint32_t S = INSTR (29, 29);
12216   uint32_t op2 = INSTR (11, 10);
12217
12218   if (S == 1)
12219     HALT_UNALLOC;
12220
12221   if (op2 & 0x2)
12222     HALT_UNALLOC;
12223
12224   switch ((INSTR (31, 30) << 1) | op2)
12225     {
12226     case 0: csel32  (cpu, cc); return;
12227     case 1: csinc32 (cpu, cc); return;
12228     case 2: csinv32 (cpu, cc); return;
12229     case 3: csneg32 (cpu, cc); return;
12230     case 4: csel64  (cpu, cc); return;
12231     case 5: csinc64 (cpu, cc); return;
12232     case 6: csinv64 (cpu, cc); return;
12233     case 7: csneg64 (cpu, cc); return;
12234     }
12235 }
12236
12237 /* Some helpers for counting leading 1 or 0 bits.  */
12238
12239 /* Counts the number of leading bits which are the same
12240    in a 32 bit value in the range 1 to 32.  */
12241 static uint32_t
12242 leading32 (uint32_t value)
12243 {
12244   int32_t mask= 0xffff0000;
12245   uint32_t count= 16; /* Counts number of bits set in mask.  */
12246   uint32_t lo = 1;    /* Lower bound for number of sign bits.  */
12247   uint32_t hi = 32;   /* Upper bound for number of sign bits.  */
12248
12249   while (lo + 1 < hi)
12250     {
12251       int32_t test = (value & mask);
12252
12253       if (test == 0 || test == mask)
12254         {
12255           lo = count;
12256           count = (lo + hi) / 2;
12257           mask >>= (count - lo);
12258         }
12259       else
12260         {
12261           hi = count;
12262           count = (lo + hi) / 2;
12263           mask <<= hi - count;
12264         }
12265     }
12266
12267   if (lo != hi)
12268     {
12269       int32_t test;
12270
12271       mask >>= 1;
12272       test = (value & mask);
12273
12274       if (test == 0 || test == mask)
12275         count = hi;
12276       else
12277         count = lo;
12278     }
12279
12280   return count;
12281 }
12282
12283 /* Counts the number of leading bits which are the same
12284    in a 64 bit value in the range 1 to 64.  */
12285 static uint64_t
12286 leading64 (uint64_t value)
12287 {
12288   int64_t mask= 0xffffffff00000000LL;
12289   uint64_t count = 32; /* Counts number of bits set in mask.  */
12290   uint64_t lo = 1;     /* Lower bound for number of sign bits.  */
12291   uint64_t hi = 64;    /* Upper bound for number of sign bits.  */
12292
12293   while (lo + 1 < hi)
12294     {
12295       int64_t test = (value & mask);
12296
12297       if (test == 0 || test == mask)
12298         {
12299           lo = count;
12300           count = (lo + hi) / 2;
12301           mask >>= (count - lo);
12302         }
12303       else
12304         {
12305           hi = count;
12306           count = (lo + hi) / 2;
12307           mask <<= hi - count;
12308         }
12309     }
12310
12311   if (lo != hi)
12312     {
12313       int64_t test;
12314
12315       mask >>= 1;
12316       test = (value & mask);
12317
12318       if (test == 0 || test == mask)
12319         count = hi;
12320       else
12321         count = lo;
12322     }
12323
12324   return count;
12325 }
12326
12327 /* Bit operations.  */
12328 /* N.B register args may not be SP.  */
12329
12330 /* 32 bit count leading sign bits.  */
12331 static void
12332 cls32 (sim_cpu *cpu)
12333 {
12334   unsigned rn = INSTR (9, 5);
12335   unsigned rd = INSTR (4, 0);
12336
12337   /* N.B. the result needs to exclude the leading bit.  */
12338   aarch64_set_reg_u64
12339     (cpu, rd, NO_SP, leading32 (aarch64_get_reg_u32 (cpu, rn, NO_SP)) - 1);
12340 }
12341
12342 /* 64 bit count leading sign bits.  */
12343 static void
12344 cls64 (sim_cpu *cpu)
12345 {
12346   unsigned rn = INSTR (9, 5);
12347   unsigned rd = INSTR (4, 0);
12348
12349   /* N.B. the result needs to exclude the leading bit.  */
12350   aarch64_set_reg_u64
12351     (cpu, rd, NO_SP, leading64 (aarch64_get_reg_u64 (cpu, rn, NO_SP)) - 1);
12352 }
12353
12354 /* 32 bit count leading zero bits.  */
12355 static void
12356 clz32 (sim_cpu *cpu)
12357 {
12358   unsigned rn = INSTR (9, 5);
12359   unsigned rd = INSTR (4, 0);
12360   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12361
12362   /* if the sign (top) bit is set then the count is 0.  */
12363   if (pick32 (value, 31, 31))
12364     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12365   else
12366     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading32 (value));
12367 }
12368
12369 /* 64 bit count leading zero bits.  */
12370 static void
12371 clz64 (sim_cpu *cpu)
12372 {
12373   unsigned rn = INSTR (9, 5);
12374   unsigned rd = INSTR (4, 0);
12375   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12376
12377   /* if the sign (top) bit is set then the count is 0.  */
12378   if (pick64 (value, 63, 63))
12379     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12380   else
12381     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading64 (value));
12382 }
12383
12384 /* 32 bit reverse bits.  */
12385 static void
12386 rbit32 (sim_cpu *cpu)
12387 {
12388   unsigned rn = INSTR (9, 5);
12389   unsigned rd = INSTR (4, 0);
12390   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12391   uint32_t result = 0;
12392   int i;
12393
12394   for (i = 0; i < 32; i++)
12395     {
12396       result <<= 1;
12397       result |= (value & 1);
12398       value >>= 1;
12399     }
12400   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12401 }
12402
12403 /* 64 bit reverse bits.  */
12404 static void
12405 rbit64 (sim_cpu *cpu)
12406 {
12407   unsigned rn = INSTR (9, 5);
12408   unsigned rd = INSTR (4, 0);
12409   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12410   uint64_t result = 0;
12411   int i;
12412
12413   for (i = 0; i < 64; i++)
12414     {
12415       result <<= 1;
12416       result |= (value & 1UL);
12417       value >>= 1;
12418     }
12419   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12420 }
12421
12422 /* 32 bit reverse bytes.  */
12423 static void
12424 rev32 (sim_cpu *cpu)
12425 {
12426   unsigned rn = INSTR (9, 5);
12427   unsigned rd = INSTR (4, 0);
12428   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12429   uint32_t result = 0;
12430   int i;
12431
12432   for (i = 0; i < 4; i++)
12433     {
12434       result <<= 8;
12435       result |= (value & 0xff);
12436       value >>= 8;
12437     }
12438   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12439 }
12440
12441 /* 64 bit reverse bytes.  */
12442 static void
12443 rev64 (sim_cpu *cpu)
12444 {
12445   unsigned rn = INSTR (9, 5);
12446   unsigned rd = INSTR (4, 0);
12447   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12448   uint64_t result = 0;
12449   int i;
12450
12451   for (i = 0; i < 8; i++)
12452     {
12453       result <<= 8;
12454       result |= (value & 0xffULL);
12455       value >>= 8;
12456     }
12457   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12458 }
12459
12460 /* 32 bit reverse shorts.  */
12461 /* N.B.this reverses the order of the bytes in each half word.  */
12462 static void
12463 revh32 (sim_cpu *cpu)
12464 {
12465   unsigned rn = INSTR (9, 5);
12466   unsigned rd = INSTR (4, 0);
12467   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12468   uint32_t result = 0;
12469   int i;
12470
12471   for (i = 0; i < 2; i++)
12472     {
12473       result <<= 8;
12474       result |= (value & 0x00ff00ff);
12475       value >>= 8;
12476     }
12477   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12478 }
12479
12480 /* 64 bit reverse shorts.  */
12481 /* N.B.this reverses the order of the bytes in each half word.  */
12482 static void
12483 revh64 (sim_cpu *cpu)
12484 {
12485   unsigned rn = INSTR (9, 5);
12486   unsigned rd = INSTR (4, 0);
12487   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12488   uint64_t result = 0;
12489   int i;
12490
12491   for (i = 0; i < 2; i++)
12492     {
12493       result <<= 8;
12494       result |= (value & 0x00ff00ff00ff00ffULL);
12495       value >>= 8;
12496     }
12497   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12498 }
12499
12500 static void
12501 dexDataProc1Source (sim_cpu *cpu)
12502 {
12503   /* instr[30]    = 1
12504      instr[28,21] = 111010110
12505      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12506      instr[29]    = S : 0 ==> ok, 1 ==> UNALLOC
12507      instr[20,16] = opcode2 : 00000 ==> ok, ow ==> UNALLOC
12508      instr[15,10] = opcode : 000000 ==> RBIT, 000001 ==> REV16,
12509                              000010 ==> REV, 000011 ==> UNALLOC
12510                              000100 ==> CLZ, 000101 ==> CLS
12511                              ow ==> UNALLOC
12512      instr[9,5]   = rn : may not be SP
12513      instr[4,0]   = rd : may not be SP.  */
12514
12515   uint32_t S = INSTR (29, 29);
12516   uint32_t opcode2 = INSTR (20, 16);
12517   uint32_t opcode = INSTR (15, 10);
12518   uint32_t dispatch = ((INSTR (31, 31) << 3) | opcode);
12519
12520   if (S == 1)
12521     HALT_UNALLOC;
12522
12523   if (opcode2 != 0)
12524     HALT_UNALLOC;
12525
12526   if (opcode & 0x38)
12527     HALT_UNALLOC;
12528
12529   switch (dispatch)
12530     {
12531     case 0: rbit32 (cpu); return;
12532     case 1: revh32 (cpu); return;
12533     case 2: rev32 (cpu); return;
12534     case 4: clz32 (cpu); return;
12535     case 5: cls32 (cpu); return;
12536     case 8: rbit64 (cpu); return;
12537     case 9: revh64 (cpu); return;
12538     case 10:rev32 (cpu); return;
12539     case 11:rev64 (cpu); return;
12540     case 12:clz64 (cpu); return;
12541     case 13:cls64 (cpu); return;
12542     default: HALT_UNALLOC;
12543     }
12544 }
12545
12546 /* Variable shift.
12547    Shifts by count supplied in register.
12548    N.B register args may not be SP.
12549    These all use the shifted auxiliary function for
12550    simplicity and clarity.  Writing the actual shift
12551    inline would avoid a branch and so be faster but
12552    would also necessitate getting signs right.  */
12553
12554 /* 32 bit arithmetic shift right.  */
12555 static void
12556 asrv32 (sim_cpu *cpu)
12557 {
12558   unsigned rm = INSTR (20, 16);
12559   unsigned rn = INSTR (9, 5);
12560   unsigned rd = INSTR (4, 0);
12561
12562   aarch64_set_reg_u64
12563     (cpu, rd, NO_SP,
12564      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ASR,
12565                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12566 }
12567
12568 /* 64 bit arithmetic shift right.  */
12569 static void
12570 asrv64 (sim_cpu *cpu)
12571 {
12572   unsigned rm = INSTR (20, 16);
12573   unsigned rn = INSTR (9, 5);
12574   unsigned rd = INSTR (4, 0);
12575
12576   aarch64_set_reg_u64
12577     (cpu, rd, NO_SP,
12578      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ASR,
12579                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12580 }
12581
12582 /* 32 bit logical shift left.  */
12583 static void
12584 lslv32 (sim_cpu *cpu)
12585 {
12586   unsigned rm = INSTR (20, 16);
12587   unsigned rn = INSTR (9, 5);
12588   unsigned rd = INSTR (4, 0);
12589
12590   aarch64_set_reg_u64
12591     (cpu, rd, NO_SP,
12592      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSL,
12593                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12594 }
12595
12596 /* 64 bit arithmetic shift left.  */
12597 static void
12598 lslv64 (sim_cpu *cpu)
12599 {
12600   unsigned rm = INSTR (20, 16);
12601   unsigned rn = INSTR (9, 5);
12602   unsigned rd = INSTR (4, 0);
12603
12604   aarch64_set_reg_u64
12605     (cpu, rd, NO_SP,
12606      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSL,
12607                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12608 }
12609
12610 /* 32 bit logical shift right.  */
12611 static void
12612 lsrv32 (sim_cpu *cpu)
12613 {
12614   unsigned rm = INSTR (20, 16);
12615   unsigned rn = INSTR (9, 5);
12616   unsigned rd = INSTR (4, 0);
12617
12618   aarch64_set_reg_u64
12619     (cpu, rd, NO_SP,
12620      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSR,
12621                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12622 }
12623
12624 /* 64 bit logical shift right.  */
12625 static void
12626 lsrv64 (sim_cpu *cpu)
12627 {
12628   unsigned rm = INSTR (20, 16);
12629   unsigned rn = INSTR (9, 5);
12630   unsigned rd = INSTR (4, 0);
12631
12632   aarch64_set_reg_u64
12633     (cpu, rd, NO_SP,
12634      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSR,
12635                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12636 }
12637
12638 /* 32 bit rotate right.  */
12639 static void
12640 rorv32 (sim_cpu *cpu)
12641 {
12642   unsigned rm = INSTR (20, 16);
12643   unsigned rn = INSTR (9, 5);
12644   unsigned rd = INSTR (4, 0);
12645
12646   aarch64_set_reg_u64
12647     (cpu, rd, NO_SP,
12648      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ROR,
12649                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12650 }
12651
12652 /* 64 bit rotate right.  */
12653 static void
12654 rorv64 (sim_cpu *cpu)
12655 {
12656   unsigned rm = INSTR (20, 16);
12657   unsigned rn = INSTR (9, 5);
12658   unsigned rd = INSTR (4, 0);
12659
12660   aarch64_set_reg_u64
12661     (cpu, rd, NO_SP,
12662      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ROR,
12663                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12664 }
12665
12666
12667 /* divide.  */
12668
12669 /* 32 bit signed divide.  */
12670 static void
12671 cpuiv32 (sim_cpu *cpu)
12672 {
12673   unsigned rm = INSTR (20, 16);
12674   unsigned rn = INSTR (9, 5);
12675   unsigned rd = INSTR (4, 0);
12676   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12677   /* TODO : check that this rounds towards zero as required.  */
12678   int64_t dividend = aarch64_get_reg_s32 (cpu, rn, NO_SP);
12679   int64_t divisor = aarch64_get_reg_s32 (cpu, rm, NO_SP);
12680
12681   aarch64_set_reg_s64 (cpu, rd, NO_SP,
12682                        divisor ? ((int32_t) (dividend / divisor)) : 0);
12683 }
12684
12685 /* 64 bit signed divide.  */
12686 static void
12687 cpuiv64 (sim_cpu *cpu)
12688 {
12689   unsigned rm = INSTR (20, 16);
12690   unsigned rn = INSTR (9, 5);
12691   unsigned rd = INSTR (4, 0);
12692
12693   /* TODO : check that this rounds towards zero as required.  */
12694   int64_t divisor = aarch64_get_reg_s64 (cpu, rm, NO_SP);
12695
12696   aarch64_set_reg_s64
12697     (cpu, rd, NO_SP,
12698      divisor ? (aarch64_get_reg_s64 (cpu, rn, NO_SP) / divisor) : 0);
12699 }
12700
12701 /* 32 bit unsigned divide.  */
12702 static void
12703 udiv32 (sim_cpu *cpu)
12704 {
12705   unsigned rm = INSTR (20, 16);
12706   unsigned rn = INSTR (9, 5);
12707   unsigned rd = INSTR (4, 0);
12708
12709   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12710   uint64_t dividend = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12711   uint64_t divisor  = aarch64_get_reg_u32 (cpu, rm, NO_SP);
12712
12713   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12714                        divisor ? (uint32_t) (dividend / divisor) : 0);
12715 }
12716
12717 /* 64 bit unsigned divide.  */
12718 static void
12719 udiv64 (sim_cpu *cpu)
12720 {
12721   unsigned rm = INSTR (20, 16);
12722   unsigned rn = INSTR (9, 5);
12723   unsigned rd = INSTR (4, 0);
12724
12725   /* TODO : check that this rounds towards zero as required.  */
12726   uint64_t divisor = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12727
12728   aarch64_set_reg_u64
12729     (cpu, rd, NO_SP,
12730      divisor ? (aarch64_get_reg_u64 (cpu, rn, NO_SP) / divisor) : 0);
12731 }
12732
12733 static void
12734 dexDataProc2Source (sim_cpu *cpu)
12735 {
12736   /* assert instr[30] == 0
12737      instr[28,21] == 11010110
12738      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
12739      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12740      instr[15,10] = opcode : 000010 ==> UDIV, 000011 ==> CPUIV,
12741                              001000 ==> LSLV, 001001 ==> LSRV
12742                              001010 ==> ASRV, 001011 ==> RORV
12743                              ow ==> UNALLOC.  */
12744
12745   uint32_t dispatch;
12746   uint32_t S = INSTR (29, 29);
12747   uint32_t opcode = INSTR (15, 10);
12748
12749   if (S == 1)
12750     HALT_UNALLOC;
12751
12752   if (opcode & 0x34)
12753     HALT_UNALLOC;
12754
12755   dispatch = (  (INSTR (31, 31) << 3)
12756               | (uimm (opcode, 3, 3) << 2)
12757               |  uimm (opcode, 1, 0));
12758   switch (dispatch)
12759     {
12760     case 2:  udiv32 (cpu); return;
12761     case 3:  cpuiv32 (cpu); return;
12762     case 4:  lslv32 (cpu); return;
12763     case 5:  lsrv32 (cpu); return;
12764     case 6:  asrv32 (cpu); return;
12765     case 7:  rorv32 (cpu); return;
12766     case 10: udiv64 (cpu); return;
12767     case 11: cpuiv64 (cpu); return;
12768     case 12: lslv64 (cpu); return;
12769     case 13: lsrv64 (cpu); return;
12770     case 14: asrv64 (cpu); return;
12771     case 15: rorv64 (cpu); return;
12772     default: HALT_UNALLOC;
12773     }
12774 }
12775
12776
12777 /* Multiply.  */
12778
12779 /* 32 bit multiply and add.  */
12780 static void
12781 madd32 (sim_cpu *cpu)
12782 {
12783   unsigned rm = INSTR (20, 16);
12784   unsigned ra = INSTR (14, 10);
12785   unsigned rn = INSTR (9, 5);
12786   unsigned rd = INSTR (4, 0);
12787
12788   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12789   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12790                        aarch64_get_reg_u32 (cpu, ra, NO_SP)
12791                        + aarch64_get_reg_u32 (cpu, rn, NO_SP)
12792                        * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12793 }
12794
12795 /* 64 bit multiply and add.  */
12796 static void
12797 madd64 (sim_cpu *cpu)
12798 {
12799   unsigned rm = INSTR (20, 16);
12800   unsigned ra = INSTR (14, 10);
12801   unsigned rn = INSTR (9, 5);
12802   unsigned rd = INSTR (4, 0);
12803
12804   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12805   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12806                        aarch64_get_reg_u64 (cpu, ra, NO_SP)
12807                        + (aarch64_get_reg_u64 (cpu, rn, NO_SP)
12808                           * aarch64_get_reg_u64 (cpu, rm, NO_SP)));
12809 }
12810
12811 /* 32 bit multiply and sub.  */
12812 static void
12813 msub32 (sim_cpu *cpu)
12814 {
12815   unsigned rm = INSTR (20, 16);
12816   unsigned ra = INSTR (14, 10);
12817   unsigned rn = INSTR (9, 5);
12818   unsigned rd = INSTR (4, 0);
12819
12820   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12821   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12822                        aarch64_get_reg_u32 (cpu, ra, NO_SP)
12823                        - aarch64_get_reg_u32 (cpu, rn, NO_SP)
12824                        * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12825 }
12826
12827 /* 64 bit multiply and sub.  */
12828 static void
12829 msub64 (sim_cpu *cpu)
12830 {
12831   unsigned rm = INSTR (20, 16);
12832   unsigned ra = INSTR (14, 10);
12833   unsigned rn = INSTR (9, 5);
12834   unsigned rd = INSTR (4, 0);
12835
12836   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12837   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12838                        aarch64_get_reg_u64 (cpu, ra, NO_SP)
12839                        - aarch64_get_reg_u64 (cpu, rn, NO_SP)
12840                        * aarch64_get_reg_u64 (cpu, rm, NO_SP));
12841 }
12842
12843 /* Signed multiply add long -- source, source2 : 32 bit, source3 : 64 bit.  */
12844 static void
12845 smaddl (sim_cpu *cpu)
12846 {
12847   unsigned rm = INSTR (20, 16);
12848   unsigned ra = INSTR (14, 10);
12849   unsigned rn = INSTR (9, 5);
12850   unsigned rd = INSTR (4, 0);
12851
12852   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12853      obtain a 64 bit product.  */
12854   aarch64_set_reg_s64
12855     (cpu, rd, NO_SP,
12856      aarch64_get_reg_s64 (cpu, ra, NO_SP)
12857      + ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
12858      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
12859 }
12860
12861 /* Signed multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
12862 static void
12863 smsubl (sim_cpu *cpu)
12864 {
12865   unsigned rm = INSTR (20, 16);
12866   unsigned ra = INSTR (14, 10);
12867   unsigned rn = INSTR (9, 5);
12868   unsigned rd = INSTR (4, 0);
12869
12870   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12871      obtain a 64 bit product.  */
12872   aarch64_set_reg_s64
12873     (cpu, rd, NO_SP,
12874      aarch64_get_reg_s64 (cpu, ra, NO_SP)
12875      - ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
12876      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
12877 }
12878
12879 /* Integer Multiply/Divide.  */
12880
12881 /* First some macros and a helper function.  */
12882 /* Macros to test or access elements of 64 bit words.  */
12883
12884 /* Mask used to access lo 32 bits of 64 bit unsigned int.  */
12885 #define LOW_WORD_MASK ((1ULL << 32) - 1)
12886 /* Return the lo 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
12887 #define lowWordToU64(_value_u64) ((_value_u64) & LOW_WORD_MASK)
12888 /* Return the hi 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
12889 #define highWordToU64(_value_u64) ((_value_u64) >> 32)
12890
12891 /* Offset of sign bit in 64 bit signed integger.  */
12892 #define SIGN_SHIFT_U64 63
12893 /* The sign bit itself -- also identifies the minimum negative int value.  */
12894 #define SIGN_BIT_U64 (1UL << SIGN_SHIFT_U64)
12895 /* Return true if a 64 bit signed int presented as an unsigned int is the
12896    most negative value.  */
12897 #define isMinimumU64(_value_u64) ((_value_u64) == SIGN_BIT_U64)
12898 /* Return true (non-zero) if a 64 bit signed int presented as an unsigned
12899    int has its sign bit set to false.  */
12900 #define isSignSetU64(_value_u64) ((_value_u64) & SIGN_BIT_U64)
12901 /* Return 1L or -1L according to whether a 64 bit signed int presented as
12902    an unsigned int has its sign bit set or not.  */
12903 #define signOfU64(_value_u64) (1L + (((value_u64) >> SIGN_SHIFT_U64) * -2L)
12904 /* Clear the sign bit of a 64 bit signed int presented as an unsigned int.  */
12905 #define clearSignU64(_value_u64) ((_value_u64) &= ~SIGN_BIT_U64)
12906
12907 /* Multiply two 64 bit ints and return.
12908    the hi 64 bits of the 128 bit product.  */
12909
12910 static uint64_t
12911 mul64hi (uint64_t value1, uint64_t value2)
12912 {
12913   uint64_t resultmid1;
12914   uint64_t result;
12915   uint64_t value1_lo = lowWordToU64 (value1);
12916   uint64_t value1_hi = highWordToU64 (value1) ;
12917   uint64_t value2_lo = lowWordToU64 (value2);
12918   uint64_t value2_hi = highWordToU64 (value2);
12919
12920   /* Cross-multiply and collect results.  */
12921   uint64_t xproductlo = value1_lo * value2_lo;
12922   uint64_t xproductmid1 = value1_lo * value2_hi;
12923   uint64_t xproductmid2 = value1_hi * value2_lo;
12924   uint64_t xproducthi = value1_hi * value2_hi;
12925   uint64_t carry = 0;
12926   /* Start accumulating 64 bit results.  */
12927   /* Drop bottom half of lowest cross-product.  */
12928   uint64_t resultmid = xproductlo >> 32;
12929   /* Add in middle products.  */
12930   resultmid = resultmid + xproductmid1;
12931
12932   /* Check for overflow.  */
12933   if (resultmid < xproductmid1)
12934     /* Carry over 1 into top cross-product.  */
12935     carry++;
12936
12937   resultmid1  = resultmid + xproductmid2;
12938
12939   /* Check for overflow.  */
12940   if (resultmid1 < xproductmid2)
12941     /* Carry over 1 into top cross-product.  */
12942     carry++;
12943
12944   /* Drop lowest 32 bits of middle cross-product.  */
12945   result = resultmid1 >> 32;
12946
12947   /* Add top cross-product plus and any carry.  */
12948   result += xproducthi + carry;
12949
12950   return result;
12951 }
12952
12953 /* Signed multiply high, source, source2 :
12954    64 bit, dest <-- high 64-bit of result.  */
12955 static void
12956 smulh (sim_cpu *cpu)
12957 {
12958   uint64_t uresult;
12959   int64_t  result;
12960   unsigned rm = INSTR (20, 16);
12961   unsigned rn = INSTR (9, 5);
12962   unsigned rd = INSTR (4, 0);
12963   GReg     ra = INSTR (14, 10);
12964   int64_t  value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12965   int64_t  value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12966   uint64_t uvalue1;
12967   uint64_t uvalue2;
12968   int64_t  signum = 1;
12969
12970   if (ra != R31)
12971     HALT_UNALLOC;
12972
12973   /* Convert to unsigned and use the unsigned mul64hi routine
12974      the fix the sign up afterwards.  */
12975   if (value1 < 0)
12976     {
12977       signum *= -1L;
12978       uvalue1 = -value1;
12979     }
12980   else
12981     {
12982       uvalue1 = value1;
12983     }
12984
12985   if (value2 < 0)
12986     {
12987       signum *= -1L;
12988       uvalue2 = -value2;
12989     }
12990   else
12991     {
12992       uvalue2 = value2;
12993     }
12994
12995   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12996   uresult = mul64hi (uvalue1, uvalue2);
12997   result = uresult;
12998   result *= signum;
12999
13000   aarch64_set_reg_s64 (cpu, rd, NO_SP, result);
13001 }
13002
13003 /* Unsigned multiply add long -- source, source2 :
13004    32 bit, source3 : 64 bit.  */
13005 static void
13006 umaddl (sim_cpu *cpu)
13007 {
13008   unsigned rm = INSTR (20, 16);
13009   unsigned ra = INSTR (14, 10);
13010   unsigned rn = INSTR (9, 5);
13011   unsigned rd = INSTR (4, 0);
13012
13013   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13014   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13015      obtain a 64 bit product.  */
13016   aarch64_set_reg_u64
13017     (cpu, rd, NO_SP,
13018      aarch64_get_reg_u64 (cpu, ra, NO_SP)
13019      + ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
13020      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
13021 }
13022
13023 /* Unsigned multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
13024 static void
13025 umsubl (sim_cpu *cpu)
13026 {
13027   unsigned rm = INSTR (20, 16);
13028   unsigned ra = INSTR (14, 10);
13029   unsigned rn = INSTR (9, 5);
13030   unsigned rd = INSTR (4, 0);
13031
13032   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13033   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13034      obtain a 64 bit product.  */
13035   aarch64_set_reg_u64
13036     (cpu, rd, NO_SP,
13037      aarch64_get_reg_u64 (cpu, ra, NO_SP)
13038      - ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
13039      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
13040 }
13041
13042 /* Unsigned multiply high, source, source2 :
13043    64 bit, dest <-- high 64-bit of result.  */
13044 static void
13045 umulh (sim_cpu *cpu)
13046 {
13047   unsigned rm = INSTR (20, 16);
13048   unsigned rn = INSTR (9, 5);
13049   unsigned rd = INSTR (4, 0);
13050   GReg     ra = INSTR (14, 10);
13051
13052   if (ra != R31)
13053     HALT_UNALLOC;
13054
13055   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13056   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13057                        mul64hi (aarch64_get_reg_u64 (cpu, rn, NO_SP),
13058                                 aarch64_get_reg_u64 (cpu, rm, NO_SP)));
13059 }
13060
13061 static void
13062 dexDataProc3Source (sim_cpu *cpu)
13063 {
13064   /* assert instr[28,24] == 11011.  */
13065   /* instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit (for rd at least)
13066      instr[30,29] = op54 : 00 ==> ok, ow ==> UNALLOC
13067      instr[23,21] = op31 : 111 ==> UNALLOC, o2 ==> ok
13068      instr[15] = o0 : 0/1 ==> ok
13069      instr[23,21:15] ==> op : 0000 ==> MADD, 0001 ==> MSUB,     (32/64 bit)
13070                               0010 ==> SMADDL, 0011 ==> SMSUBL, (64 bit only)
13071                               0100 ==> SMULH,                   (64 bit only)
13072                               1010 ==> UMADDL, 1011 ==> UNSUBL, (64 bit only)
13073                               1100 ==> UMULH                    (64 bit only)
13074                               ow ==> UNALLOC.  */
13075
13076   uint32_t dispatch;
13077   uint32_t size = INSTR (31, 31);
13078   uint32_t op54 = INSTR (30, 29);
13079   uint32_t op31 = INSTR (23, 21);
13080   uint32_t o0 = INSTR (15, 15);
13081
13082   if (op54 != 0)
13083     HALT_UNALLOC;
13084
13085   if (size == 0)
13086     {
13087       if (op31 != 0)
13088         HALT_UNALLOC;
13089
13090       if (o0 == 0)
13091         madd32 (cpu);
13092       else
13093         msub32 (cpu);
13094       return;
13095     }
13096
13097   dispatch = (op31 << 1) | o0;
13098
13099   switch (dispatch)
13100     {
13101     case 0:  madd64 (cpu); return;
13102     case 1:  msub64 (cpu); return;
13103     case 2:  smaddl (cpu); return;
13104     case 3:  smsubl (cpu); return;
13105     case 4:  smulh (cpu); return;
13106     case 10: umaddl (cpu); return;
13107     case 11: umsubl (cpu); return;
13108     case 12: umulh (cpu); return;
13109     default: HALT_UNALLOC;
13110     }
13111 }
13112
13113 static void
13114 dexDPReg (sim_cpu *cpu)
13115 {
13116   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
13117      assert  group == GROUP_DPREG_0101 || group == GROUP_DPREG_1101
13118      bits [28:24:21] of a DPReg are the secondary dispatch vector.  */
13119   uint32_t group2 = dispatchDPReg (aarch64_get_instr (cpu));
13120
13121   switch (group2)
13122     {
13123     case DPREG_LOG_000:
13124     case DPREG_LOG_001:
13125       dexLogicalShiftedRegister (cpu); return;
13126
13127     case DPREG_ADDSHF_010:
13128       dexAddSubtractShiftedRegister (cpu); return;
13129
13130     case DPREG_ADDEXT_011:
13131       dexAddSubtractExtendedRegister (cpu); return;
13132
13133     case DPREG_ADDCOND_100:
13134       {
13135         /* This set bundles a variety of different operations.  */
13136         /* Check for.  */
13137         /* 1) add/sub w carry.  */
13138         uint32_t mask1 = 0x1FE00000U;
13139         uint32_t val1  = 0x1A000000U;
13140         /* 2) cond compare register/immediate.  */
13141         uint32_t mask2 = 0x1FE00000U;
13142         uint32_t val2  = 0x1A400000U;
13143         /* 3) cond select.  */
13144         uint32_t mask3 = 0x1FE00000U;
13145         uint32_t val3  = 0x1A800000U;
13146         /* 4) data proc 1/2 source.  */
13147         uint32_t mask4 = 0x1FE00000U;
13148         uint32_t val4  = 0x1AC00000U;
13149
13150         if ((aarch64_get_instr (cpu) & mask1) == val1)
13151           dexAddSubtractWithCarry (cpu);
13152
13153         else if ((aarch64_get_instr (cpu) & mask2) == val2)
13154           CondCompare (cpu);
13155
13156         else if ((aarch64_get_instr (cpu) & mask3) == val3)
13157           dexCondSelect (cpu);
13158
13159         else if ((aarch64_get_instr (cpu) & mask4) == val4)
13160           {
13161             /* Bit 30 is clear for data proc 2 source
13162                and set for data proc 1 source.  */
13163             if (aarch64_get_instr (cpu)  & (1U << 30))
13164               dexDataProc1Source (cpu);
13165             else
13166               dexDataProc2Source (cpu);
13167           }
13168
13169         else
13170           /* Should not reach here.  */
13171           HALT_NYI;
13172
13173         return;
13174       }
13175
13176     case DPREG_3SRC_110:
13177       dexDataProc3Source (cpu); return;
13178
13179     case DPREG_UNALLOC_101:
13180       HALT_UNALLOC;
13181
13182     case DPREG_3SRC_111:
13183       dexDataProc3Source (cpu); return;
13184
13185     default:
13186       /* Should never reach here.  */
13187       HALT_NYI;
13188     }
13189 }
13190
13191 /* Unconditional Branch immediate.
13192    Offset is a PC-relative byte offset in the range +/- 128MiB.
13193    The offset is assumed to be raw from the decode i.e. the
13194    simulator is expected to scale them from word offsets to byte.  */
13195
13196 /* Unconditional branch.  */
13197 static void
13198 buc (sim_cpu *cpu, int32_t offset)
13199 {
13200   aarch64_set_next_PC_by_offset (cpu, offset);
13201 }
13202
13203 static unsigned stack_depth = 0;
13204
13205 /* Unconditional branch and link -- writes return PC to LR.  */
13206 static void
13207 bl (sim_cpu *cpu, int32_t offset)
13208 {
13209   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13210   aarch64_save_LR (cpu);
13211   aarch64_set_next_PC_by_offset (cpu, offset);
13212
13213   if (TRACE_BRANCH_P (cpu))
13214     {
13215       ++ stack_depth;
13216       TRACE_BRANCH (cpu,
13217                     " %*scall %" PRIx64 " [%s]"
13218                     " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13219                     stack_depth, " ", aarch64_get_next_PC (cpu),
13220                     aarch64_get_func (CPU_STATE (cpu),
13221                                       aarch64_get_next_PC (cpu)),
13222                     aarch64_get_reg_u64 (cpu, 0, NO_SP),
13223                     aarch64_get_reg_u64 (cpu, 1, NO_SP),
13224                     aarch64_get_reg_u64 (cpu, 2, NO_SP)
13225                     );
13226     }
13227 }
13228
13229 /* Unconditional Branch register.
13230    Branch/return address is in source register.  */
13231
13232 /* Unconditional branch.  */
13233 static void
13234 br (sim_cpu *cpu)
13235 {
13236   unsigned rn = INSTR (9, 5);
13237   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13238   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13239 }
13240
13241 /* Unconditional branch and link -- writes return PC to LR.  */
13242 static void
13243 blr (sim_cpu *cpu)
13244 {
13245   unsigned rn = INSTR (9, 5);
13246
13247   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13248   /* The pseudo code in the spec says we update LR before fetching.
13249      the value from the rn.  */
13250   aarch64_save_LR (cpu);
13251   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13252
13253   if (TRACE_BRANCH_P (cpu))
13254     {
13255       ++ stack_depth;
13256       TRACE_BRANCH (cpu,
13257                     " %*scall %" PRIx64 " [%s]"
13258                     " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13259                     stack_depth, " ", aarch64_get_next_PC (cpu),
13260                     aarch64_get_func (CPU_STATE (cpu),
13261                                       aarch64_get_next_PC (cpu)),
13262                     aarch64_get_reg_u64 (cpu, 0, NO_SP),
13263                     aarch64_get_reg_u64 (cpu, 1, NO_SP),
13264                     aarch64_get_reg_u64 (cpu, 2, NO_SP)
13265                     );
13266     }
13267 }
13268
13269 /* Return -- assembler will default source to LR this is functionally
13270    equivalent to br but, presumably, unlike br it side effects the
13271    branch predictor.  */
13272 static void
13273 ret (sim_cpu *cpu)
13274 {
13275   unsigned rn = INSTR (9, 5);
13276   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13277
13278   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13279   if (TRACE_BRANCH_P (cpu))
13280     {
13281       TRACE_BRANCH (cpu,
13282                     " %*sreturn [result: %" PRIx64 "]",
13283                     stack_depth, " ", aarch64_get_reg_u64 (cpu, 0, NO_SP));
13284       -- stack_depth;
13285     }
13286 }
13287
13288 /* NOP -- we implement this and call it from the decode in case we
13289    want to intercept it later.  */
13290
13291 static void
13292 nop (sim_cpu *cpu)
13293 {
13294   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13295 }
13296
13297 /* Data synchronization barrier.  */
13298
13299 static void
13300 dsb (sim_cpu *cpu)
13301 {
13302   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13303 }
13304
13305 /* Data memory barrier.  */
13306
13307 static void
13308 dmb (sim_cpu *cpu)
13309 {
13310   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13311 }
13312
13313 /* Instruction synchronization barrier.  */
13314
13315 static void
13316 isb (sim_cpu *cpu)
13317 {
13318   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13319 }
13320
13321 static void
13322 dexBranchImmediate (sim_cpu *cpu)
13323 {
13324   /* assert instr[30,26] == 00101
13325      instr[31] ==> 0 == B, 1 == BL
13326      instr[25,0] == imm26 branch offset counted in words.  */
13327
13328   uint32_t top = INSTR (31, 31);
13329   /* We have a 26 byte signed word offset which we need to pass to the
13330      execute routine as a signed byte offset.  */
13331   int32_t offset = simm32 (aarch64_get_instr (cpu), 25, 0) << 2;
13332
13333   if (top)
13334     bl (cpu, offset);
13335   else
13336     buc (cpu, offset);
13337 }
13338
13339 /* Control Flow.  */
13340
13341 /* Conditional branch
13342
13343    Offset is a PC-relative byte offset in the range +/- 1MiB pos is
13344    a bit position in the range 0 .. 63
13345
13346    cc is a CondCode enum value as pulled out of the decode
13347
13348    N.B. any offset register (source) can only be Xn or Wn.  */
13349
13350 static void
13351 bcc (sim_cpu *cpu, int32_t offset, CondCode cc)
13352 {
13353   /* The test returns TRUE if CC is met.  */
13354   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13355   if (testConditionCode (cpu, cc))
13356     aarch64_set_next_PC_by_offset (cpu, offset);
13357 }
13358
13359 /* 32 bit branch on register non-zero.  */
13360 static void
13361 cbnz32 (sim_cpu *cpu, int32_t offset)
13362 {
13363   unsigned rt = INSTR (4, 0);
13364
13365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13366   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) != 0)
13367     aarch64_set_next_PC_by_offset (cpu, offset);
13368 }
13369
13370 /* 64 bit branch on register zero.  */
13371 static void
13372 cbnz (sim_cpu *cpu, int32_t offset)
13373 {
13374   unsigned rt = INSTR (4, 0);
13375
13376   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13377   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) != 0)
13378     aarch64_set_next_PC_by_offset (cpu, offset);
13379 }
13380
13381 /* 32 bit branch on register non-zero.  */
13382 static void
13383 cbz32 (sim_cpu *cpu, int32_t offset)
13384 {
13385   unsigned rt = INSTR (4, 0);
13386
13387   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13388   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) == 0)
13389     aarch64_set_next_PC_by_offset (cpu, offset);
13390 }
13391
13392 /* 64 bit branch on register zero.  */
13393 static void
13394 cbz (sim_cpu *cpu, int32_t offset)
13395 {
13396   unsigned rt = INSTR (4, 0);
13397
13398   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13399   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) == 0)
13400     aarch64_set_next_PC_by_offset (cpu, offset);
13401 }
13402
13403 /* Branch on register bit test non-zero -- one size fits all.  */
13404 static void
13405 tbnz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13406 {
13407   unsigned rt = INSTR (4, 0);
13408
13409   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13410   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos))
13411     aarch64_set_next_PC_by_offset (cpu, offset);
13412 }
13413
13414 /* Branch on register bit test zero -- one size fits all.  */
13415 static void
13416 tbz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13417 {
13418   unsigned rt = INSTR (4, 0);
13419
13420   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13421   if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos)))
13422     aarch64_set_next_PC_by_offset (cpu, offset);
13423 }
13424
13425 static void
13426 dexCompareBranchImmediate (sim_cpu *cpu)
13427 {
13428   /* instr[30,25] = 01 1010
13429      instr[31]    = size : 0 ==> 32, 1 ==> 64
13430      instr[24]    = op : 0 ==> CBZ, 1 ==> CBNZ
13431      instr[23,5]  = simm19 branch offset counted in words
13432      instr[4,0]   = rt  */
13433
13434   uint32_t size = INSTR (31, 31);
13435   uint32_t op   = INSTR (24, 24);
13436   int32_t offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13437
13438   if (size == 0)
13439     {
13440       if (op == 0)
13441         cbz32 (cpu, offset);
13442       else
13443         cbnz32 (cpu, offset);
13444     }
13445   else
13446     {
13447       if (op == 0)
13448         cbz (cpu, offset);
13449       else
13450         cbnz (cpu, offset);
13451     }
13452 }
13453
13454 static void
13455 dexTestBranchImmediate (sim_cpu *cpu)
13456 {
13457   /* instr[31]    = b5 : bit 5 of test bit idx
13458      instr[30,25] = 01 1011
13459      instr[24]    = op : 0 ==> TBZ, 1 == TBNZ
13460      instr[23,19] = b40 : bits 4 to 0 of test bit idx
13461      instr[18,5]  = simm14 : signed offset counted in words
13462      instr[4,0]   = uimm5  */
13463
13464   uint32_t pos = ((INSTR (31, 31) << 5) | INSTR (23, 19));
13465   int32_t offset = simm32 (aarch64_get_instr (cpu), 18, 5) << 2;
13466
13467   NYI_assert (30, 25, 0x1b);
13468
13469   if (INSTR (24, 24) == 0)
13470     tbz (cpu, pos, offset);
13471   else
13472     tbnz (cpu, pos, offset);
13473 }
13474
13475 static void
13476 dexCondBranchImmediate (sim_cpu *cpu)
13477 {
13478   /* instr[31,25] = 010 1010
13479      instr[24]    = op1; op => 00 ==> B.cond
13480      instr[23,5]  = simm19 : signed offset counted in words
13481      instr[4]     = op0
13482      instr[3,0]   = cond  */
13483
13484   int32_t offset;
13485   uint32_t op = ((INSTR (24, 24) << 1) | INSTR (4, 4));
13486
13487   NYI_assert (31, 25, 0x2a);
13488
13489   if (op != 0)
13490     HALT_UNALLOC;
13491
13492   offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13493
13494   bcc (cpu, offset, INSTR (3, 0));
13495 }
13496
13497 static void
13498 dexBranchRegister (sim_cpu *cpu)
13499 {
13500   /* instr[31,25] = 110 1011
13501      instr[24,21] = op : 0 ==> BR, 1 => BLR, 2 => RET, 3 => ERET, 4 => DRPS
13502      instr[20,16] = op2 : must be 11111
13503      instr[15,10] = op3 : must be 000000
13504      instr[4,0]   = op2 : must be 11111.  */
13505
13506   uint32_t op = INSTR (24, 21);
13507   uint32_t op2 = INSTR (20, 16);
13508   uint32_t op3 = INSTR (15, 10);
13509   uint32_t op4 = INSTR (4, 0);
13510
13511   NYI_assert (31, 25, 0x6b);
13512
13513   if (op2 != 0x1F || op3 != 0 || op4 != 0)
13514     HALT_UNALLOC;
13515
13516   if (op == 0)
13517     br (cpu);
13518
13519   else if (op == 1)
13520     blr (cpu);
13521
13522   else if (op == 2)
13523     ret (cpu);
13524
13525   else
13526     {
13527       /* ERET and DRPS accept 0b11111 for rn = instr [4,0].  */
13528       /* anything else is unallocated.  */
13529       uint32_t rn = INSTR (4, 0);
13530
13531       if (rn != 0x1f)
13532         HALT_UNALLOC;
13533
13534       if (op == 4 || op == 5)
13535         HALT_NYI;
13536
13537       HALT_UNALLOC;
13538     }
13539 }
13540
13541 /* FIXME: We should get the Angel SWI values from ../../libgloss/aarch64/svc.h
13542    but this may not be available.  So instead we define the values we need
13543    here.  */
13544 #define AngelSVC_Reason_Open            0x01
13545 #define AngelSVC_Reason_Close           0x02
13546 #define AngelSVC_Reason_Write           0x05
13547 #define AngelSVC_Reason_Read            0x06
13548 #define AngelSVC_Reason_IsTTY           0x09
13549 #define AngelSVC_Reason_Seek            0x0A
13550 #define AngelSVC_Reason_FLen            0x0C
13551 #define AngelSVC_Reason_Remove          0x0E
13552 #define AngelSVC_Reason_Rename          0x0F
13553 #define AngelSVC_Reason_Clock           0x10
13554 #define AngelSVC_Reason_Time            0x11
13555 #define AngelSVC_Reason_System          0x12
13556 #define AngelSVC_Reason_Errno           0x13
13557 #define AngelSVC_Reason_GetCmdLine      0x15
13558 #define AngelSVC_Reason_HeapInfo        0x16
13559 #define AngelSVC_Reason_ReportException 0x18
13560 #define AngelSVC_Reason_Elapsed         0x30
13561
13562
13563 static void
13564 handle_halt (sim_cpu *cpu, uint32_t val)
13565 {
13566   uint64_t result = 0;
13567
13568   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13569   if (val != 0xf000)
13570     {
13571       TRACE_SYSCALL (cpu, " HLT [0x%x]", val);
13572       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13573                        sim_stopped, SIM_SIGTRAP);
13574     }
13575
13576   /* We have encountered an Angel SVC call.  See if we can process it.  */
13577   switch (aarch64_get_reg_u32 (cpu, 0, NO_SP))
13578     {
13579     case AngelSVC_Reason_HeapInfo:
13580       {
13581         /* Get the values.  */
13582         uint64_t stack_top = aarch64_get_stack_start (cpu);
13583         uint64_t heap_base = aarch64_get_heap_start (cpu);
13584
13585         /* Get the pointer  */
13586         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13587         ptr = aarch64_get_mem_u64 (cpu, ptr);
13588
13589         /* Fill in the memory block.  */
13590         /* Start addr of heap.  */
13591         aarch64_set_mem_u64 (cpu, ptr +  0, heap_base);
13592         /* End addr of heap.  */
13593         aarch64_set_mem_u64 (cpu, ptr +  8, stack_top);
13594         /* Lowest stack addr.  */
13595         aarch64_set_mem_u64 (cpu, ptr + 16, heap_base);
13596         /* Initial stack addr.  */
13597         aarch64_set_mem_u64 (cpu, ptr + 24, stack_top);
13598
13599         TRACE_SYSCALL (cpu, " AngelSVC: Get Heap Info");
13600       }
13601       break;
13602
13603     case AngelSVC_Reason_Open:
13604       {
13605         /* Get the pointer  */
13606         /* uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);.  */
13607         /* FIXME: For now we just assume that we will only be asked
13608            to open the standard file descriptors.  */
13609         static int fd = 0;
13610         result = fd ++;
13611
13612         TRACE_SYSCALL (cpu, " AngelSVC: Open file %d", fd - 1);
13613       }
13614       break;
13615
13616     case AngelSVC_Reason_Close:
13617       {
13618         uint64_t fh = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13619         TRACE_SYSCALL (cpu, " AngelSVC: Close file %d", (int) fh);
13620         result = 0;
13621       }
13622       break;
13623
13624     case AngelSVC_Reason_Errno:
13625       result = 0;
13626       TRACE_SYSCALL (cpu, " AngelSVC: Get Errno");
13627       break;
13628
13629     case AngelSVC_Reason_Clock:
13630       result =
13631 #ifdef CLOCKS_PER_SEC
13632         (CLOCKS_PER_SEC >= 100)
13633         ? (clock () / (CLOCKS_PER_SEC / 100))
13634         : ((clock () * 100) / CLOCKS_PER_SEC)
13635 #else
13636         /* Presume unix... clock() returns microseconds.  */
13637         (clock () / 10000)
13638 #endif
13639         ;
13640         TRACE_SYSCALL (cpu, " AngelSVC: Get Clock");
13641       break;
13642
13643     case AngelSVC_Reason_GetCmdLine:
13644       {
13645         /* Get the pointer  */
13646         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13647         ptr = aarch64_get_mem_u64 (cpu, ptr);
13648
13649         /* FIXME: No command line for now.  */
13650         aarch64_set_mem_u64 (cpu, ptr, 0);
13651         TRACE_SYSCALL (cpu, " AngelSVC: Get Command Line");
13652       }
13653       break;
13654
13655     case AngelSVC_Reason_IsTTY:
13656       result = 1;
13657         TRACE_SYSCALL (cpu, " AngelSVC: IsTTY ?");
13658       break;
13659
13660     case AngelSVC_Reason_Write:
13661       {
13662         /* Get the pointer  */
13663         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13664         /* Get the write control block.  */
13665         uint64_t fd  = aarch64_get_mem_u64 (cpu, ptr);
13666         uint64_t buf = aarch64_get_mem_u64 (cpu, ptr + 8);
13667         uint64_t len = aarch64_get_mem_u64 (cpu, ptr + 16);
13668
13669         TRACE_SYSCALL (cpu, "write of %" PRIx64 " bytes from %"
13670                        PRIx64 " on descriptor %" PRIx64,
13671                        len, buf, fd);
13672
13673         if (len > 1280)
13674           {
13675             TRACE_SYSCALL (cpu,
13676                            " AngelSVC: Write: Suspiciously long write: %ld",
13677                            (long) len);
13678             sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13679                              sim_stopped, SIM_SIGBUS);
13680           }
13681         else if (fd == 1)
13682           {
13683             printf ("%.*s", (int) len, aarch64_get_mem_ptr (cpu, buf));
13684           }
13685         else if (fd == 2)
13686           {
13687             TRACE (cpu, 0, "\n");
13688             sim_io_eprintf (CPU_STATE (cpu), "%.*s",
13689                             (int) len, aarch64_get_mem_ptr (cpu, buf));
13690             TRACE (cpu, 0, "\n");
13691           }
13692         else
13693           {
13694             TRACE_SYSCALL (cpu,
13695                            " AngelSVC: Write: Unexpected file handle: %d",
13696                            (int) fd);
13697             sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13698                              sim_stopped, SIM_SIGABRT);
13699           }
13700       }
13701       break;
13702
13703     case AngelSVC_Reason_ReportException:
13704       {
13705         /* Get the pointer  */
13706         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13707         /*ptr = aarch64_get_mem_u64 (cpu, ptr);.  */
13708         uint64_t type = aarch64_get_mem_u64 (cpu, ptr);
13709         uint64_t state = aarch64_get_mem_u64 (cpu, ptr + 8);
13710
13711         TRACE_SYSCALL (cpu,
13712                        "Angel Exception: type 0x%" PRIx64 " state %" PRIx64,
13713                        type, state);
13714
13715         if (type == 0x20026)
13716           sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13717                            sim_exited, state);
13718         else
13719           sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13720                            sim_stopped, SIM_SIGINT);
13721       }
13722       break;
13723
13724     case AngelSVC_Reason_Read:
13725     case AngelSVC_Reason_FLen:
13726     case AngelSVC_Reason_Seek:
13727     case AngelSVC_Reason_Remove:
13728     case AngelSVC_Reason_Time:
13729     case AngelSVC_Reason_System:
13730     case AngelSVC_Reason_Rename:
13731     case AngelSVC_Reason_Elapsed:
13732     default:
13733       TRACE_SYSCALL (cpu, " HLT [Unknown angel %x]",
13734                      aarch64_get_reg_u32 (cpu, 0, NO_SP));
13735       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13736                        sim_stopped, SIM_SIGTRAP);
13737     }
13738
13739   aarch64_set_reg_u64 (cpu, 0, NO_SP, result);
13740 }
13741
13742 static void
13743 dexExcpnGen (sim_cpu *cpu)
13744 {
13745   /* instr[31:24] = 11010100
13746      instr[23,21] = opc : 000 ==> GEN EXCPN, 001 ==> BRK
13747                           010 ==> HLT,       101 ==> DBG GEN EXCPN
13748      instr[20,5]  = imm16
13749      instr[4,2]   = opc2 000 ==> OK, ow ==> UNALLOC
13750      instr[1,0]   = LL : discriminates opc  */
13751
13752   uint32_t opc = INSTR (23, 21);
13753   uint32_t imm16 = INSTR (20, 5);
13754   uint32_t opc2 = INSTR (4, 2);
13755   uint32_t LL;
13756
13757   NYI_assert (31, 24, 0xd4);
13758
13759   if (opc2 != 0)
13760     HALT_UNALLOC;
13761
13762   LL = INSTR (1, 0);
13763
13764   /* We only implement HLT and BRK for now.  */
13765   if (opc == 1 && LL == 0)
13766     {
13767       TRACE_EVENTS (cpu, " BRK [0x%x]", imm16);
13768       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13769                        sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
13770     }
13771
13772   if (opc == 2 && LL == 0)
13773     handle_halt (cpu, imm16);
13774
13775   else if (opc == 0 || opc == 5)
13776     HALT_NYI;
13777
13778   else
13779     HALT_UNALLOC;
13780 }
13781
13782 /* Stub for accessing system registers.  */
13783
13784 static uint64_t
13785 system_get (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13786             unsigned crm, unsigned op2)
13787 {
13788   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 7)
13789     /* DCZID_EL0 - the Data Cache Zero ID register.
13790        We do not support DC ZVA at the moment, so
13791        we return a value with the disable bit set.
13792        We implement support for the DCZID register since
13793        it is used by the C library's memset function.  */
13794     return ((uint64_t) 1) << 4;
13795
13796   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 1)
13797     /* Cache Type Register.  */
13798     return 0x80008000UL;
13799
13800   if (crn == 13 && op1 == 3 && crm == 0 && op2 == 2)
13801     /* TPIDR_EL0 - thread pointer id.  */
13802     return aarch64_get_thread_id (cpu);
13803
13804   if (op1 == 3 && crm == 4 && op2 == 0)
13805     return aarch64_get_FPCR (cpu);
13806
13807   if (op1 == 3 && crm == 4 && op2 == 1)
13808     return aarch64_get_FPSR (cpu);
13809
13810   else if (op1 == 3 && crm == 2 && op2 == 0)
13811     return aarch64_get_CPSR (cpu);
13812
13813   HALT_NYI;
13814 }
13815
13816 static void
13817 system_set (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13818             unsigned crm, unsigned op2, uint64_t val)
13819 {
13820   if (op1 == 3 && crm == 4 && op2 == 0)
13821     aarch64_set_FPCR (cpu, val);
13822
13823   else if (op1 == 3 && crm == 4 && op2 == 1)
13824     aarch64_set_FPSR (cpu, val);
13825
13826   else if (op1 == 3 && crm == 2 && op2 == 0)
13827     aarch64_set_CPSR (cpu, val);
13828
13829   else
13830     HALT_NYI;
13831 }
13832
13833 static void
13834 do_mrs (sim_cpu *cpu)
13835 {
13836   /* instr[31:20] = 1101 0101 0001 1
13837      instr[19]    = op0
13838      instr[18,16] = op1
13839      instr[15,12] = CRn
13840      instr[11,8]  = CRm
13841      instr[7,5]   = op2
13842      instr[4,0]   = Rt  */
13843   unsigned sys_op0 = INSTR (19, 19) + 2;
13844   unsigned sys_op1 = INSTR (18, 16);
13845   unsigned sys_crn = INSTR (15, 12);
13846   unsigned sys_crm = INSTR (11, 8);
13847   unsigned sys_op2 = INSTR (7, 5);
13848   unsigned rt = INSTR (4, 0);
13849
13850   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13851   aarch64_set_reg_u64 (cpu, rt, NO_SP,
13852                        system_get (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2));
13853 }
13854
13855 static void
13856 do_MSR_immediate (sim_cpu *cpu)
13857 {
13858   /* instr[31:19] = 1101 0101 0000 0
13859      instr[18,16] = op1
13860      instr[15,12] = 0100
13861      instr[11,8]  = CRm
13862      instr[7,5]   = op2
13863      instr[4,0]   = 1 1111  */
13864
13865   unsigned op1 = INSTR (18, 16);
13866   /*unsigned crm = INSTR (11, 8);*/
13867   unsigned op2 = INSTR (7, 5);
13868
13869   NYI_assert (31, 19, 0x1AA0);
13870   NYI_assert (15, 12, 0x4);
13871   NYI_assert (4,  0,  0x1F);
13872
13873   if (op1 == 0)
13874     {
13875       if (op2 == 5)
13876         HALT_NYI; /* set SPSel.  */
13877       else
13878         HALT_UNALLOC;
13879     }
13880   else if (op1 == 3)
13881     {
13882       if (op2 == 6)
13883         HALT_NYI; /* set DAIFset.  */
13884       else if (op2 == 7)
13885         HALT_NYI; /* set DAIFclr.  */
13886       else
13887         HALT_UNALLOC;
13888     }
13889   else
13890     HALT_UNALLOC;
13891 }
13892
13893 static void
13894 do_MSR_reg (sim_cpu *cpu)
13895 {
13896   /* instr[31:20] = 1101 0101 0001
13897      instr[19]    = op0
13898      instr[18,16] = op1
13899      instr[15,12] = CRn
13900      instr[11,8]  = CRm
13901      instr[7,5]   = op2
13902      instr[4,0]   = Rt  */
13903
13904   unsigned sys_op0 = INSTR (19, 19) + 2;
13905   unsigned sys_op1 = INSTR (18, 16);
13906   unsigned sys_crn = INSTR (15, 12);
13907   unsigned sys_crm = INSTR (11, 8);
13908   unsigned sys_op2 = INSTR (7, 5);
13909   unsigned rt = INSTR (4, 0);
13910
13911   NYI_assert (31, 20, 0xD51);
13912
13913   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13914   system_set (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2,
13915               aarch64_get_reg_u64 (cpu, rt, NO_SP));
13916 }
13917
13918 static void
13919 do_SYS (sim_cpu *cpu)
13920 {
13921   /* instr[31,19] = 1101 0101 0000 1
13922      instr[18,16] = op1
13923      instr[15,12] = CRn
13924      instr[11,8]  = CRm
13925      instr[7,5]   = op2
13926      instr[4,0]   = Rt  */
13927   NYI_assert (31, 19, 0x1AA1);
13928
13929   /* FIXME: For now we just silently accept system ops.  */
13930 }
13931
13932 static void
13933 dexSystem (sim_cpu *cpu)
13934 {
13935   /* instr[31:22] = 1101 01010 0
13936      instr[21]    = L
13937      instr[20,19] = op0
13938      instr[18,16] = op1
13939      instr[15,12] = CRn
13940      instr[11,8]  = CRm
13941      instr[7,5]   = op2
13942      instr[4,0]   = uimm5  */
13943
13944   /* We are interested in HINT, DSB, DMB and ISB
13945
13946      Hint #0 encodes NOOP (this is the only hint we care about)
13947      L == 0, op0 == 0, op1 = 011, CRn = 0010, Rt = 11111,
13948      CRm op2  != 0000 000 OR CRm op2 == 0000 000 || CRm op > 0000 101
13949
13950      DSB, DMB, ISB are data store barrier, data memory barrier and
13951      instruction store barrier, respectively, where
13952
13953      L == 0, op0 == 0, op1 = 011, CRn = 0011, Rt = 11111,
13954      op2 : DSB ==> 100, DMB ==> 101, ISB ==> 110
13955      CRm<3:2> ==> domain, CRm<1:0> ==> types,
13956      domain : 00 ==> OuterShareable, 01 ==> Nonshareable,
13957               10 ==> InerShareable, 11 ==> FullSystem
13958      types :  01 ==> Reads, 10 ==> Writes,
13959               11 ==> All, 00 ==> All (domain == FullSystem).  */
13960
13961   unsigned rt = INSTR (4, 0);
13962
13963   NYI_assert (31, 22, 0x354);
13964
13965   switch (INSTR (21, 12))
13966     {
13967     case 0x032:
13968       if (rt == 0x1F)
13969         {
13970           /* NOP has CRm != 0000 OR.  */
13971           /*         (CRm == 0000 AND (op2 == 000 OR op2 > 101)).  */
13972           uint32_t crm = INSTR (11, 8);
13973           uint32_t op2 = INSTR (7, 5);
13974
13975           if (crm != 0 || (op2 == 0 || op2 > 5))
13976             {
13977               /* Actually call nop method so we can reimplement it later.  */
13978               nop (cpu);
13979               return;
13980             }
13981         }
13982       HALT_NYI;
13983
13984     case 0x033:
13985       {
13986         uint32_t op2 =  INSTR (7, 5);
13987
13988         switch (op2)
13989           {
13990           case 2: HALT_NYI;
13991           case 4: dsb (cpu); return;
13992           case 5: dmb (cpu); return;
13993           case 6: isb (cpu); return;
13994           default: HALT_UNALLOC;
13995         }
13996       }
13997
13998     case 0x3B0:
13999     case 0x3B4:
14000     case 0x3BD:
14001       do_mrs (cpu);
14002       return;
14003
14004     case 0x0B7:
14005       do_SYS (cpu); /* DC is an alias of SYS.  */
14006       return;
14007
14008     default:
14009       if (INSTR (21, 20) == 0x1)
14010         do_MSR_reg (cpu);
14011       else if (INSTR (21, 19) == 0 && INSTR (15, 12) == 0x4)
14012         do_MSR_immediate (cpu);
14013       else
14014         HALT_NYI;
14015       return;
14016     }
14017 }
14018
14019 static void
14020 dexBr (sim_cpu *cpu)
14021 {
14022   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
14023      assert  group == GROUP_BREXSYS_1010 || group == GROUP_BREXSYS_1011
14024      bits [31,29] of a BrExSys are the secondary dispatch vector.  */
14025   uint32_t group2 = dispatchBrExSys (aarch64_get_instr (cpu));
14026
14027   switch (group2)
14028     {
14029     case BR_IMM_000:
14030       return dexBranchImmediate (cpu);
14031
14032     case BR_IMMCMP_001:
14033       /* Compare has bit 25 clear while test has it set.  */
14034       if (!INSTR (25, 25))
14035         dexCompareBranchImmediate (cpu);
14036       else
14037         dexTestBranchImmediate (cpu);
14038       return;
14039
14040     case BR_IMMCOND_010:
14041       /* This is a conditional branch if bit 25 is clear otherwise
14042          unallocated.  */
14043       if (!INSTR (25, 25))
14044         dexCondBranchImmediate (cpu);
14045       else
14046         HALT_UNALLOC;
14047       return;
14048
14049     case BR_UNALLOC_011:
14050       HALT_UNALLOC;
14051
14052     case BR_IMM_100:
14053       dexBranchImmediate (cpu);
14054       return;
14055
14056     case BR_IMMCMP_101:
14057       /* Compare has bit 25 clear while test has it set.  */
14058       if (!INSTR (25, 25))
14059         dexCompareBranchImmediate (cpu);
14060       else
14061         dexTestBranchImmediate (cpu);
14062       return;
14063
14064     case BR_REG_110:
14065       /* Unconditional branch reg has bit 25 set.  */
14066       if (INSTR (25, 25))
14067         dexBranchRegister (cpu);
14068
14069       /* This includes both Excpn Gen, System and unalloc operations.
14070          We need to decode the Excpn Gen operation BRK so we can plant
14071          debugger entry points.
14072          Excpn Gen operations have instr [24] = 0.
14073          we need to decode at least one of the System operations NOP
14074          which is an alias for HINT #0.
14075          System operations have instr [24,22] = 100.  */
14076       else if (INSTR (24, 24) == 0)
14077         dexExcpnGen (cpu);
14078
14079       else if (INSTR (24, 22) == 4)
14080         dexSystem (cpu);
14081
14082       else
14083         HALT_UNALLOC;
14084
14085       return;
14086
14087     case BR_UNALLOC_111:
14088       HALT_UNALLOC;
14089
14090     default:
14091       /* Should never reach here.  */
14092       HALT_NYI;
14093     }
14094 }
14095
14096 static void
14097 aarch64_decode_and_execute (sim_cpu *cpu, uint64_t pc)
14098 {
14099   /* We need to check if gdb wants an in here.  */
14100   /* checkBreak (cpu);.  */
14101
14102   uint64_t group = dispatchGroup (aarch64_get_instr (cpu));
14103
14104   switch (group)
14105     {
14106     case GROUP_PSEUDO_0000:   dexPseudo (cpu); break;
14107     case GROUP_LDST_0100:     dexLdSt (cpu); break;
14108     case GROUP_DPREG_0101:    dexDPReg (cpu); break;
14109     case GROUP_LDST_0110:     dexLdSt (cpu); break;
14110     case GROUP_ADVSIMD_0111:  dexAdvSIMD0 (cpu); break;
14111     case GROUP_DPIMM_1000:    dexDPImm (cpu); break;
14112     case GROUP_DPIMM_1001:    dexDPImm (cpu); break;
14113     case GROUP_BREXSYS_1010:  dexBr (cpu); break;
14114     case GROUP_BREXSYS_1011:  dexBr (cpu); break;
14115     case GROUP_LDST_1100:     dexLdSt (cpu); break;
14116     case GROUP_DPREG_1101:    dexDPReg (cpu); break;
14117     case GROUP_LDST_1110:     dexLdSt (cpu); break;
14118     case GROUP_ADVSIMD_1111:  dexAdvSIMD1 (cpu); break;
14119
14120     case GROUP_UNALLOC_0001:
14121     case GROUP_UNALLOC_0010:
14122     case GROUP_UNALLOC_0011:
14123       HALT_UNALLOC;
14124
14125     default:
14126       /* Should never reach here.  */
14127       HALT_NYI;
14128     }
14129 }
14130
14131 static bfd_boolean
14132 aarch64_step (sim_cpu *cpu)
14133 {
14134   uint64_t pc = aarch64_get_PC (cpu);
14135
14136   if (pc == TOP_LEVEL_RETURN_PC)
14137     return FALSE;
14138
14139   aarch64_set_next_PC (cpu, pc + 4);
14140
14141   /* Code is always little-endian.  */
14142   sim_core_read_buffer (CPU_STATE (cpu), cpu, read_map,
14143                         & aarch64_get_instr (cpu), pc, 4);
14144   aarch64_get_instr (cpu) = endian_le2h_4 (aarch64_get_instr (cpu));
14145
14146   TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %08x", pc,
14147               aarch64_get_instr (cpu));
14148   TRACE_DISASM (cpu, pc);
14149
14150   aarch64_decode_and_execute (cpu, pc);
14151
14152   return TRUE;
14153 }
14154
14155 void
14156 aarch64_run (SIM_DESC sd)
14157 {
14158   sim_cpu *cpu = STATE_CPU (sd, 0);
14159
14160   while (aarch64_step (cpu))
14161     {
14162       aarch64_update_PC (cpu);
14163
14164       if (sim_events_tick (sd))
14165         sim_events_process (sd);
14166     }
14167
14168   sim_engine_halt (sd, cpu, NULL, aarch64_get_PC (cpu),
14169                    sim_exited, aarch64_get_reg_s32 (cpu, R0, NO_SP));
14170 }
14171
14172 void
14173 aarch64_init (sim_cpu *cpu, uint64_t pc)
14174 {
14175   uint64_t sp = aarch64_get_stack_start (cpu);
14176
14177   /* Install SP, FP and PC and set LR to -20
14178      so we can detect a top-level return.  */
14179   aarch64_set_reg_u64 (cpu, SP, SP_OK, sp);
14180   aarch64_set_reg_u64 (cpu, FP, SP_OK, sp);
14181   aarch64_set_reg_u64 (cpu, LR, SP_OK, TOP_LEVEL_RETURN_PC);
14182   aarch64_set_next_PC (cpu, pc);
14183   aarch64_update_PC (cpu);
14184   aarch64_init_LIT_table ();
14185 }